diff --git a/Kbuild b/Kbuild
index 94f4589..91ad5c4 100644
--- a/Kbuild
+++ b/Kbuild
@@ -13,6 +13,7 @@ neuron-objs += neuron_fw_io.o
 neuron-objs += neuron_dmabuf.o
 neuron-objs += neuron_log.o
 neuron-objs += neuron_power.o
+neuron-objs += neuron_test.o
 neuron-objs += vc/neuron_dhal_vc.o
 neuron-objs += v2/notific.o v2/neuron_dhal_v2.o
 neuron-objs += v3/notific.o v3/neuron_dhal_v3.o v3/neuron_pelect.o
diff --git a/Kbuild.tpl b/Kbuild.tpl
new file mode 100644
index 0000000..992b47c
--- /dev/null
+++ b/Kbuild.tpl
@@ -0,0 +1,23 @@
+obj-m += neuron.o
+
+neuron-objs := neuron_arch.o neuron_dhal.o
+neuron-objs += neuron_reg_access.o
+neuron-objs += neuron_module.o neuron_pci.o neuron_mempool.o neuron_dma.o neuron_ring.o neuron_ds.o
+neuron-objs += neuron_core.o neuron_crwl.o neuron_cdev.o neuron_topsp.o neuron_pid.o
+neuron-objs += neuron_reset.o neuron_cinit.o neuron_mmap.o neuron_p2p.o
+neuron-objs += neuron_nq.o
+neuron-objs += neuron_mc_handle.o
+neuron-objs += neuron_metrics.o neuron_sysfs_metrics.o
+neuron-objs += udma/udma_iofic.o udma/udma_m2m.o udma/udma_main.o
+neuron-objs += neuron_fw_io.o
+neuron-objs += neuron_dmabuf.o
+neuron-objs += neuron_log.o
+neuron-objs += neuron_power.o
+neuron-objs += vc/neuron_dhal_vc.o
+neuron-objs += v2/notific.o v2/neuron_dhal_v2.o
+neuron-objs += v3/notific.o v3/neuron_dhal_v3.o v3/neuron_pelect.o
+neuron-objs += v4/neuron_dhal_v4.o
+{extra-objs}
+ccflags-y += -O3 -Wall -Werror -Wno-declaration-after-statement -Wunused-macros -Wunused-local-typedefs
+ccflags-y += -I$(src)/
+ccflags-y += $(call cc-option,-march=armv8.2-a)
diff --git a/aws-neuronx-dkms-mkdeb/debian/postinst b/aws-neuronx-dkms-mkdeb/debian/postinst
index c4ad0da..e189062 100755
--- a/aws-neuronx-dkms-mkdeb/debian/postinst
+++ b/aws-neuronx-dkms-mkdeb/debian/postinst
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/usr/bin/sh
 # Copyright (C) 2002-2005 Flavio Stanchina
 # Copyright (C) 2005-2006 Aric Cyr
 # Copyright (C) 2007 Mario Limonciello
diff --git a/aws-neuronx-dkms-mkdeb/debian/prerm b/aws-neuronx-dkms-mkdeb/debian/prerm
index 654f5fc..61b816d 100755
--- a/aws-neuronx-dkms-mkdeb/debian/prerm
+++ b/aws-neuronx-dkms-mkdeb/debian/prerm
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/usr/bin/sh
 
 NAME=MODULE_NAME
 VERSION=MODULE_VERSION
diff --git a/aws-neuronx-dkms-mkrpm.spec b/aws-neuronx-dkms-mkrpm.spec
index a01839a..c9df3b0 100644
--- a/aws-neuronx-dkms-mkrpm.spec
+++ b/aws-neuronx-dkms-mkrpm.spec
@@ -7,6 +7,10 @@
 %{?!_srcdir: %define _srcdir %_prefix/src}
 %{?!_datarootdir: %define _datarootdir %{_datadir}}
 
+%define _source_filedigest_algorithm 10
+%define _binary_filedigest_algorithm 10
+%define _binary_payload w7.xzdio
+
 Summary:	%{module_name} %{version} dkms package
 Name:		aws-neuronx-dkms
 Version:	%{version}
@@ -84,6 +88,9 @@ exit 0
 %files
 %defattr(-,root,root)
 %{_srcdir}
+%attr(755,root,root) %{_srcdir}/%{module_name}-%{version}/preinstall
+%attr(755,root,root) %{_srcdir}/%{module_name}-%{version}/postinstall
+%attr(755,root,root) %{_srcdir}/%{module_name}-%{version}/postremove
 %{_datarootdir}/%{module_name}/
 
 %changelog
diff --git a/aws-neuronx-dkms-mkrpm/post_install.sh b/aws-neuronx-dkms-mkrpm/post_install.sh
new file mode 100644
index 0000000..af4f8f9
--- /dev/null
+++ b/aws-neuronx-dkms-mkrpm/post_install.sh
@@ -0,0 +1,12 @@
+for POSTINST in /usr/lib/dkms/common.postinst /usr/share/MODULE_NAME/postinst; do
+    if [ -f $POSTINST ]; then
+        $POSTINST MODULE_NAME MODULE_VERSION /usr/share/MODULE_NAME
+        exit $?
+    fi
+    echo "WARNING: $POSTINST does not exist."
+done
+echo -e "ERROR: DKMS version is too old and MODULE_NAME was not"
+echo -e "built with legacy DKMS support."
+echo -e "You must either rebuild MODULE_NAME with legacy postinst"
+echo -e "support or upgrade DKMS to a more current version."
+exit 1
diff --git a/aws-neuronx-dkms-mkrpm/pre_uninstall.sh b/aws-neuronx-dkms-mkrpm/pre_uninstall.sh
new file mode 100644
index 0000000..8e171a9
--- /dev/null
+++ b/aws-neuronx-dkms-mkrpm/pre_uninstall.sh
@@ -0,0 +1,12 @@
+echo -e
+echo -e "Uninstall of MODULE_NAME module (version MODULE_VERSION) beginning:"
+if lsmod | grep -q "^neuron "; then
+    echo "Neuron module is currently loaded. Attempting to unload..."
+    if ! rmmod neuron 2>/dev/null; then
+        echo "ERROR: Cannot unload neuron module - it is currently in use."
+        echo "Please stop all processes using the neuron module before uninstalling."
+        exit 1
+    fi
+fi
+dkms remove -m MODULE_NAME -v MODULE_VERSION --all --rpm_safe_upgrade
+exit 0
diff --git a/dkms.conf b/dkms.conf
index ea60afc..f5b504e 100644
--- a/dkms.conf
+++ b/dkms.conf
@@ -1,5 +1,5 @@
 PACKAGE_NAME=aws-neuronx
-PACKAGE_VERSION=2.27.4.0
+PACKAGE_VERSION=2.28.0.0
 BUILT_MODULE_NAME[0]="neuron"
 MAKE[0]="make -C ${kernel_source_dir} M=${dkms_tree}/${PACKAGE_NAME}/${PACKAGE_VERSION}/build"
 CLEAN="make -C ${kernel_source_dir} M=${dkms_tree}/${PACKAGE_NAME}/${PACKAGE_VERSION}/build clean"
diff --git a/neuron_arch.h b/neuron_arch.h
index c27e5b9..831679d 100644
--- a/neuron_arch.h
+++ b/neuron_arch.h
@@ -24,6 +24,11 @@ enum neuron_platform_type {
 	NEURON_PLATFORM_TYPE_INVALID,
 };
 
+enum neuron_platform_operation_type {
+	NEURON_PLATFORM_OP_TYPE_DEVOPEN = 0,
+	NEURON_PLATFORM_OP_TYPE_EXEC = 1,
+};
+
 /**
  * narch_init() - Set neuron devices architecture and revision.
  *
diff --git a/neuron_cdev.c b/neuron_cdev.c
index 4dfbfe7..dbc12af 100644
--- a/neuron_cdev.c
+++ b/neuron_cdev.c
@@ -149,7 +149,7 @@ static int ncdev_dma_queue_init(struct neuron_device *nd, void *param)
 	else
 		rxc_mc = NULL;
 	ret = ndmar_queue_init(nd, arg.eng_id, arg.qid, arg.tx_desc_count, arg.rx_desc_count, tx_mc,
-			       rx_mc, rxc_mc, arg.axi_port, false);
+			       rx_mc, rxc_mc, false);
 	return ret;
 }
 
@@ -173,7 +173,7 @@ static int ncdev_dma_queue_init_batch_entry(struct neuron_device *nd, struct neu
 	else
 		rxc_mc = NULL;
 	ret = ndmar_queue_init(nd, arg->eng_id, arg->qid, arg->tx_desc_count, arg->rx_desc_count, tx_mc,
-						   rx_mc, rxc_mc, arg->axi_port, false);
+						   rx_mc, rxc_mc, false);
 	return ret;
 }
 
@@ -458,6 +458,7 @@ static int ncdev_mem_alloc_libnrt(struct neuron_device *nd, unsigned int cmd, vo
 	static_assert(NEURON_IOCTL_MEM_ALLOC_V2 != NEURON_IOCTL_MEM_ALLOC_V2MT);
 	static_assert(NEURON_IOCTL_MEM_ALLOC_V2 != NEURON_IOCTL_MEM_ALLOC_V2MT64);
 	static_assert(NEURON_IOCTL_MEM_ALLOC_V2MT != NEURON_IOCTL_MEM_ALLOC_V2MT64);
+	static_assert(NEURON_IOCTL_MEM_ALLOC_V2MT64 != NEURON_IOCTL_MEM_ALLOC_V2MT64_PA);
 
 	enum mem_location location;
 	u64 mh;
@@ -522,6 +523,53 @@ static int ncdev_mem_alloc_libnrt(struct neuron_device *nd, unsigned int cmd, vo
 		nc_id = mem_alloc_arg.nc_id;
 		mem_type = mem_alloc_arg.mem_type;
 		mem_handle = mem_alloc_arg.mem_handle;
+	} else if (cmd == NEURON_IOCTL_MEM_ALLOC_V2MT64_PA) {
+		/* Extended alloc that also returns PA in a single ioctl */
+		struct neuron_ioctl_mem_alloc_v2_mem_type64_pa mem_alloc_arg;
+		ret = neuron_copy_from_user(__func__, &mem_alloc_arg, (struct neuron_ioctl_mem_alloc_v2_mem_type64_pa *)param,
+			     sizeof(mem_alloc_arg));
+		if (ret)
+			return ret;
+
+		size = mem_alloc_arg.size;
+		align = mem_alloc_arg.align;
+		host_memory = mem_alloc_arg.host_memory;
+		dram_channel = mem_alloc_arg.dram_channel;
+		dram_region = mem_alloc_arg.dram_region;
+		nc_id = mem_alloc_arg.nc_id;
+		mem_type = mem_alloc_arg.mem_type;
+		mem_handle = mem_alloc_arg.mem_handle;
+
+		if (host_memory)
+			location = MEM_LOC_HOST;
+		else
+			location = MEM_LOC_DEVICE;
+		ret = mc_alloc_align(nd, MC_LIFESPAN_CUR_PROCESS, size, align, location, dram_channel, dram_region, nc_id, mem_type, &mc);
+		if (ret)
+			return ret;
+
+		trace_ioctl_mem_alloc(nd, mc);
+
+		ret = ncdev_mem_chunk_to_mem_handle(nd, mc, &mh);
+		if (!ret)
+			ret = copy_to_user(mem_handle, &mh, sizeof(mc));
+		if (ret) {
+			mc_free(&mc);
+			return ret;
+		}
+
+		/* Fill in PA and copy the struct back to userspace */
+		if (mc->mem_location == MEM_LOC_HOST)
+			mem_alloc_arg.pa = mc->pa | ndhal->ndhal_address_map.pci_host_base;
+		else
+			mem_alloc_arg.pa = mc->pa;
+
+		ret = copy_to_user(param, &mem_alloc_arg, sizeof(mem_alloc_arg));
+		if (ret) {
+			mc_free(&mc);
+			return ret;
+		}
+		return 0;
 	} else {
 		return -EINVAL;
 	}
@@ -1204,7 +1252,12 @@ static int ncdev_mem_buf_zerocopy64(struct neuron_device *nd, unsigned int cmd,
 	// simulation does not have bar4 mapped to the actual memory, don't do it
 	if (use_bar4_wr) {
 		u64 cpy_offset;
-		ndhal->ndhal_mmap.mmap_get_bar4_offset(mc->pa + offset, size, &cpy_offset);
+		ret = ndhal->ndhal_mmap.mmap_get_bar4_offset(mc->pa + offset, size, &cpy_offset);
+		if (unlikely(ret)) {
+			pr_err("Failed to map address 0x%llx to BAR4\n", mc->pa + offset);
+			return ret;
+		}
+
 		// copy from user is slow, try fast copy and fall back if fails
 		pagefault_disable();
 		ret = __copy_from_user_inatomic(nd->npdev.bar4 + cpy_offset, buffer, size);
@@ -1378,7 +1431,11 @@ static int ncdev_mem_buf_zerocopy64_batch(struct neuron_device *nd, void *param)
 				const nrt_tensor_batch_op_t op = batch->ops_ptr[j];
 
 				u64 cpy_offset = 0;
-				ndhal->ndhal_mmap.mmap_get_bar4_offset(mc->pa + op.offset, op.size, &cpy_offset);
+				ret = ndhal->ndhal_mmap.mmap_get_bar4_offset(mc->pa + op.offset, op.size, &cpy_offset);
+				if (unlikely(ret)) {
+					pr_err("Failed to map address 0x%llx to BAR4\n", mc->pa + op.offset);
+					goto cleanup;
+				}
 				// copy from user is slow, try fast copy and fall back if fails
 				pagefault_disable();
 				ret = __copy_from_user_inatomic(nd->npdev.bar4 + cpy_offset, op.buffer, op.size);
@@ -1732,7 +1789,6 @@ static long ncdev_nc_reset(struct neuron_device *nd, void *param)
 	if (ret)
 		return ret;
 
-	ndmar_close_ncs(nd, arg.nc_map);
 	arg.request_id = task_tgid_nr(current);
 	ret = nr_start_ncs(nd, arg.nc_map, arg.request_id);
 	if (ret) {
@@ -1743,7 +1799,6 @@ static long ncdev_nc_reset(struct neuron_device *nd, void *param)
 
 static long ncdev_device_reset_deprecated(struct neuron_device *nd)
 {
-	ndmar_close(nd);
 	nr_start(nd);
 	return 0;
 }
@@ -1913,7 +1968,8 @@ static long ncdev_driver_info(unsigned int cmd, void *param)
 										 NEURON_DRIVER_FEATURE_BATCH_DMAQ_INIT | NEURON_DRIVER_FEATURE_BIG_CORE_MAPS |
 										 NEURON_DRIVER_FEATURE_MEM_ALLOC_TYPE | NEURON_DRIVER_FEATURE_HBM_SCRUB |
 										 NEURON_DRIVER_FEATURE_MEM_ALLOC64 | NEURON_DRIVER_FEATURE_CONTIGUOUS_SCRATCHPAD |
-										 NEURON_DRIVER_FEATURE_ZEROCOPY;
+										 NEURON_DRIVER_FEATURE_ZEROCOPY | NEURON_DRIVER_FEATURE_PINNED_HOST_MEM |
+										 NEURON_DRIVER_FEATURE_ALLOC_WITH_PA;
 
 			return copy_to_user(param, &driver_info, sizeof(driver_info));
 		}
@@ -2577,7 +2633,7 @@ static long ncdev_hbm_scrub_start(struct neuron_device *nd, void *param) {
 		uint32_t eng_id = dma_engines[i];
 		uint32_t qid = 0;
 		ret = ndmar_queue_init(nd, eng_id, qid, allocated_descs, allocated_descs, tx_mc[dma_engines[i]],
-					rx_mc[dma_engines[i]], NULL, arg.axi_port, true);
+					rx_mc[dma_engines[i]], NULL, true);
 		if (ret) {
 			pr_err("Failed to initialize DMA queue for engine %d for scrubbing nd%d HBM %d:\n", eng_id, nd->device_index, arg.hbm_index);
 			goto scrub_init_fail;
@@ -3108,13 +3164,14 @@ static int ncdev_get_async_h2d_dma_compl_queues(struct neuron_device *nd, void *
 		return ret;
 	}
 
-	/* TODO: start h2d kernel thread */
-
 	if (arg.nc_id >= ndhal->ndhal_address_map.nc_per_device) {
 		pr_err("nd%02d: invalid nc %u provided\n", nd->device_index, arg.nc_id);
 		return -EINVAL;
 	}
 
+	/* Set up the completion queue (CQ). The completion thread is created on-demand
+	 * when the first async zero-copy request is submitted.
+	 */
 	memset(arg.compl_queue_info, 0, sizeof(arg.compl_queue_info));
 
 	eng_id = ndhal->ndhal_ndmar.ndmar_get_h2t_eng_id(nd, arg.nc_id);
@@ -3144,6 +3201,40 @@ static int ncdev_get_async_h2d_dma_compl_queues(struct neuron_device *nd, void *
 	return ret;
 }
 
+static int ncdev_host_mem_pin(void *param)
+{
+	struct neuron_ioctl_host_mem_pin arg;
+	int ret;
+
+	ret = neuron_copy_from_user(__func__, &arg, param, sizeof(arg));
+	if (ret)
+		return ret;
+
+	arg.pa = ~0ULL; /* default: not contiguous */
+	ret = ndma_pin_host_memory(arg.va, arg.size, &arg.pa);
+	if (ret)
+		return ret;
+
+	/* Copy result (including pa) back to userspace */
+	if (copy_to_user(param, &arg, sizeof(arg)))
+		return -EFAULT;
+
+	return 0;
+}
+
+static int ncdev_host_mem_unpin(void *param)
+{
+	struct neuron_ioctl_host_mem_unpin arg;
+	int ret;
+
+	ret = neuron_copy_from_user(__func__, &arg, param, sizeof(arg));
+	if (ret)
+		return ret;
+
+	/* VA-based unregistration - requires exact VA match */
+	return ndma_unpin_host_memory(arg.va);
+}
+
 inline static long ncdev_misc_ioctl(struct file *filep, unsigned int cmd, unsigned long param) {
 	if ((cmd == NEURON_IOCTL_CRWL_NC_RANGE_MARK) || (cmd == NEURON_IOCTL_CRWL_NC_RANGE_MARK_EXT0)) {
 		return ncdev_crwl_nc_range_mark(filep, cmd, (void *)param);
@@ -3178,6 +3269,10 @@ inline static long ncdev_misc_ioctl(struct file *filep, unsigned int cmd, unsign
 		return ncdev_pod_ctrl(filep, cmd, (void *)param);
 	} else if (_IOC_NR(cmd) == _IOC_NR(NEURON_IOCTL_GET_VA_PLACEMENT)) {
 		return ncdev_get_va_placement((void *)param);
+	} else if (cmd == NEURON_IOCTL_HOST_MEM_PIN) {
+		return ncdev_host_mem_pin((void*)param);
+	} else if (cmd == NEURON_IOCTL_HOST_MEM_UNPIN) {
+		return ncdev_host_mem_unpin((void*)param);
 	}
 
 	pr_err("invalid misc IOCTL %d (dir=%d, type=%d, nr=%d, size=%d)\n", cmd, _IOC_DIR(cmd),
@@ -3408,7 +3503,7 @@ static int ncdev_open(struct inode *inode, struct file *filep)
 
 	// wait for device init to complete.
 	// TODO: implement some better wait system than schedule()
-	while (nd->device_state == NEURON_DEVICE_STATE_RESET) {
+	while ((nd->device_state == NEURON_DEVICE_STATE_RESET) || ndhal->ndhal_arch.narch_platform_ready(nd, NEURON_PLATFORM_OP_TYPE_DEVOPEN)) {
 		schedule();
 		if (sigismember(&current->pending.signal, SIGTERM) || sigismember(&current->pending.signal, SIGKILL)) {
 			mutex_lock(&dev->ncdev_lock);
@@ -3449,6 +3544,24 @@ static inline int ncdev_misc_flush(struct file *filep)
 	return 0;
 }
 
+/* handle any per process cleanup when the process closes
+ * the last open handle to a Neuron device
+ */
+static void ncdev_handle_process_exit_if_last(void)
+{
+	// is the process still have open handles?
+	int j;
+	for (j = 0; j < MAX_NEURON_DEVICE_COUNT; j++) {
+		struct neuron_device *nd = neuron_pci_get_device(j);
+		if (nd && npid_is_attached(nd)) {
+			// not the last
+			return;
+		}
+	}
+	ndma_pinned_mem_cleanup_process(task_tgid_nr(current));
+	// add more cleanup here if necessary
+}
+
 static int ncdev_flush(struct file *filep, fl_owner_t id)
 {
 	struct ncdev *dev;
@@ -3470,7 +3583,6 @@ static int ncdev_flush(struct file *filep, fl_owner_t id)
 	if (attach_cnt == 1) {
 		// If this proc exited in the middle of a reset, wait for the reset to be processed.
 		nr_wait(nd, task_tgid_nr(current), true);
-
 		ndmar_handle_process_exit(nd, task_tgid_nr(current));
 		msleep(10); // TODO - confirm with HW dev, whether any delay needed after q reset.
 		ncrwl_release_current_process(nd);
@@ -3492,6 +3604,9 @@ static int ncdev_flush(struct file *filep, fl_owner_t id)
 	npid_detach(nd);
 
 	mutex_unlock(&dev->ncdev_lock);
+	if (attach_cnt == 1) {
+		ncdev_handle_process_exit_if_last();
+	}
 
 	return 0;
 }
@@ -3537,6 +3652,13 @@ static int ncdev_mmap(struct file *filep, struct vm_area_struct *vma)
 	return nmmap_mem(nd, vma);
 }
 
+static unsigned long ncdev_get_unmapped_area(struct file *filep, unsigned long addr,
+					     unsigned long len, unsigned long pgoff,
+					     unsigned long flags)
+{
+	return nmmap_get_unmapped_area(filep, addr, len, pgoff, flags);
+}
+
 static struct file_operations ncdev_fops = {
 	.owner = THIS_MODULE,
 	.open = ncdev_open,
@@ -3544,18 +3666,26 @@ static struct file_operations ncdev_fops = {
 	.release = ncdev_release,
 	.unlocked_ioctl = ncdev_ioctl,
 	.mmap = ncdev_mmap,
+	.get_unmapped_area = ncdev_get_unmapped_area,
 };
 
 static ssize_t device_reset_show(struct device *dev, struct device_attribute *attr, char *buf)
 {
 	int minor = MINOR(dev->devt);
-	return sprintf(buf, "%d\n", devnodes[minor].ndev->device_state);
+	if (minor >= NEURON_MAX_DEV_NODES) {
+		return -ENODEV;
+	}
+	return scnprintf(buf, PAGE_SIZE, "%d\n", devnodes[minor].ndev->device_state);
 }
 
 static ssize_t driver_reset_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count)
 {
+	struct ncdev *devnode;
 	int minor = MINOR(dev->devt);
-	struct ncdev *devnode = &devnodes[minor];
+	if (minor >= NEURON_MAX_DEV_NODES) {
+		return -ENODEV;
+	}
+	devnode = &devnodes[minor];
 
 	mutex_lock(&devnode->ncdev_lock);
 	if (devnode->open_count == 0) { // only trigger sysfs reset if the device is not opened by app
@@ -3570,24 +3700,33 @@ static DEVICE_ATTR(reset, S_IWUSR | S_IRUSR, device_reset_show, driver_reset_sto
 
 static ssize_t neuron_core_count_show(struct device *dev, struct device_attribute *attr, char *buf)
 {
-	int ret = 0;
-	ret = sprintf(buf, "%d", ndhal->ndhal_address_map.nc_per_device);
-	return ret;
+	// We would prefer to have a newline here for consistency with other output but
+	// that breaks backward compatibility with the device plugin. They are fixing the 
+	// plugin to handle if a newline is present, so in the future we can add it when
+	// older plugins are not a concern - https://tiny.amazon.com/k1ezuoub
+	return scnprintf(buf, PAGE_SIZE, "%d", ndhal->ndhal_address_map.nc_per_device);
 }
 
-static DEVICE_ATTR(core_count, S_IRUSR, neuron_core_count_show, NULL);
+static DEVICE_ATTR(core_count, S_IRUGO, neuron_core_count_show, NULL);
 
 #define CONNECTED_DEVICES_MAX_LEN 20
 static ssize_t neuron_connected_devices_show(struct device *dev, struct device_attribute *attr, char *buf)
 {
+	int ret = 0;
 	int i = 0;
 	int offset = 0;
-	int minor = MINOR(dev->devt); // neuron device id
-	struct ncdev *devnode = &devnodes[minor];
-	struct neuron_device *nd = devnode->ndev;
+	struct ncdev *devnode;
+	struct neuron_device *nd;
 	u32 connected_devices[MAX_NEURON_DEVICE_COUNT];
 	int connected_device_count = 0;
-	int ret = ndhal->ndhal_fw_io.fw_io_topology(nd->fw_io_ctx, nd->pdev->device, minor, connected_devices, &connected_device_count);
+
+	int minor = MINOR(dev->devt); // neuron device id
+	if (minor >= NEURON_MAX_DEV_NODES) {
+		return -ENODEV;
+	}
+	devnode = &devnodes[minor];
+	nd = devnode->ndev;
+	ret = ndhal->ndhal_fw_io.fw_io_topology(nd->fw_io_ctx, nd->pdev->device, minor, connected_devices, &connected_device_count);
 	if (ret)
 		return ret;
 
@@ -3610,27 +3749,53 @@ static ssize_t neuron_connected_devices_show(struct device *dev, struct device_a
 	return offset;
 }
 
-static DEVICE_ATTR(connected_devices, S_IRUSR, neuron_connected_devices_show, NULL);
+static DEVICE_ATTR(connected_devices, S_IRUGO, neuron_connected_devices_show, NULL);
 
 static ssize_t fw_api_version_show(struct device *dev, struct device_attribute *attr, char *buf)
-{	int fw_api_version;
+{
+	struct neuron_device *nd;
+	int fw_api_version;
 	int minor = MINOR(dev->devt);
-	struct neuron_device *nd = devnodes[minor].ndev;
+	if (minor >= NEURON_MAX_DEV_NODES) {
+		return -ENODEV;
+	}
+	nd = devnodes[minor].ndev;
 
 	fw_io_api_version_read(nd->npdev.bar0, &fw_api_version);
 	if (fw_api_version == 0xdeadbeef) { // the value is not readable during reset, try later
-		return sprintf(buf, "busy\n");
+		return scnprintf(buf, PAGE_SIZE, "busy\n");
 	}
-	return sprintf(buf, "%u\n", fw_api_version);
+	return scnprintf(buf, PAGE_SIZE, "%u\n", fw_api_version);
 }
 
 static DEVICE_ATTR(fw_api_version, S_IRUGO, fw_api_version_show, NULL);
 
+static ssize_t fw_build_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	int ret;
+	struct neuron_device *nd;
+	u32 fw_build;
+	int minor = MINOR(dev->devt);
+	if (minor >= NEURON_MAX_DEV_NODES) {
+		return -ENODEV;
+	}
+	nd = devnodes[minor].ndev;
+
+	ret = fw_io_fw_build_read(nd->npdev.bar0, &fw_build);
+	if (ret || fw_build == 0xdeadbeef) { // the value is not readable during reset, try later
+		return scnprintf(buf, PAGE_SIZE, "busy\n");
+	}
+	return scnprintf(buf, PAGE_SIZE, "%u\n", fw_build);
+}
+
+static DEVICE_ATTR(fw_build, S_IRUGO, fw_build_show, NULL);
+
 static struct attribute *attrs[] = {
 	&dev_attr_reset.attr,
 	&dev_attr_core_count.attr,
 	&dev_attr_connected_devices.attr,
 	&dev_attr_fw_api_version.attr,
+	&dev_attr_fw_build.attr,
 	NULL,
 };
 
@@ -3837,6 +4002,22 @@ static ssize_t ncdev_class_ultraserver_mode_show(struct class *class, struct cla
 	return ndhal->ndhal_npe.npe_class_ultraserver_mode_show_data(buf);
 }
 
+static bool platform_device_initialization_inprogress(void)
+{
+	return total_neuron_devices == 0;
+}
+
+static bool platform_device_initialization_successful(void)
+{
+	int i;
+	for (i = 0; i < total_neuron_devices; i++) {
+		if (neuron_devices[i] == NULL) {
+			return false;
+		}
+	}
+	return true;
+}
+
 #if (!defined(RHEL_RELEASE_CODE) && (LINUX_VERSION_CODE >= KERNEL_VERSION(6, 4, 0))) || (defined(RHEL_RELEASE_CODE) && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(9, 5)))
 static ssize_t ncdev_class_hbm_7200_show(const struct class *class, const struct class_attribute *attr, char *buf)
 #else
@@ -3845,10 +4026,15 @@ static ssize_t ncdev_class_hbm_7200_show(struct class *class, struct class_attri
 {
 	int i;
 	int supports_hbm_7200 = 1;
-	if (total_neuron_devices == 0) {
+
+	if (platform_device_initialization_inprogress()) {
 		return dhal_sysfs_emit(buf, "busy\n");
 	}
 
+	if (!platform_device_initialization_successful()) {
+		return dhal_sysfs_emit(buf, "init_error\n");
+	}
+
 	for (i = 0; i < total_neuron_devices; i++) {
 		if (neuron_devices[i]->supports_hbm_7200 == -1) {
 			return dhal_sysfs_emit(buf, "busy\n");
@@ -3867,10 +4053,15 @@ static ssize_t ncdev_class_cur_perf_profile_show(struct class *class, struct cla
 {
 	int i;
 	int cur_perf_profile;
-	if (total_neuron_devices == 0) {
+
+	if (platform_device_initialization_inprogress()) {
 		return dhal_sysfs_emit(buf, "busy\n");
 	}
 
+	if (!platform_device_initialization_successful()) {
+		return dhal_sysfs_emit(buf, "init_error\n");
+	}
+
 	cur_perf_profile = neuron_devices[0]->current_perf_profile;
 	for (i = 1; i < total_neuron_devices; i++) {
 		if (neuron_devices[i]->current_perf_profile != cur_perf_profile) {
diff --git a/neuron_crwl.c b/neuron_crwl.c
index 04a48e1..f528f1c 100644
--- a/neuron_crwl.c
+++ b/neuron_crwl.c
@@ -219,6 +219,7 @@ int ncrwl_nc_range_mark(u32 nc_count, u32 start_nc_index, u32 end_nc_index,
 				ncrwl_range_mark_cnt++;
 			}
 			mutex_unlock(&ncrwl_range_lock);
+			pr_info("pid:%d  claiming neuron cores: %02d-%02d", task_tgid_nr(current), i, i + nc_count - 1);
 			return 0;
 		}
 		if (*max_range < range_len)
@@ -232,9 +233,13 @@ int ncrwl_nc_range_mark(u32 nc_count, u32 start_nc_index, u32 end_nc_index,
 void ncrwl_nc_range_unmark(volatile long unsigned int *free_map)
 {
 	int i;
+	int first = -1;
+	int last;
 	mutex_lock(&ncrwl_range_lock);
 	for (i = 0; i < MAX_NEURON_DEVICE_COUNT * MAX_NC_PER_DEVICE; i++) {
 		if (test_bit(i, free_map) && ncrwl_range_pids[i] == task_tgid_nr(current)) {
+			first = (first == -1) ? i : first;
+			last = i;
 			ncrwl_range_pids[i] = 0;
 			ncrwl_range_mark_cnt--;
 		}
@@ -242,6 +247,9 @@ void ncrwl_nc_range_unmark(volatile long unsigned int *free_map)
 		ndhal->ndhal_npe.npe_notify_mark(ncrwl_range_mark_cnt, false);
 	}
 	mutex_unlock(&ncrwl_range_lock);
+	if (first != -1) {
+		pr_info("pid:%d  releasing neuron core in range: %02d-%02d", task_tgid_nr(current), first, last);
+	}
 }
 
 int ncrwl_nc_range_pid_get( uint32_t nc_index, pid_t *pid)
diff --git a/neuron_device.h b/neuron_device.h
index 8af4f29..fa67c5b 100644
--- a/neuron_device.h
+++ b/neuron_device.h
@@ -71,6 +71,7 @@ struct neuron_device {
 	struct pci_dev *pdev;
 	int device_index;
 	volatile enum neuron_device_state device_state; // current state of this device
+	struct mutex lock; // serialize neuron_device access when device-wide exclusion is needed
 
 	// all the processes that are opened this device
 	struct neuron_attached_process attached_processes[NEURON_MAX_PROCESS_PER_DEVICE];
@@ -126,6 +127,9 @@ struct neuron_device {
 	// volatile to prevent compiler optimizations since accessed by different threads
 	// This is the true value per-device, instead of the global one in ndhal_perf used only for metrics
 	volatile int current_perf_profile;
+
+	// DMA completion thread for async IO
+	struct ndma_h2d_dma_cmpltn_thread dma_cmpltn_thread;
 };
 
 #endif
diff --git a/neuron_dhal.h b/neuron_dhal.h
index dbce141..be38ead 100644
--- a/neuron_dhal.h
+++ b/neuron_dhal.h
@@ -26,6 +26,7 @@ struct ndhal_arch {
     int arch;
     enum neuron_platform_type platform_type;
     u32 server_id;
+    int (*narch_platform_ready) (struct neuron_device *nd, enum neuron_platform_operation_type platform_operation);
 };
 struct ndhal_address_map {
 	// addresses
@@ -36,7 +37,6 @@ struct ndhal_address_map {
 	uint64_t mmap_nc_sema_incr_offset;
 	uint64_t mmap_nc_sema_decr_offset;
 	uint64_t bar0_misc_ram_offset;
-	uint64_t port_1_base;
 
 	// counts
 	int nc_per_device;
@@ -90,6 +90,8 @@ struct ndhal_mpset {
 struct ndhal_ndmar {
     uint32_t (*ndmar_get_h2t_eng_id) (struct neuron_device *nd, uint32_t nc_id);
     int (*ndmar_get_h2t_def_qid) (uint32_t nc_id);
+    int (*ndmar_ctx_queue_bit) (uint32_t h2d_eng_id, uint32_t qid);
+    void (*ndmar_ctx_queue_from_bit) (int bit, uint32_t *h2d_eng_id, uint32_t *qid);
     bool (*ndmar_is_h2t_def_q) (struct neuron_device *nd, uint32_t eng_id, uint32_t q_id);
     bool (*nr_init_h2t_eng) ( int nc_idx, uint32_t nc_map); 
     bool (*ndmar_is_nx_ring) (uint32_t eng_id, uint32_t q_id);
@@ -97,6 +99,7 @@ struct ndhal_ndmar {
 };
 
 struct ndhal_fw_io {
+    u32 new_readless_read_min_api_version;
     int (*fw_io_topology) (struct fw_io_ctx *ctx, int pdev_index, int device_id, u32 *connected_device_ids, int *count);
     int (*fw_io_register_readless_read_region) (struct fw_io_ctx *ctx, void __iomem *bar0, u64 bar0_size, void __iomem *bar2, u64 bar2_size);
     int (*fw_io_read_csr_array) (void **addrs, u32 *values, u32 num_csrs, bool operational);
@@ -131,6 +134,8 @@ struct ndhal_sysfs_metrics {
                                                 int nc_id,
                                                 int tensor_engine_attrs_info_tbl_cnt,
                                                 const nsysfsmetric_attr_info_t *tensor_engine_attr_info_tbl);
+
+    bool health_status_enabled;
 };
 
 struct ndhal_pci {
diff --git a/neuron_dma.c b/neuron_dma.c
index 32e7d43..901048b 100644
--- a/neuron_dma.c
+++ b/neuron_dma.c
@@ -11,6 +11,9 @@
 #include <linux/mm.h>
 #include <linux/sched/mm.h>
 #include <linux/bitops.h>
+#include <linux/hashtable.h>
+#include <linux/kref.h>
+
 
 #include "udma/udma.h"
 #include "neuron_trace.h"
@@ -33,6 +36,37 @@ MODULE_PARM_DESC(zerocopy_trn1_override, "override zerocopy for trn1");
 
 struct neuron_device;
 
+/* data structures for explicit pin/unpin API */
+
+/**
+ * struct neuron_pinned_mem - Tracks a pre-pinned host memory region
+ * @va: User virtual address that was pinned (lookup key)
+ * @size: Size of the pinned region in bytes
+ * @nr_pages: Number of pages pinned
+ * @pages: Array of pinned page pointers
+ * @rb_node: Red-black tree node for efficient VA lookup
+ *
+ * Process isolation is structural: each process has its own rbtree
+ * in the global hash table, so no pid field is needed here.
+ */
+struct neuron_pinned_mem {
+	u64 va;                    /* lookup key - user virtual address */
+	u64 size;
+	unsigned long nr_pages;
+	struct page **pages;
+	struct rb_node rb_node;    /* for VA-based lookup */
+};
+
+/* Per-process pinned memory state */
+struct neuron_pinned_mem_process {
+	pid_t pid;
+	struct rb_root root;          /* rbtree of pinned regions for this process */
+	struct mutex lock;            /* protects this process's rbtree */
+	struct kref refcount;         /* lifetime management; freed when last ref drops */
+	struct hlist_node hash_node;  /* for hash table lookup */
+};
+
+
 static void ndma_ack_completed_desc(struct ndma_eng *eng, struct ndma_ring *ring, u32 count)
 {
 	struct udma_q *rxq, *txq;
@@ -935,6 +969,10 @@ struct ndma_h2t_zcdma_context {
 	struct page         **page_list;          // page structures tracking our pinned pages;
 											  // managed by page_list_pool in ctx queue
 	enum ndma_zcdma_state state;              // state of this transfer
+	struct neuron_pinned_mem_process *prepin_proc;	// ref counted ptr to per process store of pinned memories
+										      //  when set indicates that 1/ the context uses pre-pinned mem and it should not be unpinned
+											  //  2/ prevents process exit cleanup from unpinning the memory while used by the context
+	pid_t				pid;				  // PID of the process that initiated the copy
 
 	// Completion-related
 	void                 *completion_ptr;     // completion buffer pointer;
@@ -946,20 +984,29 @@ struct ndma_h2t_zcdma_context {
 	struct mm_struct     *mm;                 // mm that owns the user buffers
 };
 
+static void ndma_pinned_mem_process_release(struct kref *kref);
+
 static void ndma_zc_release_ctx(struct ndma_h2t_zcdma_context *ctx, u64 *nr_pinned_pages)
 {
 	// do not free or set completion_ptr null. it is managed by completion_pool in ctx queue
 	// do not free or set page_list null. it is managed by page_list_pool in ctx queue
 
 	if (ctx->state >= NDMA_PINNED_UNSUBMITTED) {
-		if (ctx->direction) {
-			unpin_user_pages(ctx->page_list, ctx->nr_pages);
+		/* Only unpin if we pinned it ourselves (not pre-pinned memory) */
+		if (!ctx->prepin_proc) {
+			if (ctx->direction) {
+				unpin_user_pages(ctx->page_list, ctx->nr_pages);
+			} else {
+				unpin_user_pages_dirty_lock(ctx->page_list, ctx->nr_pages, true);
+			}
 		} else {
-			unpin_user_pages_dirty_lock(ctx->page_list, ctx->nr_pages, true);
+			kref_put(&ctx->prepin_proc->refcount, ndma_pinned_mem_process_release);
 		}
+
 		*nr_pinned_pages -= ctx->nr_pages;
 	}
 	ctx->nr_pages = 0;
+	ctx->prepin_proc = NULL;
 
 	if (ctx->mm) {
 		mmput(ctx->mm);
@@ -1300,7 +1347,7 @@ int ndma_ctx_queue_init(struct ndma_ctx_queue *queue)
 	}
 
 	// allocate page_list arrays in one contiguous pool, and let each entry point to its slice
-	queue->page_list_pool = kcalloc(capacity * NDMA_ZC_PAGES_PER_XFER, sizeof(struct page *), GFP_KERNEL);
+	queue->page_list_pool = kvcalloc(capacity * NDMA_ZC_PAGES_PER_XFER, sizeof(struct page *), GFP_KERNEL);
 	if (!queue->page_list_pool) {
 		pr_err("failed to allocate ctx queue page_list pool\n");
 		goto err;
@@ -1321,7 +1368,7 @@ int ndma_ctx_queue_init(struct ndma_ctx_queue *queue)
 		queue->completion_pool = NULL;
 	}
 	if (queue->page_list_pool) {
-		kfree(queue->page_list_pool);
+		kvfree(queue->page_list_pool);
 		queue->page_list_pool = NULL;
 	}
 	if (queue->entries) {
@@ -1333,6 +1380,10 @@ int ndma_ctx_queue_init(struct ndma_ctx_queue *queue)
 
 void ndma_ctx_queue_free(struct ndma_eng *eng, struct ndma_ring *ring, struct ndma_ctx_queue *queue)
 {
+	int bit = ndhal->ndhal_ndmar.ndmar_ctx_queue_bit(eng->eng_id, ring->qid);
+
+	atomic64_andnot(BIT_ULL(bit), &eng->nd->dma_cmpltn_thread.nonempty_ctxq_bitmap);
+
 	if (!queue) {
 		return;
 	}
@@ -1346,7 +1397,7 @@ void ndma_ctx_queue_free(struct ndma_eng *eng, struct ndma_ring *ring, struct nd
 		queue->completion_pool = NULL;
 	}
 	if (queue->page_list_pool) {
-		kfree(queue->page_list_pool);
+		kvfree(queue->page_list_pool);
 		queue->page_list_pool = NULL;
 	}
 	memset(queue, 0, sizeof(*queue));
@@ -1533,6 +1584,8 @@ static bool ndma_zc_should_wait(struct ndma_eng *eng,
 	return false;
 }
 
+static bool ndma_pinned_mem_try_populate(pid_t pid, u64 va, u64 size, struct page **page_list, int nr_pages, struct neuron_pinned_mem_process **prepin_proc);
+
 static int ndma_zerocopy_pin_pages(int nd_id,
 								   u32 nc_id,
 								   struct ndma_ctx_queue *ctx_queue,
@@ -1540,6 +1593,16 @@ static int ndma_zerocopy_pin_pages(int nd_id,
 								   bool use_remote_pin)
 {
 	int nr_pinned = 0;
+	struct neuron_pinned_mem_process *prepin_proc = NULL;
+
+	/* Check if this VA range is in pre-pinned memory */
+	if (ndma_pinned_mem_try_populate(dma_ctx->pid, (u64)dma_ctx->host_addr, dma_ctx->size,
+					 dma_ctx->page_list, dma_ctx->nr_pages, &prepin_proc)) {
+		dma_ctx->prepin_proc = prepin_proc;
+		ctx_queue->nr_pinned_pages += dma_ctx->nr_pages;
+		dma_ctx->state = NDMA_PINNED_UNSUBMITTED;
+		return 0;
+	}
 
 	if (use_remote_pin) {
 		if (!dma_ctx->mm) {
@@ -1627,6 +1690,13 @@ int ndma_zerocopy_submit(struct neuron_device *nd,
 		return -ENOENT;
 	}
 
+	if (async) {
+		ret = ndma_h2d_create_cmpltn_thread(nd);
+		if (ret) {
+			return ret;
+		}
+	}
+
 	mutex_lock(&ring->h2t_ring_lock);
 
 	for (i = 0; i < num_ops; i++) {
@@ -1685,6 +1755,7 @@ int ndma_zerocopy_submit(struct neuron_device *nd,
 				cur_ctx->state          = NDMA_UNPINNED;
 				cur_ctx->nr_desc        = 0; // Set by ndma_build_n_issue_zc_descs().
 				cur_ctx->mm             = NULL;
+				cur_ctx->pid		    = task_tgid_nr(current);
 				cur_ctx->sequence_num   = sequence_num;
 
 				/* Pin now if possible; otherwise capture mm for remote pinning (async only). */
@@ -1770,25 +1841,33 @@ int ndma_zerocopy_submit(struct neuron_device *nd,
 	if (ret) {
 		ndma_ctx_queue_drain(eng, ring, ctx_queue);
 	}
+
 	mutex_unlock(&ring->h2t_ring_lock);
+
+	if (!ret && async) {
+		int bit = ndhal->ndhal_ndmar.ndmar_ctx_queue_bit(eng_id, qid);
+		atomic64_or(BIT_ULL(bit), &nd->dma_cmpltn_thread.nonempty_ctxq_bitmap); // set the bit for this queue
+		wake_up(&nd->dma_cmpltn_thread.wait_queue);
+	}
+
 	return ret;
 }
 
 /* The completion flow for completion, remote pinning, and submission. Async IO only */
-static __maybe_unused int ndma_zerocopy_complete(struct neuron_device *nd,
-												 struct ndma_eng *eng,
-												 struct ndma_ring *ring,
-												 bool *did_work)
+static int ndma_zerocopy_complete(struct neuron_device *nd,
+								  struct ndma_eng *eng,
+								  struct ndma_ring *ring,
+								  u64 *nonempty_ctxq_bitmap_copy)
 {
 	int ret = 0;
 	int err = 0;
+	bool did_work = false;
 	struct ndma_ctx_queue *ctx_queue = NULL;
 	u32 desc_threshold = NDMA_ZC_DESC_WAIT_THRESHOLD_LO;
 
-	if (!ring || !did_work) {
+	if (!ring) {
 		return -EINVAL;
 	}
-	*did_work = false;
 
 	ctx_queue = &ring->dma_ctx_queue;
 
@@ -1799,7 +1878,12 @@ static __maybe_unused int ndma_zerocopy_complete(struct neuron_device *nd,
 		if (ndma_ctx_queue_submitted_empty(ctx_queue)) {
 			break;
 		}
-		if (*did_work && !ndma_zc_should_wait(eng, ring, ctx_queue, &desc_threshold)) {
+		/*
+		 * Async completion must always retire at least one submitted context.
+		 * Only fall back to the wait-throttling heuristic after we have made
+		 * some forward progress in this pass.
+		 */
+		if (did_work && !ndma_zc_should_wait(eng, ring, ctx_queue, &desc_threshold)) {
 			break;
 		}
 		struct ndma_h2t_zcdma_context *submitted_ctx = ndma_ctx_queue_pop_submitted(ctx_queue);
@@ -1815,8 +1899,7 @@ static __maybe_unused int ndma_zerocopy_complete(struct neuron_device *nd,
 		}
 
 		ndma_zc_release_ctx(submitted_ctx, &ctx_queue->nr_pinned_pages);
-
-		*did_work = true;
+		did_work = true;
 	}
 
 	/* 2) Submit pinned but unsubmitted contexts */
@@ -1836,8 +1919,7 @@ static __maybe_unused int ndma_zerocopy_complete(struct neuron_device *nd,
 		} else {
 			ndma_ctx_queue_inc_first_pinned_unsubmitted(ctx_queue);
 		}
-
-		*did_work = true;
+		did_work = true;
 	}
 
 	/* 3) Remote pin unpinned contexts */
@@ -1857,10 +1939,478 @@ static __maybe_unused int ndma_zerocopy_complete(struct neuron_device *nd,
 		} else {
 			ndma_ctx_queue_inc_first_unpinned(ctx_queue);
 		}
-
-		*did_work = true;
+		did_work = true;
 	}
 
 	mutex_unlock(&ring->h2t_ring_lock);
+
+	if (ndma_ctx_queue_is_empty(ctx_queue)) {
+		int bit = ndhal->ndhal_ndmar.ndmar_ctx_queue_bit(eng->eng_id, ring->qid);
+		*nonempty_ctxq_bitmap_copy &= ~BIT_ULL(bit);
+	}
+
 	return err;
 }
+
+static int ndma_h2d_cmpltn_thread_fn(void *arg)
+{
+	struct neuron_device *nd = (struct neuron_device *)arg;
+	int ret = 0;
+
+	while (!kthread_should_stop() && !nd->dma_cmpltn_thread.stop) {
+		wait_event_interruptible(nd->dma_cmpltn_thread.wait_queue,
+								 nd->dma_cmpltn_thread.stop || atomic64_read(&nd->dma_cmpltn_thread.nonempty_ctxq_bitmap) != 0);
+		if (kthread_should_stop() || nd->dma_cmpltn_thread.stop) {
+			break;
+		}
+		u64 bitmap = atomic64_xchg(&nd->dma_cmpltn_thread.nonempty_ctxq_bitmap, 0);
+
+		while (bitmap) {
+			int bit = __ffs64(bitmap);
+			u32 eng_id;
+			u32 qid;
+			struct ndma_eng *eng;
+			struct ndma_ring *ring;
+
+			ndhal->ndhal_ndmar.ndmar_ctx_queue_from_bit(bit, &eng_id, &qid);
+
+			eng = &nd->ndma_engine[eng_id];
+			ring = &eng->queues[qid].ring_info;
+			ret = ndma_zerocopy_complete(nd, eng, ring, &bitmap);
+			if (ret) {
+				pr_err("dma completion thread failed to process ctx queue for eng %d q %d: %d\n", eng_id, qid, ret);
+			}
+		}
+	}
+
+	return ret;
+}
+
+int ndma_h2d_create_cmpltn_thread(struct neuron_device *nd)
+{
+	int ret = 0;
+	struct task_struct *thread;
+
+	if (READ_ONCE(nd->dma_cmpltn_thread.thread)) {
+		return 0;
+	}
+
+	mutex_lock(&nd->lock);
+
+	if (nd->dma_cmpltn_thread.thread) {
+		/* thread already created */
+		goto out;
+	}
+
+	nd->dma_cmpltn_thread.stop = false;
+	init_waitqueue_head(&nd->dma_cmpltn_thread.wait_queue);
+	atomic64_set(&nd->dma_cmpltn_thread.nonempty_ctxq_bitmap, 0);
+	thread = kthread_run(ndma_h2d_cmpltn_thread_fn, nd, "neuron dma cmpltn");
+	if (IS_ERR(thread)) {
+		ret = PTR_ERR(thread);
+		pr_err("h2d dma completion thread creation failed\n");
+		goto out;
+	}
+	WRITE_ONCE(nd->dma_cmpltn_thread.thread, thread);
+
+out:
+	mutex_unlock(&nd->lock);
+	return ret;
+}
+
+void ndma_h2d_stop_cmpltn_thread(struct neuron_device *nd)
+{
+	if (!nd->dma_cmpltn_thread.thread) {
+		return;
+	}
+	if (IS_ERR(nd->dma_cmpltn_thread.thread)) {
+		nd->dma_cmpltn_thread.thread = NULL;
+		return;
+	}
+
+	nd->dma_cmpltn_thread.stop = true;
+	wake_up(&nd->dma_cmpltn_thread.wait_queue);
+	kthread_stop(nd->dma_cmpltn_thread.thread);
+	nd->dma_cmpltn_thread.thread = NULL;
+}
+
+/*
+ * Pre-pinned host memory implementation
+ * Uses a global hash table keyed by PID, with each process having its own
+ * rbtree of pinned memory regions keyed by VA.
+ * Host memory is not device-specific — a process can pin via any device
+ * and the zerocopy path on any device will find the pre-pinned region.
+ */
+
+/* 256 buckets: up to 16 devices × 16 processes per device */
+static DEFINE_HASHTABLE(pinned_mem_htable, 8);
+static DEFINE_MUTEX(pinned_mem_htable_lock); /* protects hash table add/remove/lookup only */
+
+/*
+ * Find or create per-process state and take a reference.
+ * Caller must hold pinned_mem_htable_lock; caller owns the returned ref.
+ */
+static struct neuron_pinned_mem_process *ndma_pinned_mem_get_process_locked(pid_t pid)
+{
+	struct neuron_pinned_mem_process *proc;
+
+	hash_for_each_possible(pinned_mem_htable, proc, hash_node, pid) {
+		if (proc->pid == pid) {
+			kref_get(&proc->refcount);
+			return proc;
+		}
+	}
+
+	proc = kzalloc(sizeof(*proc), GFP_KERNEL);
+	if (!proc)
+		return NULL;
+	proc->pid = pid;
+	proc->root = RB_ROOT;
+	mutex_init(&proc->lock);
+	kref_init(&proc->refcount); /* hash table holds initial ref */
+	hash_add(pinned_mem_htable, &proc->hash_node, pid);
+	kref_get(&proc->refcount);  /* caller's operational ref */
+	return proc;
+}
+
+/*
+ * Find per-process state and take a reference.
+ * Caller must hold pinned_mem_htable_lock; caller owns the returned ref.
+ * Returns NULL if not found (no ref taken).
+ */
+static struct neuron_pinned_mem_process *ndma_pinned_mem_find_process_locked(pid_t pid)
+{
+	struct neuron_pinned_mem_process *proc;
+
+	hash_for_each_possible(pinned_mem_htable, proc, hash_node, pid) {
+		if (proc->pid == pid) {
+			kref_get(&proc->refcount);
+			return proc;
+		}
+	}
+	return NULL;
+}
+
+static void ndma_pinned_mem_free_entry(struct neuron_pinned_mem *entry)
+{
+	if (entry->pages) {
+		unpin_user_pages(entry->pages, entry->nr_pages);
+		kvfree(entry->pages);
+	}
+	kfree(entry);
+}
+
+static void ndma_pinned_mem_destroy_tree(struct rb_root *root)
+{
+	struct rb_node *node;
+
+	while ((node = rb_first(root)) != NULL) {
+		struct neuron_pinned_mem *entry = rb_entry(node, struct neuron_pinned_mem, rb_node);
+		rb_erase(node, root);
+		ndma_pinned_mem_free_entry(entry);
+	}
+}
+
+void ndma_pinned_mem_destroy(void)
+{
+	struct neuron_pinned_mem_process *proc;
+	struct hlist_node *tmp;
+	int bkt;
+
+	mutex_lock(&pinned_mem_htable_lock);
+	hash_for_each_safe(pinned_mem_htable, bkt, tmp, proc, hash_node) {
+		hash_del(&proc->hash_node);
+		ndma_pinned_mem_destroy_tree(&proc->root);
+		kfree(proc);
+	}
+	mutex_unlock(&pinned_mem_htable_lock);
+}
+
+static void ndma_pinned_mem_process_release(struct kref *kref)
+{
+	struct neuron_pinned_mem_process *proc =
+		container_of(kref, struct neuron_pinned_mem_process, refcount);
+	mutex_lock(&proc->lock); // this is likely unnecessary because when we get here proc has been removed from the hash table
+	                         // on process exit and nobody can find this entry anymore 
+	ndma_pinned_mem_destroy_tree(&proc->root);
+	mutex_unlock(&proc->lock);
+	kfree(proc);
+}
+
+/* Find by exact VA match (for unpin) - caller must hold lock */
+static struct neuron_pinned_mem *ndma_pinned_mem_find_exact_locked(struct rb_root *root, u64 va)
+{
+	struct rb_node *node = root->rb_node;
+
+	while (node) {
+		struct neuron_pinned_mem *entry = rb_entry(node, struct neuron_pinned_mem, rb_node);
+
+		if (va < entry->va)
+			node = node->rb_left;
+		else if (va > entry->va)
+			node = node->rb_right;
+		else
+			return entry; /* exact match */
+	}
+	return NULL;
+}
+
+/* Find region containing VA range (for zerocopy) - caller must hold lock */
+static struct neuron_pinned_mem *ndma_pinned_mem_find_containing_locked(struct rb_root *root, u64 va, u64 size)
+{
+	struct rb_node *node = root->rb_node;
+	u64 va_end = va + size;
+
+	while (node) {
+		struct neuron_pinned_mem *entry = rb_entry(node, struct neuron_pinned_mem, rb_node);
+		u64 entry_end = entry->va + entry->size;
+
+		if (va_end <= entry->va) {
+			/* Range is entirely before this entry */
+			node = node->rb_left;
+		} else if (va >= entry_end) {
+			/* Range is entirely after this entry */
+			node = node->rb_right;
+		} else if (va >= entry->va && va_end <= entry_end) {
+			/* Range is fully contained within this entry */
+			return entry;
+		} else {
+			/* Partial overlap - not supported, return NULL */
+			return NULL;
+		}
+	}
+	return NULL;
+}
+
+/* Insert into rbtree - caller must hold lock */
+static int ndma_pinned_mem_insert_locked(struct rb_root *root, struct neuron_pinned_mem *new)
+{
+	struct rb_node **link = &root->rb_node;
+	struct rb_node *parent = NULL;
+	u64 new_end = new->va + new->size;
+
+	while (*link) {
+		struct neuron_pinned_mem *entry = rb_entry(*link, struct neuron_pinned_mem, rb_node);
+		u64 entry_end = entry->va + entry->size;
+
+		parent = *link;
+		if (new->va < entry->va) {
+			/* Check for overlap */
+			if (new_end > entry->va)
+				return -EEXIST; /* overlaps */
+			link = &(*link)->rb_left;
+		} else if (new->va > entry->va) {
+			/* Check for overlap */
+			if (new->va < entry_end)
+				return -EEXIST; /* overlaps */
+			link = &(*link)->rb_right;
+		} else {
+			return -EEXIST; /* exact duplicate */
+		}
+	}
+
+	rb_link_node(&new->rb_node, parent, link);
+	rb_insert_color(&new->rb_node, root);
+	return 0;
+}
+
+/**
+ * ndma_check_pages_contiguous() - Check if pinned pages are physically contiguous
+ * @pages: Array of pinned pages
+ * @nr_pages: Number of pages
+ * @offset: Byte offset within the first page
+ *
+ * Return: Physical address of the start of the region if all pages are
+ *         contiguous, or ~0ULL if they are not.
+ */
+static u64 ndma_check_pages_contiguous(struct page **pages, unsigned long nr_pages, unsigned long offset)
+{
+	unsigned long i;
+
+	for (i = 1; i < nr_pages; i++) {
+		if (page_to_phys(pages[i]) != page_to_phys(pages[i - 1]) + PAGE_SIZE)
+			return ~0ULL;
+	}
+	return (page_to_phys(pages[0]) + offset) | ndhal->ndhal_address_map.pci_host_base;
+}
+
+int ndma_pin_host_memory(u64 va, u64 size, u64 *pa_out)
+{
+	struct neuron_pinned_mem *entry;
+	struct neuron_pinned_mem_process *proc;
+	unsigned long offset = va & (PAGE_SIZE - 1);
+	unsigned long nr_pages = DIV_ROUND_UP(offset + size, PAGE_SIZE);
+	int ret;
+	long pinned;
+
+	if (va == 0 || size == 0)
+		return -EINVAL;
+
+	entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+	if (!entry)
+		return -ENOMEM;
+
+	entry->pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL);
+	if (!entry->pages) {
+		ret = -ENOMEM;
+		goto err_free_entry;
+	}
+
+	/* Try fast path first - doesn't require mmap_lock */
+	pinned = pin_user_pages_fast(va & PAGE_MASK, nr_pages, FOLL_WRITE | FOLL_LONGTERM, entry->pages);
+	if (pinned < 0 || pinned < nr_pages) {
+		/* Fast path failed or incomplete - fall back to slow path */
+		if (pinned > 0)
+			unpin_user_pages(entry->pages, pinned);
+
+		/* Slow path with mmap_lock */
+		mmap_read_lock(current->mm);
+#if (!defined(RHEL_RELEASE_CODE) && (LINUX_VERSION_CODE >= KERNEL_VERSION(6, 5, 0))) || (defined(RHEL_RELEASE_CODE) && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(9, 6)))
+		pinned = pin_user_pages(va & PAGE_MASK, nr_pages, FOLL_WRITE | FOLL_LONGTERM, entry->pages);
+#else
+		pinned = pin_user_pages(va & PAGE_MASK, nr_pages, FOLL_WRITE | FOLL_LONGTERM, entry->pages, NULL);
+#endif
+		mmap_read_unlock(current->mm);
+
+		if (pinned < 0) {
+			pr_err("failed to pin pages: %ld\n", pinned);
+			ret = pinned;
+			goto err_free_pages;
+		}
+		if (pinned < nr_pages) {
+			pr_err("could not pin all pages: %ld/%lu\n", pinned, nr_pages);
+			unpin_user_pages(entry->pages, pinned);
+			ret = -EFAULT;
+			goto err_free_pages;
+		}
+	}
+
+	entry->va = va;
+	entry->size = size;
+	entry->nr_pages = nr_pages;
+	RB_CLEAR_NODE(&entry->rb_node);
+
+	mutex_lock(&pinned_mem_htable_lock);
+	proc = ndma_pinned_mem_get_process_locked(task_tgid_nr(current));
+	mutex_unlock(&pinned_mem_htable_lock);
+	if (!proc) {
+		ret = -ENOMEM;
+		goto err_unpin;
+	}
+	// here and elsewhere, slightly non-obvious.
+	// we ref counting proc to make sure it's not deleted in the 
+	// unlikely case the process is detached while we are here. Not 
+	// possible to happen in this function because it's called from IOCTL
+	// but a general pattern is to 1/ lock the hashtable 2/ return ref counted
+	// proc entry, 3/ operate on the entry and 4/ decrement the count
+	// this is specifically relevant for async zerocopy case getting pinned pages
+	// from proc because it's running as an independent thread.
+	mutex_lock(&proc->lock);
+	ret = ndma_pinned_mem_insert_locked(&proc->root, entry);
+	mutex_unlock(&proc->lock);
+	kref_put(&proc->refcount, ndma_pinned_mem_process_release);
+	if (ret) {
+		pr_err("Failed to register, likely due to app failure to unpin previous mmap()\n");
+		goto err_unpin;
+	}
+
+	/* Report contiguous PA if all pinned pages are physically adjacent. */
+	if (pa_out)
+		*pa_out = ndma_check_pages_contiguous(entry->pages, nr_pages, offset);
+
+	return 0;
+
+err_unpin:
+	unpin_user_pages(entry->pages, nr_pages);
+err_free_pages:
+	kvfree(entry->pages);
+err_free_entry:
+	kfree(entry);
+	return ret;
+}
+
+int ndma_unpin_host_memory(u64 va)
+{
+	struct neuron_pinned_mem *entry;
+	struct neuron_pinned_mem_process *proc;
+
+	mutex_lock(&pinned_mem_htable_lock);
+	proc = ndma_pinned_mem_find_process_locked(task_tgid_nr(current));
+	mutex_unlock(&pinned_mem_htable_lock);
+	if (!proc)
+		return -ENOENT;
+
+	mutex_lock(&proc->lock);
+	entry = ndma_pinned_mem_find_exact_locked(&proc->root, va);
+	if (!entry) {
+		mutex_unlock(&proc->lock);
+		kref_put(&proc->refcount, ndma_pinned_mem_process_release);
+		return -ENOENT;
+	}
+
+	rb_erase(&entry->rb_node, &proc->root);
+	mutex_unlock(&proc->lock);
+	kref_put(&proc->refcount, ndma_pinned_mem_process_release);
+
+	ndma_pinned_mem_free_entry(entry);
+	return 0;
+}
+
+/* Used by zero-copy API to use pinned pages instead on pinning on demand
+ * the copy can run either as part of IOCTL or in async thread, it takes PID
+ * of the process that pinned the pages.
+ */
+static bool ndma_pinned_mem_try_populate(pid_t pid, u64 va, u64 size, struct page **page_list, int nr_pages, struct neuron_pinned_mem_process **prepin_proc)
+{
+	struct neuron_pinned_mem_process *proc;
+	struct neuron_pinned_mem *entry;
+	bool found = false;
+
+	*prepin_proc = NULL;
+
+	mutex_lock(&pinned_mem_htable_lock);
+	proc = ndma_pinned_mem_find_process_locked(pid);
+	mutex_unlock(&pinned_mem_htable_lock);
+
+	if (proc) {
+		mutex_lock(&proc->lock);
+		entry = ndma_pinned_mem_find_containing_locked(&proc->root, va, size);
+		if (entry) {
+			unsigned long va_start = va & PAGE_MASK;
+			unsigned long pinned_va_start = entry->va & PAGE_MASK;
+			unsigned long page_offset = (va_start - pinned_va_start) >> PAGE_SHIFT;
+			int i;
+
+			for (i = 0; i < nr_pages; i++)
+				page_list[i] = entry->pages[page_offset + i];
+			found = true;
+		}
+		mutex_unlock(&proc->lock);
+		if (found) { 
+			*prepin_proc = proc;
+		} else { // we are holding a ref count for proc, but we did not find/copy any pages
+			     // so we don't need to hold on to the proc
+			kref_put(&proc->refcount, ndma_pinned_mem_process_release);
+		}
+	}
+
+	return found;
+}
+
+void ndma_pinned_mem_cleanup_process(pid_t pid)
+{
+	struct neuron_pinned_mem_process *proc;
+
+	mutex_lock(&pinned_mem_htable_lock);
+	proc = ndma_pinned_mem_find_process_locked(pid);
+	if (proc)
+		hash_del(&proc->hash_node); /* prevent new lookups */
+	mutex_unlock(&pinned_mem_htable_lock);
+
+	if (proc) {
+		/* Drop the find ref; the hash_del above means no new refs can be taken */
+		kref_put(&proc->refcount, ndma_pinned_mem_process_release);
+		/* Drop the hash table's initial ref — frees proc when last user is done, when ref count is 0 rb tree is deleted and everything is unpinned */
+		kref_put(&proc->refcount, ndma_pinned_mem_process_release);
+	}
+}
diff --git a/neuron_dma.h b/neuron_dma.h
index eeea14f..0661a49 100644
--- a/neuron_dma.h
+++ b/neuron_dma.h
@@ -262,4 +262,45 @@ int ndma_zerocopy_submit(struct neuron_device *nd,
 						bool direction,
 						u64 sequence_num);
 
+/**
+ * Pre-pinned host memory support
+ *
+ * Allows userspace to pin host memory once and reuse it for multiple
+ * DMA transfers without the overhead of pinning/unpinning on each transfer.
+ * Uses VA as the lookup key - zerocopy operations auto-detect pinned memory.
+ */
+
+/**
+ * ndma_pinned_mem_destroy() - Cleanup pinned memory tracking subsystem
+ */
+void ndma_pinned_mem_destroy(void);
+
+/**
+ * ndma_pin_host_memory() - Pin host memory for accelerated DMA operations
+ * @va: User virtual address to pin
+ * @size: Size of memory to pin
+ *
+ * Pins host memory so zerocopy operations auto-detect pinned regions
+ * and skip per-transfer pinning. Uses fast path (pin_user_pages_fast)
+ * first, then falls back to slow path (pin_user_pages with mmap_lock)
+ * if needed.
+ *
+ * Return: 0 on success, -EEXIST if already pinned, negative errno on failure
+ */
+int ndma_pin_host_memory(u64 va, u64 size, u64 *pa_out);
+
+/**
+ * ndma_unpin_host_memory() - Unpin previously pinned host memory
+ * @va: VA that was used in ndma_pin_host_memory (exact match required)
+ *
+ * Return: 0 on success, -ENOENT if not found, -EPERM if not owner
+ */
+int ndma_unpin_host_memory(u64 va);
+
+/**
+ * ndma_pinned_mem_cleanup_process() - Cleanup all pinned memory for a process
+ * @pid: Process ID to cleanup
+ */
+void ndma_pinned_mem_cleanup_process(pid_t pid);
+
 #endif
diff --git a/neuron_fw_io.c b/neuron_fw_io.c
index dbf9133..186c390 100644
--- a/neuron_fw_io.c
+++ b/neuron_fw_io.c
@@ -54,6 +54,21 @@ int fw_io_ecc_read(void *bar0, uint64_t ecc_offset, uint32_t *ecc_err_count)
 	return 0;
 }
 
+int fw_io_misc_ram_reg_read(void *bar0, u64 offset, u32 *val)
+{
+	if (offset % 4 != 0) {
+		pr_err("invalid misc ram offset, needs to be 4 byte aligned\n");
+		return -EPROTO;
+	}
+	void *addr = bar0 + ndhal->ndhal_address_map.bar0_misc_ram_offset + offset;
+	int ret = ndhal->ndhal_fw_io.fw_io_read_csr_array(&addr, val, 1, true);
+	if (ret) {
+		pr_err("failed to read misc ram reg at offset 0x%llx\n", offset);
+		return -EIO;
+	}
+	return 0;
+}
+
 int fw_io_hbm_uecc_repair_state_read(void *bar0, uint32_t *hbm_repair_state)
 {
 	int ret;
@@ -133,6 +148,19 @@ int fw_io_api_version_read(void * bar0, u32 *version)
 	return ret;
 }
 
+int fw_io_fw_build_read(void *bar0, u32 *fw_build)
+{
+	int ret;
+
+	void *addr = bar0 + ndhal->ndhal_address_map.bar0_misc_ram_offset + FW_IO_REG_FW_BUILD_OFFSET;
+	ret = ndhal->ndhal_fw_io.fw_io_read_csr_array(&addr, fw_build, 1, true);
+	if (ret) {
+		pr_err("failed to get fw build from the device, ret = %d\n", ret);
+	}
+
+	return ret;
+}
+
 int fw_io_server_info_read(void *bar0, int *server_id, int * rack_id)
 {
 	int ret;
@@ -385,7 +413,7 @@ int fw_io_execute_request(struct fw_io_ctx *ctx, u8 command_id, const u8 *req, u
 			goto done;
 		}
 		ctx->fw_io_err_count++;
-		pr_err(KERN_ERR "seq: %u, cmd: %u failed %u\n", ctx->next_seq_num, command_id,
+		pr_err("seq: %u, cmd: %u failed %u\n", ctx->next_seq_num, command_id,
 	       	ctx->response->response_hdr.hdr.error_code);
 		// if we get an unsupported command response, don't retry
 		if (ctx->response->response_hdr.hdr.error_code == FW_IO_UNKNOWN_COMMAND) {
@@ -413,13 +441,13 @@ int fw_io_execute_request_new(struct fw_io_ctx *ctx, u8 command_id, const u8 *re
 
 	ret = fw_io_api_version_read(ctx->bar0, &api_version_num);
 
-	if ((ret != 0) || (api_version_num < FW_IO_NEW_READLESS_READ_MIN_API_VERSION)) {
+	if ((ret != 0) || (api_version_num < ndhal->ndhal_fw_io.new_readless_read_min_api_version)) {
 		pr_info_once("Firmware version %d, using legacy Firmware/Runtime comm framework", api_version_num);
 		return -ENOTSUPP;
 	}
 
 	mutex_lock(&ctx->lock);
-
+	ret = -EIO;
 	u32 retry_count = (command_id < FW_IO_CMD_MAX) ? fw_io_cmd_retry_tbl[command_id] : FW_IO_RD_RETRY;
 	for (i=0; i < retry_count; i++){
 		if (++ctx->next_seq_num == 0)
@@ -463,6 +491,7 @@ int fw_io_execute_request_new(struct fw_io_ctx *ctx, u8 command_id, const u8 *re
 		if (trigger) {
 			if (command_id != FW_IO_CMD_POST_TO_CW)
 				pr_err("seq: %u, cmd: %u timed out\n", ctx->next_seq_num, command_id);
+			ret = -ETIMEDOUT;
 			continue;
 		}
 		
@@ -473,6 +502,7 @@ int fw_io_execute_request_new(struct fw_io_ctx *ctx, u8 command_id, const u8 *re
 		if (resp_header.hdr.sequence_number != ctx->next_seq_num) {
 			if (command_id != FW_IO_CMD_POST_TO_CW)
 				pr_err("seq: %u, cmd: %u seq mismatch\n", ctx->next_seq_num, command_id);
+			ret = -EPROTO;
 			continue;
 		}
 
@@ -496,8 +526,8 @@ int fw_io_execute_request_new(struct fw_io_ctx *ctx, u8 command_id, const u8 *re
 		}
 
 		ctx->fw_io_err_count++;
-		pr_err(KERN_ERR "seq: %u, cmd: %u failed %u\n", ctx->next_seq_num, command_id, resp_header.hdr.error_code);
-		ret = -1;
+		pr_err("seq: %u, cmd: %u failed %u\n", ctx->next_seq_num, command_id, resp_header.hdr.error_code);
+		ret = -EIO;
 		if (resp_header.hdr.error_code == FW_IO_UNKNOWN_COMMAND) {
 			break;
 		}
@@ -775,7 +805,7 @@ void fw_io_destroy(struct fw_io_ctx *ctx)
 	kfree(ctx);
 }
 
-static inline uint32_t uncorrectable_ecc_err_count(uint32_t api_version, uint32_t ecc_err_count) {
+static inline uint32_t unrepairable_ecc_err_count(uint32_t api_version, uint32_t ecc_err_count) {
 	// API Version<6:  bitfield[15:0] Uncorrectable Errors
 	// API Version>=6: bitfield[15:12] Uncorrectable Errors
 	return (api_version >= 6) ? ((ecc_err_count >> 12) & 0xf) : (ecc_err_count & 0xffff);
@@ -787,8 +817,8 @@ static inline uint32_t repairable_ecc_err_count(uint32_t api_version, uint32_t e
 	return (api_version >= 6) ? (ecc_err_count & 0xfff) : 0;
 }
 
-void fw_io_get_total_ecc_err_counts(void *bar0, uint32_t *ue_ecc_count, uint32_t *repairable_ecc_count) {
-	uint32_t total_uncorrected_ecc_err_count = 0;
+void fw_io_get_total_ecc_err_counts(void *bar0, uint32_t *unrepairable_ecc_count, uint32_t *repairable_ecc_count) {
+	uint32_t total_unrepairable_ecc_err_count = 0;
 	uint32_t total_repairable_ecc_err_count = 0;
 	uint32_t channel = 0;
 	uint32_t ecc_err_count = 0;
@@ -804,11 +834,11 @@ void fw_io_get_total_ecc_err_counts(void *bar0, uint32_t *ue_ecc_count, uint32_t
 		if (ret) {
 			pr_err("sysfs failed to read ECC HBM%u error from FWIO\n", channel);
 		} else if (ecc_err_count != 0xdeadbeef) {
-			total_uncorrected_ecc_err_count += uncorrectable_ecc_err_count(api_version, ecc_err_count);
+			total_unrepairable_ecc_err_count += unrepairable_ecc_err_count(api_version, ecc_err_count);
 			total_repairable_ecc_err_count += repairable_ecc_err_count(api_version, ecc_err_count);
 		}
 	}
-	*ue_ecc_count = total_uncorrected_ecc_err_count;
+	*unrepairable_ecc_count = total_unrepairable_ecc_err_count;
 	*repairable_ecc_count = total_repairable_ecc_err_count;
 }
 
diff --git a/neuron_fw_io.h b/neuron_fw_io.h
index 83a5709..ac91b98 100644
--- a/neuron_fw_io.h
+++ b/neuron_fw_io.h
@@ -174,8 +174,8 @@ enum {
 	// All devices will have the D0 offset.  Devices with two dice will also have the D1 offset.
 	FW_IO_REG_POWER_UTIL_D0_OFFSET = 0x54, // 21 * 4 bytes
 	FW_IO_REG_POWER_UTIL_D1_OFFSET = 0x58, // 22 * 4 bytes
-
 	FW_IO_REG_HBM_REPAIR_STATE_OFFSET = 0x64, // 25 * 4 bytes
+	FW_IO_REG_FW_BUILD_OFFSET = 0x74, // 29 * 4 bytes
 											  //
 
 	FW_IO_REG_RESERVATION_ID_HI = 0x80,	// 32 * 4 bytes
@@ -273,10 +273,6 @@ struct fw_io_ctx {
 // max number of registers can be read in single function call
 #define FW_IO_MAX_READLESS_READ_REGISTER_COUNT 100
 
-// Min Firmware API version for new readless read framework
-#define FW_IO_NEW_READLESS_READ_MIN_API_VERSION 7
-#define FW_IO_POWER_MIN_API_VERSION 3
-
 
 /**
  * fw_io_register_read_region - Read a BAR region
@@ -467,6 +463,14 @@ int fw_io_device_power_read(void *bar0, u32 *power, unsigned die);
  */
 int fw_io_api_version_read(void * bar0, u32 *version);
 
+/**
+ * fw_io_fw_build_read() - Read the firmware build number
+ * @param bar - from bar
+ * @param fw_build  - output firmware build number
+ * @return  0 on success.
+ */
+int fw_io_fw_build_read(void *bar0, u32 *fw_build);
+
 /**
  * fw_io_device_id_write() - Read device id
  * @param bar - to bar
@@ -497,6 +501,16 @@ u64 fw_io_get_err_count(struct fw_io_ctx *ctx);
  */
 int fw_io_ecc_read(void *bar0, uint64_t ecc_offset, uint32_t *ecc_err_count);
 
+/**
+ * fw_io_misc_ram_reg_read() - Read a single 32-bit misc RAM register by byte offset
+ *
+ * @param bar0: mapped BAR0 base
+ * @param offset: byte offset of the register within the misc RAM block (e.g., FW_IO_REG_*_OFFSET)
+ * @param val: output register value
+ * @return 0 on success
+ */
+int fw_io_misc_ram_reg_read(void *bar0, u64 offset, u32 *val);
+
 /**
  * fw_io_serial_number_read() - Read serial number
  * 
@@ -509,10 +523,10 @@ int fw_io_serial_number_read(void *bar0, uint64_t *serial_number);
 /**
  * fw_io_get_total_ecc_err_counts() - Get UE ecc error count
  * @param bar0: from bar
- * @param ue_ecc_count: Pointer to the ue counter
- * @param repairable_err_count: Pointer to the repairable counter
+ * @param unrepairable_ecc_count: Pointer to the unrepairable ue counter
+ * @param repairable_err_count: Pointer to the repairable ue counter
  */
-void fw_io_get_total_ecc_err_counts(void *bar0, uint32_t *ue_ecc_count, uint32_t *repairable_ecc_count);
+void fw_io_get_total_ecc_err_counts(void *bar0, uint32_t *unrepairable_ecc_count, uint32_t *repairable_ecc_count);
 
 /**
  * fw_io_hbm_uecc_repair_state_read() - Get HBM UE ecc repair state
diff --git a/neuron_ioctl.h b/neuron_ioctl.h
index b20170b..fc0c342 100644
--- a/neuron_ioctl.h
+++ b/neuron_ioctl.h
@@ -61,6 +61,24 @@ struct neuron_ioctl_mem_alloc_v2_mem_type64 {
 	__u32 pad;  // [dummy] used to descriminate between ioctl version
 };
 
+/*
+ * Extension of neuron_ioctl_mem_alloc_v2_mem_type64 that also returns PA.
+ * The driver detects this variant by its larger _IOC_SIZE, saving a
+ * separate NEURON_IOCTL_MEM_GET_PA ioctl call.
+ */
+struct neuron_ioctl_mem_alloc_v2_mem_type64_pa {
+	__u64 size; // [in] Allocation size
+	__u64 align; // [in] alignment
+	__u32 host_memory; // [in] If true allocates from host memory; else allocates from device memory
+	__u32 dram_channel; // [in] DRAM channel in device memory
+	__u32 dram_region; // [in] DRAM region in device memory
+	__u32 nc_id; // [in] NeuronCore id(valid only if location is device)
+	__u32 mem_type; // [in] type of allocation
+	__u64 *mem_handle; // [out] Allocated memory handle would stored here.
+	__u32 pad;  // [dummy] used to descriminate between ioctl version
+	__u64 pa;   // [out] Physical address of the allocated memory
+};
+
 struct neuron_ioctl_device_init {
 	/* Splits DRAM in the device into smaller regions.
 	 * This improves performance of DDR by allowing parallel DMA using different regions.
@@ -265,7 +283,7 @@ struct neuron_ioctl_dma_queue_init {
 	__u64 tx_handle; // [in] mem handle for the tx ring
 	__u64 rx_handle; // [in] mem handle for the rx ring
 	__u64 rxc_handle; // [in] mem handle for the rxc ring
-	__u32 axi_port; // [in] axi port
+	__u32 axi_port_unused; // unused
 };
 
 #define MAX_DMA_QUEUE_INIT_BATCH 256
@@ -550,7 +568,7 @@ struct neuron_ioctl_host_device_id_to_rid_map {
 struct neuron_ioctl_hbm_scrub_start {
 	__u32 nc_id;
 	__u32 hbm_index;
-	__u32 axi_port;
+	__u32 axi_port_unused;
 	__u32 init_val;
 };
 
@@ -652,6 +670,20 @@ struct neuron_ioctl_get_async_h2t_dma_compl_queues {
 	} compl_queue_info[16];
 };
 
+/**
+ * Pre-pinned host memory support
+ * Allows pinning host memory once and reusing for multiple DMA transfers.
+ * Uses VA as the lookup key - no handles exposed to userspace.
+ */
+struct neuron_ioctl_host_mem_pin {
+	__u64 va;			// [in] User virtual address to pin
+	__u64 size;			// [in] Size of memory to pin
+	__u64 pa;			// [out] Physical address if contiguous, ~0ULL if not
+};
+
+struct neuron_ioctl_host_mem_unpin {
+	__u64 va;			// [in] VA to unpin (must match exact VA from pin)
+};
 
 #define NEURON_IOCTL_BASE 'N'
 
@@ -785,6 +817,7 @@ struct neuron_ioctl_get_async_h2t_dma_compl_queues {
 #define NEURON_IOCTL_MEM_ALLOC_V2 _IOR(NEURON_IOCTL_BASE, 102, struct neuron_ioctl_mem_alloc_v2 *) // V2 here refers to neuron 2.x, not arch type
 #define NEURON_IOCTL_MEM_ALLOC_V2MT _IOR(NEURON_IOCTL_BASE, 102, struct neuron_ioctl_mem_alloc_v2_mem_type) // just V2 with additional field mem_type
 #define NEURON_IOCTL_MEM_ALLOC_V2MT64 _IOR(NEURON_IOCTL_BASE, 102, struct neuron_ioctl_mem_alloc_v2_mem_type64) // V2 + mem_type + pad
+#define NEURON_IOCTL_MEM_ALLOC_V2MT64_PA _IOWR(NEURON_IOCTL_BASE, 102, struct neuron_ioctl_mem_alloc_v2_mem_type64_pa) // V2MT64 + pa output
 
 /** Resets the requested NC (-1 for full device) */
 #define NEURON_IOCTL_NC_RESET _IOR(NEURON_IOCTL_BASE, 103, struct neuron_ioctl_device_reset *)
@@ -873,4 +906,8 @@ struct neuron_ioctl_get_async_h2t_dma_compl_queues {
 
 #define NEURON_IOCTL_GET_ASYNC_H2T_DMA_COMPL_QUEUES _IOWR(NEURON_IOCTL_BASE, 135, struct neuron_ioctl_get_async_h2t_dma_compl_queues)
 
+/** Pre-pinned host memory operations - zerocopy will auto-detect pinned memory */
+#define NEURON_IOCTL_HOST_MEM_PIN _IOWR(NEURON_IOCTL_BASE, 136, struct neuron_ioctl_host_mem_pin)
+#define NEURON_IOCTL_HOST_MEM_UNPIN _IOW(NEURON_IOCTL_BASE, 137, struct neuron_ioctl_host_mem_unpin)
+
 #endif
diff --git a/neuron_metrics.c b/neuron_metrics.c
index 65185fb..a3a8914 100644
--- a/neuron_metrics.c
+++ b/neuron_metrics.c
@@ -19,6 +19,7 @@
 #include "neuron_device.h"
 #include "neuron_dhal.h"
 #include "neuron_power.h"
+#include "neuron_sysfs_metrics.h"
 
 unsigned int nmetric_metric_post_delay = 150000; // milliseconds
 unsigned int nmetric_metric_sample_delay = 50; // milliseconds.
@@ -1058,6 +1059,8 @@ static int nmetric_thread_fn(void *arg)
 	u64 last_metric_post_time;
 	u64 start_jiffies = jiffies;
 	u64 current_slow_tick;
+	u64 last_health_tick_jiffies = jiffies;
+	const u64 health_tick_interval_jiffies = msecs_to_jiffies(60 * 1000); // health_status cache refresh cadence
 	u8 tick_budget = 0; // how many ticks can be posted in a certain iteration of the loop
 
 	// initialize all aggregation buffers
@@ -1075,9 +1078,6 @@ static int nmetric_thread_fn(void *arg)
 	post_delay_in_jiffies = msecs_to_jiffies(nmetric_metric_post_delay);
 	last_metric_post_time = jiffies;
 
-	pr_info("Starting metrics thread, sample_delay_in_jiffies is %llu, post delay in ms is %u, timer rate = %d, \n",
-		sample_delay_in_jiffies, nmetric_metric_post_delay, HZ);
-
 	// metrics are only sent once at rate specified by module param, new metric data may be saved without being immediately sent
 	while (!kthread_should_stop() && nd->metrics.neuron_aggregation.state != NMETRIC_STATE_STOPPED) {
 		long wait_return;
@@ -1098,6 +1098,12 @@ static int nmetric_thread_fn(void *arg)
 		// There are some metrics that we sample at a relatively higher frequency.  Do that here.
 		nmetric_sample_high_freq(nd);
 
+		// Refresh health_status cached sysfs values 
+		if ((jiffies - last_health_tick_jiffies) >= health_tick_interval_jiffies) {
+			nsysfsmetric_health_status_tick(nd);
+			last_health_tick_jiffies = jiffies;
+		}
+
 		// For the slower metrics, we want to log once every post_delay_in_jiffies jiffies.
 		// We track this by keeping track of the number of intervals since this thread started
 		// up so that we don't introduce drift due to the latency of other loop operations.
diff --git a/neuron_mmap.c b/neuron_mmap.c
index 6a7dda8..a4faf63 100644
--- a/neuron_mmap.c
+++ b/neuron_mmap.c
@@ -8,7 +8,9 @@
 
 #include <linux/capability.h>
 #include <linux/fault-inject.h>
+#include <linux/mman.h>
 #include "neuron_mmap.h"
+#include "neuron_p2p.h"
 #include "neuron_pci.h"
 #include "neuron_device.h"
 #include "neuron_dhal.h"
@@ -279,8 +281,10 @@ static struct mem_chunk *nmmap_get_mc(struct neuron_device *nd, struct vm_area_s
 	 * memchunk boundaries.
 	*/
 	if (mc->size != size && mc->alloc_type != NEURON_MEMALLOC_TYPE_CONTIGUOUS_SCRATCHPAD_DEVICE) {
-		pr_err("nd%d: partial mmap of mc not supported(%llx != %llx)\n", nd->device_index,
-		       mc->size, size);
+		if (nmap_dm_special_resource_addr_valid(offset, size, NULL, NULL, NULL)) {
+			pr_err("nd%d: partial mmap of mc not supported(%llx != %llx)\n", nd->device_index,
+					mc->size, size);
+		}
 		return NULL;
 	} else if (mc->alloc_type == NEURON_MEMALLOC_TYPE_CONTIGUOUS_SCRATCHPAD_DEVICE) {
 		if (mc->pa + size > mc->mp->main_pool_end_addr) {
@@ -308,6 +312,7 @@ static const struct vm_operations_struct nmmap_dm_vm_ops = {
 
 static int nmmap_dm(struct neuron_device *nd, struct vm_area_struct *vma, u64 *bar4_offset)
 {
+	int ret;
 	u64 start, size, offset;
 
 	if (!nd->npdev.bar4_pa) {
@@ -317,7 +322,11 @@ static int nmmap_dm(struct neuron_device *nd, struct vm_area_struct *vma, u64 *b
 
 	start = vma->vm_pgoff << PAGE_SHIFT;
 	size = vma->vm_end - vma->vm_start;
-	ndhal->ndhal_mmap.mmap_get_bar4_offset(start, size, &offset);
+	ret = ndhal->ndhal_mmap.mmap_get_bar4_offset(start, size, &offset);
+	if (unlikely(ret)) {
+		pr_err("Failed to map address 0x%llx to BAR4\n", start);
+		return ret;
+	}
 
 	if (bar4_offset)
 		*bar4_offset = offset;
@@ -509,4 +518,28 @@ int nmmap_get_va_placement(void *va, int *device_index, int *hbm_index)
 	return -ENXIO;
 }
 
+/**
+ * nmmap_get_unmapped_area() - Return a huge page aligned VA for device mmaps whose
+ * offset and size are both huge page aligned.
+ */
+unsigned long nmmap_get_unmapped_area(struct file *filep, unsigned long addr,
+				      unsigned long len, unsigned long pgoff,
+				      unsigned long flags)
+{
+	unsigned long offset = pgoff << PAGE_SHIFT;
+	unsigned long aligned;
+
+	if ((flags & (MAP_FIXED | MAP_FIXED_NOREPLACE)) ||
+	    (!IS_ALIGNED(offset, NEURON_P2P_HUGE_PAGE_SZ)) ||
+	    (!IS_ALIGNED(len, NEURON_P2P_HUGE_PAGE_SZ)) ||
+	    (len == 0 || len > ULONG_MAX - NEURON_P2P_HUGE_PAGE_SZ)) {
+		return nmmap_kern_get_unmapped_area(filep, addr, len, pgoff, flags);
+	}
 
+	aligned = nmmap_kern_get_unmapped_area(filep, addr, len + NEURON_P2P_HUGE_PAGE_SZ, pgoff, flags);
+	if (IS_ERR_VALUE(aligned)) {
+		return nmmap_kern_get_unmapped_area(filep, addr, len, pgoff, flags);
+	}
+
+	return ALIGN(aligned, NEURON_P2P_HUGE_PAGE_SZ);
+}
diff --git a/neuron_mmap.h b/neuron_mmap.h
index 190e753..a0cb3e3 100644
--- a/neuron_mmap.h
+++ b/neuron_mmap.h
@@ -18,6 +18,18 @@
 #define RHEL_RELEASE_VERSION(a,b) 1
 #endif
 
+/*
+ * Linux 6.10 removed get_unmapped_area from mm_struct and replaced it
+ * with the standalone mm_get_unmapped_area() function.
+ */
+#if (!defined(RHEL_RELEASE_CODE) && (LINUX_VERSION_CODE >= KERNEL_VERSION(6, 10, 0))) || (defined(RHEL_RELEASE_CODE) && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(9, 5)))
+#define nmmap_kern_get_unmapped_area(filep, addr, len, pgoff, flags) \
+	mm_get_unmapped_area(current->mm, filep, addr, len, pgoff, flags)
+#else
+#define nmmap_kern_get_unmapped_area(filep, addr, len, pgoff, flags) \
+	current->mm->get_unmapped_area(filep, addr, len, pgoff, flags)
+#endif
+
 #if (!defined(RHEL_RELEASE_CODE) && (LINUX_VERSION_CODE < KERNEL_VERSION(6, 3, 0))) || (defined(RHEL_RELEASE_CODE) && (RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(9, 5)))
 static inline void vm_flags_set(struct vm_area_struct *vma, vm_flags_t flags)
 {
@@ -148,4 +160,21 @@ struct mem_chunk *nmmap_get_mc_from_pa(struct neuron_device *nd, phys_addr_t pa)
  */
 
 int nmmap_get_va_placement(void *va, int *device_index, int *hbm_index);
+
+/**
+ * nmmap_get_unmapped_area() - Return a huge page aligned VA for device mmaps
+ *  whose offset and size are both huge page aligned. This enables EFA P2P MR
+ *  registration to use 2MB pages instead of 4KB pages.
+ *
+ * @filep:	file pointer
+ * @addr:	address hint from userspace
+ * @len:	mapping length
+ * @pgoff:	page offset (device BAR offset)
+ * @flags:	mmap flags
+ *
+ * Return: unmapped area address, or error value
+ */
+unsigned long nmmap_get_unmapped_area(struct file *filep, unsigned long addr,
+				      unsigned long len, unsigned long pgoff,
+				      unsigned long flags);
 #endif
diff --git a/neuron_module.c b/neuron_module.c
index 56713ba..f397aa9 100644
--- a/neuron_module.c
+++ b/neuron_module.c
@@ -17,14 +17,16 @@
 #include "neuron_trace.h"
 #include "neuron_cdev.h"
 #include "neuron_pci.h"
+#include "neuron_dma.h"
+#include "neuron_test.h"
 
-MODULE_DESCRIPTION("Neuron Driver, built from SHA: 1c7ed9bd14936635773b5a01777882804ee8ea6e");
+MODULE_DESCRIPTION("Neuron Driver, built from SHA: 38e99b1fb8df603ee4109868c6d949e30f2c32c8");
 MODULE_LICENSE("GPL");
-MODULE_VERSION("2.27.4.0");
+MODULE_VERSION("2.28.0.0");
 MODULE_ALIAS("pci:v00001d0fd00007064sv*sd*bc*sc*i*");
 
-const char driver_version[] = "2.27.4.0";
-const char driver_revision[] = "1c7ed9bd14936635773b5a01777882804ee8ea6e";
+const char driver_version[] = "2.28.0.0";
+const char driver_revision[] = "38e99b1fb8df603ee4109868c6d949e30f2c32c8";
 
 #ifdef CONFIG_FAULT_INJECTION
 
@@ -75,6 +77,7 @@ static int __init neuron_module_init(void)
 #ifdef CONFIG_FAULT_INJECTION
 	neuron_module_init_debugfs();
 #endif
+	ntest_init();
 
 	ret = ncdev_module_init();
 	if (ret)
@@ -92,6 +95,7 @@ static void __exit neuron_module_exit(void)
 #ifdef CONFIG_FAULT_INJECTION
 	neuron_module_free_debugfs();
 #endif
+	ndma_pinned_mem_destroy();
 	neuron_pci_module_exit();
 	ncdev_module_exit();
 }
diff --git a/neuron_p2p.c b/neuron_p2p.c
index be76220..99e3c3f 100644
--- a/neuron_p2p.c
+++ b/neuron_p2p.c
@@ -17,9 +17,6 @@
 #include "neuron_p2p.h"
 #include "neuron_pci.h"
 
-#define NEURON_P2P_HUGE_PAGE_SZ 0x200000
-#define NEURON_P2P_HUGE_PAGE_SZ_USAGE_THRESHOLD 0x10000000
-
 /*
  * Registers the VA with the callback and also returns the PA
  */
diff --git a/neuron_p2p.h b/neuron_p2p.h
index ce91df9..915886c 100644
--- a/neuron_p2p.h
+++ b/neuron_p2p.h
@@ -6,6 +6,9 @@
 #ifndef __NEURON_P2P_H__
 #define __NEURON_P2P_H__
 
+#define NEURON_P2P_HUGE_PAGE_SZ 0x200000
+#define NEURON_P2P_HUGE_PAGE_SZ_USAGE_THRESHOLD 0x10000000
+
 struct neuron_p2p_page_info {
     u64 physical_address; // PA's that map to the VA (page aligned as defined in va_info)
     u32 page_count; // page count each page is shift_page_size size
diff --git a/neuron_pci.c b/neuron_pci.c
index f385b3d..353fa8e 100644
--- a/neuron_pci.c
+++ b/neuron_pci.c
@@ -199,6 +199,7 @@ static int neuron_pci_device_close(struct neuron_device *nd)
 		fw_io_destroy((struct fw_io_ctx *)nd->fw_io_ctx);
 
 	nd->fw_io_ctx = NULL;
+	mutex_destroy(&nd->lock);
 	return 0;
 }
 
@@ -359,6 +360,7 @@ static int neuron_pci_probe(struct pci_dev *dev, const struct pci_device_id *id)
 		pci_info(dev, "Can't allocate memory for neuron_device\n");
 		goto fail_alloc_nd_mem;
 	}
+	mutex_init(&nd->lock);
 
     nmetric_init_driver_metrics(nd);
 
@@ -487,6 +489,7 @@ static int neuron_pci_probe(struct pci_dev *dev, const struct pci_device_id *id)
 	pci_disable_device(dev);
 fail_dhal_init:
 fail_enable:
+	mutex_destroy(&nd->lock);
 	neuron_log_destroy( nd);
 	kvfree(nd);
 fail_alloc_nd_mem:
@@ -502,6 +505,8 @@ static void neuron_pci_remove(struct pci_dev *dev)
 	if (nd == NULL)
 		return;
 
+	ndma_h2d_stop_cmpltn_thread(nd);
+
 	nr_stop_thread(nd);
 
 	nmetric_stop_thread(nd);
diff --git a/neuron_reset.c b/neuron_reset.c
index ff7b3a6..c8a95b4 100644
--- a/neuron_reset.c
+++ b/neuron_reset.c
@@ -20,6 +20,7 @@
 #include "neuron_fw_io.h"
 #include "neuron_dhal.h"
 #include "neuron_nq.h"
+#include "neuron_test.h"
 
 int no_reset = 0;
 module_param(no_reset, int, S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP);
@@ -116,6 +117,8 @@ static int nr_reset_thread_fn(void *arg)
 																		 (nc_map == NEURON_NC_MAP_DEVICE) ? "device" : "TPB", 
 																 		 request_iter->request_id);)
 
+		ndmar_close_ncs(nd, nc_map);
+
 		ret = ndhal->ndhal_reset.nr_initiate_reset(nd, nc_map);
 		if (ret) {
 			char *reason = (ret == -EINTR) ? "interrupted by driver unload\n" : "failed\n";
@@ -128,7 +131,7 @@ static int nr_reset_thread_fn(void *arg)
 			// If the reset was successfully initiated the 
 			// response we get back is a pass/fail and we don't need to retry.
 			ret = ndhal->ndhal_reset.nr_wait_for_reset_completion(nd);
-			if (ret) {
+			if (ret  || _ntest_trigger(NEURON_TEST_TRIGGER_RST_FAILURE, nd->device_index)) {
 				nr_call_post_reset_config(nd, nc_map, false);
 				ITER_COAL_REQS(request_iter, first_request, last_request,
 					pr_info("nd%d: reset request %u was initiated, but failed to complete\n", nd->device_index, request_iter->request_id);)
@@ -257,9 +260,12 @@ int nr_start_ncs(struct neuron_device *nd, uint32_t nc_map, uint32_t request_id)
 		// perform the driver's reset related activities, then return so
 		// that outside of not resetting HW, everything  else will look natural.
 		//
-		ndmar_init_ncs(nd, NEURON_NC_MAP_DEVICE);
+		ndmar_close_ncs(nd, nc_map);
+		ndmar_init_ncs(nd, nc_map);
 		nr_call_post_reset_config(nd, nc_map, true);
-		nd->device_state = NEURON_DEVICE_STATE_READY;
+		if (request_id == NEURON_RESET_REQUEST_ALL) {
+			nd->device_state = NEURON_DEVICE_STATE_READY;
+		}
 		return 0;
 	}
 
diff --git a/neuron_ring.c b/neuron_ring.c
index 280e961..be7e868 100644
--- a/neuron_ring.c
+++ b/neuron_ring.c
@@ -85,11 +85,9 @@ u32 ndmar_ring_get_desc_count(u32 v)
  * @eng: dma engine
  * @qid: dma queue id in the engine for which the mc is being set.
  * @mc: backing memory chunk
- * @port: which axi port(0 or 1) to access the DRAM(for performance)
  * @queue_type: type of the queue(rx, tx, or completion)
  */
-static void ndmar_ring_set_mem_chunk(struct ndma_eng *eng, u32 qid, struct mem_chunk *mc, u32 port,
-				     enum neuron_dma_queue_type queue_type)
+static void ndmar_ring_set_mem_chunk(struct ndma_eng *eng, u32 qid, struct mem_chunk *mc, enum neuron_dma_queue_type queue_type)
 {
 	struct ndma_queue *queue = ndmar_get_queue(eng, qid);
 	struct ndma_ring *ring = ndmar_get_ring(queue);
@@ -102,9 +100,6 @@ static void ndmar_ring_set_mem_chunk(struct ndma_eng *eng, u32 qid, struct mem_c
 			ring->tx.addr = virt_to_phys(ring->tx.ptr) | ndhal->ndhal_address_map.pci_host_base;
 		} else {
 			ring->tx.addr = mc->pa;
-			if (port) {
-				ring->tx.addr |= ndhal->ndhal_address_map.port_1_base;
-			}
 		}
 		break;
 	case NEURON_DMA_QUEUE_TYPE_RX:
@@ -114,9 +109,6 @@ static void ndmar_ring_set_mem_chunk(struct ndma_eng *eng, u32 qid, struct mem_c
 			ring->rx.addr = virt_to_phys(ring->rx.ptr) | ndhal->ndhal_address_map.pci_host_base;
 		} else {
 			ring->rx.addr = mc->pa;
-			if (port) {
-				ring->rx.addr |= ndhal->ndhal_address_map.port_1_base;
-			}
 		}
 		break;
 	case NEURON_DMA_QUEUE_TYPE_COMPLETION:
@@ -127,9 +119,6 @@ static void ndmar_ring_set_mem_chunk(struct ndma_eng *eng, u32 qid, struct mem_c
 			ring->rxc.addr = virt_to_phys(ring->rxc.ptr) | ndhal->ndhal_address_map.pci_host_base;
 		} else {
 			ring->rxc.addr = mc->pa;
-			if (port) {
-				ring->rxc.addr |= ndhal->ndhal_address_map.port_1_base;
-			}
 		}
 		break;
 	default:
@@ -139,7 +128,7 @@ static void ndmar_ring_set_mem_chunk(struct ndma_eng *eng, u32 qid, struct mem_c
 
 int ndmar_queue_init(struct neuron_device *nd, u32 eng_id, u32 qid, u32 tx_desc_count,
 		     u32 rx_desc_count, struct mem_chunk *tx_mc, struct mem_chunk *rx_mc,
-		     struct mem_chunk *rxc_mc, u32 port, bool allocatable)
+		     struct mem_chunk *rxc_mc, bool allocatable)
 {
 	int ret = -1;
 	struct ndma_eng *eng;
@@ -171,8 +160,7 @@ int ndmar_queue_init(struct neuron_device *nd, u32 eng_id, u32 qid, u32 tx_desc_
 	ring->qid = qid;
 	ring->h2t_completion_mc = NULL;
 
-	trace_dma_queue_init(nd, eng_id, qid, tx_desc_count, rx_desc_count, tx_mc, rx_mc, rxc_mc,
-			     port);
+	trace_dma_queue_init(nd, eng_id, qid, tx_desc_count, rx_desc_count, tx_mc, rx_mc, rxc_mc);
 
 	if (tx_mc) {
 		/*
@@ -180,7 +168,7 @@ int ndmar_queue_init(struct neuron_device *nd, u32 eng_id, u32 qid, u32 tx_desc_
 			ret = -EINVAL;
 			goto done;
 		}*/
-		ndmar_ring_set_mem_chunk(eng, qid, tx_mc, port, NEURON_DMA_QUEUE_TYPE_TX);
+		ndmar_ring_set_mem_chunk(eng, qid, tx_mc, NEURON_DMA_QUEUE_TYPE_TX);
 	}
 
 	if (rx_mc) {
@@ -189,7 +177,7 @@ int ndmar_queue_init(struct neuron_device *nd, u32 eng_id, u32 qid, u32 tx_desc_
 			ret = -EINVAL;
 			goto done;
 		}*/
-		ndmar_ring_set_mem_chunk(eng, qid, rx_mc, port, NEURON_DMA_QUEUE_TYPE_RX);
+		ndmar_ring_set_mem_chunk(eng, qid, rx_mc, NEURON_DMA_QUEUE_TYPE_RX);
 	}
 
 	if (rxc_mc) {
@@ -197,7 +185,7 @@ int ndmar_queue_init(struct neuron_device *nd, u32 eng_id, u32 qid, u32 tx_desc_
 			ret = -EINVAL;
 			goto done;
 		}
-		ndmar_ring_set_mem_chunk(eng, qid, rxc_mc, port, NEURON_DMA_QUEUE_TYPE_COMPLETION);
+		ndmar_ring_set_mem_chunk(eng, qid, rxc_mc, NEURON_DMA_QUEUE_TYPE_COMPLETION);
 	}
 
 	ret = udma_m2m_init_queue(&eng->udma, qid, eng_id, tx_desc_count, rx_desc_count, allocatable, tx_mc != NULL ? &ring->tx : NULL,
@@ -261,7 +249,7 @@ void ndmar_handle_process_exit(struct neuron_device *nd, pid_t pid)
 				continue;
 			}
 
-			ret = ndmar_queue_init(nd, eng_id, qid, desc_count, desc_count, mc, mc, NULL, 0, false);
+			ret = ndmar_queue_init(nd, eng_id, qid, desc_count, desc_count, mc, mc, NULL, false);
 			// ignore the error and continue to reset other queues.
 			if (ret)
 				pr_err("nd%d:dma%d:q%d failed to reset (%d)", nd->device_index, eng_id, qid, ret);
@@ -380,8 +368,8 @@ static int ndmar_h2t_ring_alloc(struct neuron_device *nd, int nc_id, int qid)
 		goto error;
 	}
 
-	ndmar_ring_set_mem_chunk(eng, qid, tx_mc, 0, NEURON_DMA_QUEUE_TYPE_TX);
-	ndmar_ring_set_mem_chunk(eng, qid, rx_mc, 0, NEURON_DMA_QUEUE_TYPE_RX);
+	ndmar_ring_set_mem_chunk(eng, qid, tx_mc, NEURON_DMA_QUEUE_TYPE_TX);
+	ndmar_ring_set_mem_chunk(eng, qid, rx_mc, NEURON_DMA_QUEUE_TYPE_RX);
 
 	ret = mc_alloc_align(nd, MC_LIFESPAN_DEVICE, sizeof(u32) * 2 * NEURON_DMA_H2T_CTX_HANDLE_CNT, 0, MEM_LOC_HOST, 0, 0, nc_id, NEURON_MEMALLOC_TYPE_NCDEV_HOST, &h2t_completion_mc);
 	if (ret) {
diff --git a/neuron_ring.h b/neuron_ring.h
index c9d3462..e99ea40 100644
--- a/neuron_ring.h
+++ b/neuron_ring.h
@@ -6,6 +6,8 @@
 #ifndef NEURON_RING_H
 #define NEURON_RING_H
 
+#include <linux/atomic.h>
+
 #include "udma/udma.h"
 #include "share/neuron_driver_shared.h"
 
@@ -23,6 +25,29 @@ struct neuron_dma_queue_state;
 struct ndma_eng;
 struct ndma_ring;
 
+/*
+ * H2D DMA Completion Thread
+ * -------------------------
+ * one thread per ND.
+ * It is shared across rings for completion, remote pinning, and submission work.
+ *
+ * Async IO only.
+ *
+ * @thread: kthread handle
+ * @wait_queue: wait queue used to sleep/wake the thread
+ * @nonempty_ctxq_bitmap: bitmap of H2D ctx queues with pending work
+ * @stop: set to request thread exit
+ */
+struct ndma_h2d_dma_cmpltn_thread {
+	struct task_struct *thread;
+	wait_queue_head_t wait_queue;
+	atomic64_t nonempty_ctxq_bitmap;
+	volatile bool stop;
+};
+
+int ndma_h2d_create_cmpltn_thread(struct neuron_device *nd);
+void ndma_h2d_stop_cmpltn_thread(struct neuron_device *nd);
+
 /*
  * H2D DMA Completion Queue (CQ)
  * -----------------------------
@@ -284,14 +309,13 @@ int ndmar_eng_set_state(struct neuron_device *nd, int eng_id, u32 state);
  * @tx_mc: Memory chunk backing TX queue
  * @rx_mc: Memory chunk backing RX queue
  * @rxc_mc: Memory chunk backing RX completion queue
- * @port: AXI port.
  * @allocatable: whether new descriptors can be added post queue init
  *
  * Return: 0 if queue init succeeds, a negative error code otherwise.
  */
 int ndmar_queue_init(struct neuron_device *nd, u32 eng_id, u32 qid, u32 tx_desc_count,
 		     u32 rx_desc_count, struct mem_chunk *tx_mc, struct mem_chunk *rx_mc,
-		     struct mem_chunk *rxc_mc, u32 port, bool allocatable);
+		     struct mem_chunk *rxc_mc, bool allocatable);
 
 /**
  * ndmar_queue_release() - Release a DMA queue.
@@ -426,7 +450,7 @@ int ndmar_h2t_ring_release(struct neuron_device *nd, int nc_id, int qid);
 /**
  * ndmar_h2t_ring_is_h2t() - return true if this is an h2t ring
  */
-static inline bool ndmar_h2t_ring_is_h2t(struct ndma_ring *ring)
+static inline bool ndmar_h2t_ring_is_h2t(const struct ndma_ring *ring)
 {
 	return (ring->h2t_completion_mc != NULL);
 }
diff --git a/neuron_sysfs_metrics.c b/neuron_sysfs_metrics.c
index fd71ae0..80a4947 100644
--- a/neuron_sysfs_metrics.c
+++ b/neuron_sysfs_metrics.c
@@ -12,6 +12,7 @@
 
 #include "neuron_device.h"
 #include "neuron_ds.h"
+#include "neuron_fw_io.h"
 #include "neuron_sysfs_metrics.h"
 #include "neuron_dhal.h"
 #include "neuron_power.h"
@@ -150,6 +151,31 @@ static const nsysfsmetric_attr_info_t ecc_attrs_info_tbl[] = {
 };
 static const int ecc_attrs_info_tbl_cnt = sizeof(ecc_attrs_info_tbl) / sizeof(nsysfsmetric_attr_info_t);
 
+struct health_status_reg_map {
+    enum health_status_cache_slot slot;
+    u64 offset;
+    bool is_err_metric;
+};
+
+static const struct health_status_reg_map health_status_reg_tbl[] = {
+    { HEALTH_STATUS_SLOT_SRAM_ECC,         FW_IO_REG_SRAM_ECC_OFFSET,           true },
+    { HEALTH_STATUS_SLOT_HBM0_ECC,         FW_IO_REG_HBM0_ECC_OFFSET,           true },
+    { HEALTH_STATUS_SLOT_HBM1_ECC,         FW_IO_REG_HBM1_ECC_OFFSET,           true },
+    { HEALTH_STATUS_SLOT_HBM2_ECC,         FW_IO_REG_HBM2_ECC_OFFSET,           true },
+    { HEALTH_STATUS_SLOT_HBM3_ECC,         FW_IO_REG_HBM3_ECC_OFFSET,           true },
+    { HEALTH_STATUS_SLOT_HBM_REPAIR_STATE, FW_IO_REG_HBM_REPAIR_STATE_OFFSET,   true },
+    { HEALTH_STATUS_SLOT_FW_API_VERSION,   FW_IO_REG_API_VERSION_OFFSET,        false },
+};
+static const int health_status_reg_tbl_cnt = sizeof(health_status_reg_tbl) / sizeof(health_status_reg_tbl[0]);
+
+static const nsysfsmetric_attr_info_t health_status_attrs_info_tbl[] = {
+    ATTR_INFO("hbm_ecc_err_count",             NON_NDS_ID_TO_SYSFS_METRIC_ID(NON_NDS_HEALTH_STATUS_HBM_UE_COUNT),             CACHED_VALUES),
+    ATTR_INFO("repairable_hbm_ecc_err_count",  NON_NDS_ID_TO_SYSFS_METRIC_ID(NON_NDS_HEALTH_STATUS_REPAIRABLE_HBM_UE_COUNT),  CACHED_VALUES),
+    ATTR_INFO("sram_ecc_err_count",            NON_NDS_ID_TO_SYSFS_METRIC_ID(NON_NDS_HEALTH_STATUS_SRAM_UE_COUNT),            CACHED_VALUES),
+    ATTR_INFO("hw_error_event",                NON_NDS_ID_TO_SYSFS_METRIC_ID(NON_NDS_HEALTH_STATUS_HW_ERROR_EVENT),           CACHED_VALUES),
+};
+static const int health_status_attrs_info_tbl_cnt = sizeof(health_status_attrs_info_tbl) / sizeof(nsysfsmetric_attr_info_t);
+
 static const nsysfsmetric_attr_info_t root_arch_node_attrs_info_tbl[] = {
     ATTR_INFO("arch_type", NON_NDS_ID_TO_SYSFS_METRIC_ID(NON_NDS_OTHER_NEURON_ARCH_TYPE), OTHER),
     ATTR_INFO("instance_type", NON_NDS_ID_TO_SYSFS_METRIC_ID(NON_NDS_OTHER_NEURON_INSTANCE_TYPE), OTHER),
@@ -399,6 +425,46 @@ static ssize_t nsysfsmetric_show_nrt_other_metrics(struct nsysfsmetric_metrics *
 	return len;
 }
 
+static ssize_t nsysfsmetric_show_cached_values_metrics(struct nsysfsmetric_metrics *sysfs_metrics,
+                                                       struct metric_attribute *attr,
+                                                       char *buf)
+{
+    u32 value = 0;
+
+    switch (attr->metric_id) {
+        case NON_NDS_ID_TO_SYSFS_METRIC_ID(NON_NDS_HEALTH_STATUS_SRAM_UE_COUNT):
+            value = READ_ONCE(sysfs_metrics->cached_health_regs[HEALTH_STATUS_SLOT_SRAM_ECC]);
+            return nsysfsmetric_sysfs_emit(buf, "%u\n", value & 0xffff); // Lower 16 bits
+        case NON_NDS_ID_TO_SYSFS_METRIC_ID(NON_NDS_HEALTH_STATUS_HBM_UE_COUNT):
+            // TODO: Use cached HEALTH_STATUS_SLOT_FW_API_VERSION
+            // For now, safe to assume api_version >= 6
+            value = 0;
+            value += (((READ_ONCE(sysfs_metrics->cached_health_regs[HEALTH_STATUS_SLOT_HBM0_ECC])) >> 12) & 0xf);
+            value += (((READ_ONCE(sysfs_metrics->cached_health_regs[HEALTH_STATUS_SLOT_HBM1_ECC])) >> 12) & 0xf);
+            value += (((READ_ONCE(sysfs_metrics->cached_health_regs[HEALTH_STATUS_SLOT_HBM2_ECC])) >> 12) & 0xf);
+            value += (((READ_ONCE(sysfs_metrics->cached_health_regs[HEALTH_STATUS_SLOT_HBM3_ECC])) >> 12) & 0xf);
+            if ((READ_ONCE(sysfs_metrics->cached_health_regs[HEALTH_STATUS_SLOT_HBM_REPAIR_STATE]) & 0x3) == 0x2)
+                value +=1;
+
+            return nsysfsmetric_sysfs_emit(buf, "%u\n", value);
+        case NON_NDS_ID_TO_SYSFS_METRIC_ID(NON_NDS_HEALTH_STATUS_REPAIRABLE_HBM_UE_COUNT):
+            value = 0;
+            value += ((READ_ONCE(sysfs_metrics->cached_health_regs[HEALTH_STATUS_SLOT_HBM0_ECC])) & 0xfff);
+            value += ((READ_ONCE(sysfs_metrics->cached_health_regs[HEALTH_STATUS_SLOT_HBM1_ECC])) & 0xfff);
+            value += ((READ_ONCE(sysfs_metrics->cached_health_regs[HEALTH_STATUS_SLOT_HBM2_ECC])) & 0xfff);
+            value += ((READ_ONCE(sysfs_metrics->cached_health_regs[HEALTH_STATUS_SLOT_HBM3_ECC])) & 0xfff);
+            if ((READ_ONCE(sysfs_metrics->cached_health_regs[HEALTH_STATUS_SLOT_HBM_REPAIR_STATE]) & 0x3) == 0x1)
+                value +=1;
+
+            return nsysfsmetric_sysfs_emit(buf, "%u\n", value);
+        case NON_NDS_ID_TO_SYSFS_METRIC_ID(NON_NDS_HEALTH_STATUS_HW_ERROR_EVENT):
+            return nsysfsmetric_sysfs_emit(buf, "%u\n", READ_ONCE(sysfs_metrics->hw_error_event_count));
+        default:
+            pr_err("cannot show sysfs metrics for metric_id=%d of attr_type CACHED_VALUES\n", attr->metric_id);
+            return 0;
+    }
+}
+
 static ssize_t nsysfsmetric_set_nrt_total_metrics(struct nsysfsmetric_metrics *sysfs_metrics,
                                                     struct metric_attribute *attr,
                                                     const char *buf, size_t size)
@@ -504,6 +570,11 @@ static struct metric_attribute *nsysfsmetric_create_attr(const char *metric_name
             metric_attr->show = nsysfsmetric_show_nrt_other_metrics;
             metric_attr->store = nsysfsmetric_set_nrt_other_metrics;			
             break;
+        case CACHED_VALUES:
+            metric_attr->attr.mode = VERIFY_OCTAL_PERMISSIONS(S_IRUGO);
+            metric_attr->show = nsysfsmetric_show_cached_values_metrics;
+            metric_attr->store = NULL;
+            break;
         default:
             metric_attr->show = NULL;
             metric_attr->store = NULL;
@@ -934,6 +1005,14 @@ int nsysfsmetric_register(struct neuron_device *nd, struct kobject *neuron_devic
         return ret;
     }
 
+    // neuron{0, 1, ...}/stats/hardware/health_status/
+    if (ndhal->ndhal_sysfs_metrics.health_status_enabled && metrics->hardware_node) {
+        ret = nsysfsmetric_add_health_status_nodes(metrics, metrics->hardware_node);
+        if (ret) {
+            return ret;
+        }
+    }
+
     // neuron{0, 1, ...}/neuron_core{0, 1, ...}/
     ret = nsysfsmetric_init_and_add_nc_default_nodes(nd, &metrics->root);
     if (ret) {
@@ -963,6 +1042,8 @@ static void nsysfsmetric_destroy_counters(struct nsysfsmetric_metrics *metrics)
     memset(metrics->nrt_metrics, 0, sizeof(metrics->nrt_metrics));
     memset(metrics->nrt_nd_metrics, 0, sizeof(metrics->nrt_nd_metrics));
     memset(metrics->dev_metrics, 0, sizeof(metrics->dev_metrics));
+    metrics->hardware_node = NULL;
+    metrics->health_status_node = NULL;
 }
 
 static void nsysfsmetric_destroy_nodes(struct nsysfsmetric_node *node, bool acquire_lock)
@@ -997,6 +1078,53 @@ void nsysfsmetric_destroy(struct neuron_device *nd)
     mutex_unlock(&nd->sysfs_metrics.root.lock);
 }
 
+/*
+ * Reads a configured subset of misc RAM registers and updates the cache exposed via
+ * stats/hardware/health_status/. Bumps hw_error_event_count and issues sysfs_notify on any change.
+ * Invoked periodically from the metrics thread.
+ */
+void nsysfsmetric_health_status_tick(struct neuron_device *nd)
+{
+    struct nsysfsmetric_metrics *metrics = &nd->sysfs_metrics;
+    int i;
+    bool changed = false;
+
+    if (!ndhal->ndhal_sysfs_metrics.health_status_enabled)
+        return;
+
+    for (i = 0; i < health_status_reg_tbl_cnt; i++) {
+        u32 val;
+        int ret = fw_io_misc_ram_reg_read(nd->npdev.bar0, health_status_reg_tbl[i].offset, &val);
+        if (ret)
+            continue; // TODO: figure out how to communicate to sysfs readers that read failed
+
+        if (val != READ_ONCE(metrics->cached_health_regs[health_status_reg_tbl[i].slot])) {
+            WRITE_ONCE(metrics->cached_health_regs[health_status_reg_tbl[i].slot], val);
+            if (health_status_reg_tbl[i].is_err_metric) {
+                changed = true;
+            }
+        }
+    }
+
+    if (changed && metrics->health_status_node) {
+        // This function is the only writer, don't need atomic update, just volatile (READ_ONCE/WRITE_ONCE)
+        WRITE_ONCE(metrics->hw_error_event_count, READ_ONCE(metrics->hw_error_event_count) + 1);
+        sysfs_notify(&metrics->health_status_node->kobj, NULL, "hw_error_event");
+    }
+}
+
+int nsysfsmetric_add_health_status_nodes(struct nsysfsmetric_metrics *metrics, struct nsysfsmetric_node *hardware_node)
+{
+    struct nsysfsmetric_node *node = nsysfsmetric_init_and_add_one_node(metrics, hardware_node,
+            "health_status", false, -1, health_status_attrs_info_tbl_cnt, health_status_attrs_info_tbl);
+    if (!node) {
+        pr_err("failed to add health_status node under stats/hardware\n");
+        return -1;
+    }
+    metrics->health_status_node = node;
+    return 0;
+}
+
 int nsysfsmetric_init_and_add_dynamic_counter_nodes(struct neuron_device *nd, uint64_t ds_val)
 {
     int ret = 0;
diff --git a/neuron_sysfs_metrics.h b/neuron_sysfs_metrics.h
index 27b30c8..9c2445c 100644
--- a/neuron_sysfs_metrics.h
+++ b/neuron_sysfs_metrics.h
@@ -26,6 +26,7 @@ enum nsysfsmetric_attr_type {
     PRESENT,   // counter value at the current window
     PEAK,      // max counter value
     OTHER,     // all other types besides TOTAL, PRESENT, and PEAK
+    CACHED_VALUES, // cached value updated out-of-band (e.g., by a polling thread)
 };
 
 enum nsysfsmetric_metric_id_category {
@@ -73,6 +74,10 @@ enum nsysfsmetric_non_nds_ids { // The metrics needed by sysfs metrics but not s
 	NON_NDS_OTHER_NOTIFY_DELAY,
 	NON_NDS_OTHER_SERIAL_NUMBER,
 	NON_NDS_OTHER_POWER_UTILIZATION,
+	NON_NDS_HEALTH_STATUS_SRAM_UE_COUNT,
+	NON_NDS_HEALTH_STATUS_HBM_UE_COUNT,
+	NON_NDS_HEALTH_STATUS_REPAIRABLE_HBM_UE_COUNT,
+	NON_NDS_HEALTH_STATUS_HW_ERROR_EVENT,
 };
 
 struct neuron_device;
@@ -83,6 +88,19 @@ struct sysfs_mem_thread {
 	volatile bool stop; // if cleared, thread would exit the loop
 };
 
+// Cache slot identifiers for misc RAM registers whose values are exposed under
+// stats/hardware/health_status/. Add a new value here when caching a new register.
+enum health_status_cache_slot {
+	HEALTH_STATUS_SLOT_SRAM_ECC,
+	HEALTH_STATUS_SLOT_HBM0_ECC,
+	HEALTH_STATUS_SLOT_HBM1_ECC,
+	HEALTH_STATUS_SLOT_HBM2_ECC,
+	HEALTH_STATUS_SLOT_HBM3_ECC,
+	HEALTH_STATUS_SLOT_HBM_REPAIR_STATE,
+	HEALTH_STATUS_SLOT_FW_API_VERSION,
+	HEALTH_STATUS_SLOT_COUNT,
+};
+
 struct nsysfsmetric_counter {
     struct nsysfsmetric_node *node; // used for sysfs_notify
     u64 total;
@@ -107,6 +125,14 @@ struct nsysfsmetric_metrics { // per neuron_device
     // nc_id should be -1 to use nrt_nd_metrics, and should be a valid neuron core ID to use nrt_metrics
     struct nsysfsmetric_counter dev_metrics[MAX_METRIC_ID]; // TODO: the device metrics
     uint64_t bitmap; // store the dynamic metrics to be added
+
+    // Cached misc RAM register values for stats/hardware/health_status/.
+    // Updated periodically from the metrics thread; sysfs reads return cached values.
+    // Indexed by enum health_status_cache_slot.
+    u32 cached_health_regs[HEALTH_STATUS_SLOT_COUNT];
+    u32 hw_error_event_count;
+    struct nsysfsmetric_node *hardware_node;     // stats/hardware/; cached so health_status can attach under it
+    struct nsysfsmetric_node *health_status_node; // target for sysfs_notify on hw_error_event
 };
 
 typedef struct nsysfsmetric_attr_info {
@@ -220,5 +246,14 @@ void nsysfsmetric_set_counter(struct neuron_device *nd, int metric_id_category,
  */
 void nsysfsmetric_inc_reset_fail_count(struct neuron_device *nd);
 
+/**
+ * nsysfsmetric_add_health_status_nodes() - add stats/hardware/health_status/ subtree under the hardware node
+ */
+int nsysfsmetric_add_health_status_nodes(struct nsysfsmetric_metrics *metrics, struct nsysfsmetric_node *hardware_node);
+
+/**
+ * nsysfsmetric_health_status_tick() - perform a single health_status cache refresh; invoked from the metrics thread
+ */
+void nsysfsmetric_health_status_tick(struct neuron_device *nd);
 
 #endif
diff --git a/neuron_test.c b/neuron_test.c
new file mode 100644
index 0000000..8ed9b5f
--- /dev/null
+++ b/neuron_test.c
@@ -0,0 +1,80 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright 2026, Amazon.com, Inc. or its affiliates. All Rights Reserved
+ */
+
+/** Neuron driver test module
+ *
+ *  the purpose of this module is to prove error injection functionality
+ *  for testing.  It should be lightweight, simple and have little to no
+ *  knowledge of the driver's operation.  It requires sysadmin caps
+ */
+
+#define pr_fmt(fmt) "%s:%s: " fmt, KBUILD_MODNAME, __func__
+
+#include <linux/string.h>
+#include <linux/types.h> 
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/version.h>
+#include <linux/bitmap.h>
+#include "neuron_test.h"
+
+#define _NEURON_TT_AT_LOAD_VALBITS		16
+#define _NEURON_TT_AT_LOAD_VALSHIFT		 0
+#define _NEURON_TT_AT_LOAD_VALMASK		((1 << _NEURON_TT_AT_LOAD_VALBITS)-1)
+#define _NEURON_TT_AT_LOAD_VAL(val)		(((val) >> _NEURON_TT_AT_LOAD_VALSHIFT) & _NEURON_TT_AT_LOAD_VALMASK)
+
+#define _NEURON_TT_AT_LOAD_DATABITS		16
+#define _NEURON_TT_AT_LOAD_DATASHIFT	16
+#define _NEURON_TT_AT_LOAD_DATAMASK		((1 << _NEURON_TT_AT_LOAD_DATABITS)-1)
+#define _NEURON_TT_AT_LOAD_DATA(data)	(((data) >> _NEURON_TT_AT_LOAD_DATASHIFT) & _NEURON_TT_AT_LOAD_DATAMASK)
+
+int neuron_test_trigger_ena = 0;
+int neuron_test_trigger_at_load = 0;	// loadtime testing trigger (16 bits of trigger value, 16 bits trigger data)
+
+module_param(neuron_test_trigger_ena, int, S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP);
+MODULE_PARM_DESC(neuron_test_trigger_ena, "test trigger enable");
+
+module_param(neuron_test_trigger_at_load, int, S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP);
+MODULE_PARM_DESC(neuron_test_trigger_at_load, "test trigger at load time");
+
+DECLARE_BITMAP(neuron_test_trigger_bitmap, NEURON_TEST_TRIGGER_MAX);
+
+// test trigger data, interpretted per trigger.  Right now just u64, but we could get more sophisticated
+// if needed.
+//
+static u64 neuron_test_trigger_data[NEURON_TEST_TRIGGER_MAX] = {0};
+
+void ntest_init(void)
+{
+	bitmap_zero(neuron_test_trigger_bitmap, NEURON_TEST_TRIGGER_MAX);
+
+	// set any load time test triggers
+	//
+	if (neuron_test_trigger_at_load) {
+		int val = _NEURON_TT_AT_LOAD_VAL(neuron_test_trigger_at_load);
+		if (val < NEURON_TEST_TRIGGER_MAX) {
+			bitmap_set(neuron_test_trigger_bitmap, val, 1);
+			neuron_test_trigger_data[val] = _NEURON_TT_AT_LOAD_DATA(neuron_test_trigger_at_load);
+		}
+	}
+}
+
+//inline int _ntest_trigger(enum neuron_test_trigger trigger, void * trigger_data) {}
+
+
+int ntest_trigger(enum neuron_test_trigger trigger, u64 trigger_data)
+{
+	switch (trigger) {
+		case NEURON_TEST_TRIGGER_RST_FAILURE:
+			if (test_bit(NEURON_TEST_TRIGGER_RST_FAILURE, neuron_test_trigger_bitmap) &&
+				(trigger_data == neuron_test_trigger_data[NEURON_TEST_TRIGGER_RST_FAILURE])) {
+				return 1;
+			}
+			break;
+		default:
+			break;
+	}
+	return 0;
+}
diff --git a/neuron_test.h b/neuron_test.h
new file mode 100644
index 0000000..51bdb31
--- /dev/null
+++ b/neuron_test.h
@@ -0,0 +1,29 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright 2026, Amazon.com, Inc. or its affiliates. All Rights Reserved
+ */
+
+#ifndef NEURON_TEST_H
+#define NEURON_TEST_H
+
+#include <linux/types.h>
+
+enum neuron_test_trigger {
+	NEURON_TEST_TRIGGER_RST_FAILURE = 0,
+	NEURON_TEST_TRIGGER_MAX = 1,
+};
+
+extern int neuron_test_trigger_ena;
+
+void ntest_init(void);
+
+int ntest_trigger(enum neuron_test_trigger trigger, u64 trigger_data);
+
+static inline int _ntest_trigger(enum neuron_test_trigger trigger, u64 trigger_data)
+{
+	if (!neuron_test_trigger_ena) {
+		return 0;
+	}
+	return ntest_trigger(trigger, trigger_data);
+}
+#endif
diff --git a/neuron_trace.h b/neuron_trace.h
index e40cb8e..0782f2d 100644
--- a/neuron_trace.h
+++ b/neuron_trace.h
@@ -33,8 +33,8 @@ TRACE_EVENT(dma_engine_init,
 TRACE_EVENT(dma_queue_init,
 	TP_PROTO(struct neuron_device *nd, u32 eng_id, u32 qid, u32 tx_desc_count,
 		 u32 rx_desc_count, struct mem_chunk *tx_mc,
-		 struct mem_chunk *rx_mc, struct mem_chunk *rxc_mc, u32 port),
-	TP_ARGS(nd, eng_id, qid, tx_desc_count, rx_desc_count, tx_mc, rx_mc, rxc_mc, port),
+		 struct mem_chunk *rx_mc, struct mem_chunk *rxc_mc),
+	TP_ARGS(nd, eng_id, qid, tx_desc_count, rx_desc_count, tx_mc, rx_mc, rxc_mc),
 	TP_STRUCT__entry(
 		__field(u32,	        device_index)
 		__field(u32,	        eng_id)
@@ -44,7 +44,6 @@ TRACE_EVENT(dma_queue_init,
 		__field(struct mem_chunk *,	        tx_mc)
 		__field(struct mem_chunk *,	        rx_mc)
 		__field(struct mem_chunk *,	        rxc_mc)
-		__field(u32,	        port)
 		),
 	TP_fast_assign(
 		__entry->device_index = nd->device_index;
@@ -55,9 +54,8 @@ TRACE_EVENT(dma_queue_init,
 		__entry->tx_mc = tx_mc;
 		__entry->rx_mc = rx_mc;
 		__entry->rxc_mc = rxc_mc;
-		__entry->port = port;
 	),
-	TP_printk("nd%d eng%d q%d tx_count %d rx_count %d tx %llx rx %llx rxc %llx port %d",
+	TP_printk("nd%d eng%d q%d tx_count %d rx_count %d tx %llx rx %llx rxc %llx",
 		__entry->device_index,
 		__entry->eng_id,
 		__entry->qid,
@@ -65,8 +63,7 @@ TRACE_EVENT(dma_queue_init,
 		__entry->rx_desc_count,
 		__entry->rx_mc->pa,
 		__entry->tx_mc->pa,
-		__entry->rxc_mc == NULL ? 0 : __entry->rxc_mc->pa,
-		__entry->port
+		__entry->rxc_mc == NULL ? 0 : __entry->rxc_mc->pa
 	));
 
 TRACE_EVENT(dma_queue_release,
diff --git a/postinstall b/postinstall
index e2f44e5..393ad80 100755
--- a/postinstall
+++ b/postinstall
@@ -1,4 +1,4 @@
-#!/bin/sh -e
+#!/usr/bin/sh -e
 
 rm -f "/etc/modules-load.d/neuron.conf"
 echo "neuron" | tee -a /etc/modules-load.d/neuron.conf
diff --git a/postremove b/postremove
index 3425aa9..d988ee3 100755
--- a/postremove
+++ b/postremove
@@ -1,4 +1,4 @@
-#!/bin/sh -e
+#!/usr/bin/sh -e
 
 NEURON_PRES=$(dkms status | grep neuron)
 if [ -z "${NEURON_PRES}" ]; then
diff --git a/share/neuron_driver_shared.h b/share/neuron_driver_shared.h
index 030c19f..314f3a2 100644
--- a/share/neuron_driver_shared.h
+++ b/share/neuron_driver_shared.h
@@ -18,6 +18,8 @@ enum neuron_driver_feature_flag {
 	NEURON_DRIVER_FEATURE_MEM_ALLOC64 = 1ull << 6,
 	NEURON_DRIVER_FEATURE_CONTIGUOUS_SCRATCHPAD = 1ull << 7,
 	NEURON_DRIVER_FEATURE_ZEROCOPY = 1ull << 8,
+	NEURON_DRIVER_FEATURE_PINNED_HOST_MEM = 1ull << 9,
+	NEURON_DRIVER_FEATURE_ALLOC_WITH_PA   = 1ull << 10,
 };
 
 // FIXME  this should be more generic - like node type.
@@ -189,6 +191,7 @@ struct neuron_ioctl_mem_chunk_info {
 #define NEURON_NC_MAP_MAX_ENTRIES 128
 enum neuron_ioctl_nc_mapping_type {
     NEURON_IOCTL_NC_MAPPING_TYPE_V0 = 0,           // seng swap mapping
+    NEURON_IOCTL_NC_MAPPING_TYPE_V1 = 1,           // seng swap mapping but disable die-id flipping in ultra-server nodes.
 };
 struct neuron_ioctl_nc_map_entry {
     __u32 device_id;
@@ -210,7 +213,7 @@ typedef struct neuron_memcpy_batch {
 	void *context;                  // [in] TBD. opaque context pointer passed back in completion queue
 } neuron_memcpy_batch_t;
 
-/* H2D Completion Queue Entry (CQE) */
+/* H2D DMA Completion Queue Entry (CQE) */
 typedef struct neuron_h2d_dma_compl_queue_entry {
     __u64 sequence_num; // Sequence number for the submitted IO request from runtime (0 means empty slot).
     __s64 compl_ret;    // Completion status for the request (0 success; negative errno on failure; positive to be used for future).
diff --git a/v2/neuron_dhal_v2.c b/v2/neuron_dhal_v2.c
index 5fe4e61..f7c8de7 100644
--- a/v2/neuron_dhal_v2.c
+++ b/v2/neuron_dhal_v2.c
@@ -114,6 +114,18 @@ static int ndhal_register_funcs_inf2(void) {
 }
 
 
+/* Device Arch Functions */
+/**
+ * narch_platform_ready() - return platform ready status
+ *   Certain platforms operations require the platform to be in particular state
+ *
+ */
+static int narch_platform_ready_v2(struct neuron_device *nd,  enum neuron_platform_operation_type platform_operation)
+{
+	return 0;
+}
+
+
 /* Device Reset Functions */
 static void nr_get_tpb_reset_map(uint32_t nc_map, uint32_t *tpb_reset_map)
 {
@@ -492,6 +504,32 @@ static int ndmar_get_h2t_def_qid_v2(uint32_t nc_id)
 	return 0;
 }
 
+/**
+ * ndmar_ctx_queue_bit_v2() - dummy ctx queue bitmap mapping for v2
+ * @h2d_eng_id: ignored
+ * @qid: ignored
+ *
+ * Async IO is not supported on v2, so this hook is unused.
+ */
+static int ndmar_ctx_queue_bit_v2(uint32_t h2d_eng_id, uint32_t qid)
+{
+	return 0;
+}
+
+/**
+ * ndmar_ctx_queue_from_bit_v2() - dummy ctx queue bitmap reverse mapping for v2
+ * @bit: ignored
+ * @h2d_eng_id: returned DMA engine id placeholder
+ * @qid: returned DMA queue id placeholder
+ *
+ * Async IO is not supported on v2, so this hook is unused.
+ */
+static void ndmar_ctx_queue_from_bit_v2(int bit, uint32_t *h2d_eng_id, uint32_t *qid)
+{
+	*h2d_eng_id = 0;
+	*qid = 0;
+}
+
 /** 
  * ndmar_is_h2t_def_q() - return true 
  *
@@ -751,9 +789,9 @@ static int fw_io_post_metric_v2(struct fw_io_ctx *ctx, u8 *data, u32 size)
  */
 static int mmap_get_bar4_offset_v2(u64 start_addr, u64 size, u64 *offset)
 {
-	if (start_addr >= V2_HBM_0_BASE && start_addr + size < V2_HBM_0_BASE + V2_HBM_0_SIZE)
+	if (start_addr >= V2_HBM_0_BASE && start_addr + size <= V2_HBM_0_BASE + V2_HBM_0_SIZE)
 		*offset = start_addr;
-	else if (start_addr >= V2_HBM_1_BASE && start_addr + size < V2_HBM_1_BASE + V2_HBM_1_SIZE)
+	else if (start_addr >= V2_HBM_1_BASE && start_addr + size <= V2_HBM_1_BASE + V2_HBM_1_SIZE)
 		// The 64GB - 80GB range is mapped to 16GB - 32GB on bar4
 		*offset = start_addr - V2_HBM_1_BASE + V2_HBM_0_SIZE;
 	else
@@ -1382,6 +1420,7 @@ int ndhal_register_funcs_v2(void) {
 		return -EINVAL;
 	}
 
+	ndhal->ndhal_arch.narch_platform_ready = narch_platform_ready_v2;
 	ndhal->ndhal_address_map.pci_host_base = V2_PCIE_A0_BASE;
 	ndhal->ndhal_address_map.mmap_nc_event_offset = V2_MMAP_NC_EVENT_OFFSET;
 	ndhal->ndhal_address_map.mmap_nc_sema_read_offset = V2_MMAP_NC_SEMA_READ_OFFSET;
@@ -1389,7 +1428,6 @@ int ndhal_register_funcs_v2(void) {
 	ndhal->ndhal_address_map.mmap_nc_sema_incr_offset = V2_MMAP_NC_SEMA_INCR_OFFSET;
 	ndhal->ndhal_address_map.mmap_nc_sema_decr_offset = V2_MMAP_NC_SEMA_DECR_OFFSET;
 	ndhal->ndhal_address_map.bar0_misc_ram_offset = V2_MMAP_BAR0_APB_MISC_RAM_OFFSET;
-	ndhal->ndhal_address_map.port_1_base = 0ull;
 	ndhal->ndhal_address_map.nc_per_device = V2_NC_PER_DEVICE;
 	ndhal->ndhal_address_map.dev_nc_map = (1 << V2_NC_PER_DEVICE) - 1;
 	ndhal->ndhal_address_map.dice_per_device = V2_NUM_DIE_PER_DEVICE;
@@ -1414,10 +1452,13 @@ int ndhal_register_funcs_v2(void) {
 	ndhal->ndhal_mpset.mpset_set_dram_and_mpset_info = mpset_set_dram_and_mpset_info_v2;
 	ndhal->ndhal_ndmar.ndmar_get_h2t_eng_id = ndmar_get_h2t_eng_id_v2;
     ndhal->ndhal_ndmar.ndmar_get_h2t_def_qid = ndmar_get_h2t_def_qid_v2;
+	ndhal->ndhal_ndmar.ndmar_ctx_queue_bit = ndmar_ctx_queue_bit_v2;
+	ndhal->ndhal_ndmar.ndmar_ctx_queue_from_bit = ndmar_ctx_queue_from_bit_v2;
     ndhal->ndhal_ndmar.ndmar_is_h2t_def_q = ndmar_is_h2t_def_q_v2;
 	ndhal->ndhal_ndmar.nr_init_h2t_eng = nr_init_h2t_eng_v2;
 	ndhal->ndhal_ndmar.ndmar_is_nx_ring = ndmar_is_nx_ring_v2;
 	ndhal->ndhal_ndmar.ndmar_quiesce_queues = ndmar_quiesce_queues_v2;
+	ndhal->ndhal_fw_io.new_readless_read_min_api_version = U32_MAX;
 	ndhal->ndhal_fw_io.fw_io_topology = fw_io_topology_v2;
 	ndhal->ndhal_fw_io.fw_io_register_readless_read_region = fw_io_register_readless_read_region_v2;
 	ndhal->ndhal_fw_io.fw_io_read_csr_array = fw_io_read_csr_array_v2;
diff --git a/v3/neuron_dhal_v3.c b/v3/neuron_dhal_v3.c
index 40e683e..e70d99d 100644
--- a/v3/neuron_dhal_v3.c
+++ b/v3/neuron_dhal_v3.c
@@ -40,6 +40,10 @@ int force_die_flip = 0;
 module_param(force_die_flip, int, S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP);
 MODULE_PARM_DESC(force_die_flip, "Force Neuron Core Mapping APIs to give back DIE flip mappings");
 
+bool enable_sysfs_health_status_nodes = true;
+module_param(enable_sysfs_health_status_nodes, bool, S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP);
+MODULE_PARM_DESC(enable_sysfs_health_status_nodes, "Enable sysfs device health_status nodes");
+
 // TOP SP addresses are sparse on chip adjust to accommodate the table macro
 //
 #define V3_TOP_SP_GRP1_BASE V3_TOP_SP_0_BASE
@@ -262,6 +266,23 @@ static enum neuron_platform_type ndhal_platform_type_v3(void)
 	return platform_type;
 }
 
+
+/* Device Arch Functions */
+/**
+ * narch_platform_ready() - return platform ready status
+ *   Certain platforms operations require the platform to be in particular state
+ *
+ */
+static int narch_platform_ready_v3(struct neuron_device *nd,  enum neuron_platform_operation_type platform_operation)
+{
+	if (ndhal->ndhal_arch.platform_type == NEURON_PLATFORM_TYPE_STD) {
+		return 0;
+	} else {
+		return npe_platform_ready(nd, platform_operation);
+	}
+}
+
+
 /* Device Reset Functions */
 /**
  * nr_get_tpb_reset_map() - generates a the reset map of all resources associated with resetting a particular TPB
@@ -332,15 +353,19 @@ static int nr_initiate_reset_v3(struct neuron_device *nd, uint32_t nc_map)
 
 static int nr_initiate_reset_v3_qemu(struct neuron_device *nd, uint32_t nc_map)
 {
+	uint32_t reset_val = nc_map;
 	uint32_t tpb_reset_map_lo = 0, tpb_reset_map_hi = 0;
 	volatile void *addr;
 
 	if (no_reset)
 		return 0;
 
-	nr_get_tpb_reset_map(nc_map, &tpb_reset_map_lo, &tpb_reset_map_hi);
+	if (nc_map != NEURON_NC_MAP_DEVICE) {
+		nr_get_tpb_reset_map(nc_map, &tpb_reset_map_lo, &tpb_reset_map_hi);
+		reset_val = tpb_reset_map_lo;
+	}
 	addr = nd->npdev.bar0 + V3_PCIE_BAR0_APB_IO_0_OFFSET + V3_APB_IO_0_USER_SE_0_RESERVED2_RELBASE + 0x10;
-	writel(tpb_reset_map_lo, (volatile uint32_t *)addr);
+	writel(reset_val, (volatile uint32_t *)addr);
 
 	return 0;
 }
@@ -419,6 +444,7 @@ static int nr_post_reset_config_v3(struct neuron_device *nd, bool reset_successf
 	} else {
 		nd->supports_hbm_7200 = 0;
 	}
+	nd->current_perf_profile = 0;
 
 	if (ndhal->ndhal_arch.platform_type == NEURON_PLATFORM_TYPE_STD) {
 		return 0;
@@ -711,6 +737,30 @@ static int ndmar_get_h2t_def_qid_v3(uint32_t nc_id)
 	return nc_id % V3_NC_PER_SENG;
 }
 
+/**
+ * ndmar_ctx_queue_bit_v3() - map a V3 H2D queue to a dense bitmap index
+ * @h2d_eng_id: DMA engine id in the V3 H2D/D2H engine range (128 to 131)
+ * @qid: DMA queue id within the engine
+ *
+ * Return bitmap bit index for the queue.
+ */
+static int ndmar_ctx_queue_bit_v3(uint32_t h2d_eng_id, uint32_t qid)
+{
+	return (h2d_eng_id - V3_D2H_0_IDX) * DMA_MAX_Q_V4 + qid;
+}
+
+/**
+ * ndmar_ctx_queue_from_bit_v3() - map a dense bitmap index back to a V3 H2D queue
+ * @bit: bitmap bit index
+ * @h2d_eng_id: returned DMA engine id
+ * @qid: returned DMA queue id
+ */
+static void ndmar_ctx_queue_from_bit_v3(int bit, uint32_t *h2d_eng_id, uint32_t *qid)
+{
+	*h2d_eng_id = V3_D2H_0_IDX + (bit / DMA_MAX_Q_V4);
+	*qid = bit % DMA_MAX_Q_V4;
+}
+
 /**
  * ndmar_is_h2t_def_q() - return true
  *
@@ -967,13 +1017,13 @@ static int mmap_get_bar4_offset_v3(u64 start_addr, u64 size, u64 *offset)
 {
 	u64 hbm_dist = narch_is_qemu() ? (ndhal->ndhal_pci.dram_bar_size / 4) : V3_HBM_SIZE;
 
-	if (start_addr >= V3_HBM_0_BASE && start_addr + size < V3_HBM_0_BASE + V3_HBM_ACTIVE_SIZE)
+	if (start_addr >= V3_HBM_0_BASE && start_addr + size <= V3_HBM_0_BASE + V3_HBM_ACTIVE_SIZE)
 		*offset = start_addr;
-	else if (start_addr >= V3_HBM_1_BASE && start_addr + size < V3_HBM_1_BASE + V3_HBM_ACTIVE_SIZE)
+	else if (start_addr >= V3_HBM_1_BASE && start_addr + size <= V3_HBM_1_BASE + V3_HBM_ACTIVE_SIZE)
 		*offset = start_addr - V3_HBM_1_BASE + hbm_dist;
-	else if (start_addr >= V3_HBM_2_BASE && start_addr + size < V3_HBM_2_BASE + V3_HBM_ACTIVE_SIZE)
+	else if (start_addr >= V3_HBM_2_BASE && start_addr + size <= V3_HBM_2_BASE + V3_HBM_ACTIVE_SIZE)
 		*offset = start_addr - V3_HBM_2_BASE + hbm_dist * 2;
-	else if (start_addr >= V3_HBM_3_BASE && start_addr + size < V3_HBM_3_BASE + V3_HBM_ACTIVE_SIZE)
+	else if (start_addr >= V3_HBM_3_BASE && start_addr + size <= V3_HBM_3_BASE + V3_HBM_ACTIVE_SIZE)
 		*offset = start_addr - V3_HBM_3_BASE + hbm_dist * 3;
 	else
 		return -EINVAL;
@@ -1010,6 +1060,7 @@ static int nsysfsmetric_add_ecc_nodes_v3(struct nsysfsmetric_metrics *metrics,
 		pr_err("failed to add hardware node its attributes under stats\n");
 		return -1;
 	}
+	metrics->hardware_node = hardware_node;
 
 	return 0;
 }
@@ -1026,48 +1077,44 @@ static void nsysfsmetric_get_hbm_error_count_v3(struct neuron_device *nd,
                                                  uint32_t *err_count)
 {
 	int ret;
-	uint32_t total_uncorrected_ecc_err_count;
-	uint32_t total_repairable_ecc_err_count;
+	uint32_t total_unrepairable_ecc_err_count = 0;
+	uint32_t total_repairable_ecc_err_count = 0;
 	uint32_t ecc_repair_state;
 
 	*err_count = 0;
 
+	// read regs 17-20
+	fw_io_get_total_ecc_err_counts(nd->npdev.bar0, &total_unrepairable_ecc_err_count, &total_repairable_ecc_err_count);
+
+	// read reg 25
 	ret = fw_io_hbm_uecc_repair_state_read(nd->npdev.bar0, &ecc_repair_state);
 	if (ret) {
 		pr_err("sysfs failed to read HBM ECC repair state from FWIO\n");
 		return;
 	}
-	fw_io_get_total_ecc_err_counts(nd->npdev.bar0, &total_uncorrected_ecc_err_count, &total_repairable_ecc_err_count);
-
-	/*
-	*  HBM Repair State Bitfield notes:
-	*      2 bits to represent the state of hbm repair
-	*      0x0 means no pending repair
-	*      0x1 means pending repair
-	*      0x2 means repair failure
-	*/
-	if (total_uncorrected_ecc_err_count == 0 && ecc_repair_state != 0) {
-		// For legacy firmware, there might be the case that (err count > 0 && repair state == 0), so allow this case
-		// When err count = 0, repair state must be 0x0
-		pr_warn_once("[ND %d] Total Uncorrected ecc err count is %d, but repair state is %d which is invalid. Please contact Neuron for support.\n", nd->device_index, total_uncorrected_ecc_err_count, ecc_repair_state);
-		return;
+
+	if (ecc_repair_state > 0x2) {
+    	pr_warn_once("[ND %d] HBM unexpected ecc_repair_state: 0x%x\n", nd->device_index, ecc_repair_state);
+	}
+
+
+	if (ecc_repair_state == 0x2) { // repair failure
+		total_unrepairable_ecc_err_count += 1;
 	}
 
-	// We did not complete the repair for some reason, in this case we expect that the error count is non-zero since the repairs have
-	// not gone through yet. If it is zero notify the user since this is unexpected.
 	if (ecc_repair_state == 0x1 && total_repairable_ecc_err_count == 0) {
-		pr_warn_once("[ND %d] HBM repairs were not completed, but no repairable ecc errors were reported, which is invalid. Please contact Neuron for support.\n", nd->device_index);
-		return;
-	} 
+		/* Known race condition: it may take upto 5 seconds to have consistent regs, but we can't wait that long 
+		in a sysfs read. Increment repairable UE by 1 */
+		pr_warn_once("[ND %d] HBM pending_repair but no repairable errors\n", nd->device_index);
+		total_repairable_ecc_err_count += 1;
+	}
 
-	// We failed to repair ECC memory but have not encountered a UECC yet. Proactively notify the user of this since the ECC 
-	// will be more susceptible to errors in the future.
-	if (ecc_repair_state == 0x2 && total_uncorrected_ecc_err_count == 0) {
-		pr_warn_once("[ND %d] HBM repair failed. No uncorrectable ecc errors detected, however memory will be more suseptible to corruption. Please contact Neuron for support.\n", nd->device_index);
-		return;
+	if (ecc_repair_state == 0x0 && total_repairable_ecc_err_count > 0) {
+		/* Unexpected / unknown race condition   */
+		pr_warn_once("[ND %d] HBM repairable errors but no pending_repair\n", nd->device_index);
 	}
 
-	*err_count = (repairable) ? total_repairable_ecc_err_count : total_uncorrected_ecc_err_count;
+	*err_count = (repairable) ? total_repairable_ecc_err_count : total_unrepairable_ecc_err_count;
 }
 
 /**
@@ -1538,11 +1585,18 @@ static const struct neuron_ioctl_nc_map_entry nc_mapping_v0_seng_swap[] = {
 static_assert((NC_MAPPING_V0_SENG_SWAP_SIZE == NC_MAPPING_MAX_CORE_COUNT_V3) && (NC_MAPPING_V0_SENG_SWAP_SIZE <= NEURON_NC_MAP_MAX_ENTRIES));
 static const uint32_t neuron_nc_map_die_flip_mask = 0x6;
 
-static bool ndhal_die_flipped(void)
+static bool ndhal_die_flipped(enum neuron_ioctl_nc_mapping_type version)
 {
 	u32 state;
 	s8 node_id;
 
+	if (version == NEURON_IOCTL_NC_MAPPING_TYPE_V1) {
+		if (force_die_flip) {
+			pr_info("Runtime disabled die id flipping. overriding driver force mode");
+		}
+		return false;
+	}
+
 	if (force_die_flip) {
 		return true;
 	}
@@ -1559,12 +1613,12 @@ static bool ndhal_die_flipped(void)
 
 static int ncdev_logical_to_physical_nc_map_v3(struct neuron_ioctl_nc_map *map, uint32_t max_num_entries, enum neuron_ioctl_nc_mapping_type version)
 {
-	bool apply_dieflip = ndhal_die_flipped();
+	bool apply_dieflip = ndhal_die_flipped(version);
 	uint32_t entry_idx;
 	uint32_t entries_to_copy = (max_num_entries < NC_MAPPING_MAX_CORE_COUNT_V3) ? max_num_entries : NC_MAPPING_MAX_CORE_COUNT_V3;
 	const struct neuron_ioctl_nc_map_entry *mapping;
 
-	if (version != NEURON_IOCTL_NC_MAPPING_TYPE_V0) {
+	if (version != NEURON_IOCTL_NC_MAPPING_TYPE_V0 && version != NEURON_IOCTL_NC_MAPPING_TYPE_V1) {
 		pr_err("Unsupported Neuron Core Mapping verion %u for v3 arch", version);
 		return -EINVAL;
 	}
@@ -1705,7 +1759,7 @@ static int perf_set_profile_v3(struct neuron_device *nd, uint32_t profile)
 		if (retval == 0) {
 			nd->current_perf_profile = cur_profile;
 		} else {
-			nd->current_perf_profile = 0;
+			nd->current_perf_profile = -1;
 		}
 	}
     return ret;
@@ -1874,6 +1928,7 @@ int ndhal_register_funcs_v3(void) {
 	}
 
 	ndhal->ndhal_arch.platform_type = ndhal_platform_type_v3();
+	ndhal->ndhal_arch.narch_platform_ready = narch_platform_ready_v3;
 	ndhal->ndhal_address_map.pci_host_base = V3_PCIE_A0_BASE;
 	ndhal->ndhal_address_map.mmap_nc_event_offset = V3_MMAP_NC_EVENT_OFFSET;
 	ndhal->ndhal_address_map.mmap_nc_sema_read_offset = V3_MMAP_NC_SEMA_READ_OFFSET;
@@ -1881,7 +1936,6 @@ int ndhal_register_funcs_v3(void) {
 	ndhal->ndhal_address_map.mmap_nc_sema_incr_offset = V3_MMAP_NC_SEMA_INCR_OFFSET;
 	ndhal->ndhal_address_map.mmap_nc_sema_decr_offset = V3_MMAP_NC_SEMA_DECR_OFFSET;
 	ndhal->ndhal_address_map.bar0_misc_ram_offset = V3_MMAP_BAR0_APB_IO_0_MISC_RAM_OFFSET;
-	ndhal->ndhal_address_map.port_1_base = 0ull;
 	ndhal->ndhal_address_map.nc_per_device = V3_NC_PER_DEVICE;
 	ndhal->ndhal_address_map.dev_nc_map = (1 << V3_NC_PER_DEVICE) - 1;
 	ndhal->ndhal_address_map.dice_per_device = V3_NUM_DIE_PER_DEVICE;
@@ -1906,10 +1960,13 @@ int ndhal_register_funcs_v3(void) {
 	ndhal->ndhal_mpset.mpset_set_dram_and_mpset_info = mpset_set_dram_and_mpset_info_v3;
 	ndhal->ndhal_ndmar.ndmar_get_h2t_eng_id = ndmar_get_h2t_eng_id_v3;
 	ndhal->ndhal_ndmar.ndmar_get_h2t_def_qid = ndmar_get_h2t_def_qid_v3;
+	ndhal->ndhal_ndmar.ndmar_ctx_queue_bit = ndmar_ctx_queue_bit_v3;
+	ndhal->ndhal_ndmar.ndmar_ctx_queue_from_bit = ndmar_ctx_queue_from_bit_v3;
 	ndhal->ndhal_ndmar.ndmar_is_h2t_def_q = ndmar_is_h2t_def_q_v3;
 	ndhal->ndhal_ndmar.nr_init_h2t_eng = nr_init_h2t_eng_v3;
 	ndhal->ndhal_ndmar.ndmar_is_nx_ring = ndmar_is_nx_ring_v3;
 	ndhal->ndhal_ndmar.ndmar_quiesce_queues = ndmar_quiesce_queues_v3;
+	ndhal->ndhal_fw_io.new_readless_read_min_api_version = 7;
 	ndhal->ndhal_fw_io.fw_io_topology = fw_io_topology_v3;
 	ndhal->ndhal_fw_io.fw_io_register_readless_read_region = fw_io_register_readless_read_region_v3;
 	ndhal->ndhal_fw_io.fw_io_read_csr_array = fw_io_read_csr_array_v3;
@@ -1922,6 +1979,7 @@ int ndhal_register_funcs_v3(void) {
 	ndhal->ndhal_sysfs_metrics.nsysfsmetric_add_ecc_nodes = nsysfsmetric_add_ecc_nodes_v3;
 	ndhal->ndhal_sysfs_metrics.nsysfsmetric_get_hbm_error_count = nsysfsmetric_get_hbm_error_count_v3;
 	ndhal->ndhal_sysfs_metrics.nsysfsmetric_add_tensor_engine_node = nsysfsmetric_add_tensor_engine_node_v3;
+	ndhal->ndhal_sysfs_metrics.health_status_enabled = enable_sysfs_health_status_nodes;
 	ndhal->ndhal_pci.axi_bar = BAR_UNUSED;
 	ndhal->ndhal_pci.apb_bar = 0;
 	ndhal->ndhal_pci.dram_bar = 4;
diff --git a/v3/neuron_pelect.c b/v3/neuron_pelect.c
index 26249c9..96b2949 100644
--- a/v3/neuron_pelect.c
+++ b/v3/neuron_pelect.c
@@ -96,6 +96,9 @@
  *     - If an election fails due to broken links, we attempt to run the election using only one link pair in an attempt to for two 2-node pairs.
  *       Currently we first attempt this on the right link, then if that fails, attempt the election again on the left link.
  *
+ *   - Impact of device reset failure on election/configuration
+ *     - If any device fails reset, the election is declared a failure and the platform will default to running single instance mode
+ *
  *   Election Results:
  *     Results of the election are reported in sysfs under /sys/class/neuron_device.
  *
@@ -356,7 +359,7 @@ static int npe_pod_neighbor_io_init(pod_neighbor_io_t* pnio, struct neuron_devic
 		goto done;
 	}
 
-	ret = ndmar_queue_init(nd, pnio->eng_id, 0, pnio->ring_size, pnio->ring_size, pnio->tx_mc, pnio->rx_mc, NULL, 0, true);
+	ret = ndmar_queue_init(nd, pnio->eng_id, 0, pnio->ring_size, pnio->ring_size, pnio->tx_mc, pnio->rx_mc, NULL, true);
 	if (ret) {
 		pr_err("pod election io queue init failed");
 		goto done;
@@ -1150,8 +1153,10 @@ int npe_election_exec_on_rst(struct neuron_device *nd, bool reset_successful)
 	int node_cnt;
 	u32 lr_neighbor_mask;
 	u64 pod_serial_number;
-	
+
+	// Declare election/configuration failed if any device fails reset
 	if (!reset_successful) {
+		ndhal_pelect_data.pod_state_internal = NEURON_NPE_POD_ST_ELECTION_FAILURE;
 		return 0;
 	}
 
@@ -1201,8 +1206,13 @@ int npe_election_exec_on_rst(struct neuron_device *nd, bool reset_successful)
 
 	// if we aren't kicking off election on first driver reset (testing) or 
 	// if we aren't in init state then we've already made an election decision.
+	// Since election is happening for ultra server, skipping the election is
+	// applied for the ultra-server only. In case of PDS, ignore election skip
+	// ctl.
 	//
-	if ((ndhal_pelect_data.pod_state_internal != NEURON_NPE_POD_ST_INIT) || npe_pod_ctl_is_set(NPE_POD_CTL_RST_SKIP_ELECTION)) {
+	if ((ndhal_pelect_data.pod_state_internal != NEURON_NPE_POD_ST_INIT) ||
+        (ndhal->ndhal_arch.platform_type != NEURON_PLATFORM_TYPE_PDS &&
+         npe_pod_ctl_is_set(NPE_POD_CTL_RST_SKIP_ELECTION))) {
 		goto done;
 	}
 
@@ -1635,6 +1645,47 @@ int npe_pod_ctrl(struct neuron_device *nd, u32 ctrl, enum neuron_ultraserver_mod
 	return ret;
 }
 
+/**
+ * npe_platform_ready() - check if the platform can support a given operation
+ *
+ *     UltraServers and PDS have to collect configuration data locally or from neighbors
+ *     this function determines if the platform supports a paritcular operation.  
+ *     Currently the only thing we wait on is for config data to be available on PDS 
+ *     droplets.  The time it takes to collect config data is constrained by reset time
+ *     across devices, so we block opens until config is complete on PDS servers.
+ *
+ */
+int npe_platform_ready(struct neuron_device *nd, enum neuron_platform_operation_type platform_operation)
+{
+	if (ndhal->ndhal_arch.platform_type == NEURON_PLATFORM_TYPE_PDS) {
+		switch (platform_operation) {
+			case NEURON_PLATFORM_OP_TYPE_DEVOPEN:
+				if (npe_pod_state_busy()) { 
+					return -EBUSY;
+				} 
+				return 0;
+
+			case NEURON_PLATFORM_OP_TYPE_EXEC:
+				return 0;
+
+			default:
+				break;
+		}
+	} else if (ndhal->ndhal_arch.platform_type == NEURON_PLATFORM_TYPE_ULTRASERVER) {
+		switch (platform_operation) {
+			case NEURON_PLATFORM_OP_TYPE_DEVOPEN:
+				return 0;
+
+			case NEURON_PLATFORM_OP_TYPE_EXEC:
+				return 0;
+
+			default:
+				break;
+		}
+	}
+	return 0;
+}
+
 static int npe_election_thread_fn(void *arg)
 {
 	int ret;
diff --git a/v3/neuron_pelect.h b/v3/neuron_pelect.h
index 2e9f4a2..06f87ff 100644
--- a/v3/neuron_pelect.h
+++ b/v3/neuron_pelect.h
@@ -84,6 +84,12 @@ int npe_get_pod_status(u32 *state, u8 *node_id);
  */
 int npe_pod_ctrl(struct neuron_device *nd, u32 ctrl, enum neuron_ultraserver_mode mode, u32 timeout, u32 *state);
 
+/**
+ * npe_platform_ready() - return platform readiness for specified operation
+ *
+ */
+int npe_platform_ready(struct neuron_device *nd, enum neuron_platform_operation_type platform_operation);
+
 /**
  * npe_class_node_id_show_data() - return sysfs class node_id
  *
diff --git a/v4/neuron_dhal_v4.c b/v4/neuron_dhal_v4.c
index 72b87c2..9ad23df 100644
--- a/v4/neuron_dhal_v4.c
+++ b/v4/neuron_dhal_v4.c
@@ -149,8 +149,9 @@ static int ndhal_register_funcs_trn3(void) {
 
 /* Instance names
  */
-#define NEURON_TRN3PDS_INSTANCE_NAME "trn3s.48xlarge"
+#define NEURON_TRN3PDS_INSTANCE_NAME "trn3.48xlarge"
 #define NEURON_TRN3PDS0_INSTANCE_NAME "trn3-dev0.48xlarge"
+#define NEURON_TRN3PDS1_INSTANCE_NAME "trn3-dev1.48xlarge"
 #define NEURON_TRN3P_INSTANCE_NAME "trn3p.48xlarge"
 
 static enum neuron_platform_type ndhal_platform_type_v4(void)
@@ -161,7 +162,9 @@ static enum neuron_platform_type ndhal_platform_type_v4(void)
 	if (narch_get_instance_type_name(buf, sizeof(buf))) goto done;
 	if ((strncmp(buf, NEURON_TRN3PDS_INSTANCE_NAME, sizeof(NEURON_TRN3PDS_INSTANCE_NAME)-1) == 0)) {
 		platform_type = NEURON_PLATFORM_TYPE_PDS;
-	} else if ((strncmp(buf, NEURON_TRN3PDS0_INSTANCE_NAME, sizeof(NEURON_TRN3PDS_INSTANCE_NAME)-1) == 0)) {
+	} else if ((strncmp(buf, NEURON_TRN3PDS0_INSTANCE_NAME, sizeof(NEURON_TRN3PDS0_INSTANCE_NAME)-1) == 0)) {
+		platform_type = NEURON_PLATFORM_TYPE_PDS;
+	} else if ((strncmp(buf, NEURON_TRN3PDS1_INSTANCE_NAME, sizeof(NEURON_TRN3PDS1_INSTANCE_NAME)-1) == 0)) {
 		platform_type = NEURON_PLATFORM_TYPE_PDS;
 	} else if ((strncmp(buf, NEURON_TRN3P_INSTANCE_NAME, sizeof(NEURON_TRN3P_INSTANCE_NAME)-1) == 0)) {
 		platform_type = NEURON_PLATFORM_TYPE_ULTRASERVER;
@@ -169,6 +172,9 @@ static enum neuron_platform_type ndhal_platform_type_v4(void)
 		platform_type = NEURON_PLATFORM_TYPE_STD;
 	}
 
+	if (narch_is_qemu() || narch_is_emu())
+		platform_type = NEURON_PLATFORM_TYPE_STD;
+
 done:
 	return platform_type;
 }
@@ -187,6 +193,32 @@ static bool ndhal_instance_type_3xl(void)
 	return instance_type_is_3xl;
 }
 
+/**
+ * ndmar_ctx_queue_bit_v4() - dummy ctx queue bitmap mapping for v4
+ * @h2d_eng_id: ignored
+ * @qid: ignored
+ *
+ * Async IO is not supported on v4 yet, so this hook is unused.
+ */
+static int ndmar_ctx_queue_bit_v4(uint32_t h2d_eng_id, uint32_t qid)
+{
+	return 0;
+}
+
+/**
+ * ndmar_ctx_queue_from_bit_v4() - dummy ctx queue bitmap reverse mapping for v4
+ * @bit: ignored
+ * @h2d_eng_id: returned DMA engine id placeholder
+ * @qid: returned DMA queue id placeholder
+ *
+ * Async IO is not supported on v4 yet, so this hook is unused.
+ */
+static void ndmar_ctx_queue_from_bit_v4(int bit, uint32_t *h2d_eng_id, uint32_t *qid)
+{
+	*h2d_eng_id = 0;
+	*qid = 0;
+}
+
 
 /* Memory Pool Functions */
 /**
@@ -252,13 +284,13 @@ static int mmap_get_bar4_offset_v4(u64 start_addr, u64 size, u64 *offset)
 {
 	u64 hbm_dist = narch_is_qemu() ? (ndhal->ndhal_pci.dram_bar_size / 4) : V4_HBM_SIZE;
 
-	if (start_addr >= V4_HBM_0_BASE && start_addr + size < V4_HBM_0_BASE + V4_HBM_ACTIVE_SIZE)
+	if (start_addr >= V4_HBM_0_BASE && start_addr + size <= V4_HBM_0_BASE + V4_HBM_ACTIVE_SIZE)
 		*offset = start_addr;
-	else if (start_addr >= V4_HBM_1_BASE && start_addr + size < V4_HBM_1_BASE + V4_HBM_ACTIVE_SIZE)
+	else if (start_addr >= V4_HBM_1_BASE && start_addr + size <= V4_HBM_1_BASE + V4_HBM_ACTIVE_SIZE)
 		*offset = start_addr - V4_HBM_1_BASE + hbm_dist;
-	else if (start_addr >= V4_HBM_2_BASE && start_addr + size < V4_HBM_2_BASE + V4_HBM_ACTIVE_SIZE)
+	else if (start_addr >= V4_HBM_2_BASE && start_addr + size <= V4_HBM_2_BASE + V4_HBM_ACTIVE_SIZE)
 		*offset = start_addr - V4_HBM_2_BASE + hbm_dist * 2;
-	else if (start_addr >= V4_HBM_3_BASE && start_addr + size < V4_HBM_3_BASE + V4_HBM_ACTIVE_SIZE)
+	else if (start_addr >= V4_HBM_3_BASE && start_addr + size <= V4_HBM_3_BASE + V4_HBM_ACTIVE_SIZE)
 		*offset = start_addr - V4_HBM_3_BASE + hbm_dist * 3;
 	else
 		return -EINVAL;
@@ -436,11 +468,14 @@ int ndhal_register_funcs_v4(void) {
 	}
 
 	ndhal->ndhal_arch.platform_type = ndhal_platform_type_v4();
+	ndhal->ndhal_fw_io.new_readless_read_min_api_version = 6;
 	ndhal->ndhal_pci.neuron_pci_get_device_id = neuron_pci_get_device_id_v4;
 	ndhal->ndhal_npe.npe_neighbor_eng_ids = npe_neighbor_eng_ids_v4;
 	ndhal->ndhal_mpset.mpset_set_dram_and_mpset_info = mpset_set_dram_and_mpset_info_v4;
 	ndhal->ndhal_mmap.dm_mmap_special = dm_mmap_special_v4;
 	ndhal->ndhal_mmap.mmap_get_bar4_offset = mmap_get_bar4_offset_v4;
+	ndhal->ndhal_ndmar.ndmar_ctx_queue_bit = ndmar_ctx_queue_bit_v4;
+	ndhal->ndhal_ndmar.ndmar_ctx_queue_from_bit = ndmar_ctx_queue_from_bit_v4;
 	ndhal->ndhal_cdev.ncdev_mem_regions = ncdev_mem_regions_v4;
 	ndhal->ndhal_perf.perf_update_hbm_7200_supported = perf_update_hbm_7200_supported_v4;