diff --git a/Cargo.lock b/Cargo.lock
index 6b0ebaa..6e6db66 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -29,8 +29,7 @@ dependencies = [
 [[package]]
 name = "async-cuda"
 version = "0.6.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "56bf487caab780f706b84b5714aa01c27996429d0d0e1538617582038dd0526c"
+source = "git+https://github.com/micahcc/async-cuda.git?branch=main#80bf68771b6da5a2e1c97cf6045668348dc9be76"
 dependencies = [
  "cpp",
  "cpp_build",
diff --git a/Cargo.toml b/Cargo.toml
index bb442b3..2ca3a58 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -15,6 +15,10 @@ async-cuda = "0.6.1"
 cpp = "0.5"
 tracing = "0.1"
 
+[patch.crates-io]
+# need to upstream cuda get devices
+async-cuda = { git = "https://github.com/micahcc/async-cuda.git", branch = "main"}
+
 [dev-dependencies]
 tempfile = "3.4"
 tokio = { version = "1", default-features = false, features = [
diff --git a/src/ffi/sync/engine.rs b/src/ffi/sync/engine.rs
index dac5461..a4382df 100644
--- a/src/ffi/sync/engine.rs
+++ b/src/ffi/sync/engine.rs
@@ -1,12 +1,12 @@
-use cpp::cpp;
-
-use async_cuda::device::DeviceId;
-use async_cuda::ffi::device::Device;
-
 use crate::error::last_error;
 use crate::ffi::memory::HostBuffer;
 use crate::ffi::result;
 use crate::ffi::sync::runtime::Runtime;
+use async_cuda::device::DeviceId;
+use async_cuda::ffi::device::Device;
+use async_cuda::ffi::ptr::DevicePtr;
+use cpp::cpp;
+use std::sync::Arc;
 
 type Result<T> = std::result::Result<T, crate::error::Error>;
 
@@ -32,6 +32,40 @@ unsafe impl Send for Engine {}
 /// The TensorRT API is thread-safe with regards to all operations on [`Engine`].
 unsafe impl Sync for Engine {}
 
+#[derive(Copy, Clone, Debug)]
+pub enum DataType {
+    /// 32-bit floating point format.
+    Float,
+    /// IEEE 16-bit floating-point format – has a 5 bit exponent and 11 bit significand.
+    Half,
+    /// Signed 8-bit integer representing a quantized floating-point value.
+    Int8,
+    /// Signed 32-bit integer format.
+    Int32,
+    /// 8-bit boolean. 0 = false, 1 = true, other values undefined.
+    Bool,
+    /// Unsigned 8-bit integer format. Cannot be used to represent quantized floating-point values.
+    /// Use the IdentityLayer to convert kUINT8 network-level inputs to {kFLOAT, kHALF} prior to
+    /// use with other TensorRT layers, or to convert intermediate output before kUINT8
+    /// network-level outputs from {kFLOAT, kHALF} to kUINT8. kUINT8 conversions are only supported
+    /// for {kFLOAT, kHALF}. kUINT8 to {kFLOAT, kHALF} conversion will convert the integer values
+    /// to equivalent floating point values. {kFLOAT, kHALF} to kUINT8 conversion will convert the
+    /// floating point values to integer values by truncating towards zero. This conversion has
+    /// undefined behavior for floating point values outside the range [0.0F, 256.0F) after
+    /// truncation. kUINT8 conversions are not supported for {kINT8, kINT32, kBOOL}.
+    Uint8,
+    /// Signed 8-bit floating point with 1 sign bit, 4 exponent bits, 3 mantissa bits, and exponent-bias 7.
+    Fp8,
+    /// Brain float – has an 8 bit exponent and 8 bit significand.
+    Bf16,
+    ///Signed 64-bit integer type.
+    Int64,
+    /// Signed 4-bit integer type.
+    Int4,
+    /// 4-bit floating point type 1 bit sign, 2 bit exponent, 1 bit mantissa
+    Fp4,
+}
+
 impl Engine {
     #[inline]
     pub(crate) fn wrap(internal: *mut std::ffi::c_void, runtime: Runtime) -> Self {
@@ -58,6 +92,42 @@ impl Engine {
         num_io_tensors as usize
     }
 
+    pub fn io_tensor_type(&self, io_tensor_index: usize) -> DataType {
+        let internal = self.as_ptr();
+        let io_tensor_index = io_tensor_index as std::os::raw::c_int;
+        let data_type: i32 = cpp!(unsafe [
+            internal as "const void*",
+            io_tensor_index as "int"
+        ] -> i32 as "DataType" {
+            // Added in TRT 8
+            #if NV_TENSORRT_MAJOR >= 8
+            const char* name = ((const ICudaEngine*) internal)->getIOTensorName(io_tensor_index);
+            if(name == nullptr) {
+                return DataType::kFLOAT;
+            }
+            return ((const ICudaEngine*) internal)->getTensorDataType(name);
+            #else
+            // Removed in TRT 10
+            return ((const ICudaEngine*) internal)->getBindingDataType(io_tensor_index);
+            #endif
+        });
+
+        match data_type {
+            0 => DataType::Float,
+            1 => DataType::Half,
+            2 => DataType::Int8,
+            3 => DataType::Int32,
+            4 => DataType:: Bool,
+            5 => DataType::Uint8,
+            6 => DataType::Fp8,
+            7 => DataType::Bf16,
+            8 => DataType::Int64,
+            9 => DataType::Int4,
+            10 => DataType::Fp4,
+            _ => panic!("Unknown data type ({data_type}), you might be using an unsupported version of TensorRT")
+        }
+    }
+
     pub fn io_tensor_name(&self, io_tensor_index: usize) -> String {
         let internal = self.as_ptr();
         let io_tensor_index = io_tensor_index as std::os::raw::c_int;
@@ -175,8 +245,8 @@ unsafe impl<'engine> Send for ExecutionContext<'engine> {}
 unsafe impl<'engine> Sync for ExecutionContext<'engine> {}
 
 impl ExecutionContext<'static> {
-    pub fn from_engine(mut engine: Engine) -> Result<Self> {
-        let internal = unsafe { Self::new_internal(&mut engine) };
+    pub fn from_engine(engine: Engine) -> Result<Self> {
+        let internal = unsafe { Self::new_internal(&engine) };
         result!(
             internal,
             Self {
@@ -188,10 +258,23 @@ impl ExecutionContext<'static> {
         )
     }
 
-    pub fn from_engine_many(mut engine: Engine, num: usize) -> Result<Vec<Self>> {
+    pub fn from_shared_engine(engine: Arc<Engine>) -> Result<Self> {
+        let internal = unsafe { Self::new_internal(&engine) };
+        result!(
+            internal,
+            Self {
+                internal,
+                device: engine.device(),
+                _parent: Some(engine),
+                _phantom: Default::default(),
+            }
+        )
+    }
+
+    pub fn from_engine_many(engine: Engine, num: usize) -> Result<Vec<Self>> {
         let mut internals = Vec::with_capacity(num);
         for _ in 0..num {
-            internals.push(unsafe { Self::new_internal(&mut engine) });
+            internals.push(unsafe { Self::new_internal(&engine) });
         }
         let device = engine.device();
         let parent = std::sync::Arc::new(engine);
@@ -213,7 +296,7 @@ impl ExecutionContext<'static> {
 }
 
 impl<'engine> ExecutionContext<'engine> {
-    pub fn new(engine: &'engine mut Engine) -> Result<Self> {
+    pub fn new(engine: &'engine Engine) -> Result<Self> {
         let internal = unsafe { Self::new_internal(engine) };
         result!(
             internal,
@@ -226,6 +309,32 @@ impl<'engine> ExecutionContext<'engine> {
         )
     }
 
+    pub fn bind_tensor<T: Copy>(
+        &mut self,
+        tensor_name: &str,
+        buffer: &mut async_cuda::ffi::memory::DeviceTensor<T>,
+    ) -> Result<()> {
+        Ok(unsafe { self.set_tensor_address::<T>(tensor_name, buffer.as_mut_internal()) }?)
+    }
+
+    /// Enqueue with pre-bound
+    /// this allows for assorted types of inputs
+    pub fn enqueue_prebound(&mut self, stream: &async_cuda::ffi::stream::Stream) -> Result<()> {
+        let internal = self.as_mut_ptr();
+        let stream_ptr = stream.as_internal().as_ptr();
+        let success = cpp!(unsafe [
+            internal as "void*",
+            stream_ptr as "const void*"
+        ] -> bool as "bool" {
+            return ((IExecutionContext*) internal)->enqueueV3((cudaStream_t) stream_ptr);
+        });
+        if success {
+            Ok(())
+        } else {
+            Err(last_error())
+        }
+    }
+
     pub fn enqueue<T: Copy>(
         &mut self,
         io_tensors: &mut std::collections::HashMap<
@@ -237,7 +346,7 @@ impl<'engine> ExecutionContext<'engine> {
         let internal = self.as_mut_ptr();
         for (tensor_name, buffer) in io_tensors {
             unsafe {
-                self.set_tensor_address(tensor_name, buffer)?;
+                self.set_tensor_address::<T>(tensor_name, buffer.as_mut_internal())?;
             }
         }
         let stream_ptr = stream.as_internal().as_ptr();
@@ -271,13 +380,14 @@ impl<'engine> ExecutionContext<'engine> {
         self.device
     }
 
-    unsafe fn new_internal(engine: &mut Engine) -> *mut std::ffi::c_void {
+    unsafe fn new_internal(engine: &Engine) -> *mut std::ffi::c_void {
         Device::set_or_panic(engine.device());
-        let internal_engine = engine.as_mut_ptr();
+        let internal_engine = engine.as_ptr();
         let internal = cpp!(unsafe [
             internal_engine as "void*"
         ] -> *mut std::ffi::c_void as "void*" {
-            return (void*) ((ICudaEngine*) internal_engine)->createExecutionContext();
+            void* out = (void*) ((ICudaEngine*) internal_engine)->createExecutionContext();
+            return out;
         });
         internal
     }
@@ -285,14 +395,14 @@ impl<'engine> ExecutionContext<'engine> {
     unsafe fn set_tensor_address<T: Copy>(
         &mut self,
         tensor_name: &str,
-        buffer: &mut async_cuda::ffi::memory::DeviceBuffer<T>,
+        buffer_ptr: &mut DevicePtr,
     ) -> Result<()> {
         let internal = self.as_mut_ptr();
         let tensor_name_cstr = std::ffi::CString::new(tensor_name).unwrap();
         let tensor_name_ptr = tensor_name_cstr.as_ptr();
-        let buffer_ptr = buffer.as_mut_internal().as_mut_ptr();
+        let buffer_ptr = buffer_ptr.as_ptr();
         let success = cpp!(unsafe [
-            internal as "const void*",
+            internal as "void*",
             tensor_name_ptr as "const char*",
             buffer_ptr as "void*"
         ] -> bool as "bool" {