diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..eb315c4
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,105 @@
+cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
+project(cuhpx LANGUAGES CXX CUDA)
+
+# ---------- Basics ----------
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+# LTO/IPO can drop CUDA fatbins; keep it off for these extensions
+set(CMAKE_INTERPROCEDURAL_OPTIMIZATION OFF)
+
+# CUDA archs default; override via -DCMAKE_CUDA_ARCHITECTURES=80;86;90
+if(NOT CMAKE_CUDA_ARCHITECTURES OR CMAKE_CUDA_ARCHITECTURES STREQUAL "OFF")
+  set(CMAKE_CUDA_ARCHITECTURES 80;86;90)  # Ampere and Hopper default
+endif()
+message(STATUS "✅ cuhpx build configured. CUDA archs: ${CMAKE_CUDA_ARCHITECTURES}")
+
+# ---------- Python / Torch ----------
+find_package(Python REQUIRED COMPONENTS Interpreter Development.Module)
+
+# Find site-packages so we can locate Torch's CMake configs
+execute_process(
+  COMMAND "${Python_EXECUTABLE}" -c "import sysconfig; print(sysconfig.get_paths()['purelib'])"
+  OUTPUT_VARIABLE PYTHON_SITE_PACKAGES
+  OUTPUT_STRIP_TRAILING_WHITESPACE
+)
+list(APPEND CMAKE_PREFIX_PATH "${PYTHON_SITE_PACKAGES}/torch/share/cmake")
+
+find_package(Torch REQUIRED)        # ${TORCH_LIBRARIES}, ${TORCH_CXX_FLAGS}
+find_package(pybind11 REQUIRED)
+
+# ---------- CUDA toolchain (modern imported targets) ----------
+find_package(CUDAToolkit REQUIRED)  # provides CUDA::cudart, etc.
+
+# Optional: Torch's Python shim (resolves at::Tensor pybind casters)
+find_library(TORCH_PYTHON_LIBRARY
+  NAMES torch_python
+  HINTS "${PYTHON_SITE_PACKAGES}/torch/lib" "${TORCH_INSTALL_PREFIX}/lib"
+)
+if(TORCH_PYTHON_LIBRARY)
+  message(STATUS "Found torch_python at: ${TORCH_PYTHON_LIBRARY}")
+endif()
+
+include_directories(
+  ${TORCH_INCLUDE_DIRS}
+  ${CMAKE_CURRENT_SOURCE_DIR}/src
+)
+
+# ---------- Helper: apply safe CUDA flags per target ----------
+function(cuhpx_apply_cuda_flags target_name)
+  target_compile_options(${target_name} PRIVATE
+    $<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr>
+    $<$<COMPILE_LANGUAGE:CUDA>:--expt-extended-lambda>
+    $<$<COMPILE_LANGUAGE:CUDA>:-Xcudafe>
+    $<$<COMPILE_LANGUAGE:CUDA>:--diag_suppress=20014>
+  )
+  set_target_properties(${target_name} PROPERTIES
+    CUDA_SEPARABLE_COMPILATION ON
+    INTERPROCEDURAL_OPTIMIZATION FALSE
+  )
+endfunction()
+
+# ==================== cuhpx_fft ====================
+set(CUHPX_FFT_SRC
+  src/harmonic_transform/hpx_fft.cpp
+  src/harmonic_transform/hpx_fft_cuda.cu
+)
+pybind11_add_module(cuhpx_fft MODULE ${CUHPX_FFT_SRC})
+target_compile_definitions(cuhpx_fft PRIVATE ${TORCH_CXX_FLAGS})
+target_link_libraries(cuhpx_fft PRIVATE
+  ${TORCH_LIBRARIES}
+  CUDA::cudart
+  Python::Module
+)
+if(TORCH_PYTHON_LIBRARY)
+  target_link_libraries(cuhpx_fft PRIVATE ${TORCH_PYTHON_LIBRARY})
+endif()
+set_target_properties(cuhpx_fft PROPERTIES PREFIX "" OUTPUT_NAME "cuhpx_fft")
+cuhpx_apply_cuda_flags(cuhpx_fft)
+
+# ==================== cuhpx_remap ====================
+set(CUHPX_REMAP_SRC
+  src/data_remapping/hpx_remapping.cpp
+  src/data_remapping/hpx_remapping_cuda.cu
+)
+pybind11_add_module(cuhpx_remap MODULE ${CUHPX_REMAP_SRC})
+target_compile_definitions(cuhpx_remap PRIVATE ${TORCH_CXX_FLAGS})
+target_link_libraries(cuhpx_remap PRIVATE
+  ${TORCH_LIBRARIES}
+  CUDA::cudart
+  Python::Module
+)
+if(TORCH_PYTHON_LIBRARY)
+  target_link_libraries(cuhpx_remap PRIVATE ${TORCH_PYTHON_LIBRARY})
+endif()
+set_target_properties(cuhpx_remap PROPERTIES PREFIX "" OUTPUT_NAME "cuhpx_remap")
+cuhpx_apply_cuda_flags(cuhpx_remap)
+
+# ---------- Install layout for wheels ----------
+# scikit-build-core will install these into site-packages/cuhpx/ for wheels.
+install(TARGETS cuhpx_fft cuhpx_remap LIBRARY DESTINATION cuhpx)
+install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/cuhpx/data
+  DESTINATION cuhpx
+  FILES_MATCHING PATTERN "*.fits"
+)
diff --git a/Dockerfile b/Dockerfile
index bb93365..a335834 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -5,6 +5,7 @@ RUN apt-get update
 
 COPY . /cuhpx
 RUN cd /cuhpx && \
+    pip install scikit-build-core>=0.8 && \
     pip install --no-build-isolation .
 
 # Set the default command for the container
diff --git a/README.md b/README.md
index ebe873b..262984d 100644
--- a/README.md
+++ b/README.md
@@ -16,6 +16,7 @@ To setup the library, run
 ```bash
 git clone https://gitlab-master.nvidia.com/Devtech-Compute/cuhpx.git
 cd cuhpx
+pip install scikit-build-core>=0.8
 pip install --no-build-isolation .
 ```
 
diff --git a/cuhpx/hpx_remap.py b/cuhpx/hpx_remap.py
index a1adb05..b9b99fd 100644
--- a/cuhpx/hpx_remap.py
+++ b/cuhpx/hpx_remap.py
@@ -13,9 +13,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import cuhpx_remap
 import torch
 
+from . import cuhpx_remap
+
 
 def is_power_of_two(n):
     return (n > 0) and (n & (n - 1)) == 0
diff --git a/cuhpx/hpx_sht.py b/cuhpx/hpx_sht.py
index bb77bb2..dc942e8 100644
--- a/cuhpx/hpx_sht.py
+++ b/cuhpx/hpx_sht.py
@@ -13,10 +13,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import cuhpx_fft
 import numpy as np
 import torch
-import torch.cuda.nvtx
 import torch.nn as nn
 from torch.autograd import Function
 
@@ -33,6 +31,8 @@
     p2phi_ring,
 )
 
+from . import cuhpx_fft
+
 
 def healpix_rfft_torch(f: torch.tensor, L: int, nside: int) -> torch.tensor:
 
@@ -449,13 +449,9 @@ def einsum_with_chunking(x, weights, mmax, xout, nchunk, stream1):
     device = torch.device("cuda")
     chunk_size = int(weights.size(1) / nchunk + 1)  # Adjust this based on your memory constraints
 
-    torch.cuda.nvtx.range_push("Allocate memory for chunk")
     next_chunk_cpu = torch.empty((weights.size(0), chunk_size, weights.size(2)), dtype=weights.dtype, pin_memory=True)
     current_chunk = torch.empty((weights.size(0), chunk_size, weights.size(2)), dtype=weights.dtype, device=device)
     next_chunk = torch.empty_like(current_chunk)
-    torch.cuda.nvtx.range_pop()
-
-    torch.cuda.nvtx.range_push("einsum between x and weights with chunking")
 
     # Create events for synchronization
     event_transfer = torch.cuda.Event(blocking=True)
@@ -475,40 +471,30 @@ def einsum_with_chunking(x, weights, mmax, xout, nchunk, stream1):
         if actual_chunk_size != chunk_size:
             next_chunk_cpu.resize_((weights.size(0), actual_chunk_size, weights.size(2)))
 
-        torch.cuda.nvtx.range_push("CPU copy from weights to pin memory")
         next_chunk_cpu.copy_(weights[:, start_i:end_i, :])
-        torch.cuda.nvtx.range_pop()
 
         with torch.cuda.stream(stream1):
-            torch.cuda.nvtx.range_push(f"Transfer weights chunk {i}:{end_i} to GPU")
             next_chunk[: weights.size(0), : end_i - start_i, :].copy_(next_chunk_cpu, non_blocking=True)
             event_transfer.record(stream1)
-            torch.cuda.nvtx.range_pop()
 
-        torch.cuda.nvtx.range_push(f"Compute einsum for chunk {i - chunk_size}:{end_i - chunk_size}")
         xout[..., start_j:end_j, :, :] = torch.einsum(
             '...kmn,mlk->...lmn', x, current_chunk[:, : end_j - start_j, :].to(x.dtype)
         )
 
         event_computation.record(torch.cuda.current_stream())
-        torch.cuda.nvtx.range_pop()
         torch.cuda.current_stream().wait_event(event_transfer)
 
         current_chunk, next_chunk = next_chunk, current_chunk
         start_j, end_j = start_i, end_i
 
     if start_i < weights.size(1):
-        torch.cuda.nvtx.range_push("Compute einsum for the last chunk")
         xout[..., start_i:end_i, :, :] = torch.einsum(
             '...kmn,mlk->...lmn', x, current_chunk[:, : end_i - start_i, :].to(x.dtype)
         )
-        torch.cuda.nvtx.range_pop()
 
     stream1.synchronize()
     torch.cuda.current_stream().synchronize()
 
-    torch.cuda.nvtx.range_pop()  # End of einsum with chunking
-
     return xout
 
 
@@ -523,39 +509,28 @@ def forward(ctx, x, weights, pct, W, mmax, lmax, nside):
         ctx.lmax = lmax
         ctx.nside = nside
 
-        torch.cuda.nvtx.range_push("rfft")
         # SHT
         if x.dim() == 1:
             x = cuhpx_fft.healpix_rfft_class(x, mmax, nside)
         else:
             x = cuhpx_fft.healpix_rfft_batch(x, mmax, nside)
 
-        torch.cuda.nvtx.range_pop()
-
         x = torch.view_as_real(x)
 
         out_shape = list(x.size())
         out_shape[-3] = lmax
         out_shape[-2] = mmax
 
-        torch.cuda.nvtx.range_push("allocate xout")
         xout = torch.zeros(out_shape, dtype=x.dtype, device=x.device)
-        torch.cuda.nvtx.range_pop()
 
-        torch.cuda.nvtx.range_push("einsum between pct and weights")
         weights = pct * weights
-        torch.cuda.nvtx.range_pop()
 
         if not pct.is_cuda:
-            torch.cuda.nvtx.range_push("einsum between x and weights using two stream")
             nchunk = 12
             stream1 = torch.cuda.Stream()
             xout = einsum_with_chunking(x, weights, mmax, xout, nchunk, stream1)
-            torch.cuda.nvtx.range_pop()
         else:
-            torch.cuda.nvtx.range_push("einsum between x and weights")
             xout = torch.einsum('...kmn,mlk->...lmn', x, weights.to(x.dtype))
-            torch.cuda.nvtx.range_pop()
 
         x = torch.view_as_complex(xout.contiguous())
 
@@ -595,18 +570,14 @@ def forward(ctx, x, weights, pct, W, mmax, lmax, nside):
 
         x = torch.view_as_real(x)
 
-        torch.cuda.nvtx.range_push("einsum between x and pct")
         xs = torch.einsum('...lmn, mlk->...kmn', x, pct.to(x.dtype))
-        torch.cuda.nvtx.range_pop()
 
         x = torch.view_as_complex(xs.contiguous())
 
-        torch.cuda.nvtx.range_push("irfft")
         if x.dim() == 2:
             x = cuhpx_fft.healpix_irfft_class(x, mmax, nside)
         else:
             x = cuhpx_fft.healpix_irfft_batch(x, mmax, nside)
-        torch.cuda.nvtx.range_pop()
 
         return x
 
diff --git a/pyproject.toml b/pyproject.toml
index e4454d1..bf8656e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,21 +1,23 @@
 [build-system]
-requires = ["torch", "setuptools"]
-build-backend = "setuptools.build_meta"
+requires = ["scikit-build-core>=0.8", "pybind11>=2.12", "numpy"]
+build-backend = "scikit_build_core.build"
 
 [project]
 name = "cuHPX"
-version = "2025.5.1"
-description = "GPU-accelerated utilities for data on HEALPix grids."
-readme = "README.md"
-license = { file="LICENSE.txt" }
+version = "2025.8.1"
+description = "CUDA-accelerated HEALPix tools for harmonic transforms and remapping"
 authors = [
-    { name = "NVIDIA", email = "asubramaniam@nvidia.com" }
+  { name = "NVIDIA", email = "asubramaniam@nvidia.com" }
 ]
+readme = "README.md"
+license = { file = "LICENSE.txt" }
+requires-python = ">=3.8"
 dependencies = [
-    "numpy",
-    "torch>=2.0.0",
-    "astropy",
-    "torch_harmonics",
+  "numpy",
+  "astropy",
+  "torch_harmonics",
+  # Expect PyTorch preinstalled in the environment; if you want to enforce:
+  "torch>=2.4",
 ]
 classifiers = [
     "Development Status :: 2 - Pre-Alpha",
@@ -32,6 +34,18 @@ classifiers = [
 [project.urls]
 "Homepage" = "https://github.com/NVlabs/cuHPX"
 
+[tool.scikit-build]
+wheel.packages = ["cuhpx"]
+cmake.minimum-version = "3.18"
+cmake.source-dir = "."
+# Keep build artifacts in build/, not alongside sources
+build-dir = "build/{wheel_tag}"
+sdist.include = ["src", "cuhpx", "tests", "CMakeLists.txt", "LICENSE.txt", "README.md"]
+
+[tool.scikit-build.editable]
+# Key: makes editable installs load extensions from build/ via a redirect hook
+mode = "redirect"
+verbose = true
 
 [tool.black]
 line-length = 120
diff --git a/setup.py b/setup.py
deleted file mode 100644
index 4465e59..0000000
--- a/setup.py
+++ /dev/null
@@ -1,55 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from setuptools import find_packages, setup
-from torch.utils.cpp_extension import BuildExtension, CUDAExtension
-
-setup(
-    name='cuhpx',
-    version='0.1.0',
-    packages=find_packages(),
-    package_data={'cuhpx': ['**/*.fits']},
-    url='https://gitlab-master.nvidia.com/Devtech-Compute/cuhpx',
-    license='TBD',
-    author='Xiaopo Cheng, Akshay Subramaniam',
-    author_email='xiaopoc@nvidia.com, asubramaniam@nvidia.com',
-    description='A library for performing transformations and analysis on HEALPix',
-    install_requires=[
-        'numpy',
-        'torch',
-        'astropy',
-        'torch_harmonics',
-    ],
-    ext_modules=[
-        CUDAExtension(
-            'cuhpx_remap',
-            sources=[
-                'src/data_remapping/hpx_remapping.cpp',
-                'src/data_remapping/hpx_remapping_cuda.cu',
-            ],
-            extra_compile_args={'nvcc': ['-O2']},
-        ),
-        CUDAExtension(
-            'cuhpx_fft',
-            sources=[
-                'src/harmonic_transform/hpx_fft.cpp',
-                'src/harmonic_transform/hpx_fft_cuda.cu',
-            ],
-            extra_compile_args={'nvcc': ['-O2', '-lineinfo']},
-        ),
-    ],
-    cmdclass={'build_ext': BuildExtension},
-)
diff --git a/src/data_remapping/hpx_remapping.cpp b/src/data_remapping/hpx_remapping.cpp
index fa136d0..632d1e1 100644
--- a/src/data_remapping/hpx_remapping.cpp
+++ b/src/data_remapping/hpx_remapping.cpp
@@ -449,7 +449,7 @@ torch::Tensor xy2xy_batch(torch::Tensor data_xy_in, const std::string& s_origin,
 
 
 
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+PYBIND11_MODULE(cuhpx_remap, m) {
 
     m.def("ring2nest", &ring2nest, "Convert ring to nest (CUDA)");
     m.def("nest2ring", &nest2ring, "Convert nest to ring (CUDA)");
diff --git a/src/harmonic_transform/hpx_fft.cpp b/src/harmonic_transform/hpx_fft.cpp
index 90c8af1..9ec28ac 100644
--- a/src/harmonic_transform/hpx_fft.cpp
+++ b/src/harmonic_transform/hpx_fft.cpp
@@ -671,7 +671,7 @@ torch::Tensor healpix_irfft(torch::Tensor ftm, int L, int nside) {
     return f;
 }
 
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+PYBIND11_MODULE(cuhpx_fft, m) {
     m.def("healpix_rfft", &healpix_rfft, "HEALPix RFFT");
     m.def("healpix_irfft", &healpix_irfft, "HEALPix IRFFT");
     m.def("healpix_rfft_cufft", &healpix_rfft_cufft, "HEALPix RFFT with cuFFT");