diff --git a/ci/run_pytests_cpu.sh b/ci/run_pytests_cpu.sh
index 50e138ac..16a74640 100755
--- a/ci/run_pytests_cpu.sh
+++ b/ci/run_pytests_cpu.sh
@@ -18,6 +18,7 @@ set -e -E -u -o pipefail
 cd legateboost/test
 
 legate \
+    --gpus 0 \
     --sysmem 28000 \
     --module pytest \
     . \
diff --git a/conda/environments/all_cuda-122.yaml b/conda/environments/all_cuda-122.yaml
index f0b58cd7..194fd852 100644
--- a/conda/environments/all_cuda-122.yaml
+++ b/conda/environments/all_cuda-122.yaml
@@ -15,6 +15,7 @@ dependencies:
 - cuda-version>=12.2
 - cupynumeric==25.01.*,>=0.0.0.dev0
 - hypothesis>=6
+- legate-sparse
 - legate==25.01.*,>=0.0.0.dev0
 - libcublas-dev
 - llvm-openmp
diff --git a/dependencies.yaml b/dependencies.yaml
index 1bb50135..94590d24 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -173,3 +173,4 @@ dependencies:
           - pytest>=7,<8
           - seaborn>=0.13
           - xgboost>=2.0
+          - legate-sparse
diff --git a/examples/sparse/README.md b/examples/sparse/README.md
new file mode 100644
index 00000000..c0cd3efc
--- /dev/null
+++ b/examples/sparse/README.md
@@ -0,0 +1,3 @@
+# Sparse data
+
+This example trains a youtube comment spam classifier on a sparse dataset. The comments as raw strings are converted to a sparse matrix of word counts using the `CountVectorizer` from scikit-learn.
diff --git a/examples/sparse/sparse.py b/examples/sparse/sparse.py
new file mode 100644
index 00000000..a189beff
--- /dev/null
+++ b/examples/sparse/sparse.py
@@ -0,0 +1,47 @@
+import pandas as pd
+from sklearn.datasets import fetch_openml
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.model_selection import train_test_split
+
+import legateboost as lb
+
+# Alberto, T. & Lochter, J. (2015). YouTube Spam Collection [Dataset].
+# UCI Machine Learning Repository. https://doi.org/10.24432/C58885.
+dataset_names = [
+    "youtube-spam-psy",
+    "youtube-spam-shakira",
+    "youtube-spam-lmfao",
+    "youtube-spam-eminem",
+    "youtube-spam-katyperry",
+]
+X = []
+for dataset_name in dataset_names:
+    dataset = fetch_openml(name=dataset_name, as_frame=True)
+    X.append(dataset.data)
+
+X = pd.concat(X)
+y = X["CLASS"]
+X_train, X_test, y_train, y_test = train_test_split(
+    X, y, test_size=0.3, random_state=42
+)
+vectorizer = CountVectorizer()
+X_train_vectorized = vectorizer.fit_transform(X_train["CONTENT"])
+X_test_vectorized = vectorizer.transform(X_test["CONTENT"])
+
+model = lb.LBClassifier().fit(
+    X_train_vectorized, y_train, eval_set=[(X_test_vectorized, y_test)]
+)
+
+
+def evaluate_comment(comment):
+    print("Comment: {}".format(comment))
+    print(
+        "Probability of spam: {}".format(
+            model.predict_proba(vectorizer.transform([comment]))[0, 1]
+        )
+    )
+
+
+evaluate_comment(X_test.iloc[15]["CONTENT"])
+evaluate_comment(X_test.iloc[3]["CONTENT"])
+evaluate_comment("Your text here")
diff --git a/legateboost/input_validation.py b/legateboost/input_validation.py
index 38990a34..77c384f0 100644
--- a/legateboost/input_validation.py
+++ b/legateboost/input_validation.py
@@ -3,6 +3,11 @@
 import numpy as np
 import scipy.sparse as sp
 
+try:
+    from legate_sparse import csr_matrix
+except ImportError:
+    csr_matrix = None
+
 import cupynumeric as cn
 
 __all__: List[str] = []
@@ -29,22 +34,22 @@ def check_sample_weight(sample_weight: Any, n: int) -> cn.ndarray:
 
 def check_array(x: Any) -> cn.ndarray:
     if sp.issparse(x):
-        raise ValueError("Sparse matrix not allowed.")
-
-    if not hasattr(x, "__legate_data_interface__"):
-        x = cn.array(np.require(x, requirements=["C", "A"]))
-    if hasattr(x, "__array_interface__"):
-        shape = x.__array_interface__["shape"]
-        if shape[0] <= 0:
-            raise ValueError(
-                "Found array with %d sample(s) (shape=%s) while a"
-                " minimum of %d is required." % (shape[0], shape, 1)
-            )
-        if len(shape) >= 2 and 0 in shape:
-            raise ValueError(
-                "Found array with %d feature(s) (shape=%s) while"
-                " a minimum of %d is required." % (shape[1], shape, 1)
-            )
+        x = csr_matrix(x)
+    elif isinstance(x, csr_matrix):
+        pass
+    else:
+        x = cn.array(x, copy=False)
+
+    if x.shape[0] <= 0:
+        raise ValueError(
+            "Found array with %d sample(s) (shape=%s) while a"
+            " minimum of %d is required." % (x.shape[0], x.shape, 1)
+        )
+    if len(x.shape) >= 2 and 0 in x.shape:
+        raise ValueError(
+            "Found array with %d feature(s) (shape=%s) while"
+            " a minimum of %d is required." % (x.shape[1], x.shape, 1)
+        )
 
     if cn.iscomplexobj(x):
         raise ValueError("Complex data not supported.")
@@ -52,8 +57,6 @@ def check_array(x: Any) -> cn.ndarray:
     if np.issubdtype(x.dtype, np.floating) and not cn.isfinite(x.sum()):
         raise ValueError("Input contains NaN or inf")
 
-    x = cn.array(x, copy=False)
-
     return x
 
 
diff --git a/legateboost/models/base_model.py b/legateboost/models/base_model.py
index 07408559..1b23cdcb 100644
--- a/legateboost/models/base_model.py
+++ b/legateboost/models/base_model.py
@@ -91,6 +91,16 @@ def predict(self, X: cn.ndarray) -> cn.ndarray:
         """
         pass
 
+    def supports_csr(self) -> bool:
+        """Whether the model supports CSR matrix input.
+
+        Returns
+        -------
+        bool
+            True if the model supports CSR matrix input, False otherwise.
+        """
+        return False
+
     @abstractmethod
     def __str__(self) -> str:
         pass
diff --git a/legateboost/models/tree.py b/legateboost/models/tree.py
index a3aa8c37..ff4079d3 100644
--- a/legateboost/models/tree.py
+++ b/legateboost/models/tree.py
@@ -1,9 +1,20 @@
 import copy
 from enum import IntEnum
-from typing import Any
+from typing import Any, Union
 
 import cupynumeric as cn
-from legate.core import TaskTarget, get_legate_runtime, types
+from legate.core import (
+    ImageComputationHint,
+    TaskTarget,
+    get_legate_runtime,
+    image,
+    types,
+)
+
+try:
+    from legate_sparse import csr_matrix
+except ImportError:
+    csr_matrix = None
 
 from ..library import user_context, user_lib
 from ..utils import get_store
@@ -12,7 +23,9 @@
 
 class LegateBoostOpCode(IntEnum):
     BUILD_TREE = user_lib.cffi.BUILD_TREE
-    PREDICT = user_lib.cffi.PREDICT
+    BUILD_TREE_CSR = user_lib.cffi.BUILD_TREE_CSR
+    PREDICT_TREE = user_lib.cffi.PREDICT_TREE
+    PREDICT_TREE_CSR = user_lib.cffi.PREDICT_TREE_CSR
     UPDATE_TREE = user_lib.cffi.UPDATE_TREE
 
 
@@ -54,12 +67,7 @@ def __init__(
         self.split_samples = split_samples
         self.alpha = alpha
 
-    def fit(
-        self,
-        X: cn.ndarray,
-        g: cn.ndarray,
-        h: cn.ndarray,
-    ) -> "Tree":
+    def fit_dense(self, X: cn.ndarray, g: cn.ndarray, h: cn.ndarray) -> "Tree":
         num_outputs = g.shape[1]
 
         task = get_legate_runtime().create_auto_task(
@@ -120,6 +128,90 @@ def fit(
         self.hessian = cn.array(hessian, copy=False)
         return self
 
+    def fit_csr(self, X: csr_matrix, g: cn.ndarray, h: cn.ndarray) -> "Tree":
+        num_outputs = g.shape[1]
+
+        task = get_legate_runtime().create_auto_task(
+            user_context, LegateBoostOpCode.BUILD_TREE_CSR
+        )
+
+        # promote these to 3d. When the g/h shapes match those of the dense version,
+        # it makes code reuse easier on the C++ side
+        g_ = get_store(g).promote(1, 1)
+        h_ = get_store(h).promote(1, 1)
+
+        task.add_scalar_arg(self.max_depth, types.int32)
+        max_nodes = 2 ** (self.max_depth + 1)
+        task.add_scalar_arg(max_nodes, types.int32)
+        task.add_scalar_arg(self.alpha, types.float64)
+        task.add_scalar_arg(self.split_samples, types.int32)
+        task.add_scalar_arg(self.random_state.randint(0, 2**31), types.int32)
+        task.add_scalar_arg(X.shape[0], types.int64)
+        task.add_scalar_arg(X.shape[1], types.int64)
+
+        # inputs
+        val_var = task.add_input(X.vals)
+        crd_var = task.add_input(X.crd)
+        pos_var = task.add_input(X.pos)
+        task.add_input(g_)
+        task.add_input(h_)
+        pos_promoted = X.pos.promote(1, g.shape[1]).promote(1, 1)
+        # we don't need this input but use it for alignment
+        task.add_input(pos_promoted)
+
+        task.add_alignment(g_, h_)
+        task.add_alignment(g_, pos_promoted)
+        task.add_constraint(
+            image(pos_var, crd_var, hint=ImageComputationHint.FIRST_LAST)
+        )
+        task.add_constraint(
+            image(pos_var, val_var, hint=ImageComputationHint.FIRST_LAST)
+        )
+
+        # outputs
+        leaf_value = get_legate_runtime().create_store(
+            types.float64, (max_nodes, num_outputs)
+        )
+        feature = get_legate_runtime().create_store(types.int32, (max_nodes,))
+        split_value = get_legate_runtime().create_store(types.float64, (max_nodes,))
+        gain = get_legate_runtime().create_store(types.float64, (max_nodes,))
+        hessian = get_legate_runtime().create_store(
+            types.float64, (max_nodes, num_outputs)
+        )
+        task.add_output(leaf_value)
+        task.add_output(feature)
+        task.add_output(split_value)
+        task.add_output(gain)
+        task.add_output(hessian)
+        task.add_broadcast(leaf_value)
+        task.add_broadcast(feature)
+        task.add_broadcast(split_value)
+        task.add_broadcast(gain)
+        task.add_broadcast(hessian)
+
+        if get_legate_runtime().machine.count(TaskTarget.GPU) > 1:
+            task.add_nccl_communicator()
+        elif get_legate_runtime().machine.count() > 1:
+            task.add_cpu_communicator()
+        task.execute()
+
+        self.leaf_value = cn.array(leaf_value, copy=False)
+        self.feature = cn.array(feature, copy=False)
+        self.split_value = cn.array(split_value, copy=False)
+        self.gain = cn.array(gain, copy=False)
+        self.hessian = cn.array(hessian, copy=False)
+        return self
+
+    def fit(
+        self,
+        X: Union[cn.ndarray, csr_matrix],
+        g: cn.ndarray,
+        h: cn.ndarray,
+    ) -> "Tree":
+        if isinstance(X, csr_matrix):
+            return self.fit_csr(X, g, h)
+        return self.fit_dense(X, g, h)
+
     def clear(self) -> None:
         self.leaf_value.fill(0)
         self.hessian.fill(0)
@@ -168,12 +260,12 @@ def update(
         self.hessian = cn.array(hessian, copy=False)
         return self
 
-    def predict(self, X: cn.ndarray) -> cn.ndarray:
+    def predict_dense(self, X: cn.ndarray) -> cn.ndarray:
         n_rows = X.shape[0]
         n_features = X.shape[1]
         n_outputs = self.leaf_value.shape[1]
         task = get_legate_runtime().create_auto_task(
-            user_context, LegateBoostOpCode.PREDICT
+            user_context, LegateBoostOpCode.PREDICT_TREE
         )
 
         pred = get_legate_runtime().create_store(types.float64, (n_rows, n_outputs))
@@ -197,9 +289,58 @@ def predict(self, X: cn.ndarray) -> cn.ndarray:
 
         task.add_alignment(X_, pred_)
         task.execute()
+        return cn.array(pred, copy=False)
 
+    def predict_csr(self, X: csr_matrix) -> cn.ndarray:
+        n_rows = X.shape[0]
+        n_outputs = self.leaf_value.shape[1]
+        task = get_legate_runtime().create_auto_task(
+            user_context, LegateBoostOpCode.PREDICT_TREE_CSR
+        )
+
+        pred = get_legate_runtime().create_store(types.float64, (n_rows, n_outputs))
+        # inputs
+        val_var = task.add_input(X.vals)
+        crd_var = task.add_input(X.crd)
+        pos_var = task.add_input(X.pos)
+        task.add_constraint(
+            image(pos_var, crd_var, hint=ImageComputationHint.FIRST_LAST)
+        )
+        task.add_constraint(
+            image(pos_var, val_var, hint=ImageComputationHint.FIRST_LAST)
+        )
+        pos_var_broadcast = X.pos.promote(1, n_outputs)
+        task.add_alignment(pos_var_broadcast, pred)
+
+        # scalars
+        task.add_scalar_arg(X.shape[1], types.int32)
+
+        # output
+        task.add_output(
+            pred.promote(1, 1)
+        )  # add 1 dimension so it has the same dimension as dense version
+        task.add_output(pred)  # only here for alignment, no used
+
+        # broadcast the tree structure
+        leaf_value_ = get_store(self.leaf_value)
+        feature_ = get_store(self.feature)
+        split_value_ = get_store(self.split_value)
+        task.add_input(leaf_value_)
+        task.add_input(feature_)
+        task.add_input(split_value_)
+        task.add_broadcast(leaf_value_)
+        task.add_broadcast(feature_)
+        task.add_broadcast(split_value_)
+
+        task.add_input(pos_var_broadcast)  # used only for alignment
+        task.execute()
         return cn.array(pred, copy=False)
 
+    def predict(self, X: Union[cn.ndarray, csr_matrix]) -> cn.ndarray:
+        if isinstance(X, csr_matrix):
+            return self.predict_csr(X)
+        return self.predict_dense(X)
+
     def is_leaf(self, id: int) -> Any:
         return self.feature[id] == -1
 
@@ -245,3 +386,6 @@ def __mul__(self, scalar: Any) -> "Tree":
         new = copy.deepcopy(self)
         new.leaf_value *= scalar
         return new
+
+    def supports_csr(self) -> bool:
+        return True
diff --git a/legateboost/test/models/test_tree.py b/legateboost/test/models/test_tree.py
index 8502eef3..bb9fa9aa 100644
--- a/legateboost/test/models/test_tree.py
+++ b/legateboost/test/models/test_tree.py
@@ -72,3 +72,21 @@ def test_alpha():
     )
     model.fit(X, y)
     assert np.isclose(model.predict(X)[0], y.sum() / (y.size + alpha))
+
+
+def test_csr():
+    csr_matrix = pytest.importorskip("legate_sparse").csr_matrix
+    X = csr_matrix(
+        (cn.array([1.0, 2.0, 3.0]), cn.array([0, 1, 2]), cn.array([0, 2, 3])),
+        shape=(2, 3),
+    )
+    g = cn.array([[1.0], [-1.0]])
+    h = cn.array([[1.0], [1.0]])
+
+    model = (
+        lb.models.Tree(alpha=0.0)
+        .set_random_state(np.random.RandomState(2))
+        .fit(X, g, h)
+    )
+    print(model)
+    assert np.allclose(model.predict(X), -g / h)
diff --git a/legateboost/test/test_estimator.py b/legateboost/test/test_estimator.py
index 5d02b421..f6fdcf41 100644
--- a/legateboost/test/test_estimator.py
+++ b/legateboost/test/test_estimator.py
@@ -1,12 +1,17 @@
 import numpy as np
 import pytest
+import scipy
 from sklearn.datasets import make_regression
 from sklearn.model_selection import train_test_split
 from sklearn.utils.estimator_checks import parametrize_with_checks
 
 import cupynumeric as cn
 import legateboost as lb
-from legateboost.testing.utils import non_increasing, sanity_check_models
+from legateboost.testing.utils import (
+    all_base_models,
+    non_increasing,
+    sanity_check_models,
+)
 
 
 def test_init():
@@ -225,3 +230,56 @@ def test_iterator_methods():
     assert list(model) == list(model.models_)
     for i, est in enumerate(model):
         assert est == model[i]
+
+
+@pytest.mark.parametrize(
+    "base_model", filter(lambda m: m.supports_csr(), all_base_models()), ids=type
+)
+def test_csr_input(base_model):
+    csr_matrix = pytest.importorskip("legate_sparse").csr_matrix
+    X_dense = cn.array([[1.0, 0.0, 2.0], [0.0, 3.0, 0.0]])
+    X_scipy = scipy.sparse.csr_matrix(X_dense)
+    X_legate_sparse = csr_matrix(X_scipy)
+    y = cn.array([1.0, 2.0])
+    model = lb.LBRegressor(
+        init=None,
+        n_estimators=1,
+        base_models=(base_model,),
+        learning_rate=1.0,
+    )
+    sparse_pred = model.fit(X_scipy, y).predict(X_scipy)
+    legate_sparse_pred = model.fit(X_legate_sparse, y).predict(X_legate_sparse)
+    dense_pred = model.fit(X_dense, y).predict(X_dense)
+    assert cn.allclose(sparse_pred, legate_sparse_pred)
+    assert cn.allclose(sparse_pred, dense_pred)
+
+    # Generate a sparse dataset and check that dense and sparse
+    # input give equivalent results
+    # unfortunately we can't test that they are bitwise identical
+    # the changing order of floating point sums can lead to different results
+    # so instead assert that sparse models are roughly as accurate as dense ones
+    rng = np.random.RandomState(0)
+    X = rng.binomial(1, 0.1, (100, 100)).astype(np.float32)
+    y = rng.randint(0, 5, 100)
+    X_scipy = scipy.sparse.csr_matrix(X)
+
+    # regression
+    params = {"base_models": (base_model,), "n_estimators": 5, "random_state": 0}
+    sparse_model = lb.LBRegressor(**params)
+    dense_model = lb.LBRegressor(**params)
+    dense_pred = dense_model.fit(X, y).predict(X)
+    X_csr = csr_matrix(X)
+    assert X_csr.nnz < X.size
+    sparse_pred = sparse_model.fit(X_csr, y).predict(X_csr)
+    assert sparse_model.score(X, y) > 0.4
+    assert dense_model.score(X, y) > 0.4
+    sanity_check_models(sparse_model)
+
+    # classification
+    sparse_model = lb.LBClassifier(**params)
+    dense_model = lb.LBClassifier(**params)
+    dense_pred = dense_model.fit(X, y).predict_proba(X)
+    sparse_pred = sparse_model.fit(X_csr, y).predict_proba(X_csr)
+    assert sparse_model.score(X, y) > 0.8
+    assert dense_model.score(X, y) > 0.8
+    sanity_check_models(sparse_model)
diff --git a/pyproject.toml b/pyproject.toml
index be4346af..8b14144a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -45,6 +45,7 @@ requires-python = ">=3.10"
 [project.optional-dependencies]
 test = [
     "hypothesis>=6",
+    "legate-sparse",
     "matplotlib>=3.9",
     "mypy>=1.13",
     "nbconvert>=7.16",
diff --git a/src/legateboost.h b/src/legateboost.h
index bf8af18f..892c274a 100644
--- a/src/legateboost.h
+++ b/src/legateboost.h
@@ -20,7 +20,7 @@
 enum LegateBoostOpCode {  // NOLINT(performance-enum-size)
   OP_CODE_BASE = 0,
   BUILD_TREE   = 1,
-  PREDICT      = 2,
+  PREDICT_TREE = 2,
   UPDATE_TREE  = 3,
   /* special */
   ERF     = 4,
@@ -29,9 +29,11 @@ enum LegateBoostOpCode {  // NOLINT(performance-enum-size)
   DIGAMMA = 7,
   ZETA    = 8,
   /**/
-  GATHER   = 9,
-  RBF      = 10,
-  BUILD_NN = 11,
+  GATHER           = 9,
+  RBF              = 10,
+  BUILD_NN         = 11,
+  BUILD_TREE_CSR   = 12,
+  PREDICT_TREE_CSR = 13,
 };
 
 #endif  // SRC_LEGATEBOOST_H_
diff --git a/src/models/tree/build_tree.cc b/src/models/tree/build_tree.cc
index c942de45..0d14872a 100644
--- a/src/models/tree/build_tree.cc
+++ b/src/models/tree/build_tree.cc
@@ -25,6 +25,7 @@
 #include "legate_library.h"
 #include "legateboost.h"
 #include "cpp_utils/cpp_utils.h"
+#include "matrix_types.h"
 
 namespace legateboost {
 namespace {
@@ -128,10 +129,9 @@ void WriteTreeOutput(legate::TaskContext context, const Tree& tree)
 // Share the samples with all workers
 // Remove any duplicates
 // Return sparse matrix of split samples for each feature
-template <typename T>
+template <typename T, template <typename> class XMatrix>
 auto SelectSplitSamples(legate::TaskContext context,
-                        const legate::AccessorRO<T, 3>& X,
-                        legate::Rect<3> X_shape,
+                        const XMatrix<T>& X,
                         int split_samples,
                         int seed,
                         int64_t dataset_rows) -> SparseSplitProposals<T>
@@ -144,23 +144,22 @@ auto SelectSplitSamples(legate::TaskContext context,
     return dist(eng);
   });
 
-  auto num_features    = X_shape.hi[1] - X_shape.lo[1] + 1;
-  auto draft_proposals = legate::create_buffer<T, 2>({num_features, split_samples});
+  auto draft_proposals = legate::create_buffer<T, 2>({X.NumFeatures(), split_samples});
   for (int i = 0; i < split_samples; i++) {
     auto row            = row_samples[i];
-    bool const has_data = row >= X_shape.lo[0] && row <= X_shape.hi[0];
-    for (int j = 0; j < num_features; j++) {
-      draft_proposals[{j, i}] = has_data ? X[{row, j, 0}] : T(0);
+    const bool has_data = X.RowSubset().contains(row);
+    for (int j = 0; j < X.NumFeatures(); j++) {
+      draft_proposals[{j, i}] = has_data ? X.Get(row, j) : T(0);
     }
   }
-  SumAllReduce(context, tcb::span<T>(draft_proposals.ptr({0, 0}), num_features * split_samples));
+  SumAllReduce(context, tcb::span<T>(draft_proposals.ptr({0, 0}), X.NumFeatures() * split_samples));
 
   // Sort samples
   std::vector<T> split_proposals_tmp;
-  split_proposals_tmp.reserve(num_features * split_samples);
-  auto row_pointers = legate::create_buffer<int32_t, 1>(num_features + 1);
+  split_proposals_tmp.reserve(X.NumFeatures() * split_samples);
+  auto row_pointers = legate::create_buffer<int32_t, 1>(X.NumFeatures() + 1);
   row_pointers[0]   = 0;
-  for (int j = 0; j < num_features; j++) {
+  for (int j = 0; j < X.NumFeatures(); j++) {
     auto ptr = draft_proposals.ptr({j, 0});
     tcb::span<T> const feature_proposals(draft_proposals.ptr({j, 0}), split_samples);
     std::set<T> const unique(feature_proposals.begin(), feature_proposals.end());
@@ -173,17 +172,18 @@ auto SelectSplitSamples(legate::TaskContext context,
 
   // Set the largest split sample to +inf such that an element must belong to one of the bins
   // i.e. we cannot go off the end when searching for a bin
-  for (int feature = 0; feature < num_features; feature++) {
+  for (int feature = 0; feature < X.NumFeatures(); feature++) {
     auto end                 = row_pointers[feature + 1];
     split_proposals[end - 1] = std::numeric_limits<T>::infinity();
   }
 
   return SparseSplitProposals<T>(
-    split_proposals, row_pointers, num_features, split_proposals_tmp.size());
+    split_proposals, row_pointers, X.NumFeatures(), split_proposals_tmp.size());
 }
 
-template <typename T>
+template <typename MatrixT>
 struct TreeBuilder {
+  using T = typename MatrixT::value_type;
   TreeBuilder(int32_t num_rows,
               int32_t num_features,
               int32_t num_outputs,
@@ -194,6 +194,7 @@ struct TreeBuilder {
       num_features(num_features),
       num_outputs(num_outputs),
       max_nodes(max_nodes),
+      max_depth(max_depth),
       split_proposals(split_proposals)
   {
     sorted_positions = legate::create_buffer<std::tuple<int32_t, int32_t>>(num_rows);
@@ -211,24 +212,48 @@ struct TreeBuilder {
                                  split_proposals.histogram_size);
     max_batch_size = max_histogram_nodes;
   }
-  template <typename TYPE>
-  void ComputeHistogram(Histogram<GPair> histogram,
-                        legate::TaskContext context,
-                        Tree& tree,
-                        const legate::AccessorRO<TYPE, 3>& X,
-                        legate::Rect<3> X_shape,
-                        const legate::AccessorRO<double, 3>& g,
-                        const legate::AccessorRO<double, 3>& h,
-                        NodeBatch batch)
+
+  auto Build(legate::TaskContext context,
+             const MatrixT& X_matrix,
+             const legate::AccessorRO<double, 3>& g_accessor,
+             const legate::AccessorRO<double, 3>& h_accessor,
+             legate::Rect<3> g_shape,
+             double alpha) -> Tree
+  {
+    // Begin building the tree
+    Tree tree(max_nodes, narrow<int>(num_outputs));
+    this->InitialiseRoot(context, tree, g_accessor, h_accessor, g_shape, alpha);
+    for (int depth = 0; depth < max_depth; ++depth) {
+      auto batches = this->PrepareBatches(depth);
+      for (auto batch : batches) {
+        auto histogram = this->GetHistogram(batch);
+
+        this->ComputeHistogram(histogram, context, tree, X_matrix, g_accessor, h_accessor, batch);
+
+        this->PerformBestSplit(tree, histogram, alpha, batch);
+      }
+      // Update position of entire level
+      // Don't bother updating positions for the last level
+      if (depth < max_depth - 1) { this->UpdatePositions(tree, X_matrix); }
+    }
+    return tree;
+  }
+
+  void DenseHistogramKernel(const Tree& tree,
+                            Histogram<GPair>& histogram,
+                            const DenseXMatrix<T>& X,
+                            const legate::AccessorRO<double, 3>& g,
+                            const legate::AccessorRO<double, 3>& h,
+                            NodeBatch batch)
   {
     // Build the histogram
     for (auto [position, index_local] : batch) {
-      auto index_global  = index_local + X_shape.lo[0];
+      auto index_global  = index_local + X.RowSubset().lo[0];
       bool const compute = ComputeHistogramBin(
         position, tree.node_sums, histogram.ContainsNode(BinaryTree::Parent(position)));
       if (position < 0 || !compute) { continue; }
       for (int64_t j = 0; j < num_features; j++) {
-        auto x_value      = X[{index_global, j, 0}];
+        auto x_value      = X.Get(index_global, j);
         int const bin_idx = split_proposals.FindBin(x_value, j);
 
         for (int64_t k = 0; k < num_outputs; ++k) {
@@ -237,6 +262,48 @@ struct TreeBuilder {
         }
       }
     }
+  }
+
+  // Kernel specialised to iterate only over the non-zero elements of the sparse matrix
+  void CSRHistogramKernel(const Tree& tree,
+                          Histogram<GPair>& histogram,
+                          const CSRXMatrix<T>& X,
+                          const legate::AccessorRO<double, 3>& g,
+                          const legate::AccessorRO<double, 3>& h,
+                          NodeBatch batch)
+  {
+    // Build the histogram
+    for (auto [position, index_local] : batch) {
+      auto index_global  = index_local + X.RowSubset().lo[0];
+      bool const compute = ComputeHistogramBin(
+        position, tree.node_sums, histogram.ContainsNode(BinaryTree::Parent(position)));
+      if (position < 0 || !compute) { continue; }
+      auto row_range = X.row_ranges[index_global];
+      for (auto element_idx = row_range.lo[0]; element_idx <= row_range.hi[0]; element_idx++) {
+        auto feature      = X.column_indices[element_idx];
+        auto x            = X.values[element_idx];
+        int const bin_idx = split_proposals.FindBin(x, feature);
+        for (int64_t k = 0; k < num_outputs; ++k) {
+          histogram[{position, k, bin_idx}] +=
+            GPair{g[{index_global, 0, k}], h[{index_global, 0, k}]};
+        }
+      }
+    }
+  }
+
+  void ComputeHistogram(Histogram<GPair> histogram,
+                        legate::TaskContext context,
+                        const Tree& tree,
+                        const MatrixT& X,
+                        const legate::AccessorRO<double, 3>& g,
+                        const legate::AccessorRO<double, 3>& h,
+                        NodeBatch batch)
+  {
+    if constexpr (std::is_same_v<MatrixT, DenseXMatrix<T>>) {
+      this->DenseHistogramKernel(tree, histogram, X, g, h, batch);
+    } else {
+      this->CSRHistogramKernel(tree, histogram, X, g, h, batch);
+    }
 
     // NCCL cannot allreduce custom types, need to reinterpret as double
     SumAllReduce(
@@ -247,7 +314,7 @@ struct TreeBuilder {
     this->Scan(histogram, batch, tree);
   }
 
-  void Scan(Histogram<GPair> histogram, NodeBatch batch, Tree& tree)
+  void Scan(Histogram<GPair> histogram, NodeBatch batch, const Tree& tree)
   {
     auto scan_node_histogram = [&](int node_idx) {
       for (int feature = 0; feature < num_features; feature++) {
@@ -298,8 +365,12 @@ struct TreeBuilder {
       }
     }
   }
-  void PerformBestSplit(Tree& tree, Histogram<GPair> histogram, double alpha, NodeBatch batch)
+  void PerformBestSplit(Tree& tree,
+                        const Histogram<GPair>& histogram,
+                        double alpha,
+                        NodeBatch batch)
   {
+    const bool is_sparse_matrix = std::is_same_v<MatrixT, CSRXMatrix<T>>;
     for (int node_id = batch.node_idx_begin; node_id < batch.node_idx_end; node_id++) {
       double best_gain = 0;
       int best_feature = -1;
@@ -309,10 +380,18 @@ struct TreeBuilder {
         for (int bin_idx = feature_begin; bin_idx < feature_end; bin_idx++) {
           double gain = 0;
           for (int output = 0; output < num_outputs; ++output) {
-            auto [G_L, H_L]  = histogram[{node_id, output, bin_idx}];
-            auto [G, H]      = tree.node_sums[{node_id, output}];
-            auto G_R         = G - G_L;
-            auto H_R         = H - H_L;
+            auto [left_sum, right_sum] = InferSplitSums(histogram,
+                                                        split_proposals,
+                                                        tree.node_sums[{node_id, output}],
+                                                        node_id,
+                                                        output,
+                                                        bin_idx,
+                                                        feature,
+                                                        is_sparse_matrix);
+            auto [G_L, H_L]            = left_sum;
+            auto [G_R, H_R]            = right_sum;
+            auto [G, H]                = tree.node_sums[{node_id, output}];
+
             double const reg = std::max(eps, alpha);  // Regularisation term
             gain +=
               0.5 * ((G_L * G_L) / (H_L + reg) + (G_R * G_R) / (H_R + reg) - (G * G) / (H + reg));
@@ -325,34 +404,37 @@ struct TreeBuilder {
         }
       }
       if (best_gain > eps) {
-        std::vector<double> left_leaf(num_outputs);
-        std::vector<double> right_leaf(num_outputs);
-        std::vector<GPair> left_sum(num_outputs);
-        std::vector<GPair> right_sum(num_outputs);
+        std::vector<double> left_leaves(num_outputs);
+        std::vector<double> right_leaves(num_outputs);
+        std::vector<GPair> left_sums(num_outputs);
+        std::vector<GPair> right_sums(num_outputs);
         for (int output = 0; output < num_outputs; ++output) {
-          auto [G_L, H_L]    = histogram[{node_id, output, best_bin}];
-          auto [G, H]        = tree.node_sums[{node_id, output}];
-          auto G_R           = G - G_L;
-          auto H_R           = H - H_L;
-          left_leaf[output]  = CalculateLeafValue(G_L, H_L, alpha);
-          right_leaf[output] = CalculateLeafValue(G_R, H_R, alpha);
-          left_sum[output]   = {G_L, H_L};
-          right_sum[output]  = {G_R, H_R};
+          auto [left_sum, right_sum] = InferSplitSums(histogram,
+                                                      split_proposals,
+                                                      tree.node_sums[{node_id, output}],
+                                                      node_id,
+                                                      output,
+                                                      best_bin,
+                                                      best_feature,
+                                                      is_sparse_matrix);
+          left_leaves[output]        = CalculateLeafValue(left_sum.grad, left_sum.hess, alpha);
+          right_leaves[output]       = CalculateLeafValue(right_sum.grad, right_sum.hess, alpha);
+          left_sums[output]          = left_sum;
+          right_sums[output]         = right_sum;
         }
-        if (left_sum[0].hess <= 0.0 || right_sum[0].hess <= 0.0) { continue; }
+        if (left_sums[0].hess <= 0.0 || right_sums[0].hess <= 0.0) { continue; }
         tree.AddSplit(node_id,
                       best_feature,
                       split_proposals.split_proposals[legate::coord_t{best_bin}],
-                      left_leaf,
-                      right_leaf,
+                      left_leaves,
+                      right_leaves,
                       best_gain,
-                      left_sum,
-                      right_sum);
+                      left_sums,
+                      right_sums);
       }
     }
   }
-  template <typename TYPE>
-  void UpdatePositions(Tree& tree, const legate::AccessorRO<TYPE, 3>& X, legate::Rect<3> X_shape)
+  void UpdatePositions(Tree& tree, const MatrixT& X)
   {
     // Update the positions
     for (int i = 0; i < num_rows; i++) {
@@ -361,7 +443,7 @@ struct TreeBuilder {
         sorted_positions[i] = {-1, index_local};
         continue;
       }
-      auto x              = X[{X_shape.lo[0] + index_local, tree.feature[pos], 0}];
+      auto x              = X.Get(X.RowSubset().lo[0] + index_local, tree.feature[pos]);
       bool const left     = x <= tree.split_value[pos];
       pos                 = left ? BinaryTree::LeftChild(pos) : BinaryTree::RightChild(pos);
       sorted_positions[i] = {pos, index_local};
@@ -448,18 +530,20 @@ struct TreeBuilder {
   int32_t num_features;
   int32_t num_outputs;
   int32_t max_nodes;
+  int32_t max_depth;
   int max_batch_size;
   SparseSplitProposals<T> split_proposals;
   Histogram<GPair> histogram;
 };
 
-struct build_tree_fn {
+struct build_tree_dense_fn {
   template <typename T>
   void operator()(legate::TaskContext context)
   {
     auto [X, X_shape, X_accessor] = GetInputStore<T, 3>(context.input(0).data());
     auto [g, g_shape, g_accessor] = GetInputStore<double, 3>(context.input(1).data());
     auto [h, h_shape, h_accessor] = GetInputStore<double, 3>(context.input(2).data());
+
     EXPECT_DENSE_ROW_MAJOR(X_accessor.accessor, X_shape);
     auto num_features = X_shape.hi[1] - X_shape.lo[1] + 1;
     auto num_rows     = std::max<int64_t>(X_shape.hi[0] - X_shape.lo[0] + 1, 0);
@@ -477,39 +561,74 @@ struct build_tree_fn {
     auto seed          = context.scalars().at(4).value<int>();
     auto dataset_rows  = context.scalars().at(5).value<int64_t>();
 
-    Tree tree(max_nodes, narrow<int>(num_outputs));
+    DenseXMatrix<T> const X_matrix(X_accessor, X_shape);
+
     SparseSplitProposals<T> const split_proposals =
-      SelectSplitSamples(context, X_accessor, X_shape, split_samples, seed, dataset_rows);
+      SelectSplitSamples(context, X_matrix, split_samples, seed, dataset_rows);
 
-    // Begin building the tree
-    TreeBuilder<T> builder(
-      num_rows, num_features, num_outputs, max_nodes, max_depth, split_proposals);
+    // Dispatch the tree building algorithm templated on the matrix type
+    auto tree = TreeBuilder<DenseXMatrix<T>>(
+                  num_rows, num_features, num_outputs, max_nodes, max_depth, split_proposals)
+                  .Build(context, X_matrix, g_accessor, h_accessor, g_shape, alpha);
 
-    builder.InitialiseRoot(context, tree, g_accessor, h_accessor, g_shape, alpha);
-    for (int depth = 0; depth < max_depth; ++depth) {
-      auto batches = builder.PrepareBatches(depth);
-      for (auto batch : batches) {
-        auto histogram = builder.GetHistogram(batch);
+    WriteTreeOutput(context, tree);
+  }
+};
 
-        builder.ComputeHistogram(
-          histogram, context, tree, X_accessor, X_shape, g_accessor, h_accessor, batch);
+struct build_tree_csr_fn {
+  template <typename T>
+  void operator()(legate::TaskContext context)
+  {
+    auto [X_vals, X_vals_shape, X_vals_accessor] = GetInputStore<T, 1>(context.input(0).data());
+    auto [X_coords, X_coords_shape, X_coords_accessor] =
+      GetInputStore<int64_t, 1>(context.input(1).data());
+    auto [X_offsets, X_offsets_shape, X_offsets_accessor] =
+      GetInputStore<legate::Rect<1>, 1>(context.input(2).data());
+    auto [g, g_shape, g_accessor] = GetInputStore<double, 3>(context.input(3).data());
+    auto [h, h_shape, h_accessor] = GetInputStore<double, 3>(context.input(4).data());
+
+    auto num_rows    = std::max<int64_t>(X_offsets_shape.hi[0] - X_offsets_shape.lo[0] + 1, 0);
+    auto num_outputs = g_shape.hi[2] - g_shape.lo[2] + 1;
+    EXPECT(g_shape.lo[2] == 0, "Outputs should not be split between workers.");
+
+    // Scalars
+    auto max_depth     = context.scalars().at(0).value<int>();
+    auto max_nodes     = context.scalars().at(1).value<int>();
+    auto alpha         = context.scalars().at(2).value<double>();
+    auto split_samples = context.scalars().at(3).value<int>();
+    auto seed          = context.scalars().at(4).value<int>();
+    auto dataset_rows  = context.scalars().at(5).value<int64_t>();
+    auto num_features  = context.scalars().at(6).value<int64_t>();
+
+    CSRXMatrix<T> const X_matrix(X_vals_accessor,
+                                 X_coords_accessor,
+                                 X_offsets_accessor,
+                                 X_vals_shape,
+                                 X_offsets_shape,
+                                 num_features,
+                                 X_vals_shape.volume());
+    const SparseSplitProposals<T> split_proposals =
+      SelectSplitSamples(context, X_matrix, split_samples, seed, dataset_rows);
+
+    auto tree = TreeBuilder<CSRXMatrix<T>>(
+                  num_rows, num_features, num_outputs, max_nodes, max_depth, split_proposals)
+                  .Build(context, X_matrix, g_accessor, h_accessor, g_shape, alpha);
 
-        builder.PerformBestSplit(tree, histogram, alpha, batch);
-      }
-      // Update position of entire level
-      // Don't bother updating positions for the last level
-      if (depth < max_depth - 1) { builder.UpdatePositions(tree, X_accessor, X_shape); }
-    }
     WriteTreeOutput(context, tree);
   }
 };
-
 }  // namespace
 
-/*static*/ void BuildTreeTask::cpu_variant(legate::TaskContext context)
+/*static*/ void BuildTreeDenseTask::cpu_variant(legate::TaskContext context)
+{
+  const auto& X = context.input(0).data();
+  legateboost::type_dispatch_float(X.code(), build_tree_dense_fn(), context);
+}
+
+/*static*/ void BuildTreeCSRTask::cpu_variant(legate::TaskContext context)
 {
   const auto& X = context.input(0).data();
-  legateboost::type_dispatch_float(X.code(), build_tree_fn(), context);
+  legateboost::type_dispatch_float(X.code(), build_tree_csr_fn(), context);
 }
 
 }  // namespace legateboost
@@ -518,6 +637,7 @@ namespace  // unnamed
 {
 void __attribute__((constructor)) register_tasks()
 {
-  legateboost::BuildTreeTask::register_variants();
+  legateboost::BuildTreeDenseTask::register_variants();
+  legateboost::BuildTreeCSRTask::register_variants();
 }
 }  // namespace
diff --git a/src/models/tree/build_tree.cu b/src/models/tree/build_tree.cu
index 3c6e724d..e248b655 100644
--- a/src/models/tree/build_tree.cu
+++ b/src/models/tree/build_tree.cu
@@ -21,17 +21,18 @@
 #include <thrust/sort.h>
 #include <thrust/random.h>
 #include <thrust/unique.h>
-#include <cstddef>
 #include <numeric>
 #include <limits>
 #include <vector>
 #include <algorithm>
+#include <cstddef>
 #include "legate_library.h"
 #include "legateboost.h"
 #include "../../cpp_utils/cpp_utils.h"
 #include "../../cpp_utils/cpp_utils.cuh"
 #include "legate/comm/coll.h"
 #include "build_tree.h"
+#include "matrix_types.h"
 
 namespace legateboost {
 
@@ -271,8 +272,7 @@ struct HistogramAgent {
     }
   };
 
-  const legate::AccessorRO<T, 3>& X;
-  const int64_t& sample_offset;
+  const DenseXMatrix<T>& X;
   const legate::AccessorRO<double, 3>& g;
   const legate::AccessorRO<double, 3>& h;
   const size_t& n_outputs;
@@ -288,8 +288,7 @@ struct HistogramAgent {
   int feature_stride;
   SharedMemoryHistogram shared_histogram;
 
-  __device__ HistogramAgent(const legate::AccessorRO<T, 3>& X,
-                            const int64_t& sample_offset,
+  __device__ HistogramAgent(const DenseXMatrix<T>& X,
                             const legate::AccessorRO<double, 3>& g,
                             const legate::AccessorRO<double, 3>& h,
                             const size_t& n_outputs,
@@ -302,7 +301,6 @@ struct HistogramAgent {
                             const int64_t& seed,
                             SharedMemoryHistogramType* shared_memory)
     : X(X),
-      sample_offset(sample_offset),
       g(g),
       h(h),
       n_outputs(n_outputs),
@@ -343,11 +341,10 @@ struct HistogramAgent {
           sample_node, node_sums, histogram.ContainsNode(BinaryTree::Parent(sample_node)));
       if (!computeHistogram) { continue; }
 
-      auto x = X[{sample_offset + local_sample_idx, feature, 0}];
-      // int bin_idx = shared_split_proposals.FindBin(x, feature);
+      auto x            = X.Get(X.RowSubset().lo[0] + local_sample_idx, feature);
       int const bin_idx = split_proposals.FindBin(x, feature);
 
-      legate::Point<3> p = {sample_offset + local_sample_idx, 0, output};
+      legate::Point<3> p = {X.RowSubset().lo[0] + local_sample_idx, 0, output};
       auto gpair_quantised =
         quantiser.QuantiseStochasticRounding({g[p], h[p]}, hash_combine(seed, p[0], p[2]));
       // NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast)
@@ -387,7 +384,7 @@ struct HistogramAgent {
     std::array<T, kItemsPerThread> x{};
 #pragma unroll
     for (int i = 0; i < kItemsPerThread; i++) {
-      x[i] = X[{sample_offset + local_sample_idx[i], feature[i], 0}];
+      x[i] = X.Get(X.RowSubset().lo[0] + local_sample_idx[i], feature[i]);
     }
 
     std::array<int, kItemsPerThread> bin_idx{};
@@ -398,7 +395,7 @@ struct HistogramAgent {
     std::array<IntegerGPair, kItemsPerThread> gpair{};
 #pragma unroll
     for (int i = 0; i < kItemsPerThread; i++) {
-      legate::Point<3> p = {sample_offset + local_sample_idx[i], 0, output};
+      legate::Point<3> p = {sample_node[i] + local_sample_idx[i], 0, output};
       gpair[i] = quantiser.QuantiseStochasticRounding({g[p], h[p]}, hash_combine(seed, p[0], p[2]));
     }
 #pragma unroll
@@ -443,18 +440,17 @@ struct HistogramAgent {
 // NOLINTBEGIN(performance-unnecessary-value-param)
 template <typename T, int kBlockThreads, int kItemsPerThread>
 __global__ void __launch_bounds__(kBlockThreads)
-  fill_histogram_shared(legate::AccessorRO<T, 3> X,
-                        int64_t sample_offset,
-                        legate::AccessorRO<double, 3> g,
-                        legate::AccessorRO<double, 3> h,
-                        size_t n_outputs,
-                        SparseSplitProposals<T> split_proposals,
-                        NodeBatch batch,
-                        Histogram<IntegerGPair> histogram,
-                        legate::Buffer<IntegerGPair, 2> node_sums,
-                        GradientQuantiser quantiser,
-                        legate::Buffer<int> feature_groups,
-                        int64_t seed)
+  fill_histogram_dense(DenseXMatrix<T> X,
+                       legate::AccessorRO<double, 3> g,
+                       legate::AccessorRO<double, 3> h,
+                       size_t n_outputs,
+                       SparseSplitProposals<T> split_proposals,
+                       NodeBatch batch,
+                       Histogram<IntegerGPair> histogram,
+                       legate::Buffer<IntegerGPair, 2> node_sums,
+                       GradientQuantiser quantiser,
+                       legate::Buffer<int> feature_groups,
+                       int64_t seed)
 {
   // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays,hicpp-avoid-c-arrays)
   __shared__ char shared_char[kMaxSharedBins * sizeof(SharedMemoryHistogramType)];
@@ -463,7 +459,6 @@ __global__ void __launch_bounds__(kBlockThreads)
     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast)
     reinterpret_cast<SharedMemoryHistogramType*>(shared_char);
   HistogramAgent<T, kBlockThreads, kItemsPerThread> agent(X,
-                                                          sample_offset,
                                                           g,
                                                           h,
                                                           n_outputs,
@@ -479,9 +474,56 @@ __global__ void __launch_bounds__(kBlockThreads)
 }
 // NOLINTEND(performance-unnecessary-value-param)
 
+// NOLINTBEGIN(performance-unnecessary-value-param)
+template <typename T, int kBlockThreads>
+__global__ void __launch_bounds__(kBlockThreads)
+  fill_histogram_csr(CSRXMatrix<T> X,
+                     legate::AccessorRO<double, 3> g,
+                     legate::AccessorRO<double, 3> h,
+                     size_t n_outputs,
+                     SparseSplitProposals<T> split_proposals,
+                     NodeBatch batch,
+                     Histogram<IntegerGPair> histogram,
+                     legate::Buffer<IntegerGPair, 2> node_sums,
+                     GradientQuantiser quantiser,
+                     int64_t seed)
+{
+  // Grid stride loop over rows
+  for (std::size_t idx = (blockIdx.x * blockDim.x) + threadIdx.x; idx < batch.InstancesInBatch();
+       idx += static_cast<std::size_t>(blockDim.x * gridDim.x)) {
+    auto [sample_node, local_sample_idx] = batch.instances[idx];
+    // If we don't need to compute this node, skip
+    if (!ComputeHistogramBin(
+          sample_node, node_sums, histogram.ContainsNode(BinaryTree::Parent(sample_node)))) {
+      continue;
+    }
+    // Which matrix elements belong to this row?
+    std::size_t const global_sample_idx = X.RowSubset().lo[0] + local_sample_idx;
+    auto elements                       = X.row_ranges[global_sample_idx];
+    for (std::size_t element = elements.lo[0]; element <= elements.hi[0]; element++) {
+      auto feature      = X.column_indices[X.vals_shape.lo[0] + element];
+      auto x            = X.values[X.vals_shape.lo[0] + element];
+      int const bin_idx = split_proposals.FindBin(x, feature);
+      for (int output = 0; output < n_outputs; output++) {
+        legate::Point<3> p = {global_sample_idx, 0, output};
+        auto gpair_quantised =
+          quantiser.QuantiseStochasticRounding({g[p], h[p]}, hash_combine(seed, p[0], p[2]));
+        // NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast)
+        auto* addPosition = reinterpret_cast<Histogram<IntegerGPair>::atomic_add_type*>(
+          &histogram[{sample_node, output, bin_idx}]);
+        atomicAdd(addPosition, gpair_quantised.grad);
+        // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
+        atomicAdd(addPosition + 1, gpair_quantised.hess);
+      }
+    }
+  }
+}
+// NOLINTEND(performance-unnecessary-value-param)
+
 // Manage the launch parameters for histogram kernel
-template <typename T, std::int32_t kBlockThreads = 1024, std::int32_t kItemsPerThread = 4>
+template <typename MatrixT, std::int32_t kBlockThreads = 1024, std::int32_t kItemsPerThread = 4>
 struct HistogramKernel {
+  using T                                 = typename MatrixT::value_type;
   static const std::int32_t kItemsPerTile = kBlockThreads * kItemsPerThread;
   legate::Buffer<int> feature_groups;
   int num_groups{};
@@ -494,10 +536,7 @@ struct HistogramKernel {
     CHECK_CUDA(cudaDeviceGetAttribute(&n_mps, cudaDevAttrMultiProcessorCount, device));
     std::int32_t n_blocks_per_mp = 0;
     CHECK_CUDA(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-      &n_blocks_per_mp,
-      fill_histogram_shared<T, kBlockThreads, kItemsPerThread>,
-      kBlockThreads,
-      0));
+      &n_blocks_per_mp, fill_histogram_dense<T, kBlockThreads, kItemsPerThread>, kBlockThreads, 0));
     this->maximum_blocks_for_occupancy = n_blocks_per_mp * n_mps;
     FindFeatureGroups(split_proposals, stream);
   }
@@ -537,8 +576,7 @@ struct HistogramKernel {
                                stream));
   }
 
-  void BuildHistogram(const legate::AccessorRO<T, 3>& X,
-                      int64_t sample_offset,
+  void BuildHistogram(const MatrixT& X,
                       const legate::AccessorRO<double, 3>& g,
                       const legate::AccessorRO<double, 3>& h,
                       size_t n_outputs,
@@ -550,25 +588,35 @@ struct HistogramKernel {
                       int64_t seed,
                       cudaStream_t stream)
   {
-    int const average_features_per_group = split_proposals.num_features / num_groups;
-    std::size_t const average_elements_per_group =
-      batch.InstancesInBatch() * average_features_per_group;
-    auto min_blocks  = (average_elements_per_group + kItemsPerTile - 1) / kItemsPerTile;
-    auto x_grid_size = std::min(static_cast<uint64_t>(maximum_blocks_for_occupancy), min_blocks);
-    // Launch the kernel
-    fill_histogram_shared<T, kBlockThreads, kItemsPerThread>
-      <<<dim3(x_grid_size, num_groups, n_outputs), kBlockThreads, 0, stream>>>(X,
-                                                                               sample_offset,
-                                                                               g,
-                                                                               h,
-                                                                               n_outputs,
-                                                                               split_proposals,
-                                                                               batch,
-                                                                               histogram,
-                                                                               node_sums,
-                                                                               quantiser,
-                                                                               feature_groups,
-                                                                               seed);
+    if constexpr (std::is_same_v<MatrixT, DenseXMatrix<T>>) {
+      int const average_features_per_group = split_proposals.num_features / num_groups;
+      std::size_t const average_elements_per_group =
+        batch.InstancesInBatch() * average_features_per_group;
+      auto min_blocks  = (average_elements_per_group + kItemsPerTile - 1) / kItemsPerTile;
+      auto x_grid_size = std::min(static_cast<uint64_t>(maximum_blocks_for_occupancy), min_blocks);
+      // Launch the kernel
+      fill_histogram_dense<T, kBlockThreads, kItemsPerThread>
+        <<<dim3(x_grid_size, num_groups, n_outputs), kBlockThreads, 0, stream>>>(X,
+                                                                                 g,
+                                                                                 h,
+                                                                                 n_outputs,
+                                                                                 split_proposals,
+                                                                                 batch,
+                                                                                 histogram,
+                                                                                 node_sums,
+                                                                                 quantiser,
+                                                                                 feature_groups,
+                                                                                 seed);
+    } else {
+      // For sparse data we don't currently make use of feature groups or shared memory
+      // Use 1 thread per row for lack of a better option currently
+      // Other methods might involve complicated load balancing
+      auto min_blocks  = (batch.InstancesInBatch() + kItemsPerTile - 1) / kItemsPerTile;
+      auto x_grid_size = std::min(static_cast<uint64_t>(maximum_blocks_for_occupancy), min_blocks);
+      // Launch the kernel
+      fill_histogram_csr<T, kBlockThreads><<<dim3(x_grid_size), kBlockThreads, 0, stream>>>(
+        X, g, h, n_outputs, split_proposals, batch, histogram, node_sums, quantiser, seed);
+    }
   }
 };
 
@@ -675,11 +723,11 @@ struct GainFeaturePair {
 };
 
 // NOLINTBEGIN(performance-unnecessary-value-param)
-template <typename TYPE, int BLOCK_THREADS>
+template <typename T, int BLOCK_THREADS>
 __global__ void __launch_bounds__(BLOCK_THREADS)
   perform_best_split(Histogram<IntegerGPair> histogram,
                      size_t n_outputs,
-                     SparseSplitProposals<TYPE> split_proposals,
+                     SparseSplitProposals<T> split_proposals,
                      double eps,
                      double alpha,
                      legate::Buffer<double, 2> tree_leaf_value,
@@ -688,7 +736,8 @@ __global__ void __launch_bounds__(BLOCK_THREADS)
                      legate::Buffer<double, 1> tree_split_value,
                      legate::Buffer<double, 1> tree_gain,
                      NodeBatch batch,
-                     GradientQuantiser quantiser)
+                     GradientQuantiser quantiser,
+                     bool is_sparse)
 {
   // using one block per (level) node to have blockwise reductions
   int const node_id = narrow<int>(batch.node_idx_begin + blockIdx.x);
@@ -708,9 +757,12 @@ __global__ void __launch_bounds__(BLOCK_THREADS)
        bin_idx += BLOCK_THREADS) {
     double gain = 0;
     for (int output = 0; output < n_outputs; ++output) {
-      auto node_sum  = vectorised_load(&node_sums[{node_id, output}]);
-      auto left_sum  = vectorised_load(&histogram[{node_id, output, bin_idx}]);
-      auto right_sum = node_sum - left_sum;
+      auto node_sum = vectorised_load(&node_sums[{node_id, output}]);
+
+      auto feature               = split_proposals.FindFeature(bin_idx);
+      auto [left_sum, right_sum] = InferSplitSums(
+        histogram, split_proposals, node_sum, node_id, output, bin_idx, feature, is_sparse);
+
       if (left_sum.hess <= 0 || right_sum.hess <= 0) {
         gain = 0;
         break;
@@ -745,9 +797,17 @@ __global__ void __launch_bounds__(BLOCK_THREADS)
   if (node_best_gain > eps) {
     int const node_best_feature = split_proposals.FindFeature(node_best_bin_idx);
     for (int output = narrow_cast<int>(threadIdx.x); output < n_outputs; output += BLOCK_THREADS) {
-      auto node_sum  = vectorised_load(&node_sums[{node_id, output}]);
-      auto left_sum  = vectorised_load(&histogram[{node_id, output, node_best_bin_idx}]);
-      auto right_sum = node_sum - left_sum;
+      auto node_sum = vectorised_load(&node_sums[{node_id, output}]);
+
+      auto [left_sum, right_sum] = InferSplitSums(histogram,
+                                                  split_proposals,
+                                                  node_sum,
+                                                  node_id,
+                                                  output,
+                                                  node_best_bin_idx,
+                                                  node_best_feature,
+                                                  is_sparse);
+
       node_sums[{BinaryTree::LeftChild(node_id), output}]  = left_sum;
       node_sums[{BinaryTree::RightChild(node_id), output}] = right_sum;
 
@@ -809,11 +869,12 @@ struct Tree {
                        [=] __device__(const legate::Point<DIM>& p) { out_acc[p] = x[p]; });
   }
 
-  template <typename ThrustPolicyT>
-  void WriteTreeOutput(legate::TaskContext context,
-                       const ThrustPolicyT& policy,
-                       GradientQuantiser quantiser)
+  void WriteTreeOutput(legate::TaskContext context, GradientQuantiser quantiser)
   {
+    auto* stream      = context.get_task_stream();
+    auto thrust_alloc = ThrustAllocator(legate::Memory::GPU_FB_MEM);
+    auto policy       = DEFAULT_POLICY(thrust_alloc).on(stream);
+
     WriteOutput(context.output(0).data(), leaf_value, policy);
     WriteOutput(context.output(1).data(), feature, policy);
     WriteOutput(context.output(2).data(), split_value, policy);
@@ -833,7 +894,7 @@ struct Tree {
 
   ~Tree()                              = default;
   Tree(const Tree&)                    = delete;
-  Tree(Tree&&)                         = delete;
+  Tree(Tree&&)                         = default;
   auto operator=(const Tree&) -> Tree& = delete;
   auto operator=(Tree&&) -> Tree&      = delete;
 
@@ -842,8 +903,8 @@ struct Tree {
   legate::Buffer<double, 1> split_value;
   legate::Buffer<double, 1> gain;
   legate::Buffer<IntegerGPair, 2> node_sums;
-  const int num_outputs;
-  const int max_nodes;
+  int num_outputs;
+  int max_nodes;
   cudaStream_t stream;
 };
 
@@ -851,10 +912,9 @@ struct Tree {
 // Use nccl to share the samples with all workers
 // Remove any duplicates
 // Return sparse matrix of split samples for each feature
-template <typename T>
+template <typename T, template <typename> class XMatrix>
 auto SelectSplitSamples(legate::TaskContext context,
-                        const legate::AccessorRO<T, 3>& X,
-                        legate::Rect<3> X_shape,
+                        const XMatrix<T>& X,
                         int split_samples,
                         int seed,
                         int64_t dataset_rows,
@@ -862,7 +922,6 @@ auto SelectSplitSamples(legate::TaskContext context,
 {
   auto thrust_alloc = ThrustAllocator(legate::Memory::GPU_FB_MEM);
   auto policy       = DEFAULT_POLICY(thrust_alloc).on(stream);
-  auto num_features = X_shape.hi[1] - X_shape.lo[1] + 1;
   // Randomly choose split_samples rows
   auto row_samples = legate::create_buffer<int64_t, 1>(split_samples);
   auto counting    = thrust::make_counting_iterator(0);
@@ -873,54 +932,57 @@ auto SelectSplitSamples(legate::TaskContext context,
       eng.discard(idx);
       return dist(eng);
     });
-  auto draft_proposals = legate::create_buffer<T, 2>({num_features, split_samples});
+  auto draft_proposals = legate::create_buffer<T, 2>({X.NumFeatures(), split_samples});
 
   // fill with local data
-  LaunchN(num_features * split_samples, stream, [=] __device__(auto idx) {
-    auto i                  = idx / num_features;
-    auto j                  = idx % num_features;
+  LaunchN(X.NumFeatures() * split_samples, stream, [=] __device__(auto idx) {
+    auto i                  = idx / X.NumFeatures();
+    auto j                  = idx % X.NumFeatures();
     auto row                = row_samples[i];
-    bool const has_data     = row >= X_shape.lo[0] && row <= X_shape.hi[0];
-    draft_proposals[{j, i}] = has_data ? X[{row, j, 0}] : T(0);
+    bool const has_data     = X.RowSubset().contains(row);
+    draft_proposals[{j, i}] = has_data ? X.Get(row, j) : T(0);
   });
 
   // Sum reduce over all workers
   SumAllReduce(
-    context, tcb::span<T>(draft_proposals.ptr({0, 0}), num_features * split_samples), stream);
+    context, tcb::span<T>(draft_proposals.ptr({0, 0}), X.NumFeatures() * split_samples), stream);
 
   CHECK_CUDA_STREAM(stream);
 
   // Condense split samples to unique values
   // First sort the samples
-  auto keys = legate::create_buffer<int32_t, 1>(num_features * split_samples);
-  thrust::transform(
-    policy, counting, counting + num_features * split_samples, keys.ptr(0), [=] __device__(int i) {
-      return i / split_samples;
-    });
+  auto keys = legate::create_buffer<int32_t, 1>(X.NumFeatures() * split_samples);
+  thrust::transform(policy,
+                    counting,
+                    counting + X.NumFeatures() * split_samples,
+                    keys.ptr(0),
+                    [=] __device__(int i) { return i / split_samples; });
 
   // Segmented sort
   auto begin =
     thrust::make_zip_iterator(thrust::make_tuple(keys.ptr(0), draft_proposals.ptr({0, 0})));
-  thrust::sort(policy, begin, begin + num_features * split_samples, [] __device__(auto a, auto b) {
-    if (thrust::get<0>(a) != thrust::get<0>(b)) { return thrust::get<0>(a) < thrust::get<0>(b); }
-    return thrust::get<1>(a) < thrust::get<1>(b);
-  });
+  thrust::sort(
+    policy, begin, begin + X.NumFeatures() * split_samples, [] __device__(auto a, auto b) {
+      if (thrust::get<0>(a) != thrust::get<0>(b)) { return thrust::get<0>(a) < thrust::get<0>(b); }
+      return thrust::get<1>(a) < thrust::get<1>(b);
+    });
 
   // Extract the unique values
-  auto out_keys        = legate::create_buffer<int32_t, 1>(num_features * split_samples);
-  auto split_proposals = legate::create_buffer<T, 1>(num_features * split_samples);
+  auto out_keys        = legate::create_buffer<int32_t, 1>(X.NumFeatures() * split_samples);
+  auto split_proposals = legate::create_buffer<T, 1>(X.NumFeatures() * split_samples);
   auto key_val =
     thrust::make_zip_iterator(thrust::make_tuple(keys.ptr(0), draft_proposals.ptr({0, 0})));
   auto out_iter =
     thrust::make_zip_iterator(thrust::make_tuple(out_keys.ptr(0), split_proposals.ptr(0)));
   auto result =
-    thrust::unique_copy(policy, key_val, key_val + num_features * split_samples, out_iter);
+    thrust::unique_copy(policy, key_val, key_val + X.NumFeatures() * split_samples, out_iter);
   auto n_unique = thrust::distance(out_iter, result);
   // Count the unique values for each feature
-  auto row_pointers = legate::create_buffer<int32_t, 1>(num_features + 1);
-  CHECK_CUDA(cudaMemsetAsync(row_pointers.ptr(0), 0, (num_features + 1) * sizeof(int32_t), stream));
+  auto row_pointers = legate::create_buffer<int32_t, 1>(X.NumFeatures() + 1);
+  CHECK_CUDA(
+    cudaMemsetAsync(row_pointers.ptr(0), 0, (X.NumFeatures() + 1) * sizeof(int32_t), stream));
 
-  tcb::span<int32_t> const out_keys_span(out_keys.ptr(0), num_features * split_samples);
+  tcb::span<int32_t> const out_keys_span(out_keys.ptr(0), X.NumFeatures() * split_samples);
   auto unique_keys_span = out_keys_span.subspan(0, n_unique);
   thrust::reduce_by_key(policy,
                         unique_keys_span.begin(),
@@ -929,15 +991,15 @@ auto SelectSplitSamples(legate::TaskContext context,
                         thrust::make_discard_iterator(),
                         row_pointers.ptr(1));
   // Scan the counts to get the row pointers for a CSR matrix
-  tcb::span<int32_t> const row_pointers_span(row_pointers.ptr(0), num_features + 1);
+  tcb::span<int32_t> const row_pointers_span(row_pointers.ptr(0), X.NumFeatures() + 1);
   thrust::inclusive_scan(policy,
                          row_pointers_span.begin() + 1,
-                         row_pointers_span.begin() + 1 + num_features,
+                         row_pointers_span.begin() + 1 + X.NumFeatures(),
                          row_pointers_span.begin() + 1);
 
   // Set the largest split sample to +inf such that an element must belong to one of the bins
   // i.e. we cannot go off the end when searching for a bin
-  LaunchN(num_features, stream, [=] __device__(int i) {
+  LaunchN(X.NumFeatures(), stream, [=] __device__(int i) {
     auto end                 = row_pointers_span[i + 1];
     split_proposals[end - 1] = std::numeric_limits<T>::infinity();
   });
@@ -946,7 +1008,7 @@ auto SelectSplitSamples(legate::TaskContext context,
   row_samples.destroy();
   draft_proposals.destroy();
   out_keys.destroy();
-  return SparseSplitProposals<T>(split_proposals, row_pointers, num_features, n_unique);
+  return SparseSplitProposals<T>(split_proposals, row_pointers, X.NumFeatures(), n_unique);
 }
 
 // Can't put a device lambda in constructor so make this a function
@@ -961,8 +1023,9 @@ void FillPositions(const legate::Buffer<cuda::std::tuple<int32_t, int32_t>>& sor
 
 }  // namespace
 
-template <typename T>
+template <typename MatrixT>
 struct TreeBuilder {
+  using T = typename MatrixT::value_type;
   TreeBuilder(int32_t num_rows,
               int32_t num_features,
               int32_t num_outputs,
@@ -970,15 +1033,18 @@ struct TreeBuilder {
               int32_t max_nodes,
               int32_t max_depth,
               const SparseSplitProposals<T>& split_proposals,
-              GradientQuantiser quantiser)
+              GradientQuantiser quantiser,
+              int64_t seed)
     : num_rows(num_rows),
       num_features(num_features),
       num_outputs(num_outputs),
       stream(stream),
       max_nodes(max_nodes),
+      max_depth(max_depth),
       split_proposals(split_proposals),
       quantiser(quantiser),
-      histogram_kernel(split_proposals, stream)
+      histogram_kernel(split_proposals, stream),
+      seed(seed)
   {
     sorted_positions = legate::create_buffer<cuda::std::tuple<int32_t, int32_t>>(num_rows);
     FillPositions(sorted_positions, num_rows, stream);
@@ -1002,14 +1068,45 @@ struct TreeBuilder {
     max_batch_size = max_histogram_nodes;
   }
 
+  auto Build(legate::TaskContext context,
+             const MatrixT& X_matrix,
+             const legate::AccessorRO<double, 3>& g_accessor,
+             const legate::AccessorRO<double, 3>& h_accessor,
+             legate::Rect<3> g_shape,
+             double alpha) -> Tree
+  {
+    auto* stream            = context.get_task_stream();
+    auto thrust_alloc       = ThrustAllocator(legate::Memory::GPU_FB_MEM);
+    auto thrust_exec_policy = DEFAULT_POLICY(thrust_alloc).on(stream);
+
+    Tree tree(max_nodes, num_outputs, stream, thrust_exec_policy);
+
+    this->InitialiseRoot(context, tree, g_accessor, h_accessor, g_shape, alpha);
+
+    for (int depth = 0; depth < max_depth; ++depth) {
+      auto batches = this->PrepareBatches(depth);
+      for (auto batch : batches) {
+        auto histogram = this->GetHistogram(batch);
+
+        this->ComputeHistogram(histogram, context, tree, X_matrix, g_accessor, h_accessor, batch);
+
+        this->PerformBestSplit(tree, histogram, alpha, batch);
+      }
+      // Update position of entire level
+      // Don't bother updating positions for the last level
+      if (depth < max_depth - 1) { this->UpdatePositions(tree, X_matrix); }
+    }
+
+    return tree;
+  }
+
   TreeBuilder(const TreeBuilder&)                    = delete;
   TreeBuilder(TreeBuilder&&)                         = delete;
   auto operator=(const TreeBuilder&) -> TreeBuilder& = delete;
   auto operator=(TreeBuilder&&) -> TreeBuilder&      = delete;
   ~TreeBuilder()                                     = default;
 
-  template <typename TYPE>
-  void UpdatePositions(Tree& tree, const legate::AccessorRO<TYPE, 3>& X, legate::Rect<3> X_shape)
+  void UpdatePositions(Tree& tree, const MatrixT& X)
   {
     tcb::span<int32_t> const tree_feature_span(tree.feature.ptr(0), max_nodes);
     tcb::span<double> const tree_split_value_span(tree.split_value.ptr(0), max_nodes);
@@ -1024,10 +1121,13 @@ struct TreeBuilder {
                 sorted_positions[idx] = cuda::std::make_tuple(-1, row);
                 return;
               }
+
               double const x_value =
-                X[{X_shape.lo[0] + static_cast<int64_t>(row), tree_feature_span[pos], 0}];
+                X.Get(X.RowSubset().lo[0] + static_cast<int64_t>(row), tree_feature_span[pos]);
               bool const left = x_value <= tree_split_value_span[pos];
               pos             = left ? BinaryTree::LeftChild(pos) : BinaryTree::RightChild(pos);
+              // printf("Row %d, feature %d, value %f pos %d\n", row, tree_feature_span[pos],
+              // x_value, pos);
               sorted_positions[idx] = cuda::std::make_tuple(pos, row);
             });
     CHECK_CUDA_STREAM(stream);
@@ -1064,19 +1164,15 @@ struct TreeBuilder {
                                stream));
   }
 
-  template <typename TYPE>
   void ComputeHistogram(Histogram<IntegerGPair> histogram,
                         legate::TaskContext context,
                         Tree& tree,
-                        const legate::AccessorRO<TYPE, 3>& X,
-                        legate::Rect<3> X_shape,
+                        const MatrixT& X,
                         const legate::AccessorRO<double, 3>& g,
                         const legate::AccessorRO<double, 3>& h,
-                        NodeBatch batch,
-                        int64_t seed)
+                        NodeBatch batch)
   {
     histogram_kernel.BuildHistogram(X,
-                                    X_shape.lo[0],
                                     g,
                                     h,
                                     num_outputs,
@@ -1127,7 +1223,8 @@ struct TreeBuilder {
                                                            tree.split_value,
                                                            tree.gain,
                                                            batch,
-                                                           quantiser);
+                                                           quantiser,
+                                                           std::is_same_v<MatrixT, CSRXMatrix<T>>);
     CHECK_CUDA_STREAM(stream);
   }
   void InitialiseRoot(legate::TaskContext context,
@@ -1135,8 +1232,7 @@ struct TreeBuilder {
                       const legate::AccessorRO<double, 3>& g,
                       const legate::AccessorRO<double, 3>& h,
                       legate::Rect<3> g_shape,
-                      double alpha,
-                      int64_t seed)
+                      double alpha)
   {
     const int kBlockThreads = 256;
     const size_t blocks     = (num_rows + kBlockThreads - 1) / kBlockThreads;
@@ -1243,11 +1339,13 @@ struct TreeBuilder {
   const int32_t num_features;
   const int32_t num_outputs;
   const int32_t max_nodes;
+  const int32_t max_depth;
+  const int64_t seed;
   SparseSplitProposals<T> split_proposals;
   Histogram<IntegerGPair> histogram;
   int max_batch_size;
   GradientQuantiser quantiser;
-  HistogramKernel<T> histogram_kernel;
+  HistogramKernel<MatrixT> histogram_kernel;
 
   cudaStream_t stream;
 };
@@ -1260,6 +1358,8 @@ struct build_tree_fn {
     auto [g, g_shape, g_accessor] = GetInputStore<double, 3>(context.input(1).data());
     auto [h, h_shape, h_accessor] = GetInputStore<double, 3>(context.input(2).data());
 
+    DenseXMatrix const X_matrix(X_accessor, X_shape);
+
     EXPECT_DENSE_ROW_MAJOR(X_accessor.accessor, X_shape);
     auto num_features = X_shape.hi[1] - X_shape.lo[1] + 1;
     auto num_rows     = std::max<int64_t>(X_shape.hi[0] - X_shape.lo[0] + 1, 0);
@@ -1281,51 +1381,96 @@ struct build_tree_fn {
     auto thrust_alloc       = ThrustAllocator(legate::Memory::GPU_FB_MEM);
     auto thrust_exec_policy = DEFAULT_POLICY(thrust_alloc).on(stream);
 
-    Tree tree(max_nodes, num_outputs, stream, thrust_exec_policy);
-
-    SparseSplitProposals<T> const split_proposals =
-      SelectSplitSamples(context, X_accessor, X_shape, split_samples, seed, dataset_rows, stream);
+    const SparseSplitProposals<T> split_proposals =
+      SelectSplitSamples(context, X_matrix, split_samples, seed, dataset_rows, stream);
 
     GradientQuantiser const quantiser(context, g_accessor, h_accessor, g_shape, stream);
 
-    // Begin building the tree
-    TreeBuilder<T> builder(num_rows,
-                           num_features,
-                           num_outputs,
-                           stream,
-                           tree.max_nodes,
-                           max_depth,
-                           split_proposals,
-                           quantiser);
+    auto tree = TreeBuilder<DenseXMatrix<T>>(num_rows,
+                                             num_features,
+                                             num_outputs,
+                                             stream,
+                                             max_nodes,
+                                             max_depth,
+                                             split_proposals,
+                                             quantiser,
+                                             seed)
+                  .Build(context, X_matrix, g_accessor, h_accessor, g_shape, alpha);
 
-    builder.InitialiseRoot(context, tree, g_accessor, h_accessor, g_shape, alpha, seed);
+    tree.WriteTreeOutput(context, quantiser);
 
-    for (int depth = 0; depth < max_depth; ++depth) {
-      auto batches = builder.PrepareBatches(depth);
-      for (auto batch : batches) {
-        auto histogram = builder.GetHistogram(batch);
+    CHECK_CUDA(cudaStreamSynchronize(stream));
+    CHECK_CUDA_STREAM(stream);
+  }
+};
+
+struct build_tree_csr_fn {
+  template <typename T>
+  void operator()(legate::TaskContext context)
+  {
+    auto [X_vals, X_vals_shape, X_vals_accessor] = GetInputStore<T, 1>(context.input(0).data());
+    auto [X_coords, X_coords_shape, X_coords_accessor] =
+      GetInputStore<int64_t, 1>(context.input(1).data());
+    auto [X_offsets, X_offsets_shape, X_offsets_accessor] =
+      GetInputStore<legate::Rect<1>, 1>(context.input(2).data());
+    auto [g, g_shape, g_accessor] = GetInputStore<double, 3>(context.input(3).data());
+    auto [h, h_shape, h_accessor] = GetInputStore<double, 3>(context.input(4).data());
+
+    auto num_rows    = std::max<int64_t>(X_offsets_shape.hi[0] - X_offsets_shape.lo[0] + 1, 0);
+    auto num_outputs = g_shape.hi[2] - g_shape.lo[2] + 1;
+    EXPECT(g_shape.lo[2] == 0, "Outputs should not be split between workers.");
 
-        builder.ComputeHistogram(
-          histogram, context, tree, X_accessor, X_shape, g_accessor, h_accessor, batch, seed);
+    // Scalars
+    auto max_depth     = context.scalars().at(0).value<int>();
+    auto max_nodes     = context.scalars().at(1).value<int>();
+    auto alpha         = context.scalars().at(2).value<double>();
+    auto split_samples = context.scalars().at(3).value<int>();
+    auto seed          = context.scalars().at(4).value<int>();
+    auto dataset_rows  = context.scalars().at(5).value<int64_t>();
+    auto num_features  = context.scalars().at(6).value<int64_t>();
+
+    auto* stream = context.get_task_stream();
+    CSRXMatrix<T> const X_matrix(X_vals_accessor,
+                                 X_coords_accessor,
+                                 X_offsets_accessor,
+                                 X_vals_shape,
+                                 X_offsets_shape,
+                                 num_features,
+                                 X_vals_shape.volume());
+    const SparseSplitProposals<T> split_proposals =
+      SelectSplitSamples(context, X_matrix, split_samples, seed, dataset_rows, stream);
 
-        builder.PerformBestSplit(tree, histogram, alpha, batch);
-      }
-      // Update position of entire level
-      // Don't bother updating positions for the last level
-      if (depth < max_depth - 1) { builder.UpdatePositions(tree, X_accessor, X_shape); }
-    }
+    GradientQuantiser const quantiser(context, g_accessor, h_accessor, g_shape, stream);
 
-    tree.WriteTreeOutput(context, thrust_exec_policy, quantiser);
+    // Begin building the tree
+    auto tree = TreeBuilder<CSRXMatrix<T>>(num_rows,
+                                           num_features,
+                                           num_outputs,
+                                           stream,
+                                           max_nodes,
+                                           max_depth,
+                                           split_proposals,
+                                           quantiser,
+                                           seed)
+                  .Build(context, X_matrix, g_accessor, h_accessor, g_shape, alpha);
+
+    tree.WriteTreeOutput(context, quantiser);
 
     CHECK_CUDA(cudaStreamSynchronize(stream));
     CHECK_CUDA_STREAM(stream);
   }
 };
 
-/*static*/ void BuildTreeTask::gpu_variant(legate::TaskContext context)
+/*static*/ void BuildTreeDenseTask::gpu_variant(legate::TaskContext context)
 {
   const auto& X = context.input(0).data();
   type_dispatch_float(X.code(), build_tree_fn(), context);
 }
 
+/*static*/ void BuildTreeCSRTask::gpu_variant(legate::TaskContext context)
+{
+  const auto& X = context.input(0).data();
+  type_dispatch_float(X.code(), build_tree_csr_fn(), context);
+}
+
 }  // namespace legateboost
diff --git a/src/models/tree/build_tree.h b/src/models/tree/build_tree.h
index af8aa9d4..442b02cb 100644
--- a/src/models/tree/build_tree.h
+++ b/src/models/tree/build_tree.h
@@ -41,6 +41,12 @@ struct GPairBase {
     this->hess += b.hess;
     return *this;
   }
+  __host__ __device__ auto operator-=(const GPairBase<T>& b) -> GPairBase<T>&
+  {
+    this->grad -= b.grad;
+    this->hess -= b.hess;
+    return *this;
+  }
 };
 
 template <typename T>
@@ -243,9 +249,57 @@ class Histogram {
   {
     return buffer_[{p[0] - node_begin_, p[1], p[2]}];
   }
+
+  // Node, output, bin
+  __host__ __device__ auto operator[](legate::Point<3> p) const -> GPairT
+  {
+    return buffer_[{p[0] - node_begin_, p[1], p[2]}];
+  }
+};
+
+// From the scanned histogram gradients and the node sums, infer the gradients for the left and
+// right partitions For a dense matrix the left sum is the scanned histogram and the right sum is
+// the node sum minus the left sum In the case where we have a sparse matrix, gradients for 0's have
+// not been accumulated in the histogram. We can infer the gradients at the matrix zeroes by
+// subtracting the sum of the gradients at the last bin (which contains gradients from every
+// non-zero element) from the sum of the gradients in the node (this sum always includes gradients
+// for every element in that node)
+template <typename T, typename GPairT>
+__host__ __device__ auto InferSplitSums(const Histogram<GPairT>& scanned_histogram,
+                                        const SparseSplitProposals<T>& split_proposals,
+                                        const GPairT& node_sum,
+                                        int node_id,
+                                        int output,
+                                        int bin_idx,
+                                        int feature,
+                                        bool is_sparse) -> std::tuple<GPairT, GPairT>
+{
+  auto left_sum  = scanned_histogram[{node_id, output, bin_idx}];
+  auto right_sum = node_sum - left_sum;
+  if (!is_sparse) { return std::make_tuple(left_sum, right_sum); }
+  auto [feature_begin, feature_end] = split_proposals.FeatureRange(feature);
+  auto scan_sum                     = scanned_histogram[{node_id, output, feature_end - 1}];
+  auto zero_bin                     = split_proposals.FindBin(0.0, feature);
+  auto sparse_sum                   = node_sum - scan_sum;
+  if (bin_idx < zero_bin) {
+    // Do nothing, this amount is already on the right
+  } else {
+    // Move it to the left
+    left_sum += sparse_sum;
+    right_sum -= sparse_sum;
+  }
+  return std::make_tuple(left_sum, right_sum);
+}
+
+class BuildTreeDenseTask : public Task<BuildTreeDenseTask, BUILD_TREE> {
+ public:
+  static void cpu_variant(legate::TaskContext context);
+#ifdef LEGATEBOOST_USE_CUDA
+  static void gpu_variant(legate::TaskContext context);
+#endif
 };
 
-class BuildTreeTask : public Task<BuildTreeTask, BUILD_TREE> {
+class BuildTreeCSRTask : public Task<BuildTreeCSRTask, BUILD_TREE_CSR> {
  public:
   static constexpr auto CPU_VARIANT_OPTIONS = legate::VariantOptions{}.with_has_allocations(true);
   static constexpr auto GPU_VARIANT_OPTIONS = legate::VariantOptions{}.with_has_allocations(true);
diff --git a/src/models/tree/matrix_types.h b/src/models/tree/matrix_types.h
new file mode 100644
index 00000000..32eb62ab
--- /dev/null
+++ b/src/models/tree/matrix_types.h
@@ -0,0 +1,124 @@
+/* Copyright 2024 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+#pragma once
+#include <legate.h>
+#include <cstdint>
+#include <utility>
+#ifdef __CUDACC__
+#include <thrust/binary_search.h>
+#else
+#include <algorithm>
+#endif
+
+namespace legateboost {
+// Create a uniform interface to two matrix formats
+// Dense and CSR
+template <typename T>
+class DenseXMatrix {
+ public:
+  using value_type = T;
+
+ private:
+  legate::AccessorRO<T, 3> x;
+  legate::Rect<3> shape;
+
+ public:
+  DenseXMatrix(const legate::AccessorRO<T, 3>& x, legate::Rect<3> shape)
+    : x(std::move(x)), shape(shape)
+  {
+  }
+  // Global row index refers to the index across partitions
+  // For features, each worker has every feature so the global is the same as the local index
+  [[nodiscard]] __host__ __device__ auto Get(std::size_t global_row_idx, uint32_t feature_idx) const
+    -> T
+  {
+    return x[legate::Point<3>{global_row_idx, feature_idx, 0}];
+  }
+  [[nodiscard]] __host__ __device__ auto NumFeatures() const -> int
+  {
+    return shape.hi[1] - shape.lo[1] + 1;
+  }
+  [[nodiscard]] __host__ __device__ auto RowSubset() const -> legate::Rect<1, legate::coord_t>
+  {
+    return {shape.lo[0], shape.hi[0]};
+  }
+};
+
+template <typename T>
+class CSRXMatrix {
+ public:
+  using value_type = T;
+
+  legate::AccessorRO<T, 1> values;
+  legate::Rect<1, legate::coord_t> vals_shape;
+  legate::AccessorRO<int64_t, 1> column_indices;
+  legate::AccessorRO<legate::Rect<1, legate::coord_t>, 1> row_ranges;
+  legate::Rect<1, legate::coord_t> row_ranges_shape;
+  int num_features;
+  std::size_t nnz;  // The number of nnz in ths local partition
+
+  CSRXMatrix(const legate::AccessorRO<T, 1>& values,
+             const legate::AccessorRO<int64_t, 1>& column_indices,
+             const legate::AccessorRO<legate::Rect<1, legate::coord_t>, 1>& row_ranges,
+             legate::Rect<1, legate::coord_t> vals_shape,
+             legate::Rect<1, legate::coord_t> row_ranges_shape,
+             int num_features,
+             std::size_t nnz)
+    : values(std::move(values)),
+      column_indices(column_indices),
+      row_ranges(row_ranges),
+      num_features(num_features),
+      vals_shape(vals_shape),
+      row_ranges_shape(row_ranges_shape),
+      nnz(nnz)
+  {
+  }
+
+  // Global row index refers to the index across partitions
+  // For features, each worker has every feature so the global is the same as the local index
+  // This method is less efficient than its Dense counterpart due to the need to search for the
+  // feature
+  [[nodiscard]] __host__ __device__ auto Get(std::size_t global_row_idx, uint32_t feature_idx) const
+    -> T
+  {
+    auto row_range = row_ranges[narrow<int64_t>(global_row_idx)];
+
+    tcb::span<const int64_t> const column_indices_span(column_indices.ptr(row_range.lo),
+                                                       row_range.volume());
+
+#ifdef __CUDACC__
+    const auto* result = thrust::lower_bound(
+      thrust::seq, column_indices_span.begin(), column_indices_span.end(), feature_idx);
+#else
+    const auto* result =
+      std::lower_bound(column_indices_span.begin(), column_indices_span.end(), feature_idx);
+#endif
+
+    if (result != column_indices_span.end() && *result == feature_idx) {
+      return values[row_range.lo + (result - column_indices_span.begin())];
+    }
+    return 0;
+  }
+
+  [[nodiscard]] auto NNZ() const { return nnz; }
+
+  [[nodiscard]] __host__ __device__ auto NumFeatures() const -> int { return num_features; }
+  [[nodiscard]] __host__ __device__ auto RowSubset() const -> legate::Rect<1, legate::coord_t>
+  {
+    return row_ranges_shape;
+  }
+};
+}  // namespace legateboost
diff --git a/src/models/tree/predict.cc b/src/models/tree/predict.cc
index a87d5044..13e36f0c 100644
--- a/src/models/tree/predict.cc
+++ b/src/models/tree/predict.cc
@@ -16,11 +16,34 @@
 #include "predict.h"
 #include <cstdint>
 #include "../../cpp_utils/cpp_utils.h"
+#include "matrix_types.h"
 
 namespace legateboost {
 
 namespace {
-struct predict_fn {
+template <typename MatrixT>
+void PredictRows(const MatrixT& X,
+                 const legate::AccessorWO<double, 3>& pred_accessor,
+                 legate::Rect<3, legate::coord_t> pred_shape,
+                 const legate::AccessorRO<double, 1>& split_value,
+                 const legate::AccessorRO<int32_t, 1>& feature,
+                 const legate::AccessorRO<double, 2>& leaf_value)
+{
+  for (int64_t i = X.RowSubset().lo[0]; i <= X.RowSubset().hi[0]; i++) {
+    int pos = 0;
+    // Use a max depth of 100 to avoid infinite loops
+    const int max_depth = 100;
+    for (int depth = 0; depth < max_depth; depth++) {
+      if (feature[pos] == -1) { break; }
+      auto x = X.Get(i, feature[pos]);
+      pos    = x <= split_value[pos] ? (pos * 2) + 1 : (pos * 2) + 2;
+    }
+    for (int64_t j = pred_shape.lo[2]; j <= pred_shape.hi[2]; j++) {
+      pred_accessor[{i, 0, j}] = leaf_value[{pos, j}];
+    }
+  }
+}
+struct predict_dense_fn {
   template <typename T>
   void operator()(legate::TaskContext context)
   {
@@ -45,27 +68,59 @@ struct predict_fn {
     EXPECT_IS_BROADCAST(context.input(2).data().shape<1>());
     EXPECT_IS_BROADCAST(context.input(3).data().shape<1>());
 
-    for (int64_t i = X_shape.lo[0]; i <= X_shape.hi[0]; i++) {
-      int pos = 0;
-      // Use a max depth of 100 to avoid infinite loops
-      const int max_depth = 100;
-      for (int depth = 0; depth < max_depth; depth++) {
-        if (feature[pos] == -1) { break; }
-        auto x = X_accessor[{i, feature[pos], 0}];
-        pos    = x <= split_value[pos] ? (pos * 2) + 1 : (pos * 2) + 2;
-      }
-      for (int64_t j = pred_shape.lo[2]; j <= pred_shape.hi[2]; j++) {
-        pred_accessor[{i, 0, j}] = leaf_value[{pos, j}];
-      }
-    }
+    PredictRows(DenseXMatrix<T>(X_accessor, X_shape),
+                pred_accessor,
+                pred_shape,
+                split_value,
+                feature,
+                leaf_value);
+  }
+};
+
+struct predict_csr_fn {
+  template <typename T>
+  void operator()(legate::TaskContext context)
+  {
+    auto [X_vals, X_vals_shape, X_vals_accessor] = GetInputStore<T, 1>(context.input(0).data());
+    auto [X_coords, X_coords_shape, X_coords_accessor] =
+      GetInputStore<int64_t, 1>(context.input(1).data());
+    auto [X_offsets, X_offsets_shape, X_offsets_accessor] =
+      GetInputStore<legate::Rect<1, legate::coord_t>, 1>(context.input(2).data());
+
+    auto leaf_value  = context.input(3).data().read_accessor<double, 2>();
+    auto feature     = context.input(4).data().read_accessor<int32_t, 1>();
+    auto split_value = context.input(5).data().read_accessor<double, 1>();
+
+    auto pred          = context.output(0).data();
+    auto pred_shape    = pred.shape<3>();
+    auto pred_accessor = pred.write_accessor<double, 3>();
+
+    auto num_features = context.scalars().at(0).value<int32_t>();
+    CSRXMatrix<T> const X(X_vals_accessor,
+                          X_coords_accessor,
+                          X_offsets_accessor,
+                          X_vals_shape,
+                          X_offsets_shape,
+                          num_features,
+                          X_vals_shape.volume());
+
+    EXPECT_AXIS_ALIGNED(0, X_offsets_shape, pred_shape);
+
+    PredictRows(X, pred_accessor, pred_shape, split_value, feature, leaf_value);
   }
 };
 }  // namespace
 
-/*static*/ void PredictTask::cpu_variant(legate::TaskContext context)
+/*static*/ void PredictTreeTask::cpu_variant(legate::TaskContext context)
+{
+  const auto& X = context.input(0).data();
+  type_dispatch_float(X.code(), predict_dense_fn(), context);
+}
+
+/*static*/ void PredictTreeCSRTask::cpu_variant(legate::TaskContext context)
 {
   const auto& X = context.input(0).data();
-  type_dispatch_float(X.code(), predict_fn(), context);
+  type_dispatch_float(X.code(), predict_csr_fn(), context);
 }
 
 }  // namespace legateboost
@@ -74,6 +129,7 @@ namespace  // unnamed
 {
 void __attribute__((constructor)) register_tasks()
 {
-  legateboost::PredictTask::register_variants();
+  legateboost::PredictTreeTask::register_variants();
+  legateboost::PredictTreeCSRTask::register_variants();
 }
 }  // namespace
diff --git a/src/models/tree/predict.cu b/src/models/tree/predict.cu
index 65837619..7c93b0f6 100644
--- a/src/models/tree/predict.cu
+++ b/src/models/tree/predict.cu
@@ -19,11 +19,42 @@
 #include "../../cpp_utils/cpp_utils.cuh"
 #include "../../cpp_utils/cpp_utils.h"
 #include "predict.h"
+#include "matrix_types.h"
 
 namespace legateboost {
 
 namespace {
-struct predict_fn {
+
+template <typename MatrixT>
+void PredictRows(const MatrixT& X,
+                 const legate::AccessorWO<double, 3>& pred_accessor,
+                 legate::Rect<3, legate::coord_t> pred_shape,
+                 const legate::AccessorRO<double, 1>& split_value,
+                 const legate::AccessorRO<int32_t, 1>& feature,
+                 const legate::AccessorRO<double, 2>& leaf_value,
+                 cudaStream_t stream)
+{
+  // rowwise kernel
+  auto prediction_lambda = [=] __device__(size_t idx) {
+    int64_t pos         = 0;
+    auto global_row_idx = X.RowSubset().lo + idx;
+    // Use a max depth of 100 to avoid infinite loops
+    const int max_depth = 100;
+    for (int depth = 0; depth < max_depth; depth++) {
+      if (feature[pos] == -1) { break; }
+      double const X_val = X.Get(global_row_idx, feature[pos]);
+      pos                = X_val <= split_value[pos] ? (pos * 2) + 1 : (pos * 2) + 2;
+    }
+    for (int64_t j = pred_shape.lo[2]; j <= pred_shape.hi[2]; j++) {
+      pred_accessor[{global_row_idx, 0, j}] = leaf_value[{pos, j}];
+    }
+  };  // NOLINT(readability/braces)
+
+  LaunchN(X.RowSubset().volume(), stream, prediction_lambda);
+  CHECK_CUDA_STREAM(stream);
+}
+
+struct predict_dense_fn {
   template <typename T>
   void operator()(legate::TaskContext context)
   {
@@ -39,7 +70,6 @@ struct predict_fn {
     auto pred          = context.output(0).data();
     auto pred_shape    = pred.shape<3>();
     auto pred_accessor = pred.write_accessor<double, 3>();
-    auto n_outputs     = pred_shape.hi[2] - pred_shape.lo[2] + 1;
 
     EXPECT(pred_shape.lo[2] == 0, "Expect all outputs to be present");
     // We should have one output prediction per row of X
@@ -50,36 +80,61 @@ struct predict_fn {
     EXPECT_IS_BROADCAST(context.input(2).data().shape<1>());
     EXPECT_IS_BROADCAST(context.input(3).data().shape<1>());
 
-    // rowwise kernel
-    auto prediction_lambda = [=] __device__(size_t idx) {
-      int64_t pos              = 0;
-      legate::Point<3> x_point = {X_shape.lo[0] + static_cast<int64_t>(idx), 0, 0};
-
-      // Use a max depth of 100 to avoid infinite loops
-      const int max_depth = 100;
-      for (int depth = 0; depth < max_depth; depth++) {
-        if (feature[pos] == -1) { break; }
-        x_point[1]         = feature[pos];
-        double const X_val = X_accessor[x_point];
-        pos                = X_val <= split_value[pos] ? (pos * 2) + 1 : (pos * 2) + 2;
-      }
-      for (int64_t j = 0; j < n_outputs; j++) {
-        pred_accessor[{X_shape.lo[0] + static_cast<int64_t>(idx), 0, j}] = leaf_value[{pos, j}];
-      }
-    };  // NOLINT(readability/braces)
-
-    auto* stream = context.get_task_stream();
-    LaunchN(X_shape.hi[0] - X_shape.lo[0] + 1, stream, prediction_lambda);
-
-    CHECK_CUDA_STREAM(stream);
+    PredictRows(DenseXMatrix<T>(X_accessor, X_shape),
+                pred_accessor,
+                pred_shape,
+                split_value,
+                feature,
+                leaf_value,
+                context.get_task_stream());
+  }
+};
+
+struct predict_csr_fn {
+  template <typename T>
+  void operator()(legate::TaskContext context)
+  {
+    auto [X_vals, X_vals_shape, X_vals_accessor] = GetInputStore<T, 1>(context.input(0).data());
+    auto [X_coords, X_coords_shape, X_coords_accessor] =
+      GetInputStore<int64_t, 1>(context.input(1).data());
+    auto [X_offsets, X_offsets_shape, X_offsets_accessor] =
+      GetInputStore<legate::Rect<1, legate::coord_t>, 1>(context.input(2).data());
+
+    auto leaf_value  = context.input(3).data().read_accessor<double, 2>();
+    auto feature     = context.input(4).data().read_accessor<int32_t, 1>();
+    auto split_value = context.input(5).data().read_accessor<double, 1>();
+
+    auto pred          = context.output(0).data();
+    auto pred_shape    = pred.shape<3>();
+    auto pred_accessor = pred.write_accessor<double, 3>();
+
+    auto num_features = context.scalars().at(0).value<int32_t>();
+    CSRXMatrix<T> const X(X_vals_accessor,
+                          X_coords_accessor,
+                          X_offsets_accessor,
+                          X_vals_shape,
+                          X_offsets_shape,
+                          num_features,
+                          X_vals_shape.volume());
+
+    EXPECT_AXIS_ALIGNED(0, X_offsets_shape, pred_shape);
+
+    PredictRows(
+      X, pred_accessor, pred_shape, split_value, feature, leaf_value, context.get_task_stream());
   }
 };
 }  // namespace
 
-/*static*/ void PredictTask::gpu_variant(legate::TaskContext context)
+/*static*/ void PredictTreeTask::gpu_variant(legate::TaskContext context)
+{
+  auto X = context.input(0).data();
+  type_dispatch_float(X.code(), predict_dense_fn(), context);
+}
+
+/*static*/ void PredictTreeCSRTask::gpu_variant(legate::TaskContext context)
 {
   auto X = context.input(0).data();
-  type_dispatch_float(X.code(), predict_fn(), context);
+  type_dispatch_float(X.code(), predict_csr_fn(), context);
 }
 
 }  // namespace legateboost
diff --git a/src/models/tree/predict.h b/src/models/tree/predict.h
index 1d98e6b5..c55efd42 100644
--- a/src/models/tree/predict.h
+++ b/src/models/tree/predict.h
@@ -20,7 +20,15 @@
 
 namespace legateboost {
 
-class PredictTask : public Task<PredictTask, PREDICT> {
+class PredictTreeTask : public Task<PredictTreeTask, PREDICT_TREE> {
+ public:
+  static void cpu_variant(legate::TaskContext context);
+#ifdef LEGATEBOOST_USE_CUDA
+  static void gpu_variant(legate::TaskContext context);
+#endif
+};
+
+class PredictTreeCSRTask : public Task<PredictTreeCSRTask, PREDICT_TREE_CSR> {
  public:
   static void cpu_variant(legate::TaskContext context);
 #ifdef LEGATEBOOST_USE_CUDA