diff --git a/ci/run_pytests_cpu.sh b/ci/run_pytests_cpu.sh index 50e138ac..16a74640 100755 --- a/ci/run_pytests_cpu.sh +++ b/ci/run_pytests_cpu.sh @@ -18,6 +18,7 @@ set -e -E -u -o pipefail cd legateboost/test legate \ + --gpus 0 \ --sysmem 28000 \ --module pytest \ . \ diff --git a/conda/environments/all_cuda-122.yaml b/conda/environments/all_cuda-122.yaml index f0b58cd7..194fd852 100644 --- a/conda/environments/all_cuda-122.yaml +++ b/conda/environments/all_cuda-122.yaml @@ -15,6 +15,7 @@ dependencies: - cuda-version>=12.2 - cupynumeric==25.01.*,>=0.0.0.dev0 - hypothesis>=6 +- legate-sparse - legate==25.01.*,>=0.0.0.dev0 - libcublas-dev - llvm-openmp diff --git a/dependencies.yaml b/dependencies.yaml index 1bb50135..94590d24 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -173,3 +173,4 @@ dependencies: - pytest>=7,<8 - seaborn>=0.13 - xgboost>=2.0 + - legate-sparse diff --git a/examples/sparse/README.md b/examples/sparse/README.md new file mode 100644 index 00000000..c0cd3efc --- /dev/null +++ b/examples/sparse/README.md @@ -0,0 +1,3 @@ +# Sparse data + +This example trains a youtube comment spam classifier on a sparse dataset. The comments as raw strings are converted to a sparse matrix of word counts using the `CountVectorizer` from scikit-learn. diff --git a/examples/sparse/sparse.py b/examples/sparse/sparse.py new file mode 100644 index 00000000..a189beff --- /dev/null +++ b/examples/sparse/sparse.py @@ -0,0 +1,47 @@ +import pandas as pd +from sklearn.datasets import fetch_openml +from sklearn.feature_extraction.text import CountVectorizer +from sklearn.model_selection import train_test_split + +import legateboost as lb + +# Alberto, T. & Lochter, J. (2015). YouTube Spam Collection [Dataset]. +# UCI Machine Learning Repository. https://doi.org/10.24432/C58885. +dataset_names = [ + "youtube-spam-psy", + "youtube-spam-shakira", + "youtube-spam-lmfao", + "youtube-spam-eminem", + "youtube-spam-katyperry", +] +X = [] +for dataset_name in dataset_names: + dataset = fetch_openml(name=dataset_name, as_frame=True) + X.append(dataset.data) + +X = pd.concat(X) +y = X["CLASS"] +X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.3, random_state=42 +) +vectorizer = CountVectorizer() +X_train_vectorized = vectorizer.fit_transform(X_train["CONTENT"]) +X_test_vectorized = vectorizer.transform(X_test["CONTENT"]) + +model = lb.LBClassifier().fit( + X_train_vectorized, y_train, eval_set=[(X_test_vectorized, y_test)] +) + + +def evaluate_comment(comment): + print("Comment: {}".format(comment)) + print( + "Probability of spam: {}".format( + model.predict_proba(vectorizer.transform([comment]))[0, 1] + ) + ) + + +evaluate_comment(X_test.iloc[15]["CONTENT"]) +evaluate_comment(X_test.iloc[3]["CONTENT"]) +evaluate_comment("Your text here") diff --git a/legateboost/input_validation.py b/legateboost/input_validation.py index 38990a34..77c384f0 100644 --- a/legateboost/input_validation.py +++ b/legateboost/input_validation.py @@ -3,6 +3,11 @@ import numpy as np import scipy.sparse as sp +try: + from legate_sparse import csr_matrix +except ImportError: + csr_matrix = None + import cupynumeric as cn __all__: List[str] = [] @@ -29,22 +34,22 @@ def check_sample_weight(sample_weight: Any, n: int) -> cn.ndarray: def check_array(x: Any) -> cn.ndarray: if sp.issparse(x): - raise ValueError("Sparse matrix not allowed.") - - if not hasattr(x, "__legate_data_interface__"): - x = cn.array(np.require(x, requirements=["C", "A"])) - if hasattr(x, "__array_interface__"): - shape = x.__array_interface__["shape"] - if shape[0] <= 0: - raise ValueError( - "Found array with %d sample(s) (shape=%s) while a" - " minimum of %d is required." % (shape[0], shape, 1) - ) - if len(shape) >= 2 and 0 in shape: - raise ValueError( - "Found array with %d feature(s) (shape=%s) while" - " a minimum of %d is required." % (shape[1], shape, 1) - ) + x = csr_matrix(x) + elif isinstance(x, csr_matrix): + pass + else: + x = cn.array(x, copy=False) + + if x.shape[0] <= 0: + raise ValueError( + "Found array with %d sample(s) (shape=%s) while a" + " minimum of %d is required." % (x.shape[0], x.shape, 1) + ) + if len(x.shape) >= 2 and 0 in x.shape: + raise ValueError( + "Found array with %d feature(s) (shape=%s) while" + " a minimum of %d is required." % (x.shape[1], x.shape, 1) + ) if cn.iscomplexobj(x): raise ValueError("Complex data not supported.") @@ -52,8 +57,6 @@ def check_array(x: Any) -> cn.ndarray: if np.issubdtype(x.dtype, np.floating) and not cn.isfinite(x.sum()): raise ValueError("Input contains NaN or inf") - x = cn.array(x, copy=False) - return x diff --git a/legateboost/models/base_model.py b/legateboost/models/base_model.py index 07408559..1b23cdcb 100644 --- a/legateboost/models/base_model.py +++ b/legateboost/models/base_model.py @@ -91,6 +91,16 @@ def predict(self, X: cn.ndarray) -> cn.ndarray: """ pass + def supports_csr(self) -> bool: + """Whether the model supports CSR matrix input. + + Returns + ------- + bool + True if the model supports CSR matrix input, False otherwise. + """ + return False + @abstractmethod def __str__(self) -> str: pass diff --git a/legateboost/models/tree.py b/legateboost/models/tree.py index a3aa8c37..ff4079d3 100644 --- a/legateboost/models/tree.py +++ b/legateboost/models/tree.py @@ -1,9 +1,20 @@ import copy from enum import IntEnum -from typing import Any +from typing import Any, Union import cupynumeric as cn -from legate.core import TaskTarget, get_legate_runtime, types +from legate.core import ( + ImageComputationHint, + TaskTarget, + get_legate_runtime, + image, + types, +) + +try: + from legate_sparse import csr_matrix +except ImportError: + csr_matrix = None from ..library import user_context, user_lib from ..utils import get_store @@ -12,7 +23,9 @@ class LegateBoostOpCode(IntEnum): BUILD_TREE = user_lib.cffi.BUILD_TREE - PREDICT = user_lib.cffi.PREDICT + BUILD_TREE_CSR = user_lib.cffi.BUILD_TREE_CSR + PREDICT_TREE = user_lib.cffi.PREDICT_TREE + PREDICT_TREE_CSR = user_lib.cffi.PREDICT_TREE_CSR UPDATE_TREE = user_lib.cffi.UPDATE_TREE @@ -54,12 +67,7 @@ def __init__( self.split_samples = split_samples self.alpha = alpha - def fit( - self, - X: cn.ndarray, - g: cn.ndarray, - h: cn.ndarray, - ) -> "Tree": + def fit_dense(self, X: cn.ndarray, g: cn.ndarray, h: cn.ndarray) -> "Tree": num_outputs = g.shape[1] task = get_legate_runtime().create_auto_task( @@ -120,6 +128,90 @@ def fit( self.hessian = cn.array(hessian, copy=False) return self + def fit_csr(self, X: csr_matrix, g: cn.ndarray, h: cn.ndarray) -> "Tree": + num_outputs = g.shape[1] + + task = get_legate_runtime().create_auto_task( + user_context, LegateBoostOpCode.BUILD_TREE_CSR + ) + + # promote these to 3d. When the g/h shapes match those of the dense version, + # it makes code reuse easier on the C++ side + g_ = get_store(g).promote(1, 1) + h_ = get_store(h).promote(1, 1) + + task.add_scalar_arg(self.max_depth, types.int32) + max_nodes = 2 ** (self.max_depth + 1) + task.add_scalar_arg(max_nodes, types.int32) + task.add_scalar_arg(self.alpha, types.float64) + task.add_scalar_arg(self.split_samples, types.int32) + task.add_scalar_arg(self.random_state.randint(0, 2**31), types.int32) + task.add_scalar_arg(X.shape[0], types.int64) + task.add_scalar_arg(X.shape[1], types.int64) + + # inputs + val_var = task.add_input(X.vals) + crd_var = task.add_input(X.crd) + pos_var = task.add_input(X.pos) + task.add_input(g_) + task.add_input(h_) + pos_promoted = X.pos.promote(1, g.shape[1]).promote(1, 1) + # we don't need this input but use it for alignment + task.add_input(pos_promoted) + + task.add_alignment(g_, h_) + task.add_alignment(g_, pos_promoted) + task.add_constraint( + image(pos_var, crd_var, hint=ImageComputationHint.FIRST_LAST) + ) + task.add_constraint( + image(pos_var, val_var, hint=ImageComputationHint.FIRST_LAST) + ) + + # outputs + leaf_value = get_legate_runtime().create_store( + types.float64, (max_nodes, num_outputs) + ) + feature = get_legate_runtime().create_store(types.int32, (max_nodes,)) + split_value = get_legate_runtime().create_store(types.float64, (max_nodes,)) + gain = get_legate_runtime().create_store(types.float64, (max_nodes,)) + hessian = get_legate_runtime().create_store( + types.float64, (max_nodes, num_outputs) + ) + task.add_output(leaf_value) + task.add_output(feature) + task.add_output(split_value) + task.add_output(gain) + task.add_output(hessian) + task.add_broadcast(leaf_value) + task.add_broadcast(feature) + task.add_broadcast(split_value) + task.add_broadcast(gain) + task.add_broadcast(hessian) + + if get_legate_runtime().machine.count(TaskTarget.GPU) > 1: + task.add_nccl_communicator() + elif get_legate_runtime().machine.count() > 1: + task.add_cpu_communicator() + task.execute() + + self.leaf_value = cn.array(leaf_value, copy=False) + self.feature = cn.array(feature, copy=False) + self.split_value = cn.array(split_value, copy=False) + self.gain = cn.array(gain, copy=False) + self.hessian = cn.array(hessian, copy=False) + return self + + def fit( + self, + X: Union[cn.ndarray, csr_matrix], + g: cn.ndarray, + h: cn.ndarray, + ) -> "Tree": + if isinstance(X, csr_matrix): + return self.fit_csr(X, g, h) + return self.fit_dense(X, g, h) + def clear(self) -> None: self.leaf_value.fill(0) self.hessian.fill(0) @@ -168,12 +260,12 @@ def update( self.hessian = cn.array(hessian, copy=False) return self - def predict(self, X: cn.ndarray) -> cn.ndarray: + def predict_dense(self, X: cn.ndarray) -> cn.ndarray: n_rows = X.shape[0] n_features = X.shape[1] n_outputs = self.leaf_value.shape[1] task = get_legate_runtime().create_auto_task( - user_context, LegateBoostOpCode.PREDICT + user_context, LegateBoostOpCode.PREDICT_TREE ) pred = get_legate_runtime().create_store(types.float64, (n_rows, n_outputs)) @@ -197,9 +289,58 @@ def predict(self, X: cn.ndarray) -> cn.ndarray: task.add_alignment(X_, pred_) task.execute() + return cn.array(pred, copy=False) + def predict_csr(self, X: csr_matrix) -> cn.ndarray: + n_rows = X.shape[0] + n_outputs = self.leaf_value.shape[1] + task = get_legate_runtime().create_auto_task( + user_context, LegateBoostOpCode.PREDICT_TREE_CSR + ) + + pred = get_legate_runtime().create_store(types.float64, (n_rows, n_outputs)) + # inputs + val_var = task.add_input(X.vals) + crd_var = task.add_input(X.crd) + pos_var = task.add_input(X.pos) + task.add_constraint( + image(pos_var, crd_var, hint=ImageComputationHint.FIRST_LAST) + ) + task.add_constraint( + image(pos_var, val_var, hint=ImageComputationHint.FIRST_LAST) + ) + pos_var_broadcast = X.pos.promote(1, n_outputs) + task.add_alignment(pos_var_broadcast, pred) + + # scalars + task.add_scalar_arg(X.shape[1], types.int32) + + # output + task.add_output( + pred.promote(1, 1) + ) # add 1 dimension so it has the same dimension as dense version + task.add_output(pred) # only here for alignment, no used + + # broadcast the tree structure + leaf_value_ = get_store(self.leaf_value) + feature_ = get_store(self.feature) + split_value_ = get_store(self.split_value) + task.add_input(leaf_value_) + task.add_input(feature_) + task.add_input(split_value_) + task.add_broadcast(leaf_value_) + task.add_broadcast(feature_) + task.add_broadcast(split_value_) + + task.add_input(pos_var_broadcast) # used only for alignment + task.execute() return cn.array(pred, copy=False) + def predict(self, X: Union[cn.ndarray, csr_matrix]) -> cn.ndarray: + if isinstance(X, csr_matrix): + return self.predict_csr(X) + return self.predict_dense(X) + def is_leaf(self, id: int) -> Any: return self.feature[id] == -1 @@ -245,3 +386,6 @@ def __mul__(self, scalar: Any) -> "Tree": new = copy.deepcopy(self) new.leaf_value *= scalar return new + + def supports_csr(self) -> bool: + return True diff --git a/legateboost/test/models/test_tree.py b/legateboost/test/models/test_tree.py index 8502eef3..bb9fa9aa 100644 --- a/legateboost/test/models/test_tree.py +++ b/legateboost/test/models/test_tree.py @@ -72,3 +72,21 @@ def test_alpha(): ) model.fit(X, y) assert np.isclose(model.predict(X)[0], y.sum() / (y.size + alpha)) + + +def test_csr(): + csr_matrix = pytest.importorskip("legate_sparse").csr_matrix + X = csr_matrix( + (cn.array([1.0, 2.0, 3.0]), cn.array([0, 1, 2]), cn.array([0, 2, 3])), + shape=(2, 3), + ) + g = cn.array([[1.0], [-1.0]]) + h = cn.array([[1.0], [1.0]]) + + model = ( + lb.models.Tree(alpha=0.0) + .set_random_state(np.random.RandomState(2)) + .fit(X, g, h) + ) + print(model) + assert np.allclose(model.predict(X), -g / h) diff --git a/legateboost/test/test_estimator.py b/legateboost/test/test_estimator.py index 5d02b421..f6fdcf41 100644 --- a/legateboost/test/test_estimator.py +++ b/legateboost/test/test_estimator.py @@ -1,12 +1,17 @@ import numpy as np import pytest +import scipy from sklearn.datasets import make_regression from sklearn.model_selection import train_test_split from sklearn.utils.estimator_checks import parametrize_with_checks import cupynumeric as cn import legateboost as lb -from legateboost.testing.utils import non_increasing, sanity_check_models +from legateboost.testing.utils import ( + all_base_models, + non_increasing, + sanity_check_models, +) def test_init(): @@ -225,3 +230,56 @@ def test_iterator_methods(): assert list(model) == list(model.models_) for i, est in enumerate(model): assert est == model[i] + + +@pytest.mark.parametrize( + "base_model", filter(lambda m: m.supports_csr(), all_base_models()), ids=type +) +def test_csr_input(base_model): + csr_matrix = pytest.importorskip("legate_sparse").csr_matrix + X_dense = cn.array([[1.0, 0.0, 2.0], [0.0, 3.0, 0.0]]) + X_scipy = scipy.sparse.csr_matrix(X_dense) + X_legate_sparse = csr_matrix(X_scipy) + y = cn.array([1.0, 2.0]) + model = lb.LBRegressor( + init=None, + n_estimators=1, + base_models=(base_model,), + learning_rate=1.0, + ) + sparse_pred = model.fit(X_scipy, y).predict(X_scipy) + legate_sparse_pred = model.fit(X_legate_sparse, y).predict(X_legate_sparse) + dense_pred = model.fit(X_dense, y).predict(X_dense) + assert cn.allclose(sparse_pred, legate_sparse_pred) + assert cn.allclose(sparse_pred, dense_pred) + + # Generate a sparse dataset and check that dense and sparse + # input give equivalent results + # unfortunately we can't test that they are bitwise identical + # the changing order of floating point sums can lead to different results + # so instead assert that sparse models are roughly as accurate as dense ones + rng = np.random.RandomState(0) + X = rng.binomial(1, 0.1, (100, 100)).astype(np.float32) + y = rng.randint(0, 5, 100) + X_scipy = scipy.sparse.csr_matrix(X) + + # regression + params = {"base_models": (base_model,), "n_estimators": 5, "random_state": 0} + sparse_model = lb.LBRegressor(**params) + dense_model = lb.LBRegressor(**params) + dense_pred = dense_model.fit(X, y).predict(X) + X_csr = csr_matrix(X) + assert X_csr.nnz < X.size + sparse_pred = sparse_model.fit(X_csr, y).predict(X_csr) + assert sparse_model.score(X, y) > 0.4 + assert dense_model.score(X, y) > 0.4 + sanity_check_models(sparse_model) + + # classification + sparse_model = lb.LBClassifier(**params) + dense_model = lb.LBClassifier(**params) + dense_pred = dense_model.fit(X, y).predict_proba(X) + sparse_pred = sparse_model.fit(X_csr, y).predict_proba(X_csr) + assert sparse_model.score(X, y) > 0.8 + assert dense_model.score(X, y) > 0.8 + sanity_check_models(sparse_model) diff --git a/pyproject.toml b/pyproject.toml index be4346af..8b14144a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -45,6 +45,7 @@ requires-python = ">=3.10" [project.optional-dependencies] test = [ "hypothesis>=6", + "legate-sparse", "matplotlib>=3.9", "mypy>=1.13", "nbconvert>=7.16", diff --git a/src/legateboost.h b/src/legateboost.h index bf8af18f..892c274a 100644 --- a/src/legateboost.h +++ b/src/legateboost.h @@ -20,7 +20,7 @@ enum LegateBoostOpCode { // NOLINT(performance-enum-size) OP_CODE_BASE = 0, BUILD_TREE = 1, - PREDICT = 2, + PREDICT_TREE = 2, UPDATE_TREE = 3, /* special */ ERF = 4, @@ -29,9 +29,11 @@ enum LegateBoostOpCode { // NOLINT(performance-enum-size) DIGAMMA = 7, ZETA = 8, /**/ - GATHER = 9, - RBF = 10, - BUILD_NN = 11, + GATHER = 9, + RBF = 10, + BUILD_NN = 11, + BUILD_TREE_CSR = 12, + PREDICT_TREE_CSR = 13, }; #endif // SRC_LEGATEBOOST_H_ diff --git a/src/models/tree/build_tree.cc b/src/models/tree/build_tree.cc index c942de45..0d14872a 100644 --- a/src/models/tree/build_tree.cc +++ b/src/models/tree/build_tree.cc @@ -25,6 +25,7 @@ #include "legate_library.h" #include "legateboost.h" #include "cpp_utils/cpp_utils.h" +#include "matrix_types.h" namespace legateboost { namespace { @@ -128,10 +129,9 @@ void WriteTreeOutput(legate::TaskContext context, const Tree& tree) // Share the samples with all workers // Remove any duplicates // Return sparse matrix of split samples for each feature -template +template class XMatrix> auto SelectSplitSamples(legate::TaskContext context, - const legate::AccessorRO& X, - legate::Rect<3> X_shape, + const XMatrix& X, int split_samples, int seed, int64_t dataset_rows) -> SparseSplitProposals @@ -144,23 +144,22 @@ auto SelectSplitSamples(legate::TaskContext context, return dist(eng); }); - auto num_features = X_shape.hi[1] - X_shape.lo[1] + 1; - auto draft_proposals = legate::create_buffer({num_features, split_samples}); + auto draft_proposals = legate::create_buffer({X.NumFeatures(), split_samples}); for (int i = 0; i < split_samples; i++) { auto row = row_samples[i]; - bool const has_data = row >= X_shape.lo[0] && row <= X_shape.hi[0]; - for (int j = 0; j < num_features; j++) { - draft_proposals[{j, i}] = has_data ? X[{row, j, 0}] : T(0); + const bool has_data = X.RowSubset().contains(row); + for (int j = 0; j < X.NumFeatures(); j++) { + draft_proposals[{j, i}] = has_data ? X.Get(row, j) : T(0); } } - SumAllReduce(context, tcb::span(draft_proposals.ptr({0, 0}), num_features * split_samples)); + SumAllReduce(context, tcb::span(draft_proposals.ptr({0, 0}), X.NumFeatures() * split_samples)); // Sort samples std::vector split_proposals_tmp; - split_proposals_tmp.reserve(num_features * split_samples); - auto row_pointers = legate::create_buffer(num_features + 1); + split_proposals_tmp.reserve(X.NumFeatures() * split_samples); + auto row_pointers = legate::create_buffer(X.NumFeatures() + 1); row_pointers[0] = 0; - for (int j = 0; j < num_features; j++) { + for (int j = 0; j < X.NumFeatures(); j++) { auto ptr = draft_proposals.ptr({j, 0}); tcb::span const feature_proposals(draft_proposals.ptr({j, 0}), split_samples); std::set const unique(feature_proposals.begin(), feature_proposals.end()); @@ -173,17 +172,18 @@ auto SelectSplitSamples(legate::TaskContext context, // Set the largest split sample to +inf such that an element must belong to one of the bins // i.e. we cannot go off the end when searching for a bin - for (int feature = 0; feature < num_features; feature++) { + for (int feature = 0; feature < X.NumFeatures(); feature++) { auto end = row_pointers[feature + 1]; split_proposals[end - 1] = std::numeric_limits::infinity(); } return SparseSplitProposals( - split_proposals, row_pointers, num_features, split_proposals_tmp.size()); + split_proposals, row_pointers, X.NumFeatures(), split_proposals_tmp.size()); } -template +template struct TreeBuilder { + using T = typename MatrixT::value_type; TreeBuilder(int32_t num_rows, int32_t num_features, int32_t num_outputs, @@ -194,6 +194,7 @@ struct TreeBuilder { num_features(num_features), num_outputs(num_outputs), max_nodes(max_nodes), + max_depth(max_depth), split_proposals(split_proposals) { sorted_positions = legate::create_buffer>(num_rows); @@ -211,24 +212,48 @@ struct TreeBuilder { split_proposals.histogram_size); max_batch_size = max_histogram_nodes; } - template - void ComputeHistogram(Histogram histogram, - legate::TaskContext context, - Tree& tree, - const legate::AccessorRO& X, - legate::Rect<3> X_shape, - const legate::AccessorRO& g, - const legate::AccessorRO& h, - NodeBatch batch) + + auto Build(legate::TaskContext context, + const MatrixT& X_matrix, + const legate::AccessorRO& g_accessor, + const legate::AccessorRO& h_accessor, + legate::Rect<3> g_shape, + double alpha) -> Tree + { + // Begin building the tree + Tree tree(max_nodes, narrow(num_outputs)); + this->InitialiseRoot(context, tree, g_accessor, h_accessor, g_shape, alpha); + for (int depth = 0; depth < max_depth; ++depth) { + auto batches = this->PrepareBatches(depth); + for (auto batch : batches) { + auto histogram = this->GetHistogram(batch); + + this->ComputeHistogram(histogram, context, tree, X_matrix, g_accessor, h_accessor, batch); + + this->PerformBestSplit(tree, histogram, alpha, batch); + } + // Update position of entire level + // Don't bother updating positions for the last level + if (depth < max_depth - 1) { this->UpdatePositions(tree, X_matrix); } + } + return tree; + } + + void DenseHistogramKernel(const Tree& tree, + Histogram& histogram, + const DenseXMatrix& X, + const legate::AccessorRO& g, + const legate::AccessorRO& h, + NodeBatch batch) { // Build the histogram for (auto [position, index_local] : batch) { - auto index_global = index_local + X_shape.lo[0]; + auto index_global = index_local + X.RowSubset().lo[0]; bool const compute = ComputeHistogramBin( position, tree.node_sums, histogram.ContainsNode(BinaryTree::Parent(position))); if (position < 0 || !compute) { continue; } for (int64_t j = 0; j < num_features; j++) { - auto x_value = X[{index_global, j, 0}]; + auto x_value = X.Get(index_global, j); int const bin_idx = split_proposals.FindBin(x_value, j); for (int64_t k = 0; k < num_outputs; ++k) { @@ -237,6 +262,48 @@ struct TreeBuilder { } } } + } + + // Kernel specialised to iterate only over the non-zero elements of the sparse matrix + void CSRHistogramKernel(const Tree& tree, + Histogram& histogram, + const CSRXMatrix& X, + const legate::AccessorRO& g, + const legate::AccessorRO& h, + NodeBatch batch) + { + // Build the histogram + for (auto [position, index_local] : batch) { + auto index_global = index_local + X.RowSubset().lo[0]; + bool const compute = ComputeHistogramBin( + position, tree.node_sums, histogram.ContainsNode(BinaryTree::Parent(position))); + if (position < 0 || !compute) { continue; } + auto row_range = X.row_ranges[index_global]; + for (auto element_idx = row_range.lo[0]; element_idx <= row_range.hi[0]; element_idx++) { + auto feature = X.column_indices[element_idx]; + auto x = X.values[element_idx]; + int const bin_idx = split_proposals.FindBin(x, feature); + for (int64_t k = 0; k < num_outputs; ++k) { + histogram[{position, k, bin_idx}] += + GPair{g[{index_global, 0, k}], h[{index_global, 0, k}]}; + } + } + } + } + + void ComputeHistogram(Histogram histogram, + legate::TaskContext context, + const Tree& tree, + const MatrixT& X, + const legate::AccessorRO& g, + const legate::AccessorRO& h, + NodeBatch batch) + { + if constexpr (std::is_same_v>) { + this->DenseHistogramKernel(tree, histogram, X, g, h, batch); + } else { + this->CSRHistogramKernel(tree, histogram, X, g, h, batch); + } // NCCL cannot allreduce custom types, need to reinterpret as double SumAllReduce( @@ -247,7 +314,7 @@ struct TreeBuilder { this->Scan(histogram, batch, tree); } - void Scan(Histogram histogram, NodeBatch batch, Tree& tree) + void Scan(Histogram histogram, NodeBatch batch, const Tree& tree) { auto scan_node_histogram = [&](int node_idx) { for (int feature = 0; feature < num_features; feature++) { @@ -298,8 +365,12 @@ struct TreeBuilder { } } } - void PerformBestSplit(Tree& tree, Histogram histogram, double alpha, NodeBatch batch) + void PerformBestSplit(Tree& tree, + const Histogram& histogram, + double alpha, + NodeBatch batch) { + const bool is_sparse_matrix = std::is_same_v>; for (int node_id = batch.node_idx_begin; node_id < batch.node_idx_end; node_id++) { double best_gain = 0; int best_feature = -1; @@ -309,10 +380,18 @@ struct TreeBuilder { for (int bin_idx = feature_begin; bin_idx < feature_end; bin_idx++) { double gain = 0; for (int output = 0; output < num_outputs; ++output) { - auto [G_L, H_L] = histogram[{node_id, output, bin_idx}]; - auto [G, H] = tree.node_sums[{node_id, output}]; - auto G_R = G - G_L; - auto H_R = H - H_L; + auto [left_sum, right_sum] = InferSplitSums(histogram, + split_proposals, + tree.node_sums[{node_id, output}], + node_id, + output, + bin_idx, + feature, + is_sparse_matrix); + auto [G_L, H_L] = left_sum; + auto [G_R, H_R] = right_sum; + auto [G, H] = tree.node_sums[{node_id, output}]; + double const reg = std::max(eps, alpha); // Regularisation term gain += 0.5 * ((G_L * G_L) / (H_L + reg) + (G_R * G_R) / (H_R + reg) - (G * G) / (H + reg)); @@ -325,34 +404,37 @@ struct TreeBuilder { } } if (best_gain > eps) { - std::vector left_leaf(num_outputs); - std::vector right_leaf(num_outputs); - std::vector left_sum(num_outputs); - std::vector right_sum(num_outputs); + std::vector left_leaves(num_outputs); + std::vector right_leaves(num_outputs); + std::vector left_sums(num_outputs); + std::vector right_sums(num_outputs); for (int output = 0; output < num_outputs; ++output) { - auto [G_L, H_L] = histogram[{node_id, output, best_bin}]; - auto [G, H] = tree.node_sums[{node_id, output}]; - auto G_R = G - G_L; - auto H_R = H - H_L; - left_leaf[output] = CalculateLeafValue(G_L, H_L, alpha); - right_leaf[output] = CalculateLeafValue(G_R, H_R, alpha); - left_sum[output] = {G_L, H_L}; - right_sum[output] = {G_R, H_R}; + auto [left_sum, right_sum] = InferSplitSums(histogram, + split_proposals, + tree.node_sums[{node_id, output}], + node_id, + output, + best_bin, + best_feature, + is_sparse_matrix); + left_leaves[output] = CalculateLeafValue(left_sum.grad, left_sum.hess, alpha); + right_leaves[output] = CalculateLeafValue(right_sum.grad, right_sum.hess, alpha); + left_sums[output] = left_sum; + right_sums[output] = right_sum; } - if (left_sum[0].hess <= 0.0 || right_sum[0].hess <= 0.0) { continue; } + if (left_sums[0].hess <= 0.0 || right_sums[0].hess <= 0.0) { continue; } tree.AddSplit(node_id, best_feature, split_proposals.split_proposals[legate::coord_t{best_bin}], - left_leaf, - right_leaf, + left_leaves, + right_leaves, best_gain, - left_sum, - right_sum); + left_sums, + right_sums); } } } - template - void UpdatePositions(Tree& tree, const legate::AccessorRO& X, legate::Rect<3> X_shape) + void UpdatePositions(Tree& tree, const MatrixT& X) { // Update the positions for (int i = 0; i < num_rows; i++) { @@ -361,7 +443,7 @@ struct TreeBuilder { sorted_positions[i] = {-1, index_local}; continue; } - auto x = X[{X_shape.lo[0] + index_local, tree.feature[pos], 0}]; + auto x = X.Get(X.RowSubset().lo[0] + index_local, tree.feature[pos]); bool const left = x <= tree.split_value[pos]; pos = left ? BinaryTree::LeftChild(pos) : BinaryTree::RightChild(pos); sorted_positions[i] = {pos, index_local}; @@ -448,18 +530,20 @@ struct TreeBuilder { int32_t num_features; int32_t num_outputs; int32_t max_nodes; + int32_t max_depth; int max_batch_size; SparseSplitProposals split_proposals; Histogram histogram; }; -struct build_tree_fn { +struct build_tree_dense_fn { template void operator()(legate::TaskContext context) { auto [X, X_shape, X_accessor] = GetInputStore(context.input(0).data()); auto [g, g_shape, g_accessor] = GetInputStore(context.input(1).data()); auto [h, h_shape, h_accessor] = GetInputStore(context.input(2).data()); + EXPECT_DENSE_ROW_MAJOR(X_accessor.accessor, X_shape); auto num_features = X_shape.hi[1] - X_shape.lo[1] + 1; auto num_rows = std::max(X_shape.hi[0] - X_shape.lo[0] + 1, 0); @@ -477,39 +561,74 @@ struct build_tree_fn { auto seed = context.scalars().at(4).value(); auto dataset_rows = context.scalars().at(5).value(); - Tree tree(max_nodes, narrow(num_outputs)); + DenseXMatrix const X_matrix(X_accessor, X_shape); + SparseSplitProposals const split_proposals = - SelectSplitSamples(context, X_accessor, X_shape, split_samples, seed, dataset_rows); + SelectSplitSamples(context, X_matrix, split_samples, seed, dataset_rows); - // Begin building the tree - TreeBuilder builder( - num_rows, num_features, num_outputs, max_nodes, max_depth, split_proposals); + // Dispatch the tree building algorithm templated on the matrix type + auto tree = TreeBuilder>( + num_rows, num_features, num_outputs, max_nodes, max_depth, split_proposals) + .Build(context, X_matrix, g_accessor, h_accessor, g_shape, alpha); - builder.InitialiseRoot(context, tree, g_accessor, h_accessor, g_shape, alpha); - for (int depth = 0; depth < max_depth; ++depth) { - auto batches = builder.PrepareBatches(depth); - for (auto batch : batches) { - auto histogram = builder.GetHistogram(batch); + WriteTreeOutput(context, tree); + } +}; - builder.ComputeHistogram( - histogram, context, tree, X_accessor, X_shape, g_accessor, h_accessor, batch); +struct build_tree_csr_fn { + template + void operator()(legate::TaskContext context) + { + auto [X_vals, X_vals_shape, X_vals_accessor] = GetInputStore(context.input(0).data()); + auto [X_coords, X_coords_shape, X_coords_accessor] = + GetInputStore(context.input(1).data()); + auto [X_offsets, X_offsets_shape, X_offsets_accessor] = + GetInputStore, 1>(context.input(2).data()); + auto [g, g_shape, g_accessor] = GetInputStore(context.input(3).data()); + auto [h, h_shape, h_accessor] = GetInputStore(context.input(4).data()); + + auto num_rows = std::max(X_offsets_shape.hi[0] - X_offsets_shape.lo[0] + 1, 0); + auto num_outputs = g_shape.hi[2] - g_shape.lo[2] + 1; + EXPECT(g_shape.lo[2] == 0, "Outputs should not be split between workers."); + + // Scalars + auto max_depth = context.scalars().at(0).value(); + auto max_nodes = context.scalars().at(1).value(); + auto alpha = context.scalars().at(2).value(); + auto split_samples = context.scalars().at(3).value(); + auto seed = context.scalars().at(4).value(); + auto dataset_rows = context.scalars().at(5).value(); + auto num_features = context.scalars().at(6).value(); + + CSRXMatrix const X_matrix(X_vals_accessor, + X_coords_accessor, + X_offsets_accessor, + X_vals_shape, + X_offsets_shape, + num_features, + X_vals_shape.volume()); + const SparseSplitProposals split_proposals = + SelectSplitSamples(context, X_matrix, split_samples, seed, dataset_rows); + + auto tree = TreeBuilder>( + num_rows, num_features, num_outputs, max_nodes, max_depth, split_proposals) + .Build(context, X_matrix, g_accessor, h_accessor, g_shape, alpha); - builder.PerformBestSplit(tree, histogram, alpha, batch); - } - // Update position of entire level - // Don't bother updating positions for the last level - if (depth < max_depth - 1) { builder.UpdatePositions(tree, X_accessor, X_shape); } - } WriteTreeOutput(context, tree); } }; - } // namespace -/*static*/ void BuildTreeTask::cpu_variant(legate::TaskContext context) +/*static*/ void BuildTreeDenseTask::cpu_variant(legate::TaskContext context) +{ + const auto& X = context.input(0).data(); + legateboost::type_dispatch_float(X.code(), build_tree_dense_fn(), context); +} + +/*static*/ void BuildTreeCSRTask::cpu_variant(legate::TaskContext context) { const auto& X = context.input(0).data(); - legateboost::type_dispatch_float(X.code(), build_tree_fn(), context); + legateboost::type_dispatch_float(X.code(), build_tree_csr_fn(), context); } } // namespace legateboost @@ -518,6 +637,7 @@ namespace // unnamed { void __attribute__((constructor)) register_tasks() { - legateboost::BuildTreeTask::register_variants(); + legateboost::BuildTreeDenseTask::register_variants(); + legateboost::BuildTreeCSRTask::register_variants(); } } // namespace diff --git a/src/models/tree/build_tree.cu b/src/models/tree/build_tree.cu index 3c6e724d..e248b655 100644 --- a/src/models/tree/build_tree.cu +++ b/src/models/tree/build_tree.cu @@ -21,17 +21,18 @@ #include #include #include -#include #include #include #include #include +#include #include "legate_library.h" #include "legateboost.h" #include "../../cpp_utils/cpp_utils.h" #include "../../cpp_utils/cpp_utils.cuh" #include "legate/comm/coll.h" #include "build_tree.h" +#include "matrix_types.h" namespace legateboost { @@ -271,8 +272,7 @@ struct HistogramAgent { } }; - const legate::AccessorRO& X; - const int64_t& sample_offset; + const DenseXMatrix& X; const legate::AccessorRO& g; const legate::AccessorRO& h; const size_t& n_outputs; @@ -288,8 +288,7 @@ struct HistogramAgent { int feature_stride; SharedMemoryHistogram shared_histogram; - __device__ HistogramAgent(const legate::AccessorRO& X, - const int64_t& sample_offset, + __device__ HistogramAgent(const DenseXMatrix& X, const legate::AccessorRO& g, const legate::AccessorRO& h, const size_t& n_outputs, @@ -302,7 +301,6 @@ struct HistogramAgent { const int64_t& seed, SharedMemoryHistogramType* shared_memory) : X(X), - sample_offset(sample_offset), g(g), h(h), n_outputs(n_outputs), @@ -343,11 +341,10 @@ struct HistogramAgent { sample_node, node_sums, histogram.ContainsNode(BinaryTree::Parent(sample_node))); if (!computeHistogram) { continue; } - auto x = X[{sample_offset + local_sample_idx, feature, 0}]; - // int bin_idx = shared_split_proposals.FindBin(x, feature); + auto x = X.Get(X.RowSubset().lo[0] + local_sample_idx, feature); int const bin_idx = split_proposals.FindBin(x, feature); - legate::Point<3> p = {sample_offset + local_sample_idx, 0, output}; + legate::Point<3> p = {X.RowSubset().lo[0] + local_sample_idx, 0, output}; auto gpair_quantised = quantiser.QuantiseStochasticRounding({g[p], h[p]}, hash_combine(seed, p[0], p[2])); // NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast) @@ -387,7 +384,7 @@ struct HistogramAgent { std::array x{}; #pragma unroll for (int i = 0; i < kItemsPerThread; i++) { - x[i] = X[{sample_offset + local_sample_idx[i], feature[i], 0}]; + x[i] = X.Get(X.RowSubset().lo[0] + local_sample_idx[i], feature[i]); } std::array bin_idx{}; @@ -398,7 +395,7 @@ struct HistogramAgent { std::array gpair{}; #pragma unroll for (int i = 0; i < kItemsPerThread; i++) { - legate::Point<3> p = {sample_offset + local_sample_idx[i], 0, output}; + legate::Point<3> p = {sample_node[i] + local_sample_idx[i], 0, output}; gpair[i] = quantiser.QuantiseStochasticRounding({g[p], h[p]}, hash_combine(seed, p[0], p[2])); } #pragma unroll @@ -443,18 +440,17 @@ struct HistogramAgent { // NOLINTBEGIN(performance-unnecessary-value-param) template __global__ void __launch_bounds__(kBlockThreads) - fill_histogram_shared(legate::AccessorRO X, - int64_t sample_offset, - legate::AccessorRO g, - legate::AccessorRO h, - size_t n_outputs, - SparseSplitProposals split_proposals, - NodeBatch batch, - Histogram histogram, - legate::Buffer node_sums, - GradientQuantiser quantiser, - legate::Buffer feature_groups, - int64_t seed) + fill_histogram_dense(DenseXMatrix X, + legate::AccessorRO g, + legate::AccessorRO h, + size_t n_outputs, + SparseSplitProposals split_proposals, + NodeBatch batch, + Histogram histogram, + legate::Buffer node_sums, + GradientQuantiser quantiser, + legate::Buffer feature_groups, + int64_t seed) { // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays,hicpp-avoid-c-arrays) __shared__ char shared_char[kMaxSharedBins * sizeof(SharedMemoryHistogramType)]; @@ -463,7 +459,6 @@ __global__ void __launch_bounds__(kBlockThreads) // NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast) reinterpret_cast(shared_char); HistogramAgent agent(X, - sample_offset, g, h, n_outputs, @@ -479,9 +474,56 @@ __global__ void __launch_bounds__(kBlockThreads) } // NOLINTEND(performance-unnecessary-value-param) +// NOLINTBEGIN(performance-unnecessary-value-param) +template +__global__ void __launch_bounds__(kBlockThreads) + fill_histogram_csr(CSRXMatrix X, + legate::AccessorRO g, + legate::AccessorRO h, + size_t n_outputs, + SparseSplitProposals split_proposals, + NodeBatch batch, + Histogram histogram, + legate::Buffer node_sums, + GradientQuantiser quantiser, + int64_t seed) +{ + // Grid stride loop over rows + for (std::size_t idx = (blockIdx.x * blockDim.x) + threadIdx.x; idx < batch.InstancesInBatch(); + idx += static_cast(blockDim.x * gridDim.x)) { + auto [sample_node, local_sample_idx] = batch.instances[idx]; + // If we don't need to compute this node, skip + if (!ComputeHistogramBin( + sample_node, node_sums, histogram.ContainsNode(BinaryTree::Parent(sample_node)))) { + continue; + } + // Which matrix elements belong to this row? + std::size_t const global_sample_idx = X.RowSubset().lo[0] + local_sample_idx; + auto elements = X.row_ranges[global_sample_idx]; + for (std::size_t element = elements.lo[0]; element <= elements.hi[0]; element++) { + auto feature = X.column_indices[X.vals_shape.lo[0] + element]; + auto x = X.values[X.vals_shape.lo[0] + element]; + int const bin_idx = split_proposals.FindBin(x, feature); + for (int output = 0; output < n_outputs; output++) { + legate::Point<3> p = {global_sample_idx, 0, output}; + auto gpair_quantised = + quantiser.QuantiseStochasticRounding({g[p], h[p]}, hash_combine(seed, p[0], p[2])); + // NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast) + auto* addPosition = reinterpret_cast::atomic_add_type*>( + &histogram[{sample_node, output, bin_idx}]); + atomicAdd(addPosition, gpair_quantised.grad); + // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) + atomicAdd(addPosition + 1, gpair_quantised.hess); + } + } + } +} +// NOLINTEND(performance-unnecessary-value-param) + // Manage the launch parameters for histogram kernel -template +template struct HistogramKernel { + using T = typename MatrixT::value_type; static const std::int32_t kItemsPerTile = kBlockThreads * kItemsPerThread; legate::Buffer feature_groups; int num_groups{}; @@ -494,10 +536,7 @@ struct HistogramKernel { CHECK_CUDA(cudaDeviceGetAttribute(&n_mps, cudaDevAttrMultiProcessorCount, device)); std::int32_t n_blocks_per_mp = 0; CHECK_CUDA(cudaOccupancyMaxActiveBlocksPerMultiprocessor( - &n_blocks_per_mp, - fill_histogram_shared, - kBlockThreads, - 0)); + &n_blocks_per_mp, fill_histogram_dense, kBlockThreads, 0)); this->maximum_blocks_for_occupancy = n_blocks_per_mp * n_mps; FindFeatureGroups(split_proposals, stream); } @@ -537,8 +576,7 @@ struct HistogramKernel { stream)); } - void BuildHistogram(const legate::AccessorRO& X, - int64_t sample_offset, + void BuildHistogram(const MatrixT& X, const legate::AccessorRO& g, const legate::AccessorRO& h, size_t n_outputs, @@ -550,25 +588,35 @@ struct HistogramKernel { int64_t seed, cudaStream_t stream) { - int const average_features_per_group = split_proposals.num_features / num_groups; - std::size_t const average_elements_per_group = - batch.InstancesInBatch() * average_features_per_group; - auto min_blocks = (average_elements_per_group + kItemsPerTile - 1) / kItemsPerTile; - auto x_grid_size = std::min(static_cast(maximum_blocks_for_occupancy), min_blocks); - // Launch the kernel - fill_histogram_shared - <<>>(X, - sample_offset, - g, - h, - n_outputs, - split_proposals, - batch, - histogram, - node_sums, - quantiser, - feature_groups, - seed); + if constexpr (std::is_same_v>) { + int const average_features_per_group = split_proposals.num_features / num_groups; + std::size_t const average_elements_per_group = + batch.InstancesInBatch() * average_features_per_group; + auto min_blocks = (average_elements_per_group + kItemsPerTile - 1) / kItemsPerTile; + auto x_grid_size = std::min(static_cast(maximum_blocks_for_occupancy), min_blocks); + // Launch the kernel + fill_histogram_dense + <<>>(X, + g, + h, + n_outputs, + split_proposals, + batch, + histogram, + node_sums, + quantiser, + feature_groups, + seed); + } else { + // For sparse data we don't currently make use of feature groups or shared memory + // Use 1 thread per row for lack of a better option currently + // Other methods might involve complicated load balancing + auto min_blocks = (batch.InstancesInBatch() + kItemsPerTile - 1) / kItemsPerTile; + auto x_grid_size = std::min(static_cast(maximum_blocks_for_occupancy), min_blocks); + // Launch the kernel + fill_histogram_csr<<>>( + X, g, h, n_outputs, split_proposals, batch, histogram, node_sums, quantiser, seed); + } } }; @@ -675,11 +723,11 @@ struct GainFeaturePair { }; // NOLINTBEGIN(performance-unnecessary-value-param) -template +template __global__ void __launch_bounds__(BLOCK_THREADS) perform_best_split(Histogram histogram, size_t n_outputs, - SparseSplitProposals split_proposals, + SparseSplitProposals split_proposals, double eps, double alpha, legate::Buffer tree_leaf_value, @@ -688,7 +736,8 @@ __global__ void __launch_bounds__(BLOCK_THREADS) legate::Buffer tree_split_value, legate::Buffer tree_gain, NodeBatch batch, - GradientQuantiser quantiser) + GradientQuantiser quantiser, + bool is_sparse) { // using one block per (level) node to have blockwise reductions int const node_id = narrow(batch.node_idx_begin + blockIdx.x); @@ -708,9 +757,12 @@ __global__ void __launch_bounds__(BLOCK_THREADS) bin_idx += BLOCK_THREADS) { double gain = 0; for (int output = 0; output < n_outputs; ++output) { - auto node_sum = vectorised_load(&node_sums[{node_id, output}]); - auto left_sum = vectorised_load(&histogram[{node_id, output, bin_idx}]); - auto right_sum = node_sum - left_sum; + auto node_sum = vectorised_load(&node_sums[{node_id, output}]); + + auto feature = split_proposals.FindFeature(bin_idx); + auto [left_sum, right_sum] = InferSplitSums( + histogram, split_proposals, node_sum, node_id, output, bin_idx, feature, is_sparse); + if (left_sum.hess <= 0 || right_sum.hess <= 0) { gain = 0; break; @@ -745,9 +797,17 @@ __global__ void __launch_bounds__(BLOCK_THREADS) if (node_best_gain > eps) { int const node_best_feature = split_proposals.FindFeature(node_best_bin_idx); for (int output = narrow_cast(threadIdx.x); output < n_outputs; output += BLOCK_THREADS) { - auto node_sum = vectorised_load(&node_sums[{node_id, output}]); - auto left_sum = vectorised_load(&histogram[{node_id, output, node_best_bin_idx}]); - auto right_sum = node_sum - left_sum; + auto node_sum = vectorised_load(&node_sums[{node_id, output}]); + + auto [left_sum, right_sum] = InferSplitSums(histogram, + split_proposals, + node_sum, + node_id, + output, + node_best_bin_idx, + node_best_feature, + is_sparse); + node_sums[{BinaryTree::LeftChild(node_id), output}] = left_sum; node_sums[{BinaryTree::RightChild(node_id), output}] = right_sum; @@ -809,11 +869,12 @@ struct Tree { [=] __device__(const legate::Point& p) { out_acc[p] = x[p]; }); } - template - void WriteTreeOutput(legate::TaskContext context, - const ThrustPolicyT& policy, - GradientQuantiser quantiser) + void WriteTreeOutput(legate::TaskContext context, GradientQuantiser quantiser) { + auto* stream = context.get_task_stream(); + auto thrust_alloc = ThrustAllocator(legate::Memory::GPU_FB_MEM); + auto policy = DEFAULT_POLICY(thrust_alloc).on(stream); + WriteOutput(context.output(0).data(), leaf_value, policy); WriteOutput(context.output(1).data(), feature, policy); WriteOutput(context.output(2).data(), split_value, policy); @@ -833,7 +894,7 @@ struct Tree { ~Tree() = default; Tree(const Tree&) = delete; - Tree(Tree&&) = delete; + Tree(Tree&&) = default; auto operator=(const Tree&) -> Tree& = delete; auto operator=(Tree&&) -> Tree& = delete; @@ -842,8 +903,8 @@ struct Tree { legate::Buffer split_value; legate::Buffer gain; legate::Buffer node_sums; - const int num_outputs; - const int max_nodes; + int num_outputs; + int max_nodes; cudaStream_t stream; }; @@ -851,10 +912,9 @@ struct Tree { // Use nccl to share the samples with all workers // Remove any duplicates // Return sparse matrix of split samples for each feature -template +template class XMatrix> auto SelectSplitSamples(legate::TaskContext context, - const legate::AccessorRO& X, - legate::Rect<3> X_shape, + const XMatrix& X, int split_samples, int seed, int64_t dataset_rows, @@ -862,7 +922,6 @@ auto SelectSplitSamples(legate::TaskContext context, { auto thrust_alloc = ThrustAllocator(legate::Memory::GPU_FB_MEM); auto policy = DEFAULT_POLICY(thrust_alloc).on(stream); - auto num_features = X_shape.hi[1] - X_shape.lo[1] + 1; // Randomly choose split_samples rows auto row_samples = legate::create_buffer(split_samples); auto counting = thrust::make_counting_iterator(0); @@ -873,54 +932,57 @@ auto SelectSplitSamples(legate::TaskContext context, eng.discard(idx); return dist(eng); }); - auto draft_proposals = legate::create_buffer({num_features, split_samples}); + auto draft_proposals = legate::create_buffer({X.NumFeatures(), split_samples}); // fill with local data - LaunchN(num_features * split_samples, stream, [=] __device__(auto idx) { - auto i = idx / num_features; - auto j = idx % num_features; + LaunchN(X.NumFeatures() * split_samples, stream, [=] __device__(auto idx) { + auto i = idx / X.NumFeatures(); + auto j = idx % X.NumFeatures(); auto row = row_samples[i]; - bool const has_data = row >= X_shape.lo[0] && row <= X_shape.hi[0]; - draft_proposals[{j, i}] = has_data ? X[{row, j, 0}] : T(0); + bool const has_data = X.RowSubset().contains(row); + draft_proposals[{j, i}] = has_data ? X.Get(row, j) : T(0); }); // Sum reduce over all workers SumAllReduce( - context, tcb::span(draft_proposals.ptr({0, 0}), num_features * split_samples), stream); + context, tcb::span(draft_proposals.ptr({0, 0}), X.NumFeatures() * split_samples), stream); CHECK_CUDA_STREAM(stream); // Condense split samples to unique values // First sort the samples - auto keys = legate::create_buffer(num_features * split_samples); - thrust::transform( - policy, counting, counting + num_features * split_samples, keys.ptr(0), [=] __device__(int i) { - return i / split_samples; - }); + auto keys = legate::create_buffer(X.NumFeatures() * split_samples); + thrust::transform(policy, + counting, + counting + X.NumFeatures() * split_samples, + keys.ptr(0), + [=] __device__(int i) { return i / split_samples; }); // Segmented sort auto begin = thrust::make_zip_iterator(thrust::make_tuple(keys.ptr(0), draft_proposals.ptr({0, 0}))); - thrust::sort(policy, begin, begin + num_features * split_samples, [] __device__(auto a, auto b) { - if (thrust::get<0>(a) != thrust::get<0>(b)) { return thrust::get<0>(a) < thrust::get<0>(b); } - return thrust::get<1>(a) < thrust::get<1>(b); - }); + thrust::sort( + policy, begin, begin + X.NumFeatures() * split_samples, [] __device__(auto a, auto b) { + if (thrust::get<0>(a) != thrust::get<0>(b)) { return thrust::get<0>(a) < thrust::get<0>(b); } + return thrust::get<1>(a) < thrust::get<1>(b); + }); // Extract the unique values - auto out_keys = legate::create_buffer(num_features * split_samples); - auto split_proposals = legate::create_buffer(num_features * split_samples); + auto out_keys = legate::create_buffer(X.NumFeatures() * split_samples); + auto split_proposals = legate::create_buffer(X.NumFeatures() * split_samples); auto key_val = thrust::make_zip_iterator(thrust::make_tuple(keys.ptr(0), draft_proposals.ptr({0, 0}))); auto out_iter = thrust::make_zip_iterator(thrust::make_tuple(out_keys.ptr(0), split_proposals.ptr(0))); auto result = - thrust::unique_copy(policy, key_val, key_val + num_features * split_samples, out_iter); + thrust::unique_copy(policy, key_val, key_val + X.NumFeatures() * split_samples, out_iter); auto n_unique = thrust::distance(out_iter, result); // Count the unique values for each feature - auto row_pointers = legate::create_buffer(num_features + 1); - CHECK_CUDA(cudaMemsetAsync(row_pointers.ptr(0), 0, (num_features + 1) * sizeof(int32_t), stream)); + auto row_pointers = legate::create_buffer(X.NumFeatures() + 1); + CHECK_CUDA( + cudaMemsetAsync(row_pointers.ptr(0), 0, (X.NumFeatures() + 1) * sizeof(int32_t), stream)); - tcb::span const out_keys_span(out_keys.ptr(0), num_features * split_samples); + tcb::span const out_keys_span(out_keys.ptr(0), X.NumFeatures() * split_samples); auto unique_keys_span = out_keys_span.subspan(0, n_unique); thrust::reduce_by_key(policy, unique_keys_span.begin(), @@ -929,15 +991,15 @@ auto SelectSplitSamples(legate::TaskContext context, thrust::make_discard_iterator(), row_pointers.ptr(1)); // Scan the counts to get the row pointers for a CSR matrix - tcb::span const row_pointers_span(row_pointers.ptr(0), num_features + 1); + tcb::span const row_pointers_span(row_pointers.ptr(0), X.NumFeatures() + 1); thrust::inclusive_scan(policy, row_pointers_span.begin() + 1, - row_pointers_span.begin() + 1 + num_features, + row_pointers_span.begin() + 1 + X.NumFeatures(), row_pointers_span.begin() + 1); // Set the largest split sample to +inf such that an element must belong to one of the bins // i.e. we cannot go off the end when searching for a bin - LaunchN(num_features, stream, [=] __device__(int i) { + LaunchN(X.NumFeatures(), stream, [=] __device__(int i) { auto end = row_pointers_span[i + 1]; split_proposals[end - 1] = std::numeric_limits::infinity(); }); @@ -946,7 +1008,7 @@ auto SelectSplitSamples(legate::TaskContext context, row_samples.destroy(); draft_proposals.destroy(); out_keys.destroy(); - return SparseSplitProposals(split_proposals, row_pointers, num_features, n_unique); + return SparseSplitProposals(split_proposals, row_pointers, X.NumFeatures(), n_unique); } // Can't put a device lambda in constructor so make this a function @@ -961,8 +1023,9 @@ void FillPositions(const legate::Buffer>& sor } // namespace -template +template struct TreeBuilder { + using T = typename MatrixT::value_type; TreeBuilder(int32_t num_rows, int32_t num_features, int32_t num_outputs, @@ -970,15 +1033,18 @@ struct TreeBuilder { int32_t max_nodes, int32_t max_depth, const SparseSplitProposals& split_proposals, - GradientQuantiser quantiser) + GradientQuantiser quantiser, + int64_t seed) : num_rows(num_rows), num_features(num_features), num_outputs(num_outputs), stream(stream), max_nodes(max_nodes), + max_depth(max_depth), split_proposals(split_proposals), quantiser(quantiser), - histogram_kernel(split_proposals, stream) + histogram_kernel(split_proposals, stream), + seed(seed) { sorted_positions = legate::create_buffer>(num_rows); FillPositions(sorted_positions, num_rows, stream); @@ -1002,14 +1068,45 @@ struct TreeBuilder { max_batch_size = max_histogram_nodes; } + auto Build(legate::TaskContext context, + const MatrixT& X_matrix, + const legate::AccessorRO& g_accessor, + const legate::AccessorRO& h_accessor, + legate::Rect<3> g_shape, + double alpha) -> Tree + { + auto* stream = context.get_task_stream(); + auto thrust_alloc = ThrustAllocator(legate::Memory::GPU_FB_MEM); + auto thrust_exec_policy = DEFAULT_POLICY(thrust_alloc).on(stream); + + Tree tree(max_nodes, num_outputs, stream, thrust_exec_policy); + + this->InitialiseRoot(context, tree, g_accessor, h_accessor, g_shape, alpha); + + for (int depth = 0; depth < max_depth; ++depth) { + auto batches = this->PrepareBatches(depth); + for (auto batch : batches) { + auto histogram = this->GetHistogram(batch); + + this->ComputeHistogram(histogram, context, tree, X_matrix, g_accessor, h_accessor, batch); + + this->PerformBestSplit(tree, histogram, alpha, batch); + } + // Update position of entire level + // Don't bother updating positions for the last level + if (depth < max_depth - 1) { this->UpdatePositions(tree, X_matrix); } + } + + return tree; + } + TreeBuilder(const TreeBuilder&) = delete; TreeBuilder(TreeBuilder&&) = delete; auto operator=(const TreeBuilder&) -> TreeBuilder& = delete; auto operator=(TreeBuilder&&) -> TreeBuilder& = delete; ~TreeBuilder() = default; - template - void UpdatePositions(Tree& tree, const legate::AccessorRO& X, legate::Rect<3> X_shape) + void UpdatePositions(Tree& tree, const MatrixT& X) { tcb::span const tree_feature_span(tree.feature.ptr(0), max_nodes); tcb::span const tree_split_value_span(tree.split_value.ptr(0), max_nodes); @@ -1024,10 +1121,13 @@ struct TreeBuilder { sorted_positions[idx] = cuda::std::make_tuple(-1, row); return; } + double const x_value = - X[{X_shape.lo[0] + static_cast(row), tree_feature_span[pos], 0}]; + X.Get(X.RowSubset().lo[0] + static_cast(row), tree_feature_span[pos]); bool const left = x_value <= tree_split_value_span[pos]; pos = left ? BinaryTree::LeftChild(pos) : BinaryTree::RightChild(pos); + // printf("Row %d, feature %d, value %f pos %d\n", row, tree_feature_span[pos], + // x_value, pos); sorted_positions[idx] = cuda::std::make_tuple(pos, row); }); CHECK_CUDA_STREAM(stream); @@ -1064,19 +1164,15 @@ struct TreeBuilder { stream)); } - template void ComputeHistogram(Histogram histogram, legate::TaskContext context, Tree& tree, - const legate::AccessorRO& X, - legate::Rect<3> X_shape, + const MatrixT& X, const legate::AccessorRO& g, const legate::AccessorRO& h, - NodeBatch batch, - int64_t seed) + NodeBatch batch) { histogram_kernel.BuildHistogram(X, - X_shape.lo[0], g, h, num_outputs, @@ -1127,7 +1223,8 @@ struct TreeBuilder { tree.split_value, tree.gain, batch, - quantiser); + quantiser, + std::is_same_v>); CHECK_CUDA_STREAM(stream); } void InitialiseRoot(legate::TaskContext context, @@ -1135,8 +1232,7 @@ struct TreeBuilder { const legate::AccessorRO& g, const legate::AccessorRO& h, legate::Rect<3> g_shape, - double alpha, - int64_t seed) + double alpha) { const int kBlockThreads = 256; const size_t blocks = (num_rows + kBlockThreads - 1) / kBlockThreads; @@ -1243,11 +1339,13 @@ struct TreeBuilder { const int32_t num_features; const int32_t num_outputs; const int32_t max_nodes; + const int32_t max_depth; + const int64_t seed; SparseSplitProposals split_proposals; Histogram histogram; int max_batch_size; GradientQuantiser quantiser; - HistogramKernel histogram_kernel; + HistogramKernel histogram_kernel; cudaStream_t stream; }; @@ -1260,6 +1358,8 @@ struct build_tree_fn { auto [g, g_shape, g_accessor] = GetInputStore(context.input(1).data()); auto [h, h_shape, h_accessor] = GetInputStore(context.input(2).data()); + DenseXMatrix const X_matrix(X_accessor, X_shape); + EXPECT_DENSE_ROW_MAJOR(X_accessor.accessor, X_shape); auto num_features = X_shape.hi[1] - X_shape.lo[1] + 1; auto num_rows = std::max(X_shape.hi[0] - X_shape.lo[0] + 1, 0); @@ -1281,51 +1381,96 @@ struct build_tree_fn { auto thrust_alloc = ThrustAllocator(legate::Memory::GPU_FB_MEM); auto thrust_exec_policy = DEFAULT_POLICY(thrust_alloc).on(stream); - Tree tree(max_nodes, num_outputs, stream, thrust_exec_policy); - - SparseSplitProposals const split_proposals = - SelectSplitSamples(context, X_accessor, X_shape, split_samples, seed, dataset_rows, stream); + const SparseSplitProposals split_proposals = + SelectSplitSamples(context, X_matrix, split_samples, seed, dataset_rows, stream); GradientQuantiser const quantiser(context, g_accessor, h_accessor, g_shape, stream); - // Begin building the tree - TreeBuilder builder(num_rows, - num_features, - num_outputs, - stream, - tree.max_nodes, - max_depth, - split_proposals, - quantiser); + auto tree = TreeBuilder>(num_rows, + num_features, + num_outputs, + stream, + max_nodes, + max_depth, + split_proposals, + quantiser, + seed) + .Build(context, X_matrix, g_accessor, h_accessor, g_shape, alpha); - builder.InitialiseRoot(context, tree, g_accessor, h_accessor, g_shape, alpha, seed); + tree.WriteTreeOutput(context, quantiser); - for (int depth = 0; depth < max_depth; ++depth) { - auto batches = builder.PrepareBatches(depth); - for (auto batch : batches) { - auto histogram = builder.GetHistogram(batch); + CHECK_CUDA(cudaStreamSynchronize(stream)); + CHECK_CUDA_STREAM(stream); + } +}; + +struct build_tree_csr_fn { + template + void operator()(legate::TaskContext context) + { + auto [X_vals, X_vals_shape, X_vals_accessor] = GetInputStore(context.input(0).data()); + auto [X_coords, X_coords_shape, X_coords_accessor] = + GetInputStore(context.input(1).data()); + auto [X_offsets, X_offsets_shape, X_offsets_accessor] = + GetInputStore, 1>(context.input(2).data()); + auto [g, g_shape, g_accessor] = GetInputStore(context.input(3).data()); + auto [h, h_shape, h_accessor] = GetInputStore(context.input(4).data()); + + auto num_rows = std::max(X_offsets_shape.hi[0] - X_offsets_shape.lo[0] + 1, 0); + auto num_outputs = g_shape.hi[2] - g_shape.lo[2] + 1; + EXPECT(g_shape.lo[2] == 0, "Outputs should not be split between workers."); - builder.ComputeHistogram( - histogram, context, tree, X_accessor, X_shape, g_accessor, h_accessor, batch, seed); + // Scalars + auto max_depth = context.scalars().at(0).value(); + auto max_nodes = context.scalars().at(1).value(); + auto alpha = context.scalars().at(2).value(); + auto split_samples = context.scalars().at(3).value(); + auto seed = context.scalars().at(4).value(); + auto dataset_rows = context.scalars().at(5).value(); + auto num_features = context.scalars().at(6).value(); + + auto* stream = context.get_task_stream(); + CSRXMatrix const X_matrix(X_vals_accessor, + X_coords_accessor, + X_offsets_accessor, + X_vals_shape, + X_offsets_shape, + num_features, + X_vals_shape.volume()); + const SparseSplitProposals split_proposals = + SelectSplitSamples(context, X_matrix, split_samples, seed, dataset_rows, stream); - builder.PerformBestSplit(tree, histogram, alpha, batch); - } - // Update position of entire level - // Don't bother updating positions for the last level - if (depth < max_depth - 1) { builder.UpdatePositions(tree, X_accessor, X_shape); } - } + GradientQuantiser const quantiser(context, g_accessor, h_accessor, g_shape, stream); - tree.WriteTreeOutput(context, thrust_exec_policy, quantiser); + // Begin building the tree + auto tree = TreeBuilder>(num_rows, + num_features, + num_outputs, + stream, + max_nodes, + max_depth, + split_proposals, + quantiser, + seed) + .Build(context, X_matrix, g_accessor, h_accessor, g_shape, alpha); + + tree.WriteTreeOutput(context, quantiser); CHECK_CUDA(cudaStreamSynchronize(stream)); CHECK_CUDA_STREAM(stream); } }; -/*static*/ void BuildTreeTask::gpu_variant(legate::TaskContext context) +/*static*/ void BuildTreeDenseTask::gpu_variant(legate::TaskContext context) { const auto& X = context.input(0).data(); type_dispatch_float(X.code(), build_tree_fn(), context); } +/*static*/ void BuildTreeCSRTask::gpu_variant(legate::TaskContext context) +{ + const auto& X = context.input(0).data(); + type_dispatch_float(X.code(), build_tree_csr_fn(), context); +} + } // namespace legateboost diff --git a/src/models/tree/build_tree.h b/src/models/tree/build_tree.h index af8aa9d4..442b02cb 100644 --- a/src/models/tree/build_tree.h +++ b/src/models/tree/build_tree.h @@ -41,6 +41,12 @@ struct GPairBase { this->hess += b.hess; return *this; } + __host__ __device__ auto operator-=(const GPairBase& b) -> GPairBase& + { + this->grad -= b.grad; + this->hess -= b.hess; + return *this; + } }; template @@ -243,9 +249,57 @@ class Histogram { { return buffer_[{p[0] - node_begin_, p[1], p[2]}]; } + + // Node, output, bin + __host__ __device__ auto operator[](legate::Point<3> p) const -> GPairT + { + return buffer_[{p[0] - node_begin_, p[1], p[2]}]; + } +}; + +// From the scanned histogram gradients and the node sums, infer the gradients for the left and +// right partitions For a dense matrix the left sum is the scanned histogram and the right sum is +// the node sum minus the left sum In the case where we have a sparse matrix, gradients for 0's have +// not been accumulated in the histogram. We can infer the gradients at the matrix zeroes by +// subtracting the sum of the gradients at the last bin (which contains gradients from every +// non-zero element) from the sum of the gradients in the node (this sum always includes gradients +// for every element in that node) +template +__host__ __device__ auto InferSplitSums(const Histogram& scanned_histogram, + const SparseSplitProposals& split_proposals, + const GPairT& node_sum, + int node_id, + int output, + int bin_idx, + int feature, + bool is_sparse) -> std::tuple +{ + auto left_sum = scanned_histogram[{node_id, output, bin_idx}]; + auto right_sum = node_sum - left_sum; + if (!is_sparse) { return std::make_tuple(left_sum, right_sum); } + auto [feature_begin, feature_end] = split_proposals.FeatureRange(feature); + auto scan_sum = scanned_histogram[{node_id, output, feature_end - 1}]; + auto zero_bin = split_proposals.FindBin(0.0, feature); + auto sparse_sum = node_sum - scan_sum; + if (bin_idx < zero_bin) { + // Do nothing, this amount is already on the right + } else { + // Move it to the left + left_sum += sparse_sum; + right_sum -= sparse_sum; + } + return std::make_tuple(left_sum, right_sum); +} + +class BuildTreeDenseTask : public Task { + public: + static void cpu_variant(legate::TaskContext context); +#ifdef LEGATEBOOST_USE_CUDA + static void gpu_variant(legate::TaskContext context); +#endif }; -class BuildTreeTask : public Task { +class BuildTreeCSRTask : public Task { public: static constexpr auto CPU_VARIANT_OPTIONS = legate::VariantOptions{}.with_has_allocations(true); static constexpr auto GPU_VARIANT_OPTIONS = legate::VariantOptions{}.with_has_allocations(true); diff --git a/src/models/tree/matrix_types.h b/src/models/tree/matrix_types.h new file mode 100644 index 00000000..32eb62ab --- /dev/null +++ b/src/models/tree/matrix_types.h @@ -0,0 +1,124 @@ +/* Copyright 2024 NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ +#pragma once +#include +#include +#include +#ifdef __CUDACC__ +#include +#else +#include +#endif + +namespace legateboost { +// Create a uniform interface to two matrix formats +// Dense and CSR +template +class DenseXMatrix { + public: + using value_type = T; + + private: + legate::AccessorRO x; + legate::Rect<3> shape; + + public: + DenseXMatrix(const legate::AccessorRO& x, legate::Rect<3> shape) + : x(std::move(x)), shape(shape) + { + } + // Global row index refers to the index across partitions + // For features, each worker has every feature so the global is the same as the local index + [[nodiscard]] __host__ __device__ auto Get(std::size_t global_row_idx, uint32_t feature_idx) const + -> T + { + return x[legate::Point<3>{global_row_idx, feature_idx, 0}]; + } + [[nodiscard]] __host__ __device__ auto NumFeatures() const -> int + { + return shape.hi[1] - shape.lo[1] + 1; + } + [[nodiscard]] __host__ __device__ auto RowSubset() const -> legate::Rect<1, legate::coord_t> + { + return {shape.lo[0], shape.hi[0]}; + } +}; + +template +class CSRXMatrix { + public: + using value_type = T; + + legate::AccessorRO values; + legate::Rect<1, legate::coord_t> vals_shape; + legate::AccessorRO column_indices; + legate::AccessorRO, 1> row_ranges; + legate::Rect<1, legate::coord_t> row_ranges_shape; + int num_features; + std::size_t nnz; // The number of nnz in ths local partition + + CSRXMatrix(const legate::AccessorRO& values, + const legate::AccessorRO& column_indices, + const legate::AccessorRO, 1>& row_ranges, + legate::Rect<1, legate::coord_t> vals_shape, + legate::Rect<1, legate::coord_t> row_ranges_shape, + int num_features, + std::size_t nnz) + : values(std::move(values)), + column_indices(column_indices), + row_ranges(row_ranges), + num_features(num_features), + vals_shape(vals_shape), + row_ranges_shape(row_ranges_shape), + nnz(nnz) + { + } + + // Global row index refers to the index across partitions + // For features, each worker has every feature so the global is the same as the local index + // This method is less efficient than its Dense counterpart due to the need to search for the + // feature + [[nodiscard]] __host__ __device__ auto Get(std::size_t global_row_idx, uint32_t feature_idx) const + -> T + { + auto row_range = row_ranges[narrow(global_row_idx)]; + + tcb::span const column_indices_span(column_indices.ptr(row_range.lo), + row_range.volume()); + +#ifdef __CUDACC__ + const auto* result = thrust::lower_bound( + thrust::seq, column_indices_span.begin(), column_indices_span.end(), feature_idx); +#else + const auto* result = + std::lower_bound(column_indices_span.begin(), column_indices_span.end(), feature_idx); +#endif + + if (result != column_indices_span.end() && *result == feature_idx) { + return values[row_range.lo + (result - column_indices_span.begin())]; + } + return 0; + } + + [[nodiscard]] auto NNZ() const { return nnz; } + + [[nodiscard]] __host__ __device__ auto NumFeatures() const -> int { return num_features; } + [[nodiscard]] __host__ __device__ auto RowSubset() const -> legate::Rect<1, legate::coord_t> + { + return row_ranges_shape; + } +}; +} // namespace legateboost diff --git a/src/models/tree/predict.cc b/src/models/tree/predict.cc index a87d5044..13e36f0c 100644 --- a/src/models/tree/predict.cc +++ b/src/models/tree/predict.cc @@ -16,11 +16,34 @@ #include "predict.h" #include #include "../../cpp_utils/cpp_utils.h" +#include "matrix_types.h" namespace legateboost { namespace { -struct predict_fn { +template +void PredictRows(const MatrixT& X, + const legate::AccessorWO& pred_accessor, + legate::Rect<3, legate::coord_t> pred_shape, + const legate::AccessorRO& split_value, + const legate::AccessorRO& feature, + const legate::AccessorRO& leaf_value) +{ + for (int64_t i = X.RowSubset().lo[0]; i <= X.RowSubset().hi[0]; i++) { + int pos = 0; + // Use a max depth of 100 to avoid infinite loops + const int max_depth = 100; + for (int depth = 0; depth < max_depth; depth++) { + if (feature[pos] == -1) { break; } + auto x = X.Get(i, feature[pos]); + pos = x <= split_value[pos] ? (pos * 2) + 1 : (pos * 2) + 2; + } + for (int64_t j = pred_shape.lo[2]; j <= pred_shape.hi[2]; j++) { + pred_accessor[{i, 0, j}] = leaf_value[{pos, j}]; + } + } +} +struct predict_dense_fn { template void operator()(legate::TaskContext context) { @@ -45,27 +68,59 @@ struct predict_fn { EXPECT_IS_BROADCAST(context.input(2).data().shape<1>()); EXPECT_IS_BROADCAST(context.input(3).data().shape<1>()); - for (int64_t i = X_shape.lo[0]; i <= X_shape.hi[0]; i++) { - int pos = 0; - // Use a max depth of 100 to avoid infinite loops - const int max_depth = 100; - for (int depth = 0; depth < max_depth; depth++) { - if (feature[pos] == -1) { break; } - auto x = X_accessor[{i, feature[pos], 0}]; - pos = x <= split_value[pos] ? (pos * 2) + 1 : (pos * 2) + 2; - } - for (int64_t j = pred_shape.lo[2]; j <= pred_shape.hi[2]; j++) { - pred_accessor[{i, 0, j}] = leaf_value[{pos, j}]; - } - } + PredictRows(DenseXMatrix(X_accessor, X_shape), + pred_accessor, + pred_shape, + split_value, + feature, + leaf_value); + } +}; + +struct predict_csr_fn { + template + void operator()(legate::TaskContext context) + { + auto [X_vals, X_vals_shape, X_vals_accessor] = GetInputStore(context.input(0).data()); + auto [X_coords, X_coords_shape, X_coords_accessor] = + GetInputStore(context.input(1).data()); + auto [X_offsets, X_offsets_shape, X_offsets_accessor] = + GetInputStore, 1>(context.input(2).data()); + + auto leaf_value = context.input(3).data().read_accessor(); + auto feature = context.input(4).data().read_accessor(); + auto split_value = context.input(5).data().read_accessor(); + + auto pred = context.output(0).data(); + auto pred_shape = pred.shape<3>(); + auto pred_accessor = pred.write_accessor(); + + auto num_features = context.scalars().at(0).value(); + CSRXMatrix const X(X_vals_accessor, + X_coords_accessor, + X_offsets_accessor, + X_vals_shape, + X_offsets_shape, + num_features, + X_vals_shape.volume()); + + EXPECT_AXIS_ALIGNED(0, X_offsets_shape, pred_shape); + + PredictRows(X, pred_accessor, pred_shape, split_value, feature, leaf_value); } }; } // namespace -/*static*/ void PredictTask::cpu_variant(legate::TaskContext context) +/*static*/ void PredictTreeTask::cpu_variant(legate::TaskContext context) +{ + const auto& X = context.input(0).data(); + type_dispatch_float(X.code(), predict_dense_fn(), context); +} + +/*static*/ void PredictTreeCSRTask::cpu_variant(legate::TaskContext context) { const auto& X = context.input(0).data(); - type_dispatch_float(X.code(), predict_fn(), context); + type_dispatch_float(X.code(), predict_csr_fn(), context); } } // namespace legateboost @@ -74,6 +129,7 @@ namespace // unnamed { void __attribute__((constructor)) register_tasks() { - legateboost::PredictTask::register_variants(); + legateboost::PredictTreeTask::register_variants(); + legateboost::PredictTreeCSRTask::register_variants(); } } // namespace diff --git a/src/models/tree/predict.cu b/src/models/tree/predict.cu index 65837619..7c93b0f6 100644 --- a/src/models/tree/predict.cu +++ b/src/models/tree/predict.cu @@ -19,11 +19,42 @@ #include "../../cpp_utils/cpp_utils.cuh" #include "../../cpp_utils/cpp_utils.h" #include "predict.h" +#include "matrix_types.h" namespace legateboost { namespace { -struct predict_fn { + +template +void PredictRows(const MatrixT& X, + const legate::AccessorWO& pred_accessor, + legate::Rect<3, legate::coord_t> pred_shape, + const legate::AccessorRO& split_value, + const legate::AccessorRO& feature, + const legate::AccessorRO& leaf_value, + cudaStream_t stream) +{ + // rowwise kernel + auto prediction_lambda = [=] __device__(size_t idx) { + int64_t pos = 0; + auto global_row_idx = X.RowSubset().lo + idx; + // Use a max depth of 100 to avoid infinite loops + const int max_depth = 100; + for (int depth = 0; depth < max_depth; depth++) { + if (feature[pos] == -1) { break; } + double const X_val = X.Get(global_row_idx, feature[pos]); + pos = X_val <= split_value[pos] ? (pos * 2) + 1 : (pos * 2) + 2; + } + for (int64_t j = pred_shape.lo[2]; j <= pred_shape.hi[2]; j++) { + pred_accessor[{global_row_idx, 0, j}] = leaf_value[{pos, j}]; + } + }; // NOLINT(readability/braces) + + LaunchN(X.RowSubset().volume(), stream, prediction_lambda); + CHECK_CUDA_STREAM(stream); +} + +struct predict_dense_fn { template void operator()(legate::TaskContext context) { @@ -39,7 +70,6 @@ struct predict_fn { auto pred = context.output(0).data(); auto pred_shape = pred.shape<3>(); auto pred_accessor = pred.write_accessor(); - auto n_outputs = pred_shape.hi[2] - pred_shape.lo[2] + 1; EXPECT(pred_shape.lo[2] == 0, "Expect all outputs to be present"); // We should have one output prediction per row of X @@ -50,36 +80,61 @@ struct predict_fn { EXPECT_IS_BROADCAST(context.input(2).data().shape<1>()); EXPECT_IS_BROADCAST(context.input(3).data().shape<1>()); - // rowwise kernel - auto prediction_lambda = [=] __device__(size_t idx) { - int64_t pos = 0; - legate::Point<3> x_point = {X_shape.lo[0] + static_cast(idx), 0, 0}; - - // Use a max depth of 100 to avoid infinite loops - const int max_depth = 100; - for (int depth = 0; depth < max_depth; depth++) { - if (feature[pos] == -1) { break; } - x_point[1] = feature[pos]; - double const X_val = X_accessor[x_point]; - pos = X_val <= split_value[pos] ? (pos * 2) + 1 : (pos * 2) + 2; - } - for (int64_t j = 0; j < n_outputs; j++) { - pred_accessor[{X_shape.lo[0] + static_cast(idx), 0, j}] = leaf_value[{pos, j}]; - } - }; // NOLINT(readability/braces) - - auto* stream = context.get_task_stream(); - LaunchN(X_shape.hi[0] - X_shape.lo[0] + 1, stream, prediction_lambda); - - CHECK_CUDA_STREAM(stream); + PredictRows(DenseXMatrix(X_accessor, X_shape), + pred_accessor, + pred_shape, + split_value, + feature, + leaf_value, + context.get_task_stream()); + } +}; + +struct predict_csr_fn { + template + void operator()(legate::TaskContext context) + { + auto [X_vals, X_vals_shape, X_vals_accessor] = GetInputStore(context.input(0).data()); + auto [X_coords, X_coords_shape, X_coords_accessor] = + GetInputStore(context.input(1).data()); + auto [X_offsets, X_offsets_shape, X_offsets_accessor] = + GetInputStore, 1>(context.input(2).data()); + + auto leaf_value = context.input(3).data().read_accessor(); + auto feature = context.input(4).data().read_accessor(); + auto split_value = context.input(5).data().read_accessor(); + + auto pred = context.output(0).data(); + auto pred_shape = pred.shape<3>(); + auto pred_accessor = pred.write_accessor(); + + auto num_features = context.scalars().at(0).value(); + CSRXMatrix const X(X_vals_accessor, + X_coords_accessor, + X_offsets_accessor, + X_vals_shape, + X_offsets_shape, + num_features, + X_vals_shape.volume()); + + EXPECT_AXIS_ALIGNED(0, X_offsets_shape, pred_shape); + + PredictRows( + X, pred_accessor, pred_shape, split_value, feature, leaf_value, context.get_task_stream()); } }; } // namespace -/*static*/ void PredictTask::gpu_variant(legate::TaskContext context) +/*static*/ void PredictTreeTask::gpu_variant(legate::TaskContext context) +{ + auto X = context.input(0).data(); + type_dispatch_float(X.code(), predict_dense_fn(), context); +} + +/*static*/ void PredictTreeCSRTask::gpu_variant(legate::TaskContext context) { auto X = context.input(0).data(); - type_dispatch_float(X.code(), predict_fn(), context); + type_dispatch_float(X.code(), predict_csr_fn(), context); } } // namespace legateboost diff --git a/src/models/tree/predict.h b/src/models/tree/predict.h index 1d98e6b5..c55efd42 100644 --- a/src/models/tree/predict.h +++ b/src/models/tree/predict.h @@ -20,7 +20,15 @@ namespace legateboost { -class PredictTask : public Task { +class PredictTreeTask : public Task { + public: + static void cpu_variant(legate::TaskContext context); +#ifdef LEGATEBOOST_USE_CUDA + static void gpu_variant(legate::TaskContext context); +#endif +}; + +class PredictTreeCSRTask : public Task { public: static void cpu_variant(legate::TaskContext context); #ifdef LEGATEBOOST_USE_CUDA