From 11bd20355061b1c947b6c716c4685a4b66115350 Mon Sep 17 00:00:00 2001 From: anuunchin <88698977+anuunchin@users.noreply.github.com> Date: Fri, 12 Dec 2025 10:11:09 +0100 Subject: [PATCH] Initial commit: preliminary tests, some helper functions --- scripts/builtin/hdbscan.dml | 132 ++++++++++++++++++ scripts/builtin/hdbscanApply.dml | 0 .../functions/builtin/BuiltinHDBSCANTest.java | 0 src/test/scripts/functions/builtin/hdbscan.R | 0 .../scripts/functions/builtin/hdbscan.dml | 20 +++ .../functions/builtin/hdbscanApply.dml | 0 test_build_mst.dml | 46 ++++++ test_kth_smallest.dml | 14 ++ test_mutual_reachability.dml | 26 ++++ 9 files changed, 238 insertions(+) create mode 100644 scripts/builtin/hdbscan.dml create mode 100644 scripts/builtin/hdbscanApply.dml create mode 100644 src/test/java/org/apache/sysds/test/functions/builtin/BuiltinHDBSCANTest.java create mode 100644 src/test/scripts/functions/builtin/hdbscan.R create mode 100644 src/test/scripts/functions/builtin/hdbscan.dml create mode 100644 src/test/scripts/functions/builtin/hdbscanApply.dml create mode 100644 test_build_mst.dml create mode 100644 test_kth_smallest.dml create mode 100644 test_mutual_reachability.dml diff --git a/scripts/builtin/hdbscan.dml b/scripts/builtin/hdbscan.dml new file mode 100644 index 00000000000..ca48c7fdfc6 --- /dev/null +++ b/scripts/builtin/hdbscan.dml @@ -0,0 +1,132 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# The hdbscan function is used to perform the hdbscan clustering +# algorithm using knn-based mutual reachability distance and minimum spanning tree. +# +# INPUT: +# ------------------------------------------------------------ +# X The input Matrix to do hdbscan on. +# minPts Minimum number of points for core distance computation. (Defaults to 5) +# minClSize Minimum cluster size (Defaults to minPts) +# ------------------------------------------------------------ +# +# OUTPUT: +# ------------------------------------------------------------ +# clusterMems Cluster labels for each point +# clusterModel The cluster centroids for prediction +# ------------------------------------------------------------ + +# TODO: m,s , f? +m_hdbscan = function(Matrix[Double] X, Integer minPts = 5, Integer minClSize = -1) + return (Matrix[Double] clusterMems, Matrix[Double] clusterModel) +{ + if(minPts < 2) { + stop("HDBSCAN: minPts should be at least 2") + } + + if(minClSize < 0) { + minClSize = minPts + } + + n = nrow(X) + d = ncol(X) + + if(n < minPts) { + stop("HDBSCAN: Number of data points should be at least minPts") + } + + distances = dist(X) + + coreDistances = matrix(0, rows=n, cols=1) + for(i in 1:n) { + kthDist = computeKthSmallest(distances[i,], minPts) + coreDistances[i] = kthDist + } + + mutualReachDist = computeMutualReachability(distances, coreDistances) + + [mstEdges, mstWeights] = buildMST(mutualReachDist, n) + + # TODO: build cluster hierarchy + # TODO: get stable cluster with stability score + # TODO: build cluster model + + # temp dummy values + clusterMems = matrix(1, rows=n, cols=1) + clusterModel = X +} + + +computeKthSmallest = function(Matrix[Double] array, Integer k) + return (Double res) +{ + sorted = order(target=array, by=1, decreasing=FALSE) + res = as.scalar(sorted[k+1, 1]) +} + + +computeMutualReachability = function(Matrix[Double] distances, Matrix[Double] coreDistances) + return (Matrix[Double] mutualReach) +{ + # mutualReach(i,j) = max(dist(i,j), coreDist(i), coreDist(j)) + # Diagonal is set to zero. + + n = nrow(distances) + + coreDistRow = t(coreDistances) + coreDistCol = coreDistances + + maxCoreRow = (distances > coreDistRow) * distances + (distances <= coreDistRow) * coreDistRow + mutualReach = (maxCoreRow > coreDistCol) * maxCoreRow + (maxCoreRow <= coreDistCol) * coreDistCol + + mutualReach = mutualReach * (1 - diag(matrix(1, rows=n, cols=1))) +} + + +buildMST = function(Matrix[Double] distances, Integer n) + return (Matrix[Double] edges, Matrix[Double] weights) +{ + edges = matrix(0, rows=n-1, cols=2) + weights = matrix(0, rows=n-1, cols=1) + + inMST = matrix(0, rows=n, cols=1) + inMST[1] = 1 + + minDist = distances[1,] + minDist = t(minDist) + + for(i in 1:(n-1)) { + candidates = minDist + inMST * 1e15 + minIdx = as.scalar(rowIndexMin(t(candidates))) + minWeight = as.scalar(minDist[minIdx]) + + # Find which node in MST connects to minIdx + connectIdx = as.scalar(rowIndexMin(distances[minIdx,] + t(1-inMST) * 1e15)) + edges[i,1] = minIdx + edges[i,2] = connectIdx + weights[i] = minWeight + + inMST[minIdx] = 1 + newDists = distances[minIdx,] + minDist = (minDist < t(newDists)) * minDist + (minDist >= t(newDists)) * t(newDists) + } +} \ No newline at end of file diff --git a/scripts/builtin/hdbscanApply.dml b/scripts/builtin/hdbscanApply.dml new file mode 100644 index 00000000000..e69de29bb2d diff --git a/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinHDBSCANTest.java b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinHDBSCANTest.java new file mode 100644 index 00000000000..e69de29bb2d diff --git a/src/test/scripts/functions/builtin/hdbscan.R b/src/test/scripts/functions/builtin/hdbscan.R new file mode 100644 index 00000000000..e69de29bb2d diff --git a/src/test/scripts/functions/builtin/hdbscan.dml b/src/test/scripts/functions/builtin/hdbscan.dml new file mode 100644 index 00000000000..cc59154f3c3 --- /dev/null +++ b/src/test/scripts/functions/builtin/hdbscan.dml @@ -0,0 +1,20 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- diff --git a/src/test/scripts/functions/builtin/hdbscanApply.dml b/src/test/scripts/functions/builtin/hdbscanApply.dml new file mode 100644 index 00000000000..e69de29bb2d diff --git a/test_build_mst.dml b/test_build_mst.dml new file mode 100644 index 00000000000..a8dd7a44ff9 --- /dev/null +++ b/test_build_mst.dml @@ -0,0 +1,46 @@ +source("scripts/builtin/hdbscan.dml") as hdb + +# 4 +# / | \ +# / | \ +# / (2) (5) +# | | \ +# | | \ +# (2) 1-(3)--3 +# | | / +# \ | (4) +# \ (1) / +# \ | / +# \|/ +# 2 + +distances = matrix(0, rows=4, cols=4) +distances[1,2] = 1 +distances[2,1] = 1 + +distances[1,3] = 3 +distances[3,1] = 3 + +distances[1,4] = 2 +distances[4,1] = 2 + +distances[2,3] = 4 +distances[3,2] = 4 + +distances[2,4] = 2 +distances[4,2] = 2 + +distances[3,4] = 5 +distances[4,3] = 5 + +[edges, weights] = hdb::buildMST(distances, 4) + +totalWeight = sum(weights) + +test_pass = (nrow(edges) == 3) & (totalWeight == 6) + +if(test_pass) { + print("Passed") +} else { + print("Failes") +} \ No newline at end of file diff --git a/test_kth_smallest.dml b/test_kth_smallest.dml new file mode 100644 index 00000000000..b5065230f26 --- /dev/null +++ b/test_kth_smallest.dml @@ -0,0 +1,14 @@ +source("scripts/builtin/hdbscan.dml") as hdb + +array = matrix("0 1 4 2", rows=4, cols=1) + +res1 = hdb::computeKthSmallest(array, 1) +res2 = hdb::computeKthSmallest(array, 2) + +test_pass = (res1 == 1) & (res2 == 2) + +if(test_pass) { + print("Passed") +} else { + print("Failed") +} \ No newline at end of file diff --git a/test_mutual_reachability.dml b/test_mutual_reachability.dml new file mode 100644 index 00000000000..2e86ed28311 --- /dev/null +++ b/test_mutual_reachability.dml @@ -0,0 +1,26 @@ +source("scripts/builtin/hdbscan.dml") as hdb + +distances = matrix("0 1 5 1 0 4 5 4 0", rows=3, cols=3) +coreDistances = matrix("1 1 4", rows=3, cols=1) + +mutualReach = hdb::computeMutualReachability(distances, coreDistances) + +diagSum = sum(diag(mutualReach)) + +val_AB = as.scalar(mutualReach[1,2]) +val_AC = as.scalar(mutualReach[1,3]) +val_BC = as.scalar(mutualReach[2,3]) + +sym_AB = as.scalar(mutualReach[2,1]) +sym_AC = as.scalar(mutualReach[3,1]) +sym_BC = as.scalar(mutualReach[3,2]) + +test1_pass = (val_AB == 1) & (val_AC == 5) & (val_BC == 4) & + (diagSum == 0) & + (val_AB == sym_AB) & (val_AC == sym_AC) & (val_BC == sym_BC) + +if(test1_pass) { + print("Passed") +} else { + print("Failed") +}