diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..1eead6f --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,11 @@ +cmake_minimum_required(VERSION 3.15...4.0) +project(tripclust LANGUAGES CXX) + +set(PYBIND11_FINDPYTHON ON) +find_package(pybind11 CONFIG REQUIRED) + +find_package (Eigen3 REQUIRED) +include_directories(${EIGEN3_INCLUDE_DIR}) + +pybind11_add_module(tripclust src/spyral_utils/tripclust/pybind.cpp src/spyral_utils/tripclust/src/postprocess.cpp src/spyral_utils/tripclust/src/directedgraph.cpp src/spyral_utils/tripclust/src/orthogonallsq.cpp src/spyral_utils/tripclust/src/pointcloud.cpp src/spyral_utils/tripclust/src/option.cpp src/spyral_utils/tripclust/src/cluster.cpp src/spyral_utils/tripclust/src/dnn.cpp src/spyral_utils/tripclust/src/graph.cpp src/spyral_utils/tripclust/src/triplet.cpp src/spyral_utils/tripclust/src/util.cpp src/spyral_utils/tripclust/src/hclust/fastcluster.cpp src/spyral_utils/tripclust/src/kdtree/kdtree.cpp) +install(TARGETS tripclust DESTINATION .) diff --git a/README.md b/README.md index a5f42ea..4e0846a 100644 --- a/README.md +++ b/README.md @@ -10,6 +10,7 @@ spyral-utils is a utility library that contains some of the core functionality - Some histogramming and gating/cuting tools that are plotting backend agnostic - Energy loss analysis for gas and solid targets using pycatima - 4-vector analysis through the vector package +- Point triplet clustering algorithm (also called tripclust or Dalitz clustering) See the [documentation](https://attpc.github.io/spyral-utils/) for more details. @@ -69,9 +70,9 @@ For a gas mixture: } ``` -Compound specifications are lists of elements where each element is an array of `[Z, A, S]`. `S` is the -stoichiometry of that particular element in the compound. spyral-utils does not support target layers at -this time (but layered targets can be built from the building blocks provided by spyral-utils). In the above examples the +Compound specifications are lists of elements where each element is an array of `[Z, A, S]`. `S` is the +stoichiometry of that particular element in the compound. spyral-utils does not support target layers at +this time (but layered targets can be built from the building blocks provided by spyral-utils). In the above examples the gas target is for 1H2 gas at 300 Torr pressure and the solid target is for 12C1 foil with a thickness of 50 μg/cm2. The gas mixutre example is for P10 gas (10% methane in argon) at 50 Torr. @@ -172,3 +173,4 @@ For testing we use - Gordon McCann - Nathan Turi +- Daniel Bazin diff --git a/docs/api/tripclust/index.md b/docs/api/tripclust/index.md new file mode 100644 index 0000000..fd33f60 --- /dev/null +++ b/docs/api/tripclust/index.md @@ -0,0 +1,3 @@ +# Module Overview + +::: spyral_utils.plot diff --git a/pyproject.toml b/pyproject.toml index b904687..ad720e3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,10 +1,11 @@ [project] name = "spyral-utils" -version = "2.0.0" +version = "2.1.0" description = "A collection of useful utilities for the Spyral analysis framework" authors = [ {name = "Gordon McCann", email = "mccann@frib.msu.edu"}, {name = "Nathan Turi", email = "turi@frib.msu.edu"}, + {name = "Daniel Bazin", email = "bazin@frib.msu.edu"}, ] dependencies = [ "numpy>=2.0, <2.2", @@ -13,14 +14,17 @@ dependencies = [ "pycatima>=1.90", "shapely>=2.0.6", "vector>=1.3.0", + "pybind11>=3.0.0", + "eigenpip>=0.1.0", ] requires-python = ">=3.10,<3.14" readme = "README.md" license = {text = "GPLv3"} [build-system] -requires = ["pdm-backend"] -build-backend = "pdm.backend" +requires = ["pdm-backend", "scikit-build-core", "pybind11"] +# build-backend = "pdm.backend" +build-backend = "scikit_build_core.build" [dependency-groups] dev = [ diff --git a/src/spyral_utils/tripclust/CMakeLists.txt b/src/spyral_utils/tripclust/CMakeLists.txt new file mode 100644 index 0000000..41aa21f --- /dev/null +++ b/src/spyral_utils/tripclust/CMakeLists.txt @@ -0,0 +1,11 @@ +cmake_minimum_required(VERSION 3.15...4.0) +project(tripclust LANGUAGES CXX) + +set(PYBIND11_FINDPYTHON ON) +find_package(pybind11 CONFIG REQUIRED) + +find_package (Eigen3 REQUIRED) +include_directories(${EIGEN3_INCLUDE_DIR}) + +pybind11_add_module(tripclust pybind.cpp src/postprocess.cpp src/directedgraph.cpp src/orthogonallsq.cpp src/pointcloud.cpp src/option.cpp src/cluster.cpp src/dnn.cpp src/graph.cpp src/triplet.cpp src/util.cpp src/hclust/fastcluster.cpp src/kdtree/kdtree.cpp) +install(TARGETS tripclust DESTINATION .) diff --git a/src/spyral_utils/tripclust/pybind.cpp b/src/spyral_utils/tripclust/pybind.cpp new file mode 100644 index 0000000..56d8a5c --- /dev/null +++ b/src/spyral_utils/tripclust/pybind.cpp @@ -0,0 +1,193 @@ +#include +#include +#include +#include +#include "src/pointcloud.h" +#include "src/option.h" +#include "src/cluster.h" +#include "src/dnn.h" +#include "src/graph.h" +#include "src/postprocess.h" + +namespace py = pybind11; + +class tripclust { +private: + Opt opt; + PointCloud cloud_xyz; + cluster_group cl_group; + bool postprocess; + int min_depth; + +public: + tripclust(); + void fill_pointcloud(py::array_t); + void perform_clustering(); + py::array_t get_cluster(int); + py::array_t get_labels(); + int get_number_of_clusters(); + // neighbour distance for smoothing + void set_r(const double value) {opt.r = value;} + void set_rdnn(const bool value) {opt.rdnn = value;} // compute r with dnn + // tested neighbours of triplet mid point + void set_k(const size_t value) {opt.k = value;} + // max number of triplets to one mid point + void set_n(const size_t value) {opt.n = value;} + // 1 - cos alpha, where alpha is the angle between the two triplet branches + void set_a(const double value) {opt.a = value;} + // distance scale factor in metric + void set_s(const double value) {opt.s = value;} + void set_sdnn(const bool value) {opt.sdnn = value;} // compute s with dnn + // threshold for cdist in clustering + void set_t(const double value) {opt.t = value;} + void set_tauto(const bool value) {opt.tauto = value;} // auto generate t + // maximum gap width + void set_dmax(const double value) {opt.dmax = value;} + void set_dmax_dnn(const bool value) {opt.dmax_dnn = value;} // use dnn for dmax + void set_ordered(const bool value) {opt.ordered = value;} // points are in chronological order + // linkage method for clustering + void set_link(const Linkage value) {opt.link = value;} + // min number of triplets per cluster + void set_m(const size_t value) {opt.m = value;} + // whether or not post processing should be enabled + void set_postprocess(const bool value) {postprocess = value;} + // minimum number of points making a branch in curve in post processing + void set_min_depth(const int value) {min_depth = value;} +}; + +tripclust::tripclust() { + Opt opt; + PointCloud pc; + cluster_group cg; + this->opt = opt; + this->cloud_xyz = pc; + this->cl_group = cg; + this->postprocess = false; + this->min_depth = 25; +} + +// Load cloud_xyz from numpy array +void tripclust::fill_pointcloud(py::array_t arr) { + py::buffer_info buf_info = arr.request(); + double *ptr = static_cast(buf_info.ptr); + size_t ndim = buf_info.ndim; + std::vector shape = buf_info.shape; + + // The numpy array is assumed to be a 2D cloud array from Spyral + for (size_t i = 0; i < shape[0]; i++) { + Point point; + point.x = ptr[i*shape[1]+0]; // x coordinate + point.y = ptr[i*shape[1]+1]; // y coordinate + point.z = ptr[i*shape[1]+2]; // z coordinate + this->cloud_xyz.push_back(point); + } + // this->cloud.setOrdered(true); + // this->cloud_xyz.set2d(false); +} + +void tripclust::perform_clustering() { + if (this->opt.needs_dnn()) { + double dnn = std::sqrt(first_quartile(this->cloud_xyz)); + this->opt.set_dnn(dnn); + if (dnn == 0.0) { + return; // Need to throw some kind of error + } + } + // Step 1) smoothing by position averaging of neighboring points + PointCloud cloud_xyz_smooth; + smoothen_cloud(this->cloud_xyz, cloud_xyz_smooth, this->opt.get_r()); + + // Step 2) finding triplets of approximately collinear points + std::vector triplets; + generate_triplets(cloud_xyz_smooth, triplets, this->opt.get_k(), + this->opt.get_n(), this->opt.get_a()); + + // Step 3) single link hierarchical clustering of the triplets + // cluster_group cl_group; + compute_hc(cloud_xyz_smooth, this->cl_group, triplets, this->opt.get_s(), + this->opt.get_t(), this->opt.is_tauto(), this->opt.get_dmax(), + this->opt.is_dmax(), this->opt.get_linkage(), 0); + + // Step 4) pruning by removal of small clusters ... + cleanup_cluster_group(this->cl_group, this->opt.get_m(), 0); + cluster_triplets_to_points(triplets, this->cl_group); + // .. and (optionally) by splitting up clusters at gaps > dmax + if (this->opt.is_dmax()) { + cluster_group cleaned_up_cluster_group; + for (cluster_group::iterator cl = this->cl_group.begin(); cl != this->cl_group.end(); ++cl) { + max_step(cleaned_up_cluster_group, *cl, this->cloud_xyz, this->opt.get_dmax(), + this->opt.get_m() + 2); + } + this->cl_group = cleaned_up_cluster_group; + } + // store cluster labels in points + add_clusters(this->cloud_xyz, this->cl_group, 0); + // Step 5) Optionally perform post processing to find sub clusters + if (this->postprocess) { + int nchanged; + nchanged = process_pointcloud(this->cloud_xyz, this->min_depth, 0); + } +} + +// Get an individual cluster array +py::array_t tripclust::get_cluster(int clu) { + std::vector cluster; + for (size_t i=0; icl_group[clu].size(); ++i) + cluster.push_back(this->cl_group[clu][i]); + return py::array_t(cluster.size(), cluster.data()); +} + +// Get an array containing the cluster labels for each point +py::array_t tripclust::get_labels() { + std::vector labels(this->cloud_xyz.size(), -1); + for (size_t i=0; icloud_xyz.size(); ++i) { + std::set::iterator iterator=this->cloud_xyz[i].cluster_ids.begin(); + if (this->cloud_xyz[i].cluster_ids.size() > 0) labels[i] = *iterator; // Choose the first cluster id + } + // + // for (size_t clu=0; clucl_group.size(); ++clu) { + // for (size_t i=0; icl_group[clu].size(); ++i) { + // labels[this->cl_group[clu][i]] = clu; + // } + // } + return py::array_t(labels.size(), labels.data()); +} + +// Get the number of clusters +int tripclust::get_number_of_clusters(){ + int nclu = 0; + std::vector test(this->cloud_xyz.size(), false); + for (size_t i=0; icloud_xyz.size(); ++i) { + for (std::set::iterator j=this->cloud_xyz[i].cluster_ids.begin(); j!=this->cloud_xyz[i].cluster_ids.end(); ++j) + test[*j] = true; + } + for (size_t i=0; icl_group.size(); +} + +PYBIND11_MODULE(tripclust, m, py::mod_gil_not_used()) { + py::class_(m, "tripclust") + .def(py::init()) + .def("fill_pointcloud", &tripclust::fill_pointcloud, "Fills point cloud") + .def("perform_clustering", &tripclust::perform_clustering, "Performs the tripclust clustering") + .def("get_number_of_clusters", &tripclust::get_number_of_clusters, "Get the number of clusters") + .def("get_cluster", &tripclust::get_cluster, "Retrieves individual cluster") + .def("get_labels", &tripclust::get_labels, "Returns an array containing the label for each point") + .def("set_r", &tripclust::set_r, "neighbour distance for smoothing") + .def("set_rdnn", &tripclust::set_rdnn, "compute r with dnn") + .def("set_k", &tripclust::set_k, "tested neighbours of triplet mid point") + .def("set_n", &tripclust::set_n, "max number of triplets to one mid point") + .def("set_a", &tripclust::set_a, "1 - cos alpha, where alpha is the angle between the two triplet branches") + .def("set_s", &tripclust::set_s, "distance scale factor in metric") + .def("set_sdnn", &tripclust::set_sdnn, "compute s with dnn") + .def("set_t", &tripclust::set_t, "threshold for cdist in clustering") + .def("set_tauto", &tripclust::set_tauto, "auto generate t") + .def("set_dmax", &tripclust::set_dmax, "maximum gap width") + .def("set_dmax_dnn", &tripclust::set_dmax_dnn, "use dnn for dmax") + .def("set_ordered", &tripclust::set_ordered, "points are in chronological order") + .def("set_link", &tripclust::set_link, "linkage method for clustering") + .def("set_m", &tripclust::set_m, "min number of triplets per cluster") + .def("set_postprocess", &tripclust::set_postprocess, "whether or not post processing should be enabled") + .def("set_min_depth", &tripclust::set_min_depth, "minimum number of points making a branch in curve in post processing"); +} diff --git a/src/spyral_utils/tripclust/src/cluster.cpp b/src/spyral_utils/tripclust/src/cluster.cpp new file mode 100644 index 0000000..16e44e8 --- /dev/null +++ b/src/spyral_utils/tripclust/src/cluster.cpp @@ -0,0 +1,259 @@ +// +// cluster.cpp +// Functions for triplet clustering and for propagating +// the triplet cluster labels to points +// +// Author: Jens Wilberg, Lukas Aymans, Christoph Dalitz +// Date: 2019-04-02 +// License: see ../LICENSE +// + +#include +#include +#include + +#include "cluster.h" +#include "hclust/fastcluster.h" + +// compute mean of *a* with size *m* +double mean(const double *a, size_t m) { + double sum = 0; + for (size_t i = 0; i < m; ++i) { + sum += a[i]; + } + return sum / m; +} + +// compute standard deviation of *a* with size *m* +double sd(const double *a, size_t m) { + double sum = 0, result; + double mean_val = mean(a, m); + for (size_t k = 0; k < m; ++k) { + double tmp = mean_val - a[k]; + sum += (tmp * tmp); + } + result = (1.0 / (m - 1.0)) * sum; + return std::sqrt(result); +} + +//------------------------------------------------------------------- +// computation of condensed distance matrix. +// The distance matrix is computed from the triplets in *triplets* +// and saved in *result*. *triplet_metric* is used as distance metric. +//------------------------------------------------------------------- +void calculate_distance_matrix(const std::vector &triplets, + const PointCloud &cloud, double *result, + ScaleTripletMetric &triplet_metric) { + size_t const triplet_size = triplets.size(); + size_t k = 0; + + for (size_t i = 0; i < triplet_size; ++i) { + for (size_t j = i + 1; j < triplet_size; j++) { + result[k++] = triplet_metric(triplets[i], triplets[j]); + } + } +} + +//------------------------------------------------------------------- +// Computation of the clustering. +// The triplets in *triplets* are clustered by the fastcluster algorithm +// and the result is returned as cluster_group. *t* is the cut distance +// and *triplet_metric* is the distance metric for the triplets. +// *opt_verbose* is the verbosity level for debug outputs. the clustering +// is returned in *result*. +//------------------------------------------------------------------- +void compute_hc(const PointCloud &cloud, cluster_group &result, + const std::vector &triplets, double s, double t, + bool tauto, double dmax, bool is_dmax, Linkage method, + int opt_verbose) { + const size_t triplet_size = triplets.size(); + size_t k, cluster_size; + hclust_fast_methods link; + + if (!triplet_size) { + // if no triplets are generated + return; + } + // choose linkage method + switch (method) { + case SINGLE: + link = HCLUST_METHOD_SINGLE; + break; + case COMPLETE: + link = HCLUST_METHOD_COMPLETE; + break; + case AVERAGE: + link = HCLUST_METHOD_AVERAGE; + break; + } + + double *distance_matrix = new double[(triplet_size * (triplet_size - 1)) / 2]; + double *cdists = new double[triplet_size - 1]; + int *merge = new int[2 * (triplet_size - 1)], *labels = new int[triplet_size]; + ScaleTripletMetric metric(s); + calculate_distance_matrix(triplets, cloud, distance_matrix, metric); + + hclust_fast(triplet_size, distance_matrix, link, merge, cdists); + + // splitting the dendrogram into clusters + if (tauto) { + // automatic stopping criterion where cdist is unexpected large + for (k = (triplet_size - 1) / 2; k < (triplet_size - 1); ++k) { + if ((cdists[k - 1] > 0.0 || cdists[k] > 1.0e-8) && + (cdists[k] > cdists[k - 1] + 2 * sd(cdists, k + 1))) { + break; + } + } + if (opt_verbose) { + double automatic_t; + double prev_cdist = (k > 0) ? cdists[k - 1] : 0.0; + if (k < (triplet_size - 1)) { + automatic_t = (prev_cdist + cdists[k]) / 2.0; + } else { + automatic_t = cdists[k - 1]; + } + std::cout << "[Info] optimal cdist threshold: " << automatic_t + << std::endl; + } + } else { + // fixed threshold t + for (k = 0; k < (triplet_size - 1); ++k) { + if (cdists[k] >= t) { + break; + } + } + } + cluster_size = triplet_size - k; + cutree_k(triplet_size, merge, cluster_size, labels); + + // generate clusters + for (size_t i = 0; i < cluster_size; ++i) { + cluster_t new_cluster; + result.push_back(new_cluster); + } + + for (size_t i = 0; i < triplet_size; ++i) { + result[labels[i]].push_back(i); + } + + if (opt_verbose > 1) { + // write debug file + const char *fname = "debug_cdist.csv"; + std::ofstream of(fname); + of << std::fixed; // set float style + if (of.is_open()) { + for (size_t i = 0; i < (triplet_size - 1); ++i) { + of << cdists[i] << std::endl; + } + } else { + std::cerr << "[Error] could not write file '" << fname << "'\n"; + } + of.close(); + } + + // cleanup + delete[] distance_matrix; + delete[] cdists; + delete[] merge; + delete[] labels; +} + +//------------------------------------------------------------------- +// Remove all clusters in *cl_group* which contains less then *m* +// triplets. *cl_group* will be modified. +//------------------------------------------------------------------- +void cleanup_cluster_group(cluster_group &cl_group, size_t m, int opt_verbose) { + size_t old_size = cl_group.size(); + cluster_group::iterator it = cl_group.begin(); + while (it != cl_group.end()) { + if (it->size() < m) { + it = cl_group.erase(it); + } else { + ++it; + } + } + if (opt_verbose > 0) { + std::cout << "[Info] in pruning removed clusters: " + << old_size - cl_group.size() << std::endl; + } +} + +//------------------------------------------------------------------- +// Convert the triplet indices ind *cl_group* to point indices. +// *triplets* contains all triplets and *cl_group* will be modified. +//------------------------------------------------------------------- +void cluster_triplets_to_points(const std::vector &triplets, + cluster_group &cl_group) { + for (size_t i = 0; i < cl_group.size(); ++i) { + cluster_t point_indices, ¤t_cluster = cl_group[i]; + for (std::vector::const_iterator triplet_index = + current_cluster.begin(); + triplet_index < current_cluster.end(); ++triplet_index) { + const triplet ¤t_triplet = triplets[*triplet_index]; + point_indices.push_back(current_triplet.point_index_a); + point_indices.push_back(current_triplet.point_index_b); + point_indices.push_back(current_triplet.point_index_c); + } + // sort point-indices and remove duplicates + std::sort(point_indices.begin(), point_indices.end()); + std::vector::iterator new_end = + std::unique(point_indices.begin(), point_indices.end()); + point_indices.resize(std::distance(point_indices.begin(), new_end)); + // replace triplet cluster with point cluster + current_cluster = point_indices; + } +} + +//------------------------------------------------------------------- +// Adds the cluster ids to the points in *cloud* +// *cl_group* contains the clusters with the point indices. For every +// point the corresponding cluster id is saved. For gnuplot the points +// which overlap between multiple clusteres are saved in seperate +// clusters in *cl_group* +//------------------------------------------------------------------- +void add_clusters(PointCloud &cloud, cluster_group &cl_group, bool gnuplot) { + for (size_t i = 0; i < cl_group.size(); ++i) { + const std::vector ¤t_cluster = cl_group[i]; + for (std::vector::const_iterator point_index = + current_cluster.begin(); + point_index != current_cluster.end(); ++point_index) { + cloud[*point_index].cluster_ids.insert(i); + } + } + + // add point indices to the corresponding cluster/vertex vector. this is for + // the gnuplot output + if (gnuplot) { + std::vector verticies; + for (size_t i = 0; i < cloud.size(); ++i) { + const Point &p = cloud[i]; + if (p.cluster_ids.size() > 1) { + // if the point is in multiple clusters add it to the corresponding + // vertex or create one if none exists + bool found = false; + for (std::vector::iterator v = verticies.begin(); + v != verticies.end(); ++v) { + if (cloud[v->at(0)].cluster_ids == p.cluster_ids) { + v->push_back(i); + found = true; + } + } + if (!found) { + cluster_t v; + v.push_back(i); + verticies.push_back(v); + } + // remove the point from all other clusters + for (std::set::iterator it = p.cluster_ids.begin(); + it != p.cluster_ids.end(); ++it) { + cluster_t &cluster = cl_group[*it]; + cluster.erase(std::remove(cluster.begin(), cluster.end(), i), + cluster.end()); + } + } + } + // extend clusters with verticies + cl_group.reserve(cl_group.size() + verticies.size()); + cl_group.insert(cl_group.end(), verticies.begin(), verticies.end()); + } +} diff --git a/src/spyral_utils/tripclust/src/cluster.h b/src/spyral_utils/tripclust/src/cluster.h new file mode 100644 index 0000000..00a6b44 --- /dev/null +++ b/src/spyral_utils/tripclust/src/cluster.h @@ -0,0 +1,37 @@ +// +// cluster.h +// Functions for triplet clustering and for propagating +// the triplet cluster labels to points +// +// Author: Jens Wilberg, Lukas Aymans, Christoph Dalitz +// Date: 2018-08-30 +// License: see ../LICENSE +// + +#ifndef CLUSTER_H +#define CLUSTER_H + +#include +#include + +#include "triplet.h" +#include "util.h" + +typedef std::vector cluster_t; + +typedef std::vector cluster_group; + +// compute hierarchical clustering +void compute_hc(const PointCloud &cloud, cluster_group &result, + const std::vector &triplets, double s, double t, + bool tauto = false, double dmax = 0, bool is_dmax = false, + Linkage method = SINGLE, int opt_verbose = 0); +// remove all small clusters +void cleanup_cluster_group(cluster_group &cg, size_t m, int opt_verbose = 0); +// convert the triplet indices ind *cl_group* to point indices. +void cluster_triplets_to_points(const std::vector &triplets, + cluster_group &cl_group); +// adds the cluster ids to the points in *cloud* +void add_clusters(PointCloud &cloud, cluster_group &cl_group, + bool gnuplot = false); +#endif diff --git a/src/spyral_utils/tripclust/src/directedgraph.cpp b/src/spyral_utils/tripclust/src/directedgraph.cpp new file mode 100644 index 0000000..60005e6 --- /dev/null +++ b/src/spyral_utils/tripclust/src/directedgraph.cpp @@ -0,0 +1,433 @@ +// +// directedgraph.cpp +// Class for directed graph and building directed MST. +// +// Author: Christoph Dalitz +// Date: 2022-11-07 +// License: see ../LICENSE +// +/** @file */ + +#include "directedgraph.h" +#include "util.h" + +#include /* sqrt */ +#include +#include /* assert */ + +/** + * @brief Return the euklidian distance of two Kdtree::CoordPoints + * + * @param a + * @param b + * @return double + */ +double euklidian_distance(Kdtree::CoordPoint a, Kdtree::CoordPoint b) { + return sqrt(((a[0]-b[0])*(a[0]-b[0]))+((a[1]-b[1])*(a[1]-b[1]))+((a[2]-b[2])*(a[2]-b[2]))); +} + +/** + * @brief Construct a new Graph:: Graph object. This includes building an MST. + * @param cloud + * @param indices + */ +Graph::Graph(PointCloud & cloud, std::vector indices) { + // check input + assert(cloud.size() > 0); + assert(indices.size() > 0); + assert(std::set(indices.begin(), indices.end()).size() == indices.size()); // indices are unique + assert((indices.back() < cloud.size())); // checks range of indices + + this->indices = indices; + + // insert Nodes without Edges. Nodes have x,y,z Coordinates and the corresponding index + for(unsigned int i = 0; i < indices.size(); i++) { + graph.insert(std::pair(indices[i], Node(indices[i], cloud[indices[i]].x, cloud[indices[i]].y, cloud[indices[i]].z))); + } + + // Add first subtree: the first Point by time and the last Point by Index + this->subtree_roots.push_back(&graph.at(indices[indices.size()-1])); + + // construct MST by adding Edges + constructMST(); +} + +/** + * @brief Constructs a KDTree. Requires the graph and indices attribute to be set + * + * @return Kdtree::KdTree + */ +Kdtree::KdTree Graph::constructKDTree() { + // build kdtree + Kdtree::KdNodeVector nodes; + size_t index; + for (std::vector::iterator iterator = indices.begin(); iterator != indices.end(); ++iterator) { + index = *iterator; + Kdtree::CoordPoint p={graph.at(index).x, graph.at(index).y, graph.at(index).z}; + nodes.push_back(Kdtree::KdNode(p, NULL, index)); + } + Kdtree::KdTree kdtree(&nodes); + return kdtree; +} + +/** + * @brief Constructs MST with Chu-Lui/Edmond's algorithm. Simplified, because we assume no cycles are possible due to strict ordering. + * + */ +void Graph::constructMST() { + // build kdtree + Kdtree::KdTree kdtree = constructKDTree(); + + double cost; + size_t nn_idx; + size_t last = indices.back(); + size_t index; + // for vertex in graph do + for (std::vector::iterator iterator = indices.begin(); iterator != indices.end(); ++iterator) { + index=*iterator; + // no outgoing edges to be added at last Node + if(index!=last) { + Kdtree::CoordPoint p = {graph.at(index).x, graph.at(index).y, graph.at(index).z}; + Kdtree::KdNodeVector neighbors; + Predicate predicate = Predicate(index); + std::vector distances; + kdtree.k_nearest_neighbors(p, 1, &neighbors, &distances, &predicate); + // nearest neighbor + if(neighbors.size()== 0) { + std::cout<<"neighbors is empty!!!"<::iterator iterator = indices.begin(); iterator != indices.end(); ++iterator) { + idx = *iterator; + removed += graph.at(idx).removeBranchEdges(depth, &removed_branch_edges,&subtree_roots); + } + if(removed>0) { + return true; + } + return false; +} + +/** + * @brief Calls removeLongEdges for every Node in the Graph. + * + */ +void Graph::removeLongEdges() { + size_t removed = 0; + size_t idx; + for (std::vector::iterator iterator = indices.begin(); iterator != indices.end(); ++iterator) { + idx=*iterator; + removed += graph.at(idx).removeLongEdges(&removed_long_edges,&subtree_roots); + } +} + +/** + * @brief Does a breath-first search starting at each of the Segment Heads to assign new Curve IDs. + * + * @return std::vector< std::vector > + */ +std::vector< std::vector > Graph::assignNewCurveIds() { + int current_id = 0; + std::vector< std::vector > result(subtree_roots.size()); + std::vector open_list; + std::vector closed_list; + std::vector temp; + size_t v; + Node * node_ptr; + size_t i; + // for segment Head + for (std::vector::iterator iterator = subtree_roots.begin(); iterator != subtree_roots.end(); ++iterator) { + node_ptr = *iterator; + open_list.clear(); + closed_list.clear(); + closed_list.push_back(node_ptr->index); + open_list.push_back(node_ptr->index); + result[current_id].push_back(node_ptr->index); + while(!open_list.empty()) { + v = open_list.front(); + open_list.erase(open_list.begin()); + temp.clear(); + graph.at(v).get_next(&temp, true); + for (std::vector::iterator iterator2 = temp.begin(); iterator2 != temp.end(); ++iterator2) { + i = *iterator2; + if(std::find(closed_list.begin(), closed_list.end(), i) == closed_list.end()) { // if not in closed list + closed_list.push_back(i); + result[current_id].push_back(i); + open_list.push_back(i); + } + } + } + current_id++; + } + return result; +} + +/** + * @brief Generates two Gnuplot files in the calling directory: One with the filename [filename_prefix]_edgeRemoval.gnuplot and another with the filename [filename_prefix]_segmentation.gnuplot + * [filename_prefix]_edgeRemoval.gnuplot: + * * Demonstrates the Edges that are removed either because they are long or because they are head of a branch. + * * The color of the points corresponds to the index to show the order of the points. The new roots of the resulting subtrees are also shown. + * @param f + * @param filename_prefix + */ +void Graph::generateGnuplot(bool f, std::string filename_prefix) { + + // option for graph output: + // 1: Remove Edges Plot + // 2: Cluster-Ids Plot + + + if(f) { + // demo edge Removal in MST + std::ofstream outfile1 (filename_prefix+"_edgeRemoval.gnuplot"); + std::vector idx; + size_t index; + size_t edge_to; + Edge e; + Edge edge; + Node * node_ptr; + if ( outfile1 ) { + outfile1<<"set palette rgb 34,35,36;\n"; + //outfile<<"splot '-' using 1:2:3 with points palette; pause -1;\n"; + outfile1<<"splot '-' using 1:2:3 with points palette pointtype 7 ps 1, '-' with lines lc 'black' title 'Kept', '-' with lines lc 'green' title 'Removed (Too long)', '-' with lines lc 'magenta' title 'Removed (Is Branch)', '-' with points lc 'web-blue' pointtype 1 ps 2 title 'Head of Segment'; pause -1;\n"; //, '-' using 1:2:3 with points lc 'blue' ps 1, '-' with lines lc 'blue' title 'Removal Candidate'; pause -1;\n"; + for (std::vector::iterator iterator = indices.begin(); iterator != indices.end(); ++iterator) { + index = *iterator; + outfile1<::iterator iterator = indices.begin(); iterator != indices.end(); ++iterator) { + index = *iterator; + graph[index].get_next(&idx); + for (std::vector::iterator iterator2 = idx.begin(); iterator2 != idx.end(); ++iterator2) { + edge_to = *iterator2; + outfile1<::iterator iterator = this->removed_long_edges.begin(); iterator != this->removed_long_edges.end(); ++iterator) { + edge = *iterator; + outfile1<x<<" "<y<<" "<z<<"\n"; + outfile1<x<<" "<y<<" "<z<<"\n"; + outfile1<<"\n\n"; + } + outfile1<<"e\n"; + for (std::vector::iterator iterator = this->removed_branch_edges.begin(); iterator != this->removed_branch_edges.end(); ++iterator) { + edge = *iterator; + outfile1<x<<" "<y<<" "<z<<"\n"; + outfile1<x<<" "<y<<" "<z<<"\n"; + outfile1<<"\n\n"; + } + outfile1<<"e\n"; + for (std::vector::iterator iterator = this->subtree_roots.begin(); iterator != this->subtree_roots.end(); ++iterator) { + node_ptr = *iterator; + outfile1<x<<" "<y<<" "<z<<"\n"; + } + outfile1.close(); + } + // demo segmentation + std::ofstream outfile(filename_prefix+"_segmentation.gnuplot"); + std::vector< std::vector > result; + result = assignNewCurveIds(); + if ( outfile ) { + outfile<<"splot '-' with points lc 'black' pt 1, "; + for(unsigned int cluster = 0; cluster::iterator iterator = indices.begin(); iterator != indices.end(); ++iterator) { + index = *iterator; + outfile<::iterator iterator = result[cluster].begin(); iterator != result[cluster].end(); ++iterator) { + index = *iterator; + outfile<::iterator iterator = subtree_roots.begin(); iterator != subtree_roots.end(); ++iterator) { + node_ptr = *iterator; + outfile<x<<" "<y<<" "<z<<"\n"; + } + outfile<<"e\n"; + outfile.close(); + } + } +} + +/** + * @brief Does a breadth-first search until the max_depth (4). Returns the count of the Edges and the sum of the costs of earch encountered edge. + * pair : N edges, Sum of the Cost of the Edges + * @return std::pair + */ +std::pair Node::get_neighbourhood() { + int max_depth = 4; // 4-er Nachbarschaft + double sum = 0; + double N = 0; + // open_list is queue + std::vector open_list; + std::vector closed_list; + Node * v = this; + int level=0; + Edge e; + + // null Idee von: https://stackoverflow.com/questions/31247634/how-to-keep-track-of-depth-in-breadth-first-search + + open_list.push_back(this); + open_list.push_back(NULL); + while( (! open_list.empty() ) && (level < max_depth) ) { + v = open_list.front(); // queueify + open_list.erase(open_list.begin()); + if(v == NULL){ + level++; + //if(open_list.back() == NULL) break; // changed + open_list.push_back(NULL); + } else { + closed_list.push_back(v); + for (std::vector::iterator iterator = v->_out.begin(); iterator != v->_out.end(); ++iterator) { + e = *iterator; + if(std::find(closed_list.begin(), closed_list.end(), e.end) == closed_list.end()) { + sum += e.cost; + N ++; + open_list.push_back(e.end); + } + } + for (std::vector::iterator iterator = v->_in.begin(); iterator != v->_in.end(); ++iterator) { + e = *iterator; + if(std::find(closed_list.begin(), closed_list.end(), e.begin) == closed_list.end()) { + sum += e.cost; + N ++; + open_list.push_back(e.begin); + } + } + } + + } + return std::pair(N,sum); +} + +/** + * @brief Removes the longest Edge(s) leading into a branch of a depth of at least min_depth. Adds the next Node as a new Segment Head and the Edge to a debug vector. + * Returns count of removed Edges. + * @param min_depth + * @param debug_removed_edges + * @param segment_heads + * @return size_t + */ +size_t Node::removeBranchEdges(size_t min_depth, std::vector * debug_removed_edges, std::vector * segment_heads) { + + size_t counter=0; + std::vector removal_candidates; + Edge i; + Edge i2; + + if(this->isBranchRoot()) { // check if this node is Root of a branch (has > 1 Incoming Edge) + for(std::vector::iterator it = _in.begin(); it != _in.end();it++) { // for every edge: + if(it->begin->depthSearch(min_depth)) { // check depth of branch + // if depth > min_depth + counter++; + removal_candidates.push_back(*it); + } + } + if(counter >= 2) { // is this Node Root at least 2 long Branches? + std::vector::iterator iteratorMinElement = std::min_element(removal_candidates.begin(), removal_candidates.end()); + removal_candidates.erase(iteratorMinElement); + for (std::vector::iterator iterator = removal_candidates.begin(); iterator != removal_candidates.end(); ++iterator) { + i = *iterator; + debug_removed_edges->push_back(i); + } + std::vector temp_in = _in; + _in.clear(); + for (std::vector::iterator iterator = temp_in.begin(); iterator != temp_in.end(); ++iterator) { + i2 = *iterator; + if (! (std::find(removal_candidates.begin(), removal_candidates.end(), i2) != removal_candidates.end())) { + _in.push_back(i2); + } else { + i.begin->remove_out_edge(this); + segment_heads->push_back(i2.begin); + } + } + temp_in.clear(); + removal_candidates.clear(); + return counter-1; + } + } + return 0; +} + + +/** + * @brief Calls get_neighbourhood and then checks for every incoming edge e with weight w: + * * get the mean weight length of the neighbourhood edges (without e): n_mean + * * if w > (n_mean*4.5): + * * * edge should be removed due to being long + * * * add the edge to the debug vector + * * * add the begin of the edge to the segment head vector + * * * remove edge e + * @param debug_removed_edges: is potentially added to + * @param segment_heads: is potentially added to + * @return size_t: how many incoming edges to this Node were removed + */ +size_t Node::removeLongEdges(std::vector * debug_removed_edges, std::vector * segment_heads) { + std::pair n_sum = get_neighbourhood(); + double cost_sum = n_sum.second; + int n_edges = n_sum.first; + size_t counter = 0; + double n_mean = 0; + for(std::vector::iterator it = _in.begin(); it != _in.end();) { // for every edge, do + n_mean = (cost_sum-it->cost)/(n_edges-1); + if(it->cost > (n_mean*4.5)) { // if cost of edge > "Vielfaches" (4.5) of mean Neighbourhood Costs. + debug_removed_edges->push_back(*it); // add edge to removed_edges in Graph object, for gnuplot debug file + segment_heads->push_back(it->begin); + it->begin->remove_out_edge(this); + it = remove_in_edge(it); // remove the edge. + counter++; + } else { + it++; + } + } + return counter; +} + +/** + * @brief gets the indexes of the connected Nodes, either just outgoing edges or also incoming edges if full=true + * + * @return result + * @param full: true if you want indexes of both incoming and outgoing edges + */ +void Node::get_next(std::vector * result, bool full) { + (*result).clear(); + Edge edge; + for (std::vector::iterator iterator = _out.begin(); iterator != _out.end(); ++iterator) { + edge = *iterator; + (*result).push_back(edge.end->index); + } + if(full) { + for (std::vector::iterator iterator = _in.begin(); iterator != _in.end(); ++iterator) { + edge = *iterator; + (*result).push_back(edge.begin->index); + } + } + +} diff --git a/src/spyral_utils/tripclust/src/directedgraph.h b/src/spyral_utils/tripclust/src/directedgraph.h new file mode 100644 index 0000000..2ef6569 --- /dev/null +++ b/src/spyral_utils/tripclust/src/directedgraph.h @@ -0,0 +1,209 @@ +// +// directedgraph.h +// Class for directed graph and building directed MST. +// +// Author: Christoph Dalitz +// Date: 2022-11-07 +// License: see ../LICENSE +// +/** @file */ + +#ifndef DIRECTEDGRAPH_H +#define DIRECTEDGRAPH_H + + +#include +#include +#include +#include +#include "kdtree/kdtree.hpp" +#include "pointcloud.h" + +class Node; + +/** + * @brief A weighted, directed Edge. + * + */ +struct Edge { + Node * begin; + Node * end; + double cost; + Edge() { + begin = NULL; + end = NULL; + cost=0.0; + } + Edge(Node * b, Node * e, double c) { + begin = b; + end = e; + cost = c; + } + bool operator<(const Edge& a) const { + return cost < a.cost; + } + bool operator==(const Edge& a) const { + return (cost==a.cost) && (begin == a.begin) && (end == a.end); + } +}; + + +/** + * @brief A Node containing a vector of incoming and outgoing Edges, the Coordinates and the index. + * + */ +class Node { + private: + std::vector _in; + std::vector _out; + + /** + * @brief Removes _in edge by Iterator. + * + * @param it + * @return * std::vector::iterator + */ + std::vector::iterator remove_in_edge(std::vector::iterator it) { + return _in.erase(it); + } + void add_in(Node * new_node, double cost) { + _in.push_back(Edge(new_node, this, cost)); + } + bool depthSearch(size_t depth) { + Edge e; + if(depth > 0) { + for (std::vector::iterator iterator = _in.begin(); iterator != _in.end(); ++iterator) { + e = *iterator; + if(e.begin->depthSearch(--depth)) { + return true; + } + } + return false; + } else { + return true; + } + } + bool isBranchRoot() { + return _in.size() > 1; + } + + // has to return: sum and n because the edge we look at itself should not be included in mean neighbourhood + // return value is: N and sum + std::pair get_neighbourhood(); + + public: + size_t index; + double x; // X-Coordinate + double y; // Y-Coordinate + double z; // Z-Coordinate + + Node() { + this->index = 0; + this->x = 0.0; + this->y = 0.0; + this->z = 0.0; + } + Node(size_t index, double x, double y, double z) { + this->index = index; + this->x = x; + this->y = y; + this->z = z; + } + size_t getnEdge() { + return _in.size(); + } + void add_edge(Node * new_node, double cost) { + _out.push_back(Edge(this, new_node, cost)); + new_node->add_in(this, cost); + } + + /** + * @brief Removes an out edge by its end. + * + * @param Node * n + */ + void remove_out_edge(Node * n) { + bool found = false; + for(std::vector::iterator it = _out.begin(); (it != _out.end()) && (! found);it++) { + if(it->end == n) { + found=true; + _out.erase(it); + } + } + } + + size_t removeBranchEdges(size_t min_depth, std::vector * debug_removed_edges, std::vector * segment_heads); + + size_t removeLongEdges(std::vector * debug_removed_edges, std::vector * segment_heads); + + // get Index of next Nodes (write into result) + void get_next(std::vector * result, bool all = false); +}; + + +/** + * @brief Graph that organizes in a Minimum-Spanning-Tree. The structure is contained in a std::map of Indexes and Nodes. + * + * + */ +class Graph { + private: + std::map graph; // reason for map: KDTree only returns index as nearest neighbour + std::vector indices; // keep track of index of Node in original, complete pointcloud + std::vector subtree_roots; // keep track of roots of subtrees after removing edges + + + std::vector removed_long_edges; // debugging: All removed long Edges + std::vector removed_branch_edges; // debugging: All removed branch Edges + + void constructMST(); // Edmonds Algorithm, simplified + Kdtree::KdTree constructKDTree(); // Helper Function of constructMST() + + public: + // take Pointcloud and indices and construct a MST + /** + * @brief Construct a new MST Graph object. + * + * @param cloud + * @param indices + */ + Graph(PointCloud & cloud, std::vector indices); + // f==true : write to file with name filename + // f==false : write to stdout + void generateGnuplot(bool f=true, std::string filename="debug-file.gnuplot"); + // remove edges with depth > depth + bool removeBranches(size_t depth); + // remove edges with length greater than "Vielfaches" von Nachbarschaft + void removeLongEdges(); + + std::vector< std::vector > assignNewCurveIds(); + + // update begins of subtrees + void addSegmentHead(Node * new_seg) { + subtree_roots.push_back(new_seg); + } + + size_t getnEdge() { + size_t result = 0; + size_t id; + for (std::vector::iterator iterator = indices.begin(); iterator != indices.end(); ++iterator) { + id = *iterator; + result += graph.at(id).getnEdge(); + } + return result; + } +}; + + +// only search for nodes on the right hand side of search point +struct Predicate : public Kdtree::KdNodePredicate { + int a; + Predicate(size_t index) { + a = index; + } + bool operator()(const Kdtree::KdNode& kn) const { + return a < kn.index; + } +}; + +#endif diff --git a/src/spyral_utils/tripclust/src/dnn.cpp b/src/spyral_utils/tripclust/src/dnn.cpp new file mode 100644 index 0000000..3ecfcc0 --- /dev/null +++ b/src/spyral_utils/tripclust/src/dnn.cpp @@ -0,0 +1,66 @@ +// +// dnn.cpp +// Functions for computing the characteristic length dnn +// +// Author: Jens Wilberg, Lukas Aymans, Christoph Dalitz +// Date: 2018-08-30 +// License: see ../LICENSE +// + +#include +#include +#include + +#include "dnn.h" +#include "kdtree/kdtree.hpp" + +//------------------------------------------------------------------- +// Compute mean squared distances. +// the distances is computed for every point in *cloud* to its *k* +// nearest neighbours. The distances are returned in *msd*. +//------------------------------------------------------------------- +void compute_mean_square_distance(const PointCloud &cloud, + std::vector &msd, int k) { + // compute mean square distances for every point to its k nearest neighbours + Kdtree::KdNodeVector nodes, result; + double sum; + + // build kdtree + for (size_t i = 0; i < cloud.size(); ++i) { + nodes.push_back(cloud[i].as_vector()); + } + Kdtree::KdTree kdtree(&nodes); + + k++; // k must be one higher because the first point found by the kdtree is + // the point itself + + for (size_t i = 0; i < cloud.size(); ++i) { + // compute + std::vector squared_distances; + kdtree.k_nearest_neighbors(cloud[i].as_vector(), k, &result, + &squared_distances); + + squared_distances.erase(squared_distances.begin()); // The first value + // must be deleted + // because it is + // the distance + // with the point + // itself + + sum = std::accumulate(squared_distances.begin(), squared_distances.end(), + 0.0); + msd.push_back(sum / squared_distances.size()); + } +} + +//------------------------------------------------------------------- +// Compute first quartile of the mean squared distance of all points +// in *cloud* +//------------------------------------------------------------------- +double first_quartile(const PointCloud &cloud) { + std::vector msd; + compute_mean_square_distance(cloud, msd, 1); + const double q1 = msd.size() / 4; + std::nth_element(msd.begin(), msd.begin() + q1, msd.end()); + return msd[q1]; +} diff --git a/src/spyral_utils/tripclust/src/dnn.h b/src/spyral_utils/tripclust/src/dnn.h new file mode 100644 index 0000000..1ee02b6 --- /dev/null +++ b/src/spyral_utils/tripclust/src/dnn.h @@ -0,0 +1,17 @@ +// +// dnn.h +// Functions for computing the characteristic length dnn +// +// Author: Jens Wilberg, Lukas Aymans, Christoph Dalitz +// Date: 2018-08-30 +// License: see ../LICENSE +// + +#ifndef DNN_H +#define DNN_H +#include "pointcloud.h" + +// compute first quartile of the mean squared distance from the points +double first_quartile(const PointCloud &cloud); + +#endif diff --git a/src/spyral_utils/tripclust/src/graph.cpp b/src/spyral_utils/tripclust/src/graph.cpp new file mode 100644 index 0000000..ac5fd4c --- /dev/null +++ b/src/spyral_utils/tripclust/src/graph.cpp @@ -0,0 +1,147 @@ +// +// graph.cpp +// Classes and functions for computing the MST and for +// splitting up clusters at gaps +// +// Author: Jens Wilberg, Lukas Aymans, Christoph Dalitz +// Date: 2018-08-30 +// License: see ../LICENSE +// + +#include +#include +#include + +#include "graph.h" + +struct Edge { + size_t src, dest; + double weight; + bool operator<(const Edge &e2) const { return this->weight < e2.weight; }; +}; + +// Create edges with weights between all point indices in *cluster*. the +// weights are the distances of the points in *cloud*. The edges are returned +// in *edges*. +void create_edges(std::vector &edges, const PointCloud &cloud, + const std::vector &cluster) { + for (size_t vertex1 = 0; vertex1 < cluster.size(); ++vertex1) { + for (size_t vertex2 = vertex1 + 1; vertex2 < cluster.size(); ++vertex2) { + size_t point_index1 = cluster[vertex1], point_index2 = cluster[vertex2]; + const Point &p = cloud[point_index1]; + const Point &q = cloud[point_index2]; + + // compute squared distance + double distance = (q - p).squared_norm(); + + Edge e = {vertex1, vertex2, distance}; + edges.push_back(e); + } + } + std::sort(edges.begin(), edges.end()); +} + +// Create a adjacent list for every vertex from the edges in *edges*. +// The adjacent list is returned in *adj* which be resized to the number of +// vertices a priori. +void create_adj(std::vector > &adj, + const std::vector edges) { + for (std::vector::const_iterator edge = edges.begin(); + edge != edges.end(); ++edge) { + adj[edge->src].push_back(edge->dest); + adj[edge->dest].push_back(edge->src); + } +} + +// Remove all edges of *edges* with weight smaller *dmax*. *edges* will be +// modified. +void remove_edge(std::vector &edges, double dmax) { + std::vector::iterator edge = edges.begin(); + double dmax2 = dmax * dmax; + while (edge != edges.end()) { + if (edge->weight > dmax2) { + edge = edges.erase(edge); + } else { + ++edge; + } + } +} + +// Remove all edges from *edges* which don't belong to the mst. *vcount* is the +// number of verteicies. *edges* will be modified. +void mst(const std::vector &edges, std::vector &mst_edges, + size_t vcount) { + std::vector groups(vcount); + for (size_t i = 0; i < groups.size(); i++) { + groups[i] = i; + } + + for (std::vector::const_iterator e = edges.begin(); e != edges.end(); + ++e) { + size_t group_a = groups[e->src]; + size_t group_b = groups[e->dest]; + + // look if there is no circle + if (group_a != group_b) { + // merge groups + for (std::vector::iterator it = groups.begin(); + it != groups.end(); ++it) { + if (*it == group_b) *it = group_a; + } + mst_edges.push_back(*e); + } + } +} + +// create a cluster of connected components which is returned in *new_cluster*. +// *vertex* is the index of the start vertex. *visited* is a list of the visted +// states from every vertecy. *cluster* is used to get the original point index +// of a vertex. *adj* are the adjacent lists of all vertices. +void dfs_util(std::vector &new_cluster, const size_t vertex, + std::vector &visited, const std::vector &cluster, + const std::vector > &adj) { + std::stack stack; + stack.push(vertex); + while (!stack.empty()) { + size_t v = stack.top(); + stack.pop(); + visited[v] = true; + new_cluster.push_back(cluster[v]); + + for (std::vector::const_iterator it = adj[v].begin(); + it != adj[v].end(); ++it) { + if (!visited[*it]) stack.push(*it); + } + } +} + +//------------------------------------------------------------------- +// Split *cluster* in multiple new clusters and return the result in +// *new_clusters". The mst of the cluster is created and all edges are +// removed with a wheigth > *dmax*. The connected components are computed +// and returned as new clusters if their size is >= *min_size*. +//------------------------------------------------------------------- +void max_step(std::vector > &new_clusters, + const std::vector &cluster, const PointCloud &cloud, + double dmax, size_t min_size) { + size_t vcount = cluster.size(); + size_t n_removed; + std::vector > adj(vcount); + std::vector edges, mst_edges; + std::vector visited(vcount); + create_edges(edges, cloud, cluster); + mst(edges, mst_edges, vcount); + n_removed = mst_edges.size(); + remove_edge(mst_edges, dmax); + n_removed = n_removed - mst_edges.size(); + create_adj(adj, mst_edges); + + for (size_t v = 0; v < vcount; ++v) { + if (!visited[v]) { + std::vector new_cluster; + dfs_util(new_cluster, v, visited, cluster, adj); + if ((new_cluster.size() >= min_size) || (n_removed == 0)) + new_clusters.push_back(new_cluster); + } + } +} diff --git a/src/spyral_utils/tripclust/src/graph.h b/src/spyral_utils/tripclust/src/graph.h new file mode 100644 index 0000000..a516ee5 --- /dev/null +++ b/src/spyral_utils/tripclust/src/graph.h @@ -0,0 +1,26 @@ +// +// graph.h +// Classes and functions for computing the MST and for +// splitting up clusters at gaps +// +// Author: Jens Wilberg, Lukas Aymans, Christoph Dalitz +// Date: 2018-08-30 +// License: see ../LICENSE +// + +#ifndef MSD_H +#define MSD_H +#include +#include + +#include "pointcloud.h" + +// Split *cluster* in multiple new clusters and return the result in +// *new_clusters". The mst of the cluster is created and all edges are +// removed with a wheigth > *dmax*. The connected comonents are computed +// and returned as new clusters if their size is >= *min_size*. +void max_step(std::vector > &new_clusters, + const std::vector &cluster, const PointCloud &cloud, + double dmax, size_t min_size); + +#endif diff --git a/src/spyral_utils/tripclust/src/hclust/LICENSE b/src/spyral_utils/tripclust/src/hclust/LICENSE new file mode 100644 index 0000000..ab8b4db --- /dev/null +++ b/src/spyral_utils/tripclust/src/hclust/LICENSE @@ -0,0 +1,13 @@ +Copyright: + * fastcluster_dm.cpp & fastcluster_R_dm.cpp: + © 2011 Daniel Müllner + * fastcluster.(h|cpp) & demo.cpp & plotresult.r: + © 2018 Christoph Dalitz +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/src/spyral_utils/tripclust/src/hclust/README b/src/spyral_utils/tripclust/src/hclust/README new file mode 100644 index 0000000..05226fe --- /dev/null +++ b/src/spyral_utils/tripclust/src/hclust/README @@ -0,0 +1,79 @@ +C++ interface to fast hierarchical clustering algorithms +======================================================== + +This is simplified C++ interface to fast implementations of hierarchical +clustering by Daniel Müllner. The original library with interfaces to R +and Python is described in: + +Daniel Müllner: "fastcluster: Fast Hierarchical, Agglomerative Clustering +Routines for R and Python." Journal of Statistical Software 53 (2013), +no. 9, pp. 1–18, http://www.jstatsoft.org/v53/i09/ + + +Usage of the library +-------------------- + +For using the library, the following source files are needed: + +fastcluster_dm.cpp, fastcluster_R_dm.cpp + original code by Daniel Müllner + these are included by fastcluster.cpp via #include, and therefore + need not be compiled to object code + +fastcluster.[h|cpp] + simplified C++ interface + fastcluster.cpp is the only file that must be compiled + +The library provides the clustering function *hclust_fast* for +creating the dendrogram information in an encoding as used by the +R function *hclust*. For a description of the parameters, see fastcluster.h. +Its parameter *method* can be one of + +HCLUST_METHOD_SINGLE + single link with the minimum spanning tree algorithm (Rohlf, 1973) + +HHCLUST_METHOD_COMPLETE + complete link with the nearest-neighbor-chain algorithm (Murtagh, 1984) + +HCLUST_METHOD_AVERAGE + complete link with the nearest-neighbor-chain algorithm (Murtagh, 1984) + +HCLUST_METHOD_MEDIAN + median link with the generic algorithm (Müllner, 2011) + +For splitting the dendrogram into clusters, the two functions *cutree_k* +and *cutree_cdist* are provided. + +Note that output parameters must be allocated beforehand, e.g. + int* merge = new int[2*(npoints-1)]; +For a complete usage example, see lines 135-142 of demo.cpp. + + +Demonstration program +--------------------- + +A simple demo is implemented in demo.cpp, which can be compiled and run with + + make + ./hclust-demo -m complete lines.csv + +It creates two clusters of line segments such that the segment angle between +line segments of different clusters have a maximum (cosine) dissimilarity. +For visualizing the result, plotresult.r can be used as follows +(requires R to be installed): + + ./hclust-demo -m complete lines.csv | Rscript plotresult.r + + +Authors & Copyright +------------------- + +Daniel Müllner, 2011, +Christoph Dalitz, 2018, + + +License +------- + +This code is provided under a BSD-style license. +See the file LICENSE for details. diff --git a/src/spyral_utils/tripclust/src/hclust/fastcluster.cpp b/src/spyral_utils/tripclust/src/hclust/fastcluster.cpp new file mode 100644 index 0000000..ab2506c --- /dev/null +++ b/src/spyral_utils/tripclust/src/hclust/fastcluster.cpp @@ -0,0 +1,169 @@ +// +// C++ standalone verion of fastcluster by Daniel Müllner +// +// Copyright: Christoph Dalitz, 2018 +// Daniel Müllner, 2011 +// License: BSD style license +// (see the file LICENSE for details) +// + + +#include +#include + +#include "fastcluster.h" + +// Code by Daniel Müllner +// workaround to make it usable as a standalone version (without R) +bool fc_isnan(double x) { return false; } +#include "fastcluster_dm.cpp" +#include "fastcluster_R_dm.cpp" + +// +// Assigns cluster labels (0, ..., nclust-1) to the n points such +// that the cluster result is split into nclust clusters. +// +// Input arguments: +// n = number of observables +// merge = clustering result in R format +// nclust = number of clusters +// Output arguments: +// labels = allocated integer array of size n for result +// +void cutree_k(int n, const int* merge, int nclust, int* labels) { + + int k,m1,m2,j,l; + + if (nclust > n || nclust < 2) { + for (j=0; j last_merge(n, 0); + for (k=1; k<=(n-nclust); k++) { + // (m1,m2) = merge[k,] + m1 = merge[k-1]; + m2 = merge[n-1+k-1]; + if (m1 < 0 && m2 < 0) { // both single observables + last_merge[-m1-1] = last_merge[-m2-1] = k; + } + else if (m1 < 0 || m2 < 0) { // one is a cluster + if(m1 < 0) { j = -m1; m1 = m2; } else j = -m2; + // merging single observable and cluster + for(l = 0; l < n; l++) + if (last_merge[l] == m1) + last_merge[l] = k; + last_merge[j-1] = k; + } + else { // both cluster + for(l=0; l < n; l++) { + if( last_merge[l] == m1 || last_merge[l] == m2 ) + last_merge[l] = k; + } + } + } + + // assign cluster labels + int label = 0; + std::vector z(n,-1); + for (j=0; j= cdist +// +// Input arguments: +// n = number of observables +// merge = clustering result in R format +// height = cluster distance at each merge step +// cdist = cutoff cluster distance +// Output arguments: +// labels = allocated integer array of size n for result +// +void cutree_cdist(int n, const int* merge, double* height, double cdist, int* labels) { + + int k; + + for (k=0; k<(n-1); k++) { + if (height[k] >= cdist) { + break; + } + } + cutree_k(n, merge, n-k, labels); +} + + +// +// Hierarchical clustering with one of Daniel Muellner's fast algorithms +// +// Input arguments: +// n = number of observables +// distmat = condensed distance matrix, i.e. an n*(n-1)/2 array representing +// the upper triangle (without diagonal elements) of the distance +// matrix, e.g. for n=4: +// d00 d01 d02 d03 +// d10 d11 d12 d13 -> d02 d02 d03 d12 d13 d23 +// d20 d21 d22 d23 +// d30 d31 d32 d33 +// method = cluster metric (see enum method_code) +// Output arguments: +// merge = allocated (n-1)x2 matrix (2*(n-1) array) for storing result. +// Result follows R hclust convention: +// - observabe indices start with one +// - merge[i][] contains the merged nodes in step i +// - merge[i][j] is negative when the node is an atom +// height = allocated (n-1) array with distances at each merge step +// Return code: +// 0 = ok +// 1 = invalid method +// +int hclust_fast(int n, double* distmat, int method, int* merge, double* height) { + + // call appropriate culstering function + cluster_result Z2(n-1); + if (method == HCLUST_METHOD_SINGLE) { + // single link + MST_linkage_core(n, distmat, Z2); + } + else if (method == HCLUST_METHOD_COMPLETE) { + // complete link + NN_chain_core(n, distmat, NULL, Z2); + } + else if (method == HCLUST_METHOD_AVERAGE) { + // best average distance + double* members = new double[n]; + for (int i=0; i(n, distmat, members, Z2); + delete[] members; + } + else if (method == HCLUST_METHOD_MEDIAN) { + // best median distance (beware: O(n^3)) + generic_linkage(n, distmat, NULL, Z2); + } + else { + return 1; + } + + int* order = new int[n]; + if (method == HCLUST_METHOD_MEDIAN) { + generate_R_dendrogram(merge, height, order, Z2, n); + } else { + generate_R_dendrogram(merge, height, order, Z2, n); + } + + delete[] order; // only needed for visualization + + return 0; +} diff --git a/src/spyral_utils/tripclust/src/hclust/fastcluster.h b/src/spyral_utils/tripclust/src/hclust/fastcluster.h new file mode 100644 index 0000000..0e13457 --- /dev/null +++ b/src/spyral_utils/tripclust/src/hclust/fastcluster.h @@ -0,0 +1,73 @@ +// +// C++ standalone verion of fastcluster by Daniel Muellner +// +// Copyright: Daniel Muellner, 2011 +// Christoph Dalitz, 2018 +// License: BSD style license +// (see the file LICENSE for details) +// + +#ifndef fastclustercpp_H +#define fastclustercpp_H + +// +// Assigns cluster labels (0, ..., nclust-1) to the n points such +// that the cluster result is split into nclust clusters. +// +// Input arguments: +// n = number of observables +// merge = clustering result in R format +// nclust = number of clusters +// Output arguments: +// labels = allocated integer array of size n for result +// +void cutree_k(int n, const int* merge, int nclust, int* labels); + +// +// Assigns cluster labels (0, ..., nclust-1) to the n points such +// that the hierarchical clsutering is stopped at cluster distance cdist +// +// Input arguments: +// n = number of observables +// merge = clustering result in R format +// height = cluster distance at each merge step +// cdist = cutoff cluster distance +// Output arguments: +// labels = allocated integer array of size n for result +// +void cutree_cdist(int n, const int* merge, double* height, double cdist, int* labels); + +// +// Hierarchical clustering with one of Daniel Muellner's fast algorithms +// +// Input arguments: +// n = number of observables +// distmat = condensed distance matrix, i.e. an n*(n-1)/2 array representing +// the upper triangle (without diagonal elements) of the distance +// matrix, e.g. for n=4: +// d00 d01 d02 d03 +// d10 d11 d12 d13 -> d02 d02 d03 d12 d13 d23 +// d20 d21 d22 d23 +// d30 d31 d32 d33 +// method = cluster metric (see enum method_code) +// Output arguments: +// merge = allocated (n-1)x2 matrix (2*(n-1) array) for storing result. +// Result follows R hclust convention: +// - observabe indices start with one +// - merge[i][] contains the merged nodes in step i +// - merge[i][j] is negative when the node is an atom +// height = allocated (n-1) array with distances at each merge step +// Return code: +// 0 = ok +// 1 = invalid method +// +int hclust_fast(int n, double* distmat, int method, int* merge, double* height); +enum hclust_fast_methods { + HCLUST_METHOD_SINGLE = 0, + HCLUST_METHOD_COMPLETE = 1, + HCLUST_METHOD_AVERAGE = 2, + HCLUST_METHOD_MEDIAN = 3 +}; + + +#endif diff --git a/src/spyral_utils/tripclust/src/hclust/fastcluster_R_dm.cpp b/src/spyral_utils/tripclust/src/hclust/fastcluster_R_dm.cpp new file mode 100644 index 0000000..cbe126c --- /dev/null +++ b/src/spyral_utils/tripclust/src/hclust/fastcluster_R_dm.cpp @@ -0,0 +1,115 @@ +// +// Excerpt from fastcluster_R.cpp +// +// Copyright: Daniel Müllner, 2011 +// + +struct pos_node { + t_index pos; + int node; +}; + +void order_nodes(const int N, const int * const merge, const t_index * const node_size, int * const order) { + /* Parameters: + N : number of data points + merge : (N-1)×2 array which specifies the node indices which are + merged in each step of the clustering procedure. + Negative entries -1...-N point to singleton nodes, while + positive entries 1...(N-1) point to nodes which are themselves + parents of other nodes. + node_size : array of node sizes - makes it easier + order : output array of size N + + Runtime: Θ(N) + */ + auto_array_ptr queue(N/2); + + int parent; + int child; + t_index pos = 0; + + queue[0].pos = 0; + queue[0].node = N-2; + t_index idx = 1; + + do { + --idx; + pos = queue[idx].pos; + parent = queue[idx].node; + + // First child + child = merge[parent]; + if (child<0) { // singleton node, write this into the 'order' array. + order[pos] = -child; + ++pos; + } + else { /* compound node: put it on top of the queue and decompose it + in a later iteration. */ + queue[idx].pos = pos; + queue[idx].node = child-1; // convert index-1 based to index-0 based + ++idx; + pos += node_size[child-1]; + } + // Second child + child = merge[parent+N-1]; + if (child<0) { + order[pos] = -child; + } + else { + queue[idx].pos = pos; + queue[idx].node = child-1; + ++idx; + } + } while (idx>0); +} + +#define size_(r_) ( ((r_ +void generate_R_dendrogram(int * const merge, double * const height, int * const order, cluster_result & Z2, const int N) { + // The array "nodes" is a union-find data structure for the cluster + // identites (only needed for unsorted cluster_result input). + union_find nodes(sorted ? 0 : N); + if (!sorted) { + std::stable_sort(Z2[0], Z2[N-1]); + } + + t_index node1, node2; + auto_array_ptr node_size(N-1); + + for (t_index i=0; inode1; + node2 = Z2[i]->node2; + } + else { + node1 = nodes.Find(Z2[i]->node1); + node2 = nodes.Find(Z2[i]->node2); + // Merge the nodes in the union-find data structure by making them + // children of a new node. + nodes.Union(node1, node2); + } + // Sort the nodes in the output array. + if (node1>node2) { + t_index tmp = node1; + node1 = node2; + node2 = tmp; + } + /* Conversion between labeling conventions. + Input: singleton nodes 0,...,N-1 + compound nodes N,...,2N-2 + Output: singleton nodes -1,...,-N + compound nodes 1,...,N + */ + merge[i] = (node1(node1)-1 + : static_cast(node1)-N+1; + merge[i+N-1] = (node2(node2)-1 + : static_cast(node2)-N+1; + height[i] = Z2[i]->dist; + node_size[i] = size_(node1) + size_(node2); + } + + order_nodes(N, merge, node_size, order); +} diff --git a/src/spyral_utils/tripclust/src/hclust/fastcluster_dm.cpp b/src/spyral_utils/tripclust/src/hclust/fastcluster_dm.cpp new file mode 100644 index 0000000..9b7f7eb --- /dev/null +++ b/src/spyral_utils/tripclust/src/hclust/fastcluster_dm.cpp @@ -0,0 +1,1794 @@ +/* + fastcluster: Fast hierarchical clustering routines for R and Python + + Copyright © 2011 Daniel Müllner + + + This library implements various fast algorithms for hierarchical, + agglomerative clustering methods: + + (1) Algorithms for the "stored matrix approach": the input is the array of + pairwise dissimilarities. + + MST_linkage_core: single linkage clustering with the "minimum spanning + tree algorithm (Rohlfs) + + NN_chain_core: nearest-neighbor-chain algorithm, suitable for single, + complete, average, weighted and Ward linkage (Murtagh) + + generic_linkage: generic algorithm, suitable for all distance update + formulas (Müllner) + + (2) Algorithms for the "stored data approach": the input are points in a + vector space. + + MST_linkage_core_vector: single linkage clustering for vector data + + generic_linkage_vector: generic algorithm for vector data, suitable for + the Ward, centroid and median methods. + + generic_linkage_vector_alternative: alternative scheme for updating the + nearest neighbors. This method seems faster than "generic_linkage_vector" + for the centroid and median methods but slower for the Ward method. + + All these implementation treat infinity values correctly. They throw an + exception if a NaN distance value occurs. +*/ + +// Older versions of Microsoft Visual Studio do not have the fenv header. +#ifdef _MSC_VER +#if (_MSC_VER == 1500 || _MSC_VER == 1600) +#define NO_INCLUDE_FENV +#endif +#endif +// NaN detection via fenv might not work on systems with software +// floating-point emulation (bug report for Debian armel). +#ifdef __SOFTFP__ +#define NO_INCLUDE_FENV +#endif +#ifdef NO_INCLUDE_FENV +//#pragma message("Do not use fenv header.") +#else +//#pragma message("Use fenv header. If there is a warning about unknown #pragma STDC FENV_ACCESS, this can be ignored.") +//#pragma STDC FENV_ACCESS on +#include +#endif + +#include // for std::pow, std::sqrt +#include // for std::ptrdiff_t +#include // for std::numeric_limits<...>::infinity() +#include // for std::fill_n +#include // for std::runtime_error +#include // for std::string + +#include // also for DBL_MAX, DBL_MIN +#ifndef DBL_MANT_DIG +#error The constant DBL_MANT_DIG could not be defined. +#endif +#define T_FLOAT_MANT_DIG DBL_MANT_DIG + +#ifndef LONG_MAX +#include +#endif +#ifndef LONG_MAX +#error The constant LONG_MAX could not be defined. +#endif +#ifndef INT_MAX +#error The constant INT_MAX could not be defined. +#endif + +#ifndef INT32_MAX +#ifdef _MSC_VER +#if _MSC_VER >= 1600 +#define __STDC_LIMIT_MACROS +#include +#else +typedef __int32 int_fast32_t; +typedef __int64 int64_t; +#endif +#else +#define __STDC_LIMIT_MACROS +#include +#endif +#endif + +#define FILL_N std::fill_n +#ifdef _MSC_VER +#if _MSC_VER < 1600 +#undef FILL_N +#define FILL_N stdext::unchecked_fill_n +#endif +#endif + +// Suppress warnings about (potentially) uninitialized variables. +#ifdef _MSC_VER + #pragma warning (disable:4700) +#endif + +#ifndef HAVE_DIAGNOSTIC +#if __GNUC__ > 4 || (__GNUC__ == 4 && (__GNUC_MINOR__ >= 6)) +#define HAVE_DIAGNOSTIC 1 +#endif +#endif + +#ifndef HAVE_VISIBILITY +#if __GNUC__ >= 4 +#define HAVE_VISIBILITY 1 +#endif +#endif + +/* Since the public interface is given by the Python respectively R interface, + * we do not want other symbols than the interface initalization routines to be + * visible in the shared object file. The "visibility" switch is a GCC concept. + * Hiding symbols keeps the relocation table small and decreases startup time. + * See http://gcc.gnu.org/wiki/Visibility + */ +#if HAVE_VISIBILITY +#pragma GCC visibility push(hidden) +#endif + +typedef int_fast32_t t_index; +#ifndef INT32_MAX +#define MAX_INDEX 0x7fffffffL +#else +#define MAX_INDEX INT32_MAX +#endif +#if (LONG_MAX < MAX_INDEX) +#error The integer format "t_index" must not have a greater range than "long int". +#endif +#if (INT_MAX > MAX_INDEX) +#error The integer format "int" must not have a greater range than "t_index". +#endif +typedef double t_float; + +/* Method codes. + + These codes must agree with the METHODS array in fastcluster.R and the + dictionary mthidx in fastcluster.py. +*/ +enum method_codes { + // non-Euclidean methods + METHOD_METR_SINGLE = 0, + METHOD_METR_COMPLETE = 1, + METHOD_METR_AVERAGE = 2, + METHOD_METR_WEIGHTED = 3, + METHOD_METR_WARD = 4, + METHOD_METR_WARD_D = METHOD_METR_WARD, + METHOD_METR_CENTROID = 5, + METHOD_METR_MEDIAN = 6, + METHOD_METR_WARD_D2 = 7, + + MIN_METHOD_CODE = 0, + MAX_METHOD_CODE = 7 +}; + +enum method_codes_vector { + // Euclidean methods + METHOD_VECTOR_SINGLE = 0, + METHOD_VECTOR_WARD = 1, + METHOD_VECTOR_CENTROID = 2, + METHOD_VECTOR_MEDIAN = 3, + + MIN_METHOD_VECTOR_CODE = 0, + MAX_METHOD_VECTOR_CODE = 3 +}; + +// self-destructing array pointer +template +class auto_array_ptr{ +private: + type * ptr; + auto_array_ptr(auto_array_ptr const &); // non construction-copyable + auto_array_ptr& operator=(auto_array_ptr const &); // non copyable +public: + auto_array_ptr() + : ptr(NULL) + { } + template + auto_array_ptr(index const size) + : ptr(new type[size]) + { } + template + auto_array_ptr(index const size, value const val) + : ptr(new type[size]) + { + FILL_N(ptr, size, val); + } + ~auto_array_ptr() { + delete [] ptr; } + void free() { + delete [] ptr; + ptr = NULL; + } + template + void init(index const size) { + ptr = new type [size]; + } + template + void init(index const size, value const val) { + init(size); + FILL_N(ptr, size, val); + } + inline operator type *() const { return ptr; } +}; + +struct node { + t_index node1, node2; + t_float dist; +}; + +inline bool operator< (const node a, const node b) { + return (a.dist < b.dist); +} + +class cluster_result { +private: + auto_array_ptr Z; + t_index pos; + +public: + cluster_result(const t_index size) + : Z(size) + , pos(0) + {} + + void append(const t_index node1, const t_index node2, const t_float dist) { + Z[pos].node1 = node1; + Z[pos].node2 = node2; + Z[pos].dist = dist; + ++pos; + } + + node * operator[] (const t_index idx) const { return Z + idx; } + + /* Define several methods to postprocess the distances. All these functions + are monotone, so they do not change the sorted order of distances. */ + + void sqrt() const { + for (node * ZZ=Z; ZZ!=Z+pos; ++ZZ) { + ZZ->dist = std::sqrt(ZZ->dist); + } + } + + void sqrt(const t_float) const { // ignore the argument + sqrt(); + } + + void sqrtdouble(const t_float) const { // ignore the argument + for (node * ZZ=Z; ZZ!=Z+pos; ++ZZ) { + ZZ->dist = std::sqrt(2*ZZ->dist); + } + } + + #ifdef R_pow + #define my_pow R_pow + #else + #define my_pow std::pow + #endif + + void power(const t_float p) const { + t_float const q = 1/p; + for (node * ZZ=Z; ZZ!=Z+pos; ++ZZ) { + ZZ->dist = my_pow(ZZ->dist,q); + } + } + + void plusone(const t_float) const { // ignore the argument + for (node * ZZ=Z; ZZ!=Z+pos; ++ZZ) { + ZZ->dist += 1; + } + } + + void divide(const t_float denom) const { + for (node * ZZ=Z; ZZ!=Z+pos; ++ZZ) { + ZZ->dist /= denom; + } + } +}; + +class doubly_linked_list { + /* + Class for a doubly linked list. Initially, the list is the integer range + [0, size]. We provide a forward iterator and a method to delete an index + from the list. + + Typical use: for (i=L.start; L succ; + +private: + auto_array_ptr pred; + // Not necessarily private, we just do not need it in this instance. + +public: + doubly_linked_list(const t_index size) + // Initialize to the given size. + : start(0) + , succ(size+1) + , pred(size+1) + { + for (t_index i=0; i(2*N-3-(r_))*(r_)>>1)+(c_)-1] ) +// Z is an ((N-1)x4)-array +#define Z_(_r, _c) (Z[(_r)*4 + (_c)]) + +/* + Lookup function for a union-find data structure. + + The function finds the root of idx by going iteratively through all + parent elements until a root is found. An element i is a root if + nodes[i] is zero. To make subsequent searches faster, the entry for + idx and all its parents is updated with the root element. + */ +class union_find { +private: + auto_array_ptr parent; + t_index nextparent; + +public: + union_find(const t_index size) + : parent(size>0 ? 2*size-1 : 0, 0) + , nextparent(size) + { } + + t_index Find (t_index idx) const { + if (parent[idx] != 0 ) { // a → b + t_index p = idx; + idx = parent[idx]; + if (parent[idx] != 0 ) { // a → b → c + do { + idx = parent[idx]; + } while (parent[idx] != 0); + do { + t_index tmp = parent[p]; + parent[p] = idx; + p = tmp; + } while (parent[p] != idx); + } + } + return idx; + } + + void Union (const t_index node1, const t_index node2) { + parent[node1] = parent[node2] = nextparent++; + } +}; + +class nan_error{}; +#ifdef FE_INVALID +class fenv_error{}; +#endif + +static void MST_linkage_core(const t_index N, const t_float * const D, + cluster_result & Z2) { +/* + N: integer, number of data points + D: condensed distance matrix N*(N-1)/2 + Z2: output data structure + + The basis of this algorithm is an algorithm by Rohlf: + + F. James Rohlf, Hierarchical clustering using the minimum spanning tree, + The Computer Journal, vol. 16, 1973, p. 93–95. +*/ + t_index i; + t_index idx2; + doubly_linked_list active_nodes(N); + auto_array_ptr d(N); + + t_index prev_node; + t_float min; + + // first iteration + idx2 = 1; + min = std::numeric_limits::infinity(); + for (i=1; i tmp) + d[i] = tmp; + else if (fc_isnan(tmp)) + throw (nan_error()); +#if HAVE_DIAGNOSTIC +#pragma GCC diagnostic pop +#endif + if (d[i] < min) { + min = d[i]; + idx2 = i; + } + } + Z2.append(prev_node, idx2, min); + } +} + +/* Functions for the update of the dissimilarity array */ + +inline static void f_single( t_float * const b, const t_float a ) { + if (*b > a) *b = a; +} +inline static void f_complete( t_float * const b, const t_float a ) { + if (*b < a) *b = a; +} +inline static void f_average( t_float * const b, const t_float a, const t_float s, const t_float t) { + *b = s*a + t*(*b); + #ifndef FE_INVALID +#if HAVE_DIAGNOSTIC +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wfloat-equal" +#endif + if (fc_isnan(*b)) { + throw(nan_error()); + } +#if HAVE_DIAGNOSTIC +#pragma GCC diagnostic pop +#endif + #endif +} +inline static void f_weighted( t_float * const b, const t_float a) { + *b = (a+*b)*.5; + #ifndef FE_INVALID +#if HAVE_DIAGNOSTIC +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wfloat-equal" +#endif + if (fc_isnan(*b)) { + throw(nan_error()); + } +#if HAVE_DIAGNOSTIC +#pragma GCC diagnostic pop +#endif + #endif +} +inline static void f_ward( t_float * const b, const t_float a, const t_float c, const t_float s, const t_float t, const t_float v) { + *b = ( (v+s)*a - v*c + (v+t)*(*b) ) / (s+t+v); + //*b = a+(*b)-(t*a+s*(*b)+v*c)/(s+t+v); + #ifndef FE_INVALID +#if HAVE_DIAGNOSTIC +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wfloat-equal" +#endif + if (fc_isnan(*b)) { + throw(nan_error()); + } +#if HAVE_DIAGNOSTIC +#pragma GCC diagnostic pop +#endif + #endif +} +inline static void f_centroid( t_float * const b, const t_float a, const t_float stc, const t_float s, const t_float t) { + *b = s*a - stc + t*(*b); + #ifndef FE_INVALID + if (fc_isnan(*b)) { + throw(nan_error()); + } +#if HAVE_DIAGNOSTIC +#pragma GCC diagnostic pop +#endif + #endif +} +inline static void f_median( t_float * const b, const t_float a, const t_float c_4) { + *b = (a+(*b))*.5 - c_4; + #ifndef FE_INVALID +#if HAVE_DIAGNOSTIC +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wfloat-equal" +#endif + if (fc_isnan(*b)) { + throw(nan_error()); + } +#if HAVE_DIAGNOSTIC +#pragma GCC diagnostic pop +#endif + #endif +} + +template +static void NN_chain_core(const t_index N, t_float * const D, t_members * const members, cluster_result & Z2) { +/* + N: integer + D: condensed distance matrix N*(N-1)/2 + Z2: output data structure + + This is the NN-chain algorithm, described on page 86 in the following book: + + Fionn Murtagh, Multidimensional Clustering Algorithms, + Vienna, Würzburg: Physica-Verlag, 1985. +*/ + t_index i; + + auto_array_ptr NN_chain(N); + t_index NN_chain_tip = 0; + + t_index idx1, idx2; + + t_float size1, size2; + doubly_linked_list active_nodes(N); + + t_float min; + + for (t_float const * DD=D; DD!=D+(static_cast(N)*(N-1)>>1); + ++DD) { +#if HAVE_DIAGNOSTIC +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wfloat-equal" +#endif + if (fc_isnan(*DD)) { + throw(nan_error()); + } +#if HAVE_DIAGNOSTIC +#pragma GCC diagnostic pop +#endif + } + + #ifdef FE_INVALID + if (feclearexcept(FE_INVALID)) throw fenv_error(); + #endif + + for (t_index j=0; jidx2) { + t_index tmp = idx1; + idx1 = idx2; + idx2 = tmp; + } + + if (method==METHOD_METR_AVERAGE || + method==METHOD_METR_WARD) { + size1 = static_cast(members[idx1]); + size2 = static_cast(members[idx2]); + members[idx2] += members[idx1]; + } + + // Remove the smaller index from the valid indices (active_nodes). + active_nodes.remove(idx1); + + switch (method) { + case METHOD_METR_SINGLE: + /* + Single linkage. + + Characteristic: new distances are never longer than the old distances. + */ + // Update the distance matrix in the range [start, idx1). + for (i=active_nodes.start; i(members[i]); + for (i=active_nodes.start; i(members[i]) ); + // Update the distance matrix in the range (idx1, idx2). + for (; i(members[i]) ); + // Update the distance matrix in the range (idx2, N). + for (i=active_nodes.succ[idx2]; i(members[i]) ); + break; + + default: + throw std::runtime_error(std::string("Invalid method.")); + } + } + #ifdef FE_INVALID + if (fetestexcept(FE_INVALID)) throw fenv_error(); + #endif +} + +class binary_min_heap { + /* + Class for a binary min-heap. The data resides in an array A. The elements of + A are not changed but two lists I and R of indices are generated which point + to elements of A and backwards. + + The heap tree structure is + + H[2*i+1] H[2*i+2] + \ / + \ / + ≤ ≤ + \ / + \ / + H[i] + + where the children must be less or equal than their parent. Thus, H[0] + contains the minimum. The lists I and R are made such that H[i] = A[I[i]] + and R[I[i]] = i. + + This implementation is not designed to handle NaN values. + */ +private: + t_float * const A; + t_index size; + auto_array_ptr I; + auto_array_ptr R; + + // no default constructor + binary_min_heap(); + // noncopyable + binary_min_heap(binary_min_heap const &); + binary_min_heap & operator=(binary_min_heap const &); + +public: + binary_min_heap(t_float * const A_, const t_index size_) + : A(A_), size(size_), I(size), R(size) + { // Allocate memory and initialize the lists I and R to the identity. This + // does not make it a heap. Call heapify afterwards! + for (t_index i=0; i>1); idx>0; ) { + --idx; + update_geq_(idx); + } + } + + inline t_index argmin() const { + // Return the minimal element. + return I[0]; + } + + void heap_pop() { + // Remove the minimal element from the heap. + --size; + I[0] = I[size]; + R[I[0]] = 0; + update_geq_(0); + } + + void remove(t_index idx) { + // Remove an element from the heap. + --size; + R[I[size]] = R[idx]; + I[R[idx]] = I[size]; + if ( H(size)<=A[idx] ) { + update_leq_(R[idx]); + } + else { + update_geq_(R[idx]); + } + } + + void replace ( const t_index idxold, const t_index idxnew, + const t_float val) { + R[idxnew] = R[idxold]; + I[R[idxnew]] = idxnew; + if (val<=A[idxold]) + update_leq(idxnew, val); + else + update_geq(idxnew, val); + } + + void update ( const t_index idx, const t_float val ) const { + // Update the element A[i] with val and re-arrange the indices to preserve + // the heap condition. + if (val<=A[idx]) + update_leq(idx, val); + else + update_geq(idx, val); + } + + void update_leq ( const t_index idx, const t_float val ) const { + // Use this when the new value is not more than the old value. + A[idx] = val; + update_leq_(R[idx]); + } + + void update_geq ( const t_index idx, const t_float val ) const { + // Use this when the new value is not less than the old value. + A[idx] = val; + update_geq_(R[idx]); + } + +private: + void update_leq_ (t_index i) const { + t_index j; + for ( ; (i>0) && ( H(i)>1) ); i=j) + heap_swap(i,j); + } + + void update_geq_ (t_index i) const { + t_index j; + for ( ; (j=2*i+1)=H(i) ) { + ++j; + if ( j>=size || H(j)>=H(i) ) break; + } + else if ( j+1 +static void generic_linkage(const t_index N, t_float * const D, t_members * const members, cluster_result & Z2) { + /* + N: integer, number of data points + D: condensed distance matrix N*(N-1)/2 + Z2: output data structure + */ + + const t_index N_1 = N-1; + t_index i, j; // loop variables + t_index idx1, idx2; // row and column indices + + auto_array_ptr n_nghbr(N_1); // array of nearest neighbors + auto_array_ptr mindist(N_1); // distances to the nearest neighbors + auto_array_ptr row_repr(N); // row_repr[i]: node number that the + // i-th row represents + doubly_linked_list active_nodes(N); + binary_min_heap nn_distances(&*mindist, N_1); // minimum heap structure for + // the distance to the nearest neighbor of each point + t_index node1, node2; // node numbers in the output + t_float size1, size2; // and their cardinalities + + t_float min; // minimum and row index for nearest-neighbor search + t_index idx; + + for (i=0; ii} D(i,j) for i in range(N-1) + t_float const * DD = D; + for (i=0; i::infinity(); + for (idx=j=i+1; ji} D(i,j) + + Normally, we have equality. However, this minimum may become invalid due + to the updates in the distance matrix. The rules are: + + 1) If mindist[i] is equal to D(i, n_nghbr[i]), this is the correct + minimum and n_nghbr[i] is a nearest neighbor. + + 2) If mindist[i] is smaller than D(i, n_nghbr[i]), this might not be the + correct minimum. The minimum needs to be recomputed. + + 3) mindist[i] is never bigger than the true minimum. Hence, we never + miss the true minimum if we take the smallest mindist entry, + re-compute the value if necessary (thus maybe increasing it) and + looking for the now smallest mindist entry until a valid minimal + entry is found. This step is done in the lines below. + + The update process for D below takes care that these rules are + fulfilled. This makes sure that the minima in the rows D(i,i+1:)of D are + re-calculated when necessary but re-calculation is avoided whenever + possible. + + The re-calculation of the minima makes the worst-case runtime of this + algorithm cubic in N. We avoid this whenever possible, and in most cases + the runtime appears to be quadratic. + */ + idx1 = nn_distances.argmin(); + if (method != METHOD_METR_SINGLE) { + while ( mindist[idx1] < D_(idx1, n_nghbr[idx1]) ) { + // Recompute the minimum mindist[idx1] and n_nghbr[idx1]. + n_nghbr[idx1] = j = active_nodes.succ[idx1]; // exists, maximally N-1 + min = D_(idx1,j); + for (j=active_nodes.succ[j]; j(members[idx1]); + size2 = static_cast(members[idx2]); + members[idx2] += members[idx1]; + } + Z2.append(node1, node2, mindist[idx1]); + + // Remove idx1 from the list of active indices (active_nodes). + active_nodes.remove(idx1); + // Index idx2 now represents the new (merged) node with label N+i. + row_repr[idx2] = N+i; + + // Update the distance matrix + switch (method) { + case METHOD_METR_SINGLE: + /* + Single linkage. + + Characteristic: new distances are never longer than the old distances. + */ + // Update the distance matrix in the range [start, idx1). + for (j=active_nodes.start; j(members[j]) ); + if (n_nghbr[j] == idx1) + n_nghbr[j] = idx2; + } + // Update the distance matrix in the range (idx1, idx2). + for (; j(members[j]) ); + if (D_(j, idx2) < mindist[j]) { + nn_distances.update_leq(j, D_(j, idx2)); + n_nghbr[j] = idx2; + } + } + // Update the distance matrix in the range (idx2, N). + if (idx2(members[j]) ); + min = D_(idx2,j); + for (j=active_nodes.succ[j]; j(members[j]) ); + if (D_(idx2,j) < min) { + min = D_(idx2,j); + n_nghbr[idx2] = j; + } + } + nn_distances.update(idx2, min); + } + break; + + case METHOD_METR_CENTROID: { + /* + Centroid linkage. + + Shorter and longer distances can occur, not bigger than max(d1,d2) + but maybe smaller than min(d1,d2). + */ + // Update the distance matrix in the range [start, idx1). + t_float s = size1/(size1+size2); + t_float t = size2/(size1+size2); + t_float stc = s*t*mindist[idx1]; + for (j=active_nodes.start; j +static void MST_linkage_core_vector(const t_index N, + t_dissimilarity & dist, + cluster_result & Z2) { +/* + N: integer, number of data points + dist: function pointer to the metric + Z2: output data structure + + The basis of this algorithm is an algorithm by Rohlf: + + F. James Rohlf, Hierarchical clustering using the minimum spanning tree, + The Computer Journal, vol. 16, 1973, p. 93–95. +*/ + t_index i; + t_index idx2; + doubly_linked_list active_nodes(N); + auto_array_ptr d(N); + + t_index prev_node; + t_float min; + + // first iteration + idx2 = 1; + min = std::numeric_limits::infinity(); + for (i=1; i tmp) + d[i] = tmp; + else if (fc_isnan(tmp)) + throw (nan_error()); +#if HAVE_DIAGNOSTIC +#pragma GCC diagnostic pop +#endif + if (d[i] < min) { + min = d[i]; + idx2 = i; + } + } + Z2.append(prev_node, idx2, min); + } +} + +template +static void generic_linkage_vector(const t_index N, + t_dissimilarity & dist, + cluster_result & Z2) { + /* + N: integer, number of data points + dist: function pointer to the metric + Z2: output data structure + + This algorithm is valid for the distance update methods + "Ward", "centroid" and "median" only! + */ + const t_index N_1 = N-1; + t_index i, j; // loop variables + t_index idx1, idx2; // row and column indices + + auto_array_ptr n_nghbr(N_1); // array of nearest neighbors + auto_array_ptr mindist(N_1); // distances to the nearest neighbors + auto_array_ptr row_repr(N); // row_repr[i]: node number that the + // i-th row represents + doubly_linked_list active_nodes(N); + binary_min_heap nn_distances(&*mindist, N_1); // minimum heap structure for + // the distance to the nearest neighbor of each point + t_index node1, node2; // node numbers in the output + t_float min; // minimum and row index for nearest-neighbor search + + for (i=0; ii} D(i,j) for i in range(N-1) + for (i=0; i::infinity(); + t_index idx; + for (idx=j=i+1; j(i,j); + } + if (tmp(idx1,j); + for (j=active_nodes.succ[j]; j(idx1,j); + if (tmp(j, idx2); + if (tmp < mindist[j]) { + nn_distances.update_leq(j, tmp); + n_nghbr[j] = idx2; + } + else if (n_nghbr[j] == idx2) + n_nghbr[j] = idx1; // invalidate + } + // Find the nearest neighbor for idx2. + if (idx2(idx2,j); + for (j=active_nodes.succ[j]; j(idx2, j); + if (tmp < min) { + min = tmp; + n_nghbr[idx2] = j; + } + } + nn_distances.update(idx2, min); + } + } + } +} + +template +static void generic_linkage_vector_alternative(const t_index N, + t_dissimilarity & dist, + cluster_result & Z2) { + /* + N: integer, number of data points + dist: function pointer to the metric + Z2: output data structure + + This algorithm is valid for the distance update methods + "Ward", "centroid" and "median" only! + */ + const t_index N_1 = N-1; + t_index i, j=0; // loop variables + t_index idx1, idx2; // row and column indices + + auto_array_ptr n_nghbr(2*N-2); // array of nearest neighbors + auto_array_ptr mindist(2*N-2); // distances to the nearest neighbors + + doubly_linked_list active_nodes(N+N_1); + binary_min_heap nn_distances(&*mindist, N_1, 2*N-2, 1); // minimum heap + // structure for the distance to the nearest neighbor of each point + + t_float min; // minimum for nearest-neighbor searches + + // Initialize the minimal distances: + // Find the nearest neighbor of each point. + // n_nghbr[i] = argmin_{j>i} D(i,j) for i in range(N-1) + for (i=1; i::infinity(); + t_index idx; + for (idx=j=0; j(i,j); + } + if (tmp +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/src/spyral_utils/tripclust/src/kdtree/README b/src/spyral_utils/tripclust/src/kdtree/README new file mode 100644 index 0000000..3c47bab --- /dev/null +++ b/src/spyral_utils/tripclust/src/kdtree/README @@ -0,0 +1,25 @@ +C++ Kd-Tree Library +=================== + +This is a standalone C++ version of the kd-tree implementation in the +Gamera framework that has been extended by a range search and has +been relicensed by the original author under a BSD style license. + +Usage of the library +-------------------- + +Design and usage of the library are described in detail in + +C. Dalitz: Kd-Trees for Document Layout Analysis. +In C. Dalitz (Ed.): "Document Image Analysis with the Gamera Framework." +Schriftenreihe des Fachbereichs Elektrotechnik und Informatik, +Hochschule Niederrhein, vol. 8, pp. 39-52, Shaker Verlag (2009) + + +Authors & Copyright +------------------- + +Christoph Dalitz, 2018-2022, +Jens Wilberg, 2018, + +For licensing information, see the file LICENSE for details. diff --git a/src/spyral_utils/tripclust/src/kdtree/kdtree.cpp b/src/spyral_utils/tripclust/src/kdtree/kdtree.cpp new file mode 100644 index 0000000..0b62958 --- /dev/null +++ b/src/spyral_utils/tripclust/src/kdtree/kdtree.cpp @@ -0,0 +1,447 @@ +// +// Kd-Tree implementation. +// +// Copyright: Christoph Dalitz, 2018 +// Jens Wilberg, 2018 +// Version: 1.2 +// License: BSD style license +// (see the file LICENSE for details) +// + +#include "kdtree.hpp" +#include +#include +#include +#include + +namespace Kdtree { + +//-------------------------------------------------------------- +// function object for comparing only dimension d of two vecotrs +//-------------------------------------------------------------- +class compare_dimension { + public: + compare_dimension(size_t dim) { d = dim; } + bool operator()(const KdNode& p, const KdNode& q) { + return (p.point[d] < q.point[d]); + } + size_t d; +}; + +//-------------------------------------------------------------- +// internal node structure used by kdtree +//-------------------------------------------------------------- +class kdtree_node { + public: + kdtree_node() { + dataindex = cutdim = 0; + loson = hison = (kdtree_node*)NULL; + } + ~kdtree_node() { + if (loson) delete loson; + if (hison) delete hison; + } + // index of node data in kdtree array "allnodes" + size_t dataindex; + // cutting dimension + size_t cutdim; + // value of point + // double cutval; // == point[cutdim] + CoordPoint point; + // roots of the two subtrees + kdtree_node *loson, *hison; + // bounding rectangle of this node's subtree + CoordPoint lobound, upbound; +}; + +//-------------------------------------------------------------- +// different distance metrics +//-------------------------------------------------------------- +class DistanceMeasure { + public: + DistanceMeasure() {} + virtual ~DistanceMeasure() {} + virtual double distance(const CoordPoint& p, const CoordPoint& q) = 0; + virtual double coordinate_distance(double x, double y, size_t dim) = 0; +}; +// Maximum distance (Linfinite norm) +class DistanceL0 : virtual public DistanceMeasure { + DoubleVector* w; + + public: + DistanceL0(const DoubleVector* weights = NULL) { + if (weights) + w = new DoubleVector(*weights); + else + w = (DoubleVector*)NULL; + } + ~DistanceL0() { + if (w) delete w; + } + double distance(const CoordPoint& p, const CoordPoint& q) { + size_t i; + double dist, test; + if (w) { + dist = (*w)[0] * fabs(p[0] - q[0]); + for (i = 1; i < p.size(); i++) { + test = (*w)[i] * fabs(p[i] - q[i]); + if (test > dist) dist = test; + } + } else { + dist = fabs(p[0] - q[0]); + for (i = 1; i < p.size(); i++) { + test = fabs(p[i] - q[i]); + if (test > dist) dist = test; + } + } + return dist; + } + double coordinate_distance(double x, double y, size_t dim) { + if (w) + return (*w)[dim] * fabs(x - y); + else + return fabs(x - y); + } +}; +// Manhatten distance (L1 norm) +class DistanceL1 : virtual public DistanceMeasure { + DoubleVector* w; + + public: + DistanceL1(const DoubleVector* weights = NULL) { + if (weights) + w = new DoubleVector(*weights); + else + w = (DoubleVector*)NULL; + } + ~DistanceL1() { + if (w) delete w; + } + double distance(const CoordPoint& p, const CoordPoint& q) { + size_t i; + double dist = 0.0; + if (w) { + for (i = 0; i < p.size(); i++) dist += (*w)[i] * fabs(p[i] - q[i]); + } else { + for (i = 0; i < p.size(); i++) dist += fabs(p[i] - q[i]); + } + return dist; + } + double coordinate_distance(double x, double y, size_t dim) { + if (w) + return (*w)[dim] * fabs(x - y); + else + return fabs(x - y); + } +}; +// Euklidean distance (L2 norm) +class DistanceL2 : virtual public DistanceMeasure { + DoubleVector* w; + + public: + DistanceL2(const DoubleVector* weights = NULL) { + if (weights) + w = new DoubleVector(*weights); + else + w = (DoubleVector*)NULL; + } + ~DistanceL2() { + if (w) delete w; + } + double distance(const CoordPoint& p, const CoordPoint& q) { + size_t i; + double dist = 0.0; + if (w) { + for (i = 0; i < p.size(); i++) + dist += (*w)[i] * (p[i] - q[i]) * (p[i] - q[i]); + } else { + for (i = 0; i < p.size(); i++) dist += (p[i] - q[i]) * (p[i] - q[i]); + } + return dist; + } + double coordinate_distance(double x, double y, size_t dim) { + if (w) + return (*w)[dim] * (x - y) * (x - y); + else + return (x - y) * (x - y); + } +}; + +//-------------------------------------------------------------- +// destructor and constructor of kdtree +//-------------------------------------------------------------- +KdTree::~KdTree() { + if (root) delete root; + delete distance; +} +// distance_type can be 0 (Maximum), 1 (Manhatten), or 2 (Euklidean) +KdTree::KdTree(const KdNodeVector* nodes, int distance_type /*=2*/) { + size_t i, j; + double val; + // copy over input data + if (!nodes || nodes->empty()) + throw std::invalid_argument( + "kdtree::KdTree(): argument nodes must not be empty"); + dimension = nodes->begin()->point.size(); + allnodes = *nodes; + // initialize distance values + distance = NULL; + this->distance_type = -1; + set_distance(distance_type); + // compute global bounding box + lobound = nodes->begin()->point; + upbound = nodes->begin()->point; + for (i = 1; i < nodes->size(); i++) { + for (j = 0; j < dimension; j++) { + val = allnodes[i].point[j]; + if (lobound[j] > val) lobound[j] = val; + if (upbound[j] < val) upbound[j] = val; + } + } + // build tree recursively + root = build_tree(0, 0, allnodes.size()); +} + +// distance_type can be 0 (Maximum), 1 (Manhatten), or 2 (Euklidean) +void KdTree::set_distance(int distance_type, + const DoubleVector* weights /*=NULL*/) { + if (distance) delete distance; + this->distance_type = distance_type; + if (distance_type == 0) { + distance = (DistanceMeasure*)new DistanceL0(weights); + } else if (distance_type == 1) { + distance = (DistanceMeasure*)new DistanceL1(weights); + } else { + distance = (DistanceMeasure*)new DistanceL2(weights); + } +} + +//-------------------------------------------------------------- +// recursive build of tree +// "a" and "b"-1 are the lower and upper indices +// from "allnodes" from which the subtree is to be built +//-------------------------------------------------------------- +kdtree_node* KdTree::build_tree(size_t depth, size_t a, size_t b) { + size_t m; + double temp, cutval; + kdtree_node* node = new kdtree_node(); + node->lobound = lobound; + node->upbound = upbound; + node->cutdim = depth % dimension; + if (b - a <= 1) { + node->dataindex = a; + node->point = allnodes[a].point; + } else { + m = (a + b) / 2; + std::nth_element(allnodes.begin() + a, allnodes.begin() + m, + allnodes.begin() + b, compare_dimension(node->cutdim)); + node->point = allnodes[m].point; + cutval = allnodes[m].point[node->cutdim]; + node->dataindex = m; + if (m - a > 0) { + temp = upbound[node->cutdim]; + upbound[node->cutdim] = cutval; + node->loson = build_tree(depth + 1, a, m); + upbound[node->cutdim] = temp; + } + if (b - m > 1) { + temp = lobound[node->cutdim]; + lobound[node->cutdim] = cutval; + node->hison = build_tree(depth + 1, m + 1, b); + lobound[node->cutdim] = temp; + } + } + return node; +} + +//-------------------------------------------------------------- +// k nearest neighbor search +// returns the *k* nearest neighbors of *point* in O(log(n)) +// time. The result is returned in *result* and is sorted by +// distance from *point*. +// The optional search predicate is a callable class (aka "functor") +// derived from KdNodePredicate. When Null (default, no search +// predicate is applied). +//-------------------------------------------------------------- +void KdTree::k_nearest_neighbors(const CoordPoint& point, size_t k, + KdNodeVector* result, + std::vector* distances, + KdNodePredicate* pred /*=NULL*/) { + size_t i; + double d, temp_dist; + KdNode temp; + searchpredicate = pred; + + result->clear(); + if (k < 1) return; + if (point.size() != dimension) + throw std::invalid_argument( + "kdtree::k_nearest_neighbors(): point must be of same dimension as " + "kdtree"); + + // collect result of k values in neighborheap + //std::priority_queue, compare_nn4heap>* + //neighborheap = new std::priority_queue, compare_nn4heap>(); + SearchQueue* neighborheap = new SearchQueue(); + if (k > allnodes.size()) { + // when more neighbors asked than nodes in tree, return everything + k = allnodes.size(); + for (i = 0; i < k; i++) { + if (!(searchpredicate && !(*searchpredicate)(allnodes[i]))) + neighborheap->push( + nn4heap(i, distance->distance(allnodes[i].point, point))); + } + } else { + neighbor_search(point, root, k, neighborheap); + } + + // copy over result sorted by distance + // (we must revert the vector for ascending order) + while (!neighborheap->empty()) { + i = neighborheap->top().dataindex; + d = neighborheap->top().distance; + neighborheap->pop(); + result->push_back(allnodes[i]); + distances->push_back(d); + } + // beware that less than k results might have been returned + k = result->size(); + for (i = 0; i < k / 2; i++) { + temp = (*result)[i]; + (*result)[i] = (*result)[k - 1 - i]; + (*result)[k - 1 - i] = temp; + temp_dist = (*distances)[i]; + (*distances)[i] = (*distances)[k - 1 - i]; + (*distances)[k - 1 - i] = temp_dist; + } + delete neighborheap; +} + +//-------------------------------------------------------------- +// range nearest neighbor search +// returns the nearest neighbors of *point* in the given range +// *r*. The result is returned in *result* and is sorted by +// distance from *point*. +//-------------------------------------------------------------- +void KdTree::range_nearest_neighbors(const CoordPoint& point, double r, + KdNodeVector* result) { + KdNode temp; + + result->clear(); + if (point.size() != dimension) + throw std::invalid_argument( + "kdtree::k_nearest_neighbors(): point must be of same dimension as " + "kdtree"); + if (this->distance_type == 2) { + // if euclidien distance is used the range must be squared because we + // get squared distances from this implementation + r *= r; + } + + // collect result in range_result + std::vector range_result; + range_search(point, root, r, &range_result); + + // copy over result + for (std::vector::iterator i = range_result.begin(); + i != range_result.end(); ++i) { + result->push_back(allnodes[*i]); + } + + // clear vector + range_result.clear(); +} + +//-------------------------------------------------------------- +// recursive function for nearest neighbor search in subtree +// under *node*. Stores result in *neighborheap*. +// returns "true" when no nearer neighbor elsewhere possible +//-------------------------------------------------------------- +bool KdTree::neighbor_search(const CoordPoint& point, kdtree_node* node, + size_t k, SearchQueue* neighborheap) { + double curdist, dist; + + curdist = distance->distance(point, node->point); + if (!(searchpredicate && !(*searchpredicate)(allnodes[node->dataindex]))) { + if (neighborheap->size() < k) { + neighborheap->push(nn4heap(node->dataindex, curdist)); + } else if (curdist < neighborheap->top().distance) { + neighborheap->pop(); + neighborheap->push(nn4heap(node->dataindex, curdist)); + } + } + // first search on side closer to point + if (point[node->cutdim] < node->point[node->cutdim]) { + if (node->loson) + if (neighbor_search(point, node->loson, k, neighborheap)) return true; + } else { + if (node->hison) + if (neighbor_search(point, node->hison, k, neighborheap)) return true; + } + // second search on farther side, if necessary + if (neighborheap->size() < k) { + dist = std::numeric_limits::max(); + } else { + dist = neighborheap->top().distance; + } + if (point[node->cutdim] < node->point[node->cutdim]) { + if (node->hison && bounds_overlap_ball(point, dist, node->hison)) + if (neighbor_search(point, node->hison, k, neighborheap)) return true; + } else { + if (node->loson && bounds_overlap_ball(point, dist, node->loson)) + if (neighbor_search(point, node->loson, k, neighborheap)) return true; + } + + if (neighborheap->size() == k) dist = neighborheap->top().distance; + return ball_within_bounds(point, dist, node); +} + +//-------------------------------------------------------------- +// recursive function for range search in subtree under *node*. +// Stores result in *range_result*. +//-------------------------------------------------------------- +void KdTree::range_search(const CoordPoint& point, kdtree_node* node, + double r, std::vector* range_result) { + double curdist = distance->distance(point, node->point); + if (curdist <= r) { + range_result->push_back(node->dataindex); + } + if (node->loson != NULL && this->bounds_overlap_ball(point, r, node->loson)) { + range_search(point, node->loson, r, range_result); + } + if (node->hison != NULL && this->bounds_overlap_ball(point, r, node->hison)) { + range_search(point, node->hison, r, range_result); + } +} + +// returns true when the bounds of *node* overlap with the +// ball with radius *dist* around *point* +bool KdTree::bounds_overlap_ball(const CoordPoint& point, double dist, + kdtree_node* node) { + double distsum = 0.0; + size_t i; + for (i = 0; i < dimension; i++) { + if (point[i] < node->lobound[i]) { // lower than low boundary + distsum += distance->coordinate_distance(point[i], node->lobound[i], i); + if (distsum > dist) return false; + } else if (point[i] > node->upbound[i]) { // higher than high boundary + distsum += distance->coordinate_distance(point[i], node->upbound[i], i); + if (distsum > dist) return false; + } + } + return true; +} + +// returns true when the bounds of *node* completely contain the +// ball with radius *dist* around *point* +bool KdTree::ball_within_bounds(const CoordPoint& point, double dist, + kdtree_node* node) { + size_t i; + for (i = 0; i < dimension; i++) + if (distance->coordinate_distance(point[i], node->lobound[i], i) <= dist || + distance->coordinate_distance(point[i], node->upbound[i], i) <= dist) + return false; + return true; +} + +} // namespace Kdtree \ No newline at end of file diff --git a/src/spyral_utils/tripclust/src/kdtree/kdtree.hpp b/src/spyral_utils/tripclust/src/kdtree/kdtree.hpp new file mode 100644 index 0000000..c37c113 --- /dev/null +++ b/src/spyral_utils/tripclust/src/kdtree/kdtree.hpp @@ -0,0 +1,110 @@ +#ifndef __kdtree_HPP +#define __kdtree_HPP + +// +// Kd-Tree implementation. +// +// Copyright: Christoph Dalitz, 2018-2022 +// Jens Wilberg, 2018 +// Version: 1.2 +// License: BSD style license +// (see the file LICENSE for details) +// + +#include +#include +#include + +namespace Kdtree { + +typedef std::vector CoordPoint; +typedef std::vector DoubleVector; + +// for passing points to the constructor of kdtree +struct KdNode { + CoordPoint point; + void* data; + int index; + KdNode(const CoordPoint& p, void* d = NULL, int i = -1) { + point = p; + data = d; + index = i; + } + KdNode() { data = NULL; } +}; +typedef std::vector KdNodeVector; + +// base function object for search predicate in knn search +// returns true when the given KdNode is an admissible neighbor +// To define an own search predicate, derive from this class +// and overwrite the call operator operator() +struct KdNodePredicate { + virtual ~KdNodePredicate() {} + virtual bool operator()(const KdNode&) const { return true; } +}; + +//-------------------------------------------------------- +// private helper classes used internally by KdTree +// +// the internal node structure used by kdtree +class kdtree_node; +// base class for different distance computations +class DistanceMeasure; +// helper class for priority queue in k nearest neighbor search +class nn4heap { + public: + size_t dataindex; // index of actual kdnode in *allnodes* + double distance; // distance of this neighbor from *point* + nn4heap(size_t i, double d) { + dataindex = i; + distance = d; + } +}; +class compare_nn4heap { + public: + bool operator()(const nn4heap& n, const nn4heap& m) { + return (n.distance < m.distance); + } +}; + typedef std::priority_queue, compare_nn4heap> SearchQueue; +//-------------------------------------------------------- + +// kdtree class +class KdTree { + private: + // recursive build of tree + kdtree_node* build_tree(size_t depth, size_t a, size_t b); + // helper variable for keeping track of subtree bounding box + CoordPoint lobound, upbound; + // helper variable to check the distance method + int distance_type; + bool neighbor_search(const CoordPoint& point, kdtree_node* node, size_t k, SearchQueue* neighborheap); + void range_search(const CoordPoint& point, kdtree_node* node, double r, std::vector* range_result); + bool bounds_overlap_ball(const CoordPoint& point, double dist, + kdtree_node* node); + bool ball_within_bounds(const CoordPoint& point, double dist, + kdtree_node* node); + // class implementing the distance computation + DistanceMeasure* distance; + // search predicate in knn searches + KdNodePredicate* searchpredicate; + + public: + KdNodeVector allnodes; + size_t dimension; + kdtree_node* root; + // distance_type can be 0 (max), 1 (city block), or 2 (euklid) + KdTree(const KdNodeVector* nodes, int distance_type = 2); + ~KdTree(); + void set_distance(int distance_type, const DoubleVector* weights = NULL); + void k_nearest_neighbors(const CoordPoint& point, size_t k, + KdNodeVector* result, + std::vector* distances, + KdNodePredicate* pred = NULL); + void range_nearest_neighbors(const CoordPoint& point, double r, + KdNodeVector* result); +}; + +} // end namespace Kdtree + +#endif \ No newline at end of file diff --git a/src/spyral_utils/tripclust/src/main.cpp b/src/spyral_utils/tripclust/src/main.cpp new file mode 100644 index 0000000..fcaad6d --- /dev/null +++ b/src/spyral_utils/tripclust/src/main.cpp @@ -0,0 +1,187 @@ +// +// main.cpp +// Main file for reference implementation of the TriplClust algorithm. +// +// Author: Jens Wilberg, Lukas Aymans, Christoph Dalitz +// Date: 2024-02-16 +// License: see ../LICENSE +// + +#include +#include +#include +#include +#include + +#include "cluster.h" +#include "dnn.h" +#include "graph.h" +#include "option.h" +#include "output.h" +#include "pointcloud.h" + +// usage message +const char *usage = + "Usage:\n" + "\ttriplclust [options] \n" + "Options (defaults in brackets):\n" + "\t-r radius for point smoothing [2dNN]\n" + "\t (can be numeric or multiple of dNN)\n" + "\t-k number of neighbours in triplet creation [19]\n" + "\t-n number of the best triplets to use [2]\n" + "\t-a maximum value for the angle between the\n" + "\t triplet branches [0.03]\n" + "\t-s scalingfactor for clustering [0.33dNN]\n" + "\t (can be numeric or multiple of dNN)\n" + "\t-t best cluster distance [auto]\n" + "\t (can be numeric or 'auto')\n" + "\t-m minimum number of triplets for a cluster [5]\n" + "\t-dmax max gapwidth within a triplet [none]\n" + "\t (can be numeric, multiple of dNN or 'none')\n" + "\t-link linkage method for clustering [single]\n" + "\t (can be 'single', 'complete', 'average')\n" + "\t-ordered interpret infile as ordered\n" + "\t (i.e. points are in chronological order)\n" + "\t-oprefix \n" + "\t write result not to stdout, but to .csv\n" + "\t and (if -gnuplot is set) to .gnuplot\n" + "\t-gnuplot print result as a gnuplot command\n" + "\t-delim single char delimiter for csv input [' ']\n" + "\t-skip number of lines skipped at head of infile [0]\n" + "\t-v be verbose\n" + "\t-vv be more verbose and write debug trace files\n" + "Version:\n" + "\t1.4 from 2024-02-16"; + +int main(int argc, char **argv) { + // parse commandline + Opt opt_params; + if (opt_params.parse_args(argc, argv) != 0) { + std::cerr << usage << std::endl; + return 1; + } + const char *infile_name = opt_params.get_ifname(); + const char *outfile_prefix = opt_params.get_ofprefix(); + int opt_verbose = opt_params.get_verbosity(); + bool opt_ordered = opt_params.get_ordered(); + + // plausibility checks + if (!infile_name) { + std::cerr << "[Error] no infile given!\n" << usage << std::endl; + return 1; + } + + // load data + PointCloud cloud_xyz; + cloud_xyz.setOrdered(opt_ordered); + + try { + load_csv_file(infile_name, cloud_xyz, opt_params.get_delimiter(), + opt_params.get_skip()); + } catch (const std::invalid_argument &e) { + std::cerr << "[Error] in file'" << infile_name << "': " << e.what() + << std::endl; + return 2; + } +#ifdef WEBDEMO + // maximum pointcloud size error for webdemo + catch (const std::length_error &e) { + std::cerr << "[Error] in file'" << infile_name << "': " << e.what() + << std::endl; + return 3; + } +#endif + catch (const std::exception &e) { + std::cerr << "[Error] cannot read infile '" << infile_name << "'! " + << e.what() << std::endl; + return 2; + } + if (cloud_xyz.size() == 0) { + std::cerr << "[Error] empty cloud in file '" << infile_name << "'" + << std::endl + << "maybe you used the wrong delimiter" << std::endl; + return 2; + } + + // compute characteristic length dnn if needed + if (opt_params.needs_dnn()) { + double dnn = std::sqrt(first_quartile(cloud_xyz)); + if (opt_verbose > 0) { + std::cout << "[Info] computed dnn: " << dnn << std::endl; + } + opt_params.set_dnn(dnn); + if (dnn == 0.0) { + std::cerr << "[Error] dnn computed as zero. " + << "Suggestion: remove doublets, e.g. with 'sort -u'" + << std::endl; + return 3; + } + } + + // Step 1) smoothing by position averaging of neighboring points + PointCloud cloud_xyz_smooth; + smoothen_cloud(cloud_xyz, cloud_xyz_smooth, opt_params.get_r()); + + if (opt_verbose > 1) { + bool rc; + rc = cloud_to_csv(cloud_xyz_smooth); + if (!rc) + std::cerr << "[Error] can't write debug_smoothed.csv" << std::endl; + rc = debug_gnuplot(cloud_xyz, cloud_xyz_smooth); + if (!rc) + std::cerr << "[Error] can't write debug_smoothed.gnuplot" << std::endl; + } + + // Step 2) finding triplets of approximately collinear points + std::vector triplets; + generate_triplets(cloud_xyz_smooth, triplets, opt_params.get_k(), + opt_params.get_n(), opt_params.get_a()); + + // Step 3) single link hierarchical clustering of the triplets + cluster_group cl_group; + compute_hc(cloud_xyz_smooth, cl_group, triplets, opt_params.get_s(), + opt_params.get_t(), opt_params.is_tauto(), opt_params.get_dmax(), + opt_params.is_dmax(), opt_params.get_linkage(), opt_verbose); + + // Step 4) pruning by removal of small clusters ... + cleanup_cluster_group(cl_group, opt_params.get_m(), opt_verbose); + cluster_triplets_to_points(triplets, cl_group); + // .. and (optionally) by splitting up clusters at gaps > dmax + if (opt_params.is_dmax()) { + cluster_group cleaned_up_cluster_group; + for (cluster_group::iterator cl = cl_group.begin(); cl != cl_group.end(); + ++cl) { + max_step(cleaned_up_cluster_group, *cl, cloud_xyz, opt_params.get_dmax(), + opt_params.get_m() + 2); + } + cl_group = cleaned_up_cluster_group; + } + + // store cluster labels in points + add_clusters(cloud_xyz, cl_group, opt_params.is_gnuplot()); + + if (outfile_prefix) { + // redirect cout to outfile if requested + std::streambuf *backup = std::cout.rdbuf(); + std::ofstream of; + of.open((std::string(outfile_prefix) + ".csv").c_str()); + // replace the stream buffer from cout with the stream buffer from the + // opened file, so that everythin printed to cout is printed to the file. + std::cout.rdbuf(of.rdbuf()); + clusters_to_csv(cloud_xyz); + of.close(); + if (opt_params.is_gnuplot()) { + of.open((std::string(outfile_prefix) + ".gnuplot").c_str()); + clusters_to_gnuplot(cloud_xyz, cl_group); + of.close(); + } + // restore cout's default stream buffer + std::cout.rdbuf(backup); + } else if (opt_params.is_gnuplot()) { + clusters_to_gnuplot(cloud_xyz, cl_group); + } else { + clusters_to_csv(cloud_xyz); + } + + return 0; +} diff --git a/src/spyral_utils/tripclust/src/option.cpp b/src/spyral_utils/tripclust/src/option.cpp new file mode 100644 index 0000000..f97ac5f --- /dev/null +++ b/src/spyral_utils/tripclust/src/option.cpp @@ -0,0 +1,266 @@ +// +// option.cpp +// Class and functions for parsing and storing command line options. +// +// Author: Jens Wilberg, Lukas Aymans, Christoph Dalitz +// Date: 2024-02-02 +// License: see ../LICENSE +// + +#include +#include +#include +#include + +#include "option.h" + +// initialize default values +Opt::Opt() { + this->infile_name = NULL; + this->outfile_prefix = NULL; + this->gnuplot = false; + this->delimiter = ' '; + this->skip = 0; + this->verbose = 0; + + // neighbourship smoothing + this->r = 2; + this->rdnn = true; + + // triplet building + this->k = 19; + this->n = 2; + this->a = 0.03; + + // triplet clustering + this->s = 0.3; + this->sdnn = true; + this->t = 0; + this->tauto = true; + this->dmax = 0.0; + this->isdmax = false; + this->dmax_dnn = false; + this->ordered = false; + this->link = SINGLE; + + this->m = 5; +} + +//------------------------------------------------------------------- +// parse commandline arguments. +// Parses the commandline arguments in *argv* and returns 0 if no error +// occurred. *argc* is the count off arguments. +//------------------------------------------------------------------- +int Opt::parse_args(int argc, char** argv) { + // parse command line + std::pair tmp; + try { + for (int i = 1; i < argc; i++) { + if (0 == strcmp(argv[i], "-v")) { + if (this->verbose < 1) this->verbose = 1; + } else if (0 == strcmp(argv[i], "-vv")) { + if (this->verbose < 2) this->verbose = 2; + } else if (0 == strcmp(argv[i], "-s")) { + ++i; + if (i >= argc) { + return 1; + } + tmp = this->parse_argument(argv[i]); + this->s = tmp.first; + this->sdnn = tmp.second; + } else if (0 == strcmp(argv[i], "-r")) { + ++i; + if (i >= argc) { + return 1; + } + tmp = this->parse_argument(argv[i]); + this->r = tmp.first; + this->rdnn = tmp.second; + } else if (0 == strcmp(argv[i], "-k")) { + ++i; + if (i < argc) { + this->k = (int)stod(argv[i]); + } else { + return 1; + } + } else if (0 == strcmp(argv[i], "-ordered")) { + this->ordered = true; + } else if (0 == strcmp(argv[i], "-n")) { + ++i; + if (i < argc) { + this->n = (int)stod(argv[i]); + } else { + return 1; + } + } else if (0 == strcmp(argv[i], "-a")) { + ++i; + if (i < argc) { + this->a = stod(argv[i]); + } else { + return 1; + } + } else if (0 == strcmp(argv[i], "-t")) { + ++i; + if (i < argc) { + if (strcmp(argv[i], "auto") == 0 || + strcmp(argv[i], "automatic") == 0) { + this->tauto = true; + } else { + this->t = stod(argv[i]); + this->tauto = false; + } + } else { + return 1; + } + } else if (0 == strcmp(argv[i], "-m")) { + ++i; + if (i < argc) { + this->m = (int)stod(argv[i]); + } else { + return 1; + } + } else if (0 == strcmp(argv[i], "-delim")) { + ++i; + if (i < argc) { + if (strlen(argv[i]) > 1) { + std::cerr << "[Error] only a character as delimiter is allowed" + << std::endl; + return 1; + } + this->delimiter = *argv[i]; + } else { + return 1; + } + } else if (0 == strcmp(argv[i], "-dmax")) { + ++i; + if (i >= argc) { + return 1; + } + if (strcmp(argv[i], "none") == 0) { + this->isdmax = false; + } else { + tmp = this->parse_argument(argv[i]); + this->dmax = tmp.first; + this->dmax_dnn = tmp.second; + this->isdmax = true; + } + } else if (0 == strcmp(argv[i], "-link")) { + ++i; + if (i >= argc) { + return 1; + } + if (strcmp(argv[i], "single") == 0) { + this->link = SINGLE; + } else if (strcmp(argv[i], "complete") == 0) { + this->link = COMPLETE; + } else if (strcmp(argv[i], "average") == 0) { + this->link = AVERAGE; + } else { + std::cerr << "[Error] " << argv[i] << " is not a valide option!" + << std::endl; + return 1; + } + } else if (0 == strcmp(argv[i], "-skip")) { + ++i; + if (i < argc) { + int tmp = atoi(argv[i]); + if (tmp < 0) { + std::cerr << "[Error] skip takes only positive integers. parameter " + "is ignored!" + << std::endl; + } else { + this->skip = (size_t)tmp; + } + } else { + return 1; + } + } else if (0 == strcmp(argv[i], "-oprefix")) { + if (i + 1 == argc) { + std::cerr << "[Error] not enough parameters" << std::endl; + return 1; + } else if (argv[i + 1][0] == '-') { + std::cerr << "[Error] please enter outfile name" << std::endl; + return 1; + } + this->outfile_prefix = argv[++i]; + } else if (0 == strcmp(argv[i], "-gnuplot")) { + this->gnuplot = true; + } else if (argv[i][0] == '-') { + return 1; + } else { + this->infile_name = argv[i]; + } + } + } catch (const std::invalid_argument &e) { + std::cerr << e.what() << std::endl; + return 1; + } + return 0; +} + +//------------------------------------------------------------------- +// parses the argument string *str*. +// the result is returned in *result*. *result_dnn* stores if *result* +// depends on dnn. If *str* is not a number a invalid_argument exception +// is thrown. +//------------------------------------------------------------------- +std::pair Opt::parse_argument(const char* str) { + double result = 0.0; + bool dnn = false; + char buff[4]; + size_t count = sscanf(str, "%lf%3s", &result, buff); + if (count == 2) { + if (std::strcmp("dnn", buff) && std::strcmp("dNN", buff)) + throw std::invalid_argument("not a number"); + dnn = true; + } else if (count == 0) { + throw std::invalid_argument("not a number"); + } + return std::pair(result, dnn); +} + +//------------------------------------------------------------------- +// compute attributes which depend on dnn. +// If r,s,dmax depend on dnn their new value will be computed. +//------------------------------------------------------------------- +void Opt::set_dnn(double dnn) { + if (this->rdnn) { + this->r *= dnn; + if (this->verbose > 0) { + std::cout << "[Info] computed smoothed radius: " << this->r << std::endl; + } + } + if (this->sdnn) { + this->s *= dnn; + if (this->verbose > 0) { + std::cout << "[Info] computed distance scale: " << this->s << std::endl; + } + } + if (this->dmax_dnn) { + this->dmax *= dnn; + if (this->verbose > 0) { + std::cout << "[Info] computed max gap: " << this->dmax << std::endl; + } + } +} + +// read access functions +const char* Opt::get_ifname() { return this->infile_name; } +const char* Opt::get_ofprefix() { return this->outfile_prefix; } +bool Opt::needs_dnn() { return this->rdnn || this->sdnn || this->dmax_dnn; } +bool Opt::is_gnuplot() { return this->gnuplot; } +size_t Opt::get_skip() { return this->skip; } +char Opt::get_delimiter() { return this->delimiter; } +int Opt::get_verbosity() { return this->verbose; } +double Opt::get_r() { return this->r; } +size_t Opt::get_k() { return this->k; } +size_t Opt::get_n() { return this->n; } +double Opt::get_a() { return this->a; } +double Opt::get_s() { return this->s; } +bool Opt::is_tauto() { return this->tauto; } +double Opt::get_t() { return this->t; } +bool Opt::is_dmax() { return this->isdmax; } +double Opt::get_dmax() { return this->dmax; } +Linkage Opt::get_linkage() { return this->link; } +size_t Opt::get_m() { return this->m; } +bool Opt::get_ordered() {return this->ordered;} diff --git a/src/spyral_utils/tripclust/src/option.h b/src/spyral_utils/tripclust/src/option.h new file mode 100644 index 0000000..15dee80 --- /dev/null +++ b/src/spyral_utils/tripclust/src/option.h @@ -0,0 +1,89 @@ +// +// option.h +// Class and functions for parsing and storing command line options. +// +// Author: Jens Wilberg, Lukas Aymans, Christoph Dalitz +// Date: 2024-02-02 +// License: see ../LICENSE +// + +#ifndef OPTION_H +#define OPTION_H +#include +#include + +#include "util.h" + +// class for handling all command line option +class Opt { + private: + char *infile_name, *outfile_prefix; + // output as gnuplot + bool gnuplot; + // csv file delimiter + char delimiter; + // csv file with header + size_t skip; + // verbosity level + int verbose; +public: + // neighbour distance for smoothing + double r; + bool rdnn; // compute r with dnn + + // tested neighbours of triplet mid point + size_t k; + // max number of triplets to one mid point + size_t n; + // 1 - cos alpha, where alpha is the angle between the two triplet branches + double a; + + // distance scale factor in metric + double s; + bool sdnn; // compute s with dnn + // threshold for cdist in clustering + double t; + bool tauto; // auto generate t + // maximum gap width + double dmax; + bool isdmax; // dmax != none + bool dmax_dnn; // use dnn for dmax + bool ordered; // points are in chronological order + // linkage method for clustering + Linkage link; + + // min number of triplets per cluster + size_t m; + + std::pair parse_argument(const char *str); + + public: + Opt(); + int parse_args(int argc, char **argv); + // compute attributes which depend on dnn. + void set_dnn(double dnn); + + // read access functions + const char *get_ifname(); + // get outfile name + const char *get_ofprefix(); + bool needs_dnn(); + bool is_gnuplot(); + char get_delimiter(); + size_t get_skip(); + int get_verbosity(); + double get_r(); + size_t get_k(); + size_t get_n(); + double get_a(); + double get_s(); + bool is_tauto(); + double get_t(); + bool is_dmax(); + double get_dmax(); + bool get_ordered(); //! + Linkage get_linkage(); + size_t get_m(); +}; + +#endif diff --git a/src/spyral_utils/tripclust/src/orthogonallsq.cpp b/src/spyral_utils/tripclust/src/orthogonallsq.cpp new file mode 100644 index 0000000..dc6ed39 --- /dev/null +++ b/src/spyral_utils/tripclust/src/orthogonallsq.cpp @@ -0,0 +1,76 @@ +// +// orthogonallsq.h +// Orthogonal Least Squares Fit by means of PCA. +// +// Author: Christoph Dalitz +// Date: 2022-11-07 +// License: see ../LICENSE +// +/** @file */ + +#include "pointcloud.h" + +#include +using Eigen::Vector3f; +using Eigen::MatrixXf; + + +// +// orthogonal least squares fit with libeigen +// pc: points +// a, b: line representation as a + t*b +// RC: largest eigenvalue +// +double orthogonal_lsq(const PointCloud &pc, Point* a, Point* b){ + double rc = 0.0; + + if (pc.size() == 0) + return rc; + + // anchor point is mean value + a->x = a->y = a->z = 0.0; + for (size_t i=0; i < pc.size(); i++) { + *a = *a + pc[i]; + } + *a = *a / pc.size(); + + // copy points to libeigen matrix + Eigen::MatrixXf points = Eigen::MatrixXf::Constant(pc.size(), 3, 0); + for (unsigned int i = 0; i < points.rows(); i++) { + points(i,0) = pc[i].x; + points(i,1) = pc[i].y; + points(i,2) = pc[i].z; + } + + // compute scatter matrix ... + MatrixXf centered = points.rowwise() - points.colwise().mean(); + MatrixXf scatter = (centered.adjoint() * centered); + + // ... and its eigenvalues and eigenvectors + Eigen::SelfAdjointEigenSolver eig(scatter); + Eigen::MatrixXf eigvecs = eig.eigenvectors(); + + // we need eigenvector to largest eigenvalue + // libeigen yields it as LAST column + b->x = eigvecs(0,2); b->y = eigvecs(1,2); b->z = eigvecs(2,2); + rc = eig.eigenvalues()(2); + + return (rc); +} + + +double orthogonal_lsq_distance(Point* a, Point* b, Point * c) { + double distance = 0; + double lambda; + + // copy points to libeigen vektor + Eigen::Vector3f a_vec = Eigen::Vector3f(a->x,a->y,a->z); + Eigen::Vector3f b_vec = Eigen::Vector3f(b->x,b->y,b->z); + Eigen::Vector3f c_vec = Eigen::Vector3f(c->x,c->y,c->z); + + lambda = b_vec.transpose() * (c_vec - a_vec); + + distance = (c_vec - (a_vec + lambda * b_vec)).norm(); + + return distance; +} \ No newline at end of file diff --git a/src/spyral_utils/tripclust/src/orthogonallsq.h b/src/spyral_utils/tripclust/src/orthogonallsq.h new file mode 100644 index 0000000..8d16081 --- /dev/null +++ b/src/spyral_utils/tripclust/src/orthogonallsq.h @@ -0,0 +1,31 @@ +// +// orthogonallsq.h +// Orthogonal Least Squares Fit by means of PCA. +// +// Author: Christoph Dalitz +// Date: 2022-11-07 +// License: see ../LICENSE +// +/** @file */ + +#ifndef ORTHOGONALLSQ_H +#define ORTHOGONALLSQ_H + +#include "pointcloud.h" + +// +// orthogonal least squares fit with libeigen +// pc: points +// a, b: line representation as a + t*b +// RC: largest eigenvalue +// +double orthogonal_lsq(const PointCloud &pc, Point* a, Point* b); + +// +// orthogonal distance of Point c to line a + t*b +// a, b: line representation as a + t*b +// c: point +// return: double distance +double orthogonal_lsq_distance(Point* a, Point* b, Point * c); + +#endif diff --git a/src/spyral_utils/tripclust/src/output.cpp b/src/spyral_utils/tripclust/src/output.cpp new file mode 100644 index 0000000..aecc3b4 --- /dev/null +++ b/src/spyral_utils/tripclust/src/output.cpp @@ -0,0 +1,288 @@ +// +// output.cpp +// Functions for writing output files. +// +// Author: Jens Wilberg, Lukas Aymans, Christoph Dalitz +// Date: 2018-08-30 +// License: see ../LICENSE +// + +#include +#include +#include +#include +#include +#include + +#include "output.h" + +//------------------------------------------------------------------- +// Computation of cluster colour +// The colour is computed from the index of the cluster and is returned +// as hex value. +//------------------------------------------------------------------- +unsigned long compute_cluster_colour(size_t cluster_index) { + const double r = (double)((cluster_index * 23) % 19) / 18.0; + const double g = (double)((cluster_index * 23) % 7) / 6.0; + const double b = (double)((cluster_index * 23) % 3) / 2.0; + uint8_t r2 = (uint8_t)(r * 255); + uint8_t g2 = (uint8_t)(g * 255); + uint8_t b2 = (uint8_t)(b * 255); + unsigned long rgb_hex = (r2 << 16) | (g2 << 8) | b2; + + return rgb_hex; +} + +//------------------------------------------------------------------- +// Finds min and max Point of *cloud* +// the Points are returned in *min* and *max*. +//------------------------------------------------------------------- +void find_min_max_point(const PointCloud &cloud, Point &min, Point &max) { + min = max = cloud[0]; + for (std::vector::const_iterator p = cloud.begin(); p != cloud.end(); + ++p) { + if (min.x > p->x) { + min.x = p->x; + } else if (max.x < p->x) { + max.x = p->x; + } + if (min.y > p->y) { + min.y = p->y; + } else if (max.y < p->y) { + max.y = p->y; + } + if (min.z > p->z) { + min.z = p->z; + } else if (max.z < p->z) { + max.z = p->z; + } + } +} + +//------------------------------------------------------------------- +// saves smoothen cloud as gnuplot script. +// The script is saved as file 'debug_smoothed.gnuplot' in the current +// working directory. It contains the original PointCloud *cloud* in +// black and the smoothed PointCloud *cloud_smooth* in red. +// *fname* is the name of the new file. The function returns false if +// an error occurred. +//------------------------------------------------------------------- +bool debug_gnuplot(const PointCloud &cloud, const PointCloud &cloud_smooth, + const char *fname) { + // saves cloud in gnuplot file + std::ofstream of(fname); + of << std::fixed; // set float style + bool is2d = cloud.is2d(); + if (!of.is_open()) { + std::cerr << "[Error] could not save under '" << fname << std::endl; + return false; + } + // find ranges for each axis + Point min, max; + find_min_max_point(cloud, min, max); + // Write header + if (!is2d) { + // when max and min are the same, the script can't be ploted, so the range + // must be changed + if (max.x > min.x) { + of << "set xrange [" << min.x << ":" << max.x << "]\n"; + } else { + of << "set xrange [" << (min.x - 1.0) << ":" << (max.x + 1.0) << "]\n"; + } + if (max.y > min.y) { + of << "set yrange [" << min.y << ":" << max.y << "]\n"; + } else { + of << "set yrange [" << (min.y - 1.0) << ":" << (max.y + 1.0) << "]\n"; + } + if (max.z > min.z) { + of << "set zrange [" << min.z << ":" << max.z << "]\n "; + } else { + of << "set zrange [" << (min.z - 1.0) << ":" << (max.z + 1.0) << "]\n "; + } + of << "splot "; + } else { + // if we have 2D data we use plot instead of splot. For plot the header is + // not needed. + of << "plot "; + } + + of << "'-' with points lc 'black' title 'original', '-' with points lc " + "'red' title 'smoothed'\n"; + for (PointCloud::const_iterator it = cloud.begin(); it != cloud.end(); ++it) { + of << it->x << " " << it->y; + if (!is2d) of << " " << it->z; + of << std::endl; + } + of << "e\n"; + + for (PointCloud::const_iterator it = cloud_smooth.begin(); + it != cloud_smooth.end(); ++it) { + of << it->x << " " << it->y; + if (!is2d) of << " " << it->z; + of << std::endl; + } + of << "e\npause mouse keypress\n"; + + of.close(); + return true; +} + +//------------------------------------------------------------------- +// saves a PointCloud *cloud* as csv file. +// The file is saved with the name *fname* under the current working +// directory. The function returns false if an error occurred. +//------------------------------------------------------------------- +bool cloud_to_csv(const PointCloud &cloud, const char *fname) { + std::ofstream of(fname); + of << std::fixed; // set float style + bool is2d = cloud.is2d(); + if (!of.is_open()) { + std::cerr << "[Error] could not save under '" << fname << std::endl; + return false; + } + of << "# x,y,z" << std::endl; + for (PointCloud::const_iterator it = cloud.begin(); it != cloud.end(); ++it) { + of << it->x << "," << it->y; + if (!is2d) of << "," << it->z << std::endl; + } + of.close(); + return true; +} + +//------------------------------------------------------------------- +// prints gnuplot script to stdout. +// rgb colours are created with the function *compute_cluster_colour*. +// The points of *cloud* are printed with the corresponding cluster/colour. +//------------------------------------------------------------------- +void clusters_to_gnuplot(const PointCloud &cloud, + const std::vector &clusters) { + std::vector points = cloud; + std::ostringstream pointstream, header, noise, clstream; + std::string noiseheader = ""; + bool is2d = cloud.is2d(); + // set output format for floats + pointstream << std::fixed; + header << std::fixed; + noise << std::fixed; + // find ranges for each axis + Point min, max; + find_min_max_point(cloud, min, max); + // Write header + if (!is2d) { + // when max and min are the same, the script can't be ploted, so the range + // must be changed + if (max.x > min.x) { + header << "set xrange [" << min.x << ":" << max.x << "]\n"; + } else { + header << "set xrange [" << (min.x - 1.0) << ":" << (max.x + 1.0) + << "]\n"; + } + if (max.y > min.y) { + header << "set yrange [" << min.y << ":" << max.y << "]\n"; + } else { + header << "set yrange [" << (min.y - 1.0) << ":" << (max.y + 1.0) + << "]\n"; + } + + if (max.z > min.z) { + header << "set zrange [" << min.z << ":" << max.z << "]\nsplot "; + } else { + header << "set zrange [" << (min.z - 1.0) << ":" << (max.z + 1.0) + << "]\nsplot "; + } + } else { + // if we have 2D data we use plot instead of splot. For plot the header is + // not needed. + header << "plot"; + } + + // iterate over clusters + for (size_t cluster_index = 0; cluster_index < clusters.size(); + ++cluster_index) { + const std::vector &point_indices = clusters[cluster_index]; + // if there are no points in the cluster, it is only contained in an overlap + // cluster + if (point_indices.size() == 0) continue; + // add cluster header + unsigned long rgb_hex = compute_cluster_colour(cluster_index); + clstream << " '-' with points lc '#" << std::hex << rgb_hex; + std::set cluster_ids = cloud[point_indices[0]].cluster_ids; + if (cluster_ids.size() > 1) { + clstream << "' title 'overlap "; + for (std::set::const_iterator clid = cluster_ids.begin(); + clid != cluster_ids.end(); ++clid) { + if (clid != cluster_ids.begin()) clstream << ";"; + clstream << *clid; + } + } else { + clstream << "' title 'curve " << *cluster_ids.begin(); + } + clstream << "',"; + + // add points to script + for (std::vector::const_iterator it = point_indices.begin(); + it != point_indices.end(); ++it) { + const Point &point = cloud[*it]; + + // remove current point from vector points + for (std::vector::iterator p = points.begin(); p != points.end(); + p++) { + if (*p == point) { + points.erase(p); + break; + } + } + pointstream << point.x << " " << point.y; + if (!is2d) pointstream << " " << point.z; + pointstream << std::endl; + } + pointstream << "e" << std::endl; + } + clstream << std::endl; + + // plot all points red which are not clustered + if (points.size() > 0) { + noiseheader = " '-' with points lc 'red' title 'noise',"; + for (std::vector::iterator it = points.begin(); it != points.end(); + ++it) { + noise << it->x << " " << it->y; + if (!is2d) noise << " " << it->z; + noise << std::endl; + } + noise << "e" << std::endl; + } + + std::cout << header.str() << noiseheader << clstream.str() << noise.str() + << pointstream.str() << "pause mouse keypress\n"; +} + +//------------------------------------------------------------------- +// saves the PointCloud *cloud* with clusters as csv file. +// The csv file has following form: +// x,y,z,clusterid +// or 2D: +// x,y,clusterid +//------------------------------------------------------------------- +void clusters_to_csv(const PointCloud &cloud) { + bool is2d = cloud.is2d(); + std::cout << std::fixed + << "# Comment: curveID -1 represents noise\n# x, y, z, curveID\n"; + + for (PointCloud::const_iterator it = cloud.begin(); it != cloud.end(); ++it) { + std::cout << it->x << "," << it->y << ","; + if (!is2d) std::cout << it->z << ","; + if (it->cluster_ids.empty()) { + // Noise + std::cout << "-1\n"; + } else { + for (std::set::const_iterator it2 = it->cluster_ids.begin(); + it2 != it->cluster_ids.end(); ++it2) { + if (it2 != it->cluster_ids.begin()) { + std::cout << ";"; + } + std::cout << *it2; + } + std::cout << std::endl; + } + } +} diff --git a/src/spyral_utils/tripclust/src/output.h b/src/spyral_utils/tripclust/src/output.h new file mode 100644 index 0000000..50359b9 --- /dev/null +++ b/src/spyral_utils/tripclust/src/output.h @@ -0,0 +1,27 @@ +// +// output.h +// Functions for writing output files. +// +// Author: Jens Wilberg, Lukas Aymans, Christoph Dalitz +// Date: 2018-08-30 +// License: see ../LICENSE +// + +#ifndef OUTPUT_H +#define OUTPUT_H +#include "cluster.h" +#include "pointcloud.h" + +// saves a PointCloud *cloud* as csv file. +bool cloud_to_csv(const PointCloud &cloud, + const char *fname = "debug_smoothed.csv"); +// saves smoothen cloud as gnuplot script. +bool debug_gnuplot(const PointCloud &cloud, const PointCloud &cloud_smooth, + const char *fname = "debug_smoothed.gnuplot"); +// prints gnuplot script to stdout. +void clusters_to_gnuplot(const PointCloud &cloud, + const std::vector &clusters); +// saves the PointCloud *cloud* with clusters *cluster* as csv file. +void clusters_to_csv(const PointCloud &cloud); + +#endif diff --git a/src/spyral_utils/tripclust/src/pointcloud.cpp b/src/spyral_utils/tripclust/src/pointcloud.cpp new file mode 100644 index 0000000..af458b1 --- /dev/null +++ b/src/spyral_utils/tripclust/src/pointcloud.cpp @@ -0,0 +1,287 @@ +// +// pointcloud.cpp +// Classes and functions for 3D points and clouds thereof. +// +// Author: Jens Wilberg, Lukas Aymans, Christoph Dalitz +// Date: 2024-02-02 +// License: see ../LICENSE +// + +#include +#include +#include +#include +#include +#include + +#include "kdtree/kdtree.hpp" +#include "pointcloud.h" +#include "util.h" + +// a single 3D point +Point::Point(const std::vector &point) { + if (point.size() != 3) { + throw std::invalid_argument("Point::Point(): point must be of dimension 3"); + } + this->x = point[0]; + this->y = point[1]; + this->z = point[2]; +} + +Point::Point(const std::vector &point, + const std::set &cluster_ids) { + if (point.size() != 3) { + throw std::invalid_argument("Point::Point(): point must be of dimension 3"); + } + this->x = point[0]; + this->y = point[1]; + this->z = point[2]; + this->cluster_ids = cluster_ids; +} + +Point::Point(double x, double y, double z) { + this->x = x; + this->y = y; + this->z = z; +} + +Point::Point(double x, double y, double z, + const std::set &cluster_ids) { + this->x = x; + this->y = y; + this->z = z; + this->cluster_ids = cluster_ids; +} + +Point::Point(double x, double y, double z, size_t index) { + this->x = x; + this->y = y; + this->z = z; + this->index = index; // only used for chronological order +} + + +// representation of 3D point as std::vector. +std::vector Point::as_vector() const { + std::vector point(3); + point[0] = this->x; + point[1] = this->y; + point[2] = this->z; + return point; +} + +// Euclidean norm of the point +double Point::norm() const { return sqrt((x * x) + (y * y) + (z * z)); } + +// squared euclidean norm of the point +double Point::squared_norm() const { return (x * x) + (y * y) + (z * z); } + +Point &Point::operator=(const Point &other) { + x = other.x; + y = other.y; + z = other.z; + return *this; +} + +bool Point::operator==(const Point &p) const { + return (x == p.x && y == p.y && z == p.z); +} + +// formatted output of the point +std::ostream &operator<<(std::ostream &strm, const Point &p) { + return strm << p.x << " " << p.y << " " << p.z; +} + +// vector addition +Point Point::operator+(const Point &p) const { + Point ret; + ret.x = this->x + p.x; + ret.y = this->y + p.y; + ret.z = this->z + p.z; + return ret; +} + +// vector subtraction +Point Point::operator-(const Point &p) const { + Point ret; + ret.x = this->x - p.x; + ret.y = this->y - p.y; + ret.z = this->z - p.z; + return ret; +} + +// scalar product (dot product) +double Point::operator*(const Point &p) const { + return this->x * p.x + this->y * p.y + this->z * p.z; +} + +// scalar division +Point Point::operator/(double c) const { return Point(x / c, y / c, z / c); } + +// scalar multiplication +Point operator*(Point x, double c) { + Point v(c * x.x, c * x.y, c * x.z); + return v; +} +Point operator*(double c, Point x) { + Point v(c * x.x, c * x.y, c * x.z); + return v; +} + +PointCloud::PointCloud() { this->points2d = false; } + +void PointCloud::set2d(bool is2d) { this->points2d = is2d; } + +bool PointCloud::is2d() const { return this->points2d; } + + +void PointCloud::setOrdered(bool ordered) {this->ordered=ordered;} + +bool PointCloud::isOrdered() const { return this->ordered; } + + +// Split string *input* into substrings by *delimiter*. The result is +// returned in *result* +void split(const std::string &input, std::vector &result, + const char delimiter) { + std::stringstream ss(input); + std::string element; + + while (std::getline(ss, element, delimiter)) { + result.push_back(element); + } +} + +//------------------------------------------------------------------- +// Load csv file. +// The csv file is split by *delimiter* and saved in *cloud*. +// Lines starting with '#' are ignored. +// If there are more than 3 columns all other are ignored and +// if there are two columns the PointCloud is set to 2D. +// Throws invalid_argument exception in case of problems. +//------------------------------------------------------------------- +void load_csv_file(const char *fname, PointCloud &cloud, const char delimiter, + size_t skip) { + std::ifstream infile(fname); + std::string line; + std::vector items; + size_t count = 0, count2d = 0, skiped = 0, countpoints = 0; + //size_t countOrdered = 0; //! + if (infile.fail()) throw std::exception(); + for (size_t i = 0; i < skip; ++i) { + // skip the header + std::getline(infile, line, '\n'); + skiped++; + } + while (!infile.eof()) { +#ifdef WEBDEMO + if (countpoints > 1000) + throw std::length_error("Number of points limited to 1000 in demo mode"); +#endif + //std::getline(infile, line, '\n'); + std::getline(infile, line, '\n'); + count++; + + // skip comments and empty lines + if (line[0] == '#' || line.empty() || + line.find_first_not_of("\n\r\t ") == std::string::npos) + continue; + + countpoints++; + Point point; + split(line, items, delimiter); + if (items.size() < 2) { + std::ostringstream oss; + oss << "row " << count + skiped << ": " + << "To few columns!"; + throw std::invalid_argument(oss.str()); + } else if (items.size() == 2) { + items.push_back("0"); // z=0 for 2D data (size=2) + count2d++; + } + size_t column = 1; + try { + // create point + point.x = stod(items[0].c_str()); + column++; + point.y = stod(items[1].c_str()); + column++; + point.z = stod(items[2].c_str()); + point.index = countpoints-1; + cloud.push_back(point); + } catch (const std::invalid_argument &e) { + std::ostringstream oss; + oss << "row " << count + skiped << " column " << column << ": " + << e.what(); + throw std::invalid_argument(oss.str()); + } + + items.clear(); + } + + // check if the cloud is 2d or if a problem occurred + if (count2d && count2d != cloud.size()) { + throw std::invalid_argument("Mixed 2d and 3d points."); + } else if (count2d) { + cloud.set2d(true); + } +} + +//------------------------------------------------------------------- +// Smoothing of the PointCloud *cloud*. +// For every point the nearest neighbours in the radius *r* is searched +// and the centroid of this neighbours is computed. The result is +// returned in *result_cloud* and contains these centroids. The +// centroids are duplicated in the result cloud, so it has the same +// size and order as *cloud*. +//------------------------------------------------------------------- +void smoothen_cloud(const PointCloud &cloud, PointCloud &result_cloud, + double r) { + Kdtree::KdNodeVector nodes; + + // If the smooth-radius is zero return the unsmoothed pointcloud + if (r == 0) { + result_cloud = cloud; + return; + } + + // build kdtree + for (size_t i = 0; i < cloud.size(); ++i) { + nodes.push_back(cloud[i].as_vector()); + } + Kdtree::KdTree kdtree(&nodes); + + for (size_t i = 0; i < cloud.size(); ++i) { + size_t result_size; + Point new_point, point = cloud[i]; + Kdtree::KdNodeVector result; + + kdtree.range_nearest_neighbors(point.as_vector(), r, &result); + result_size = result.size(); + + // compute the centroid with mean + std::vector x_list; + std::vector y_list; + std::vector z_list; + + for (Kdtree::KdNodeVector::iterator it = result.begin(); it != result.end(); + ++it) { + x_list.push_back(it->point[0]); + y_list.push_back(it->point[1]); + z_list.push_back(it->point[2]); + } + + new_point.x = + std::accumulate(x_list.begin(), x_list.end(), 0.0) / result_size; + + new_point.y = + std::accumulate(y_list.begin(), y_list.end(), 0.0) / result_size; + + new_point.z = + std::accumulate(z_list.begin(), z_list.end(), 0.0) / result_size; + + new_point.index = point.index; + result_cloud.push_back(new_point); + } + result_cloud.setOrdered(cloud.isOrdered()); +} diff --git a/src/spyral_utils/tripclust/src/pointcloud.h b/src/spyral_utils/tripclust/src/pointcloud.h new file mode 100644 index 0000000..15bbb7f --- /dev/null +++ b/src/spyral_utils/tripclust/src/pointcloud.h @@ -0,0 +1,81 @@ +// +// pointcloud.h +// Classes and functions for 3D points and clouds thereof. +// +// Author: Jens Wilberg, Lukas Aymans, Christoph Dalitz +// Date: 2024-02-02 +// License: see ../LICENSE +// + +#ifndef POINTCLOUD_H +#define POINTCLOUD_H +#include +#include +#include +#include +#include +#include + +// 3D point class. +class Point { + public: + double x; + double y; + double z; + std::set cluster_ids; + size_t index; // only used for chronological order + + Point(){}; + Point(const std::vector& point); + Point(const std::vector& point, const std::set& cluster_ids); + Point(double x, double y, double z); + Point(double x, double y, double z, const std::set& cluster_ids); + Point(double x, double y, double z, size_t index); + + // representation of 3D point as std::vector + std::vector as_vector() const; + // Euclidean norm + double norm() const; + // squared norm + double squared_norm() const; + + friend std::ostream& operator<<(std::ostream& os, const Point& p); + bool operator==(const Point& p) const; + Point& operator=(const Point& other); + // vector addition + Point operator+(const Point& p) const; + // vector subtraction + Point operator-(const Point& p) const; + // scalar product + double operator*(const Point& p) const; + // scalar division + Point operator/(double c) const; +}; + +// scalar multiplication +Point operator*(Point x, double c); +Point operator*(double c, Point x); + +// The Pointcloud is a vector of points +class PointCloud : public std::vector { + private: + bool points2d; + bool ordered; //! + + public: + bool is2d() const; + void set2d(bool is2d); + bool isOrdered() const; //! + void setOrdered(bool isOrdered); //! + PointCloud(); +}; + + +// Load csv file. +void load_csv_file(const char* fname, PointCloud& cloud, const char delimiter, + size_t skip = 0); +// Smoothing of the PointCloud *cloud*. The result is returned in *result_cloud* +void smoothen_cloud(const PointCloud& cloud, PointCloud& result_cloud, + double radius); + +#endif diff --git a/src/spyral_utils/tripclust/src/postprocess.cpp b/src/spyral_utils/tripclust/src/postprocess.cpp new file mode 100644 index 0000000..5375891 --- /dev/null +++ b/src/spyral_utils/tripclust/src/postprocess.cpp @@ -0,0 +1,392 @@ +// +// postprocess.cpp +// The actual TriplClust post-processing. +// +// Author: Christoph Dalitz +// Date: 2022-11-07 +// License: see ../LICENSE +// + +#include "postprocess.h" +#include "orthogonallsq.h" +#include "kdtree/kdtree.hpp" +#include "util.h" + +#include +#include +#include +#include +#include /* sqrt */ + + + + +void writeClusterID(const std::vector & correct, std::vector< std::vector > & assignedIds); +void writeIDsInCloud(PointCloud &cloud, const std::vector< std::vector > & assignedIds, size_t* next_label, size_t old_cluster_id); +int correctOverSegmentation(PointCloud &cloud, std::vector< std::vector > & assignedIds); +int expandStart(PointCloud &cloud, std::vector< std::vector > & assignedIds); + + +/** @file */ + + +/** + * @brief Processes entire point cloud and reassigns cluster id labels, if appropriate. + * + * @param cloud Pointcloud generated by triplclust + * @param min_depth Minimum number of points making a branch + * @param verbose Verbosity level + * @return RC Number of split up clusters + */ +int process_pointcloud(PointCloud &cloud, int min_depth/*=25*/, int verbose/*=0*/) +{ + int nchanged = 0; + + // collect all cluster labels in cloud + std::set ids; + Point p; + //for(Point p : cloud) { + for (std::vector::iterator iterator = cloud.begin(); iterator != cloud.end(); ++iterator) { + p = *iterator; + p.cluster_ids = iterator->cluster_ids; + // concat cluster id set to ids + ids.insert(p.cluster_ids.begin(), p.cluster_ids.end()); + } + + size_t last_label_begin = *ids.rbegin(); + size_t next_label = *ids.rbegin()+1; + + PointCloud temp; + std::vector temp_ids; + + // for each cluster label, call process_cluster + size_t id; + for (std::set::iterator iterator = ids.begin(); iterator != ids.end(); ++iterator) { + id = *iterator; + temp_ids.clear(); + for (unsigned int index = 0; index < cloud.size(); ++index) { + // check if id is in Point in cloud + if(cloud[index].cluster_ids.find(id) != cloud[index].cluster_ids.end()) { + temp_ids.push_back(index); + } + } + // call process_cluster with the cloud subset with cluster=id + nchanged += process_cluster(cloud, temp_ids, id, &next_label, min_depth, verbose); + } + + nchanged = (next_label-1)-last_label_begin; + + return nchanged; + +} + + +// processes cluster with given id in point cloud +// and reassign cluster id labels, if appropriate +// cloud: *full* point cloud with label cluster_id +// indexes: the index of the point cloud elements in the cluster +// cluster_id: cluster id that is to be processed +// next_label: next free label for new ids +// min_depth: minimum number of points making a branch +// verbose: verbosity level +// RC: number of split up clusters (1 or 0) +int process_cluster(PointCloud &cloud, std::vector indexes, size_t cluster_id, size_t* next_label, size_t min_depth, int verbose) +{ + int nchanged = 0; + bool compound = false; // is trajectory made of many trajectories + + Graph graph(cloud, indexes); // Constructor constructs MST + + std::vector< std::vector > segments; // will contain all segments in a trajectory + + // remove edges at branches with depth min_depth + if(graph.removeBranches(min_depth)) { // is compound trajectory + compound = true; + // remove edges considerably longer than neighborhood edges + graph.removeLongEdges(); + } + + // if (verbose > 1): visualize MST with removed edges in different color + if(verbose>1) { + graph.generateGnuplot(true, "debug-cluster-" + std::to_string(cluster_id)); + } + + if(compound) { + segments = graph.assignNewCurveIds(); + + // correct oversegmentation with extrapolation by orthogonal_lsq + correctOverSegmentation(cloud, segments); + + // extrapolate curves to start (points belong to more than one curve?) + expandStart(cloud, segments); + + // adjust cluster id labels + writeIDsInCloud(cloud, segments, next_label, cluster_id); + + } + return nchanged; +} + + + + +/** + * @brief Corrects Oversegmentation after removing edges from the MST. + * + * For every Segment: + * + * Compute PCA for last 4th of the segment. Then, for every follower segment + * (meaning the last index (+tolerance) of the leader is greater than the first index of the follower), + * calculate the orthogonal distance from the first point to the previously calculated PCA. + * + * The leader segment is assigned the index of the follower with the minimum distance. + * If there are no followers the segment will not change. + * + * Note: The first index is the earliest in time. + * + * Note: If a segment only has followers with a distance greater than max_distance, the followers will be ignored. + * + * @param cloud Pointcloud from triplclust. + * @return assignedIds: Contains the segments and will be modified. + * @return int Number of Segments that were removed in this step. + */ +int correctOverSegmentation(PointCloud &cloud, std::vector< std::vector > & assignedIds) { + if(assignedIds.size() <= 1) { + return 0; + } + int nchanged = assignedIds.size(); + int size_of_cluster=0; + PointCloud pc; + Point a; + Point b; + unsigned int max_cluster_id = assignedIds.size()-1; + size_t tolerance = 10; //10 + double min_lsq_distance=std::numeric_limits::max(); + double lsq_distance=0; + unsigned int min_lsq_distance_cluster_id=max_cluster_id+1; + std::vector result; + double max_distance = 10; + + + for(unsigned int cluster_id = 0; cluster_id <= max_cluster_id; cluster_id++) { + pc.clear(); + min_lsq_distance = std::numeric_limits::max(); + lsq_distance=0; + min_lsq_distance_cluster_id=max_cluster_id+1; + + size_of_cluster = assignedIds[cluster_id].size(); + + // get the last 4th of the points in the cluster and push onto pc + for(int i = (int)(3*size_of_cluster/4); i (assignedIds[i].front())) { + lsq_distance = orthogonal_lsq_distance(&a,&b,&cloud[assignedIds[i].front()]); + if((lsq_distance < max_distance) && (lsq_distance < min_lsq_distance)) { + min_lsq_distance = lsq_distance; + min_lsq_distance_cluster_id = i; + } + } + } + } + if(min_lsq_distance_cluster_id != (max_cluster_id+1)) nchanged++; + result.push_back(min_lsq_distance_cluster_id); + } + + writeClusterID(result, assignedIds); + + if((assignedIds.size() == 0) || ((assignedIds.size()-1) > max_cluster_id)) { + std::cout<<"Something has gone wrong while trying to correct oversegmentation"< > & assignedIds) { + + if(assignedIds.size() <= 1) { + return 0; + } + + int nchanged = 0; + int size_of_cluster=0; + PointCloud pc; + Point a; + Point a_middle; + Point b; + Point b0; + std::vector distances; + double mean = 0.0; + double standardDeviation = 0.0; + double upperBound = 0.0; // mean + 2*sd + double lowerBound = 0.0; // mean - 2*sd + double current_distance = 0.0; + + // For every cluster that isnt the first cluster: + for(unsigned int cluster_id = 1; cluster_id <= assignedIds.size()-1; cluster_id++) { + if(assignedIds[cluster_id].size() == 0) break; + pc.clear(); + size_of_cluster = assignedIds[cluster_id].size(); + // Add every Point in first 5th of cluster to pc. + for(int i = 0; i<(int)(size_of_cluster/5); i++) { + pc.push_back(cloud[assignedIds[cluster_id][i]]); + } + // calc PCA + orthogonal_lsq(pc, &a, &b); + + Point p; + for (std::vector::iterator iterator = pc.begin(); iterator != pc.end(); ++iterator) { + p = *iterator; + distances.push_back(orthogonal_lsq_distance(&a,&b,&p)); + } + + // avoid division by zero + if(distances.size() <= 1) { + std::cout<<"Empty distances\n"; + break; + } + + double d; + for (std::vector::iterator iterator = distances.begin(); iterator != distances.end(); ++iterator) { + d = *iterator; + mean += d; + } + + mean /= distances.size(); + + for (std::vector::iterator iterator = distances.begin(); iterator != distances.end(); ++iterator) { + d = *iterator; + standardDeviation += ((d - mean)*(d - mean)); + } + + standardDeviation *= 1/((double)distances.size()-1); + standardDeviation = sqrt(standardDeviation); + + upperBound = mean+(2*standardDeviation); + lowerBound = mean-(2*standardDeviation); + + + + for(unsigned int i = 0; i assignedIds[cluster_id][0]) { + current_distance = orthogonal_lsq_distance(&a, &b, &cloud[assignedIds[0][i]]); + if((current_distance <= upperBound) && (current_distance >= lowerBound) ) { + assignedIds[cluster_id].push_back(assignedIds[0][i]); + + //!!!!!! uncomment following lines to erase Indexes from Cluster 0 when they are assigned to another Cluster. + + //assignedIds[0].erase(assignedIds[0].begin() + i); + //i--; + //size_of_cluster0--; + nchanged++; + } + } + } + } + return nchanged; +} + + +/** + * @brief Adds oversegmented Clusters together and modifies assignedIds appropriately. Every index in correct corresponds to an original cluster and the content assigns this original cluster to another one (or none). + * + * Helper function of correctOversegmentation(). Checks every element in param correct. If it is not + * assignedIds.size() then correctOversegmenation() has deemed it a part of another segment. + * The corresponding Vector (by index) in assignedIds is written to the index specified in correct. + * Finally, empty vectors in assignedIds are removed. + * Also keeps track of Cluster 0 and ensures that Cluster 0 stays in front of assignedIds. + * + * @param correct: + * @returns assignedIds + */ +void writeClusterID(const std::vector & correct, std::vector< std::vector > & assignedIds) { + + if(correct.size()>assignedIds.size()) { + // dont correct anything + std::cout<<"writeClusterID. Invalid input, Oversegmentation will not be corrected.\n"; + return; + } + + unsigned int zero_location=0; + + for(unsigned int i = 0; i assignedIds[i].front()) { + assignedIds[correct[i]].insert( assignedIds[correct[i]].end(), assignedIds[i].begin(), assignedIds[i].end() ); + } else { + assignedIds[correct[i]].insert( assignedIds[correct[i]].begin(), assignedIds[i].begin(), assignedIds[i].end() ); + } + assignedIds[i].clear(); + } + } + + if(zero_location < assignedIds.size()) { + std::vector< std::vector >::iterator it = assignedIds.begin() + zero_location; + std::rotate(assignedIds.begin(), it, it + 1); + } else { + std::cout<<"writeClusterID: zero_location impossible.\n"; + } + + // remove all items that are empty vectors = all vectors that were appended to another vector due to being oversegmented + std::vector< std::vector > help = assignedIds; + assignedIds.clear(); + for (size_t i=0; i > & assignedIds, size_t* next_label, size_t old_cluster_id) { + // skip cluster 0, so that cluster 0 keeps old cluster id + for(size_t cluster_id = 1; cluster_id cloud_indexes, size_t cluster_id, size_t* next_label, size_t min_depth, int verbose); + + +#endif diff --git a/src/spyral_utils/tripclust/src/triplet.cpp b/src/spyral_utils/tripclust/src/triplet.cpp new file mode 100644 index 0000000..df40a27 --- /dev/null +++ b/src/spyral_utils/tripclust/src/triplet.cpp @@ -0,0 +1,141 @@ +// +// triplet.cpp +// Classes and functions for triplets of three points and +// computing their dissimilarity. +// +// Author: Jens Wilberg, Lukas Aymans, Christoph Dalitz +// Date: 2024-02-02 +// License: see ../LICENSE +// + +#include +#include + +#include "kdtree/kdtree.hpp" +#include "triplet.h" + + +//------------------------------------------------------------------- +// Generates triplets from the PointCloud *cloud*. +// The resulting triplets are returned in *triplets*. *k* is the number +// of neighbores from a point, which are used for triplet generation. +// *n* is the number of the best triplet candidates to use. This can +// be lesser than *n*. *a* is the max error (1-angle) for the triplet +// to be a triplet candidate. If the cloud is ordered, only triplets +// with a.index < b.index < c.index are considered. +//------------------------------------------------------------------- +void generate_triplets(const PointCloud &cloud, std::vector &triplets, + size_t k, size_t n, double a) { + std::vector distances; + Kdtree::KdNodeVector nodes, result; + std::vector indices; // save the indices so that they can be used + // for the KdNode constructor + indices.resize(cloud.size(), 0); + + // build kdtree + for (size_t i = 0; i < cloud.size(); ++i) { + indices[i] = i; + Kdtree::KdNode n = Kdtree::KdNode(cloud[i].as_vector(), (void *)&indices[i]); + n.index = cloud[i].index; + nodes.push_back(n);//, NULL, (int)cloud[i].index); + } + Kdtree::KdTree kdtree(&nodes); + + for (size_t point_index_b = 0; point_index_b < cloud.size(); + ++point_index_b) { + distances.clear(); + Point point_b = cloud[point_index_b]; + std::vector triplet_candidates; + kdtree.k_nearest_neighbors(cloud[point_index_b].as_vector(), k, &result, + &distances); + + for (size_t result_index_a = 1; result_index_a < result.size(); + ++result_index_a) { + // When the distance is 0, we have the same point as point_b + if (distances[result_index_a] == 0) continue; + Point point_a(result[result_index_a].point); + point_a.index = result[result_index_a].index; + if (cloud.isOrdered() && (point_a.index > point_b.index)) continue; + size_t point_index_a = *(size_t *)result[result_index_a].data; + + Point direction_ab = point_b - point_a; + double ab_norm = direction_ab.norm(); + direction_ab = direction_ab / ab_norm; + + for (size_t result_index_c = result_index_a + 1; + result_index_c < result.size(); ++result_index_c) { + // When the distance is 0, we have the same point as point_b + if (distances[result_index_c] == 0) continue; + Point point_c = Point(result[result_index_c].point); + point_c.index = result[result_index_c].index; + if (cloud.isOrdered() && (point_b.index > point_c.index)) continue; + size_t point_index_c = *(size_t *)result[result_index_c].data; + + Point direction_bc = point_c - point_b; + double bc_norm = direction_bc.norm(); + direction_bc = direction_bc / bc_norm; + + const double angle = direction_ab * direction_bc; + + // calculate error + const double error = 1.0f - angle; + + if (error <= a) { + // calculate center + Point center = (point_a + point_b + point_c) / 3.0f; + + // calculate direction + Point direction = point_c - point_b; + direction = direction / direction.norm(); + + triplet new_triplet; + + new_triplet.point_index_a = point_index_a; + new_triplet.point_index_b = point_index_b; + new_triplet.point_index_c = point_index_c; + new_triplet.center = center; + new_triplet.direction = direction; + new_triplet.error = error; + + triplet_candidates.push_back(new_triplet); + } + } + } + + // order triplet candidates + std::sort(triplet_candidates.begin(), triplet_candidates.end()); + + // use the n best candidates + for (size_t i = 0; i < std::min(n, triplet_candidates.size()); ++i) { + triplets.push_back(triplet_candidates[i]); + } + } +} + +// initialization of scale factor for triplet dissimilarity +ScaleTripletMetric::ScaleTripletMetric(double s) { + this->scale = s; +} + + +// dissimilarity measure for triplets +double ScaleTripletMetric::operator()(const triplet &lhs, const triplet &rhs) { + const double perpendicularDistanceA = + (rhs.center - lhs.center + + lhs.direction * (lhs.center - rhs.center) * lhs.direction).squared_norm(); + const double perpendicularDistanceB = + (lhs.center - rhs.center + + rhs.direction * (rhs.center - lhs.center) * rhs.direction).squared_norm(); + + double anglecos = lhs.direction * rhs.direction; + if (anglecos > 1.0) anglecos = 1.0; + if (anglecos < -1.0) anglecos = -1.0; + if (std::fabs(anglecos) < 1.0e-8) { + return 1.0e+8; + } else { + return ( + std::sqrt(std::max(perpendicularDistanceA, perpendicularDistanceB)) / + this->scale + + std::fabs(std::tan(std::acos(anglecos))) ); + } +} diff --git a/src/spyral_utils/tripclust/src/triplet.h b/src/spyral_utils/tripclust/src/triplet.h new file mode 100644 index 0000000..758b684 --- /dev/null +++ b/src/spyral_utils/tripclust/src/triplet.h @@ -0,0 +1,46 @@ +// +// triplet.h +// Classes and functions for triplets of three points and +// computing their dissimilarity. +// +// Author: Jens Wilberg, Lukas Aymans, Christoph Dalitz +// Date: 2018-08-30 +// License: see ../LICENSE +// + +#ifndef TRIPLET_H +#define TRIPLET_H + +#include +#include + +#include "pointcloud.h" + +// triplet of three points +struct triplet { + size_t point_index_a; + size_t point_index_b; + size_t point_index_c; + Point center; + Point direction; + double error; + friend bool operator<(const triplet &t1, const triplet &t2) { + return (t1.error < t2.error); + }; +}; + +// dissimilarity for triplets. +// scale is an external scale factor. +class ScaleTripletMetric { + private: + double scale; + + public: + ScaleTripletMetric(double s); + double operator()(const triplet &lhs, const triplet &rhs); +}; + +// generates triplets from PointCloud +void generate_triplets(const PointCloud &cloud, std::vector &triplets, + size_t k, size_t n, double a); +#endif diff --git a/src/spyral_utils/tripclust/src/util.cpp b/src/spyral_utils/tripclust/src/util.cpp new file mode 100644 index 0000000..5bba8f0 --- /dev/null +++ b/src/spyral_utils/tripclust/src/util.cpp @@ -0,0 +1,34 @@ +// +// util.cpp +// Utility functions needed here and there. +// +// Author: Jens Wilberg, Lukas Aymans, Christoph Dalitz +// Date: 2019-04-02 +// License: see ../LICENSE +// + +#include "util.h" +#include +#include + +//------------------------------------------------------------------- +// converts *str* to double. +// If str is not a number a invalid_argument exception is thrown. +//------------------------------------------------------------------- +double stod(const char* s) { + double result; + + // remove leading and trailing white space + std::string str(s); + str.erase(0, str.find_first_not_of("\t\r\n ")); + str.erase(str.find_last_not_of("\t\r\n ") + 1); + + // use istream for conversion + std::istringstream iss(str); + + iss >> std::ws >> result; + if (iss.fail() || !iss.eof()) { + throw std::invalid_argument("not a number"); + } + return result; +} diff --git a/src/spyral_utils/tripclust/src/util.h b/src/spyral_utils/tripclust/src/util.h new file mode 100644 index 0000000..39f8652 --- /dev/null +++ b/src/spyral_utils/tripclust/src/util.h @@ -0,0 +1,18 @@ +// +// util.h +// Utility functions needed here and there. +// +// Author: Jens Wilberg, Lukas Aymans, Christoph Dalitz +// Date: 2018-08-30 +// License: see ../LICENSE +// + +#ifndef UTIL_H +#define UTIL_H + +enum Linkage { SINGLE, COMPLETE, AVERAGE }; + +// converts *str* to double. +double stod(const char* str); + +#endif