From 4bfd7ce5164fd8671234bb821696b64a4dc464d8 Mon Sep 17 00:00:00 2001
From: "Daniel N. Baker" <dnb@cs.jhu.edu>
Date: Wed, 8 Dec 2021 08:30:26 -0500
Subject: [PATCH 1/2] Switch from C-Python API to Pybind11 to use one version
 for all Python versions

---
 .gitmodules      |   3 ++
 fht-pybind11.cpp | 116 +++++++++++++++++++++++++++++++++++++++++++++++
 pybind11         |   1 +
 setup.py         |  12 ++---
 4 files changed, 123 insertions(+), 9 deletions(-)
 create mode 100644 .gitmodules
 create mode 100644 fht-pybind11.cpp
 create mode 160000 pybind11
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000..2726187
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "pybind11"]
+	path = pybind11
+	url = https://github.com/pybind/pybind11
diff --git a/fht-pybind11.cpp b/fht-pybind11.cpp
new file mode 100644
index 0000000..3380b53
--- /dev/null
+++ b/fht-pybind11.cpp
@@ -0,0 +1,116 @@
+#include "pybind11/pybind11.h"
+#include "pybind11/numpy.h"
+#include "fht.h"
+
+namespace py = pybind11;
+
+static constexpr inline unsigned ilog2(size_t x) noexcept {
+    return __builtin_ctz(x);
+}
+
+template<typename T, int flags>
+py::array_t<T> npfht_oop(py::array_t<T, flags> input) {
+    auto bi = input.request();
+    if(bi.ndim != 1) throw std::invalid_argument("input must be 1-d array");
+    const size_t n = bi.size;
+    if(!n || (n & (n - 1)) != 0u) {
+        throw std::invalid_argument("Input must be power-of-2 sized");
+    }
+    if(sizeof(T) != bi.itemsize) {
+        std::fprintf(stderr, "Error: Corrupted NumPy array (itemsize does not match data type).\n");
+        throw std::runtime_error("Error: Corrupted NumPy array (itemsize does not match data type).");
+    }
+    py::array_t<T> ret(n);
+    py::buffer_info rbi = ret.request();
+    if(::fht((T *)bi.ptr, (T *)rbi.ptr, ilog2(n))) {
+        throw std::runtime_error("FHT did not work properly");
+    }
+    return ret;
+}
+
+template<typename T, int flags>
+py::array_t<T, flags> npfht_ip(py::array_t<T, flags> input) {
+    auto bi = input.request();
+    T *ptr = (T *)bi.ptr;
+    if(bi.ndim != 1) throw std::invalid_argument("input must be 1-d array");
+    const size_t n = bi.size;
+    if(!n || (n & (n - 1)) != 0u) {
+        throw std::invalid_argument("Input must be power-of-2 sized");
+    }
+    if(sizeof(T) != bi.itemsize) {
+        std::fprintf(stderr, "Error: Corrupted NumPy array (itemsize does not match data type).\n");
+        throw std::runtime_error("Error: Corrupted NumPy array (itemsize does not match data type).");
+    }
+    if(::fht(ptr, ilog2(n))) {
+        throw std::runtime_error("FHT did not work properly");
+    }
+    return input;
+}
+
+#define UNUSED(x) (void)(x)
+py::array npfht(py::array input) {
+    py::buffer_info bi = input.request();
+    if(bi.format.front() == 'f') {
+        py::array_t<float, py::array::forcecast> farr(input);
+        return npfht_oop(farr);
+    } else {
+        py::array_t<double, py::array::forcecast> farr(input);
+        return npfht_oop(farr);
+    }
+}
+py::array npfht_(py::array input) {
+    py::buffer_info bi = input.request();
+    if(bi.format.front() == 'f') {
+        py::array_t<float, py::array::forcecast> farr(input);
+        return npfht_ip(farr);
+    } else {
+        py::array_t<double, py::array::forcecast> farr(input);
+        return npfht_ip(farr);
+    }
+}
+
+
+
+
+static char module_docstring[] =
+    "A C extension that computes the Fast Hadamard Transform";
+static char fht_docstring[] =
+    "Compute the Fast Hadamard Transform (FHT) for a given "
+    "one-dimensional NumPy array.\n\n"
+    "The Hadamard Transform is a linear orthogonal map defined on real vectors "
+    "whose length is a _power of two_. For the precise definition, see the "
+    "[Wikipedia entry](https://en.wikipedia.org/wiki/Hadamard_transform). The "
+    "Hadamard Transform has been recently used a lot in various machine "
+    "learning "
+    "and numerical algorithms.\n\n"
+    "The implementation uses "
+    "[AVX](https://en.wikipedia.org/wiki/Advanced_Vector_Extensions) "
+    "to speed up the computation. If AVX is not supported on your machine, "
+    "a simpler implementation without (explicit) vectorization is used.\n\n"
+    "The function takes two parameters:\n\n"
+    "* `buffer` is a NumPy array which is being transformed. It must be a "
+    "one-dimensional array with `dtype` equal to `float32` or `float64` (the "
+    "former is recommended unless you need high accuracy) and of size being a "
+    "power "
+    "of two. If your CPU supports AVX, then `buffer` must be aligned to 32 "
+    "bytes. "
+    "To allocate such an aligned buffer, use the function `created_aligned` "
+    "from this "
+    "module.\n"
+    "* `chunk` is a positive integer that controls when the implementation "
+    "switches "
+    "from recursive to iterative algorithm. The overall algorithm is "
+    "recursive, but as "
+    "soon as the vector becomes no longer than `chunk`, the iterative "
+    "algorithm is "
+    "invoked. For technical reasons, `chunk` must be at least 8. A good choice "
+    "is to "
+    "set `chunk` to 1024. But to fine-tune the performance one should use a "
+    "program "
+    "`best_chunk` supplied with the library.\n";
+
+PYBIND11_MODULE(ffht, m) {
+    m.doc() = module_docstring;
+    m.def("fht", npfht, py::arg("input"), fht_docstring);
+    m.def("fht_", npfht_, py::arg("input"), "In-place version of ffht.fht. See ffht.fht documentation for details");
+}
diff --git a/pybind11 b/pybind11
new file mode 160000
index 0000000..59aa998
--- /dev/null
+++ b/pybind11
@@ -0,0 +1 @@
+Subproject commit 59aa99860c60bd171b9565e9920f125fdb749267
diff --git a/setup.py b/setup.py
index f4841cb..9b9058a 100644
--- a/setup.py
+++ b/setup.py
@@ -18,19 +18,13 @@
     sys.stderr.write('NumPy not found!\n')
     raise
 
-if sys.version_info[0] == 2:
-    arr_sources = ['_ffht_2.c', 'fht.c']
-
-if sys.version_info[0] == 3:
-    arr_sources = ['_ffht_3.c', 'fht.c']
-
 module = Extension('ffht',
-                   sources= arr_sources,
+                   sources=["fht-pybind11.cpp", "fht.c", "fast_copy.c"],
                    extra_compile_args=['-march=native', '-O3', '-Wall', '-Wextra', '-pedantic',
                                        '-Wshadow', '-Wpointer-arith', '-Wcast-qual',
                                        '-Wstrict-prototypes', '-Wmissing-prototypes',
-                                       '-std=c99', '-DFHT_HEADER_ONLY'],
-                   include_dirs=[np.get_include()])
+                                       '-std=c++11'],
+                   include_dirs=[np.get_include()] + ["pybind11/include"])
 
 setup(name='FFHT',
       version='1.1',

From 23738856026c2869109250ec7667002fa1d8810b Mon Sep 17 00:00:00 2001
From: "Daniel N. Baker" <dnb@cs.jhu.edu>
Date: Wed, 8 Dec 2021 11:57:21 -0500
Subject: [PATCH 2/2] Add 2D support to FFHT, along with parallelization across
 rows. Adds OpenMP compilation requirement.

---
 fht-pybind11.cpp | 73 ++++++++++++++++++++++++++++++++++++++++++++----
 setup.py         |  2 +-
 2 files changed, 69 insertions(+), 6 deletions(-)

diff --git a/fht-pybind11.cpp b/fht-pybind11.cpp
index 3380b53..8b93100 100644
--- a/fht-pybind11.cpp
+++ b/fht-pybind11.cpp
@@ -28,6 +28,60 @@ py::array_t<T> npfht_oop(py::array_t<T, flags> input) {
     return ret;
 }
 
+template<typename T, int flags>
+py::array_t<T> npfht2d_oop(py::array_t<T, flags> input, int nthreads=1) {
+    auto bi = input.request();
+    if(bi.ndim != 2) throw std::invalid_argument("input must be 2-d array");
+    const py::ssize_t n = bi.shape[1];
+    if(!n || (n & (n - 1)) != 0u) {
+        throw std::invalid_argument("Input must be power-of-2 dimension");
+    }
+    const py::ssize_t nrows = bi.shape[0];
+    if(sizeof(T) != bi.itemsize) {
+        std::fprintf(stderr, "Error: Corrupted NumPy array (itemsize does not match data type).\n");
+        throw std::runtime_error("Error: Corrupted NumPy array (itemsize does not match data type).");
+    }
+    py::array_t<T> ret(std::vector<py::ssize_t>{nrows, n});
+    py::buffer_info rbi = ret.request();
+    std::vector<int8_t> rvs(nrows);
+    const int il2 = ilog2(n);
+    #pragma omp parallel for num_threads(nthreads)
+    for(size_t i = 0; i < nrows; ++i) {
+        rvs[i] = ::fht((T *)bi.ptr + i * n, (T *)rbi.ptr + i * n, il2);
+    }
+    if(std::any_of(rvs.begin(), rvs.end(), [](int8_t x) {return x != 0;})) {
+        throw std::runtime_error("FHT did not work properly");
+    }
+    return ret;
+}
+
+template<typename T, int flags>
+py::array_t<T> npfht2d_ip(py::array_t<T, flags> input, int nthreads=1) {
+    auto bi = input.request();
+    if(bi.ndim != 2) throw std::invalid_argument("input must be 2-d array");
+    const py::ssize_t n = bi.shape[1];
+    if(!n || (n & (n - 1)) != 0u) {
+        throw std::invalid_argument("Input must be power-of-2 dimension");
+    }
+    const py::ssize_t nrows = bi.shape[0];
+    if(sizeof(T) != bi.itemsize) {
+        std::fprintf(stderr, "Error: Corrupted NumPy array (itemsize does not match data type).\n");
+        throw std::runtime_error("Error: Corrupted NumPy array (itemsize does not match data type).");
+    }
+    std::vector<int> rvs(nrows);
+    const int il2 = ilog2(n);
+    #pragma omp parallel for num_threads(nthreads)
+    for(size_t i = 0; i < nrows; ++i) {
+        rvs[i] = ::fht((T *)bi.ptr + i * n, (T *)bi.ptr + i * n, il2);
+    }
+    if(std::any_of(rvs.begin(), rvs.end(), [](int8_t x) {return x != 0;})) {
+        throw std::runtime_error("FHT did not work properly");
+    }
+    return input;
+}
+
+
+
 template<typename T, int flags>
 py::array_t<T, flags> npfht_ip(py::array_t<T, flags> input) {
     auto bi = input.request();
@@ -47,24 +101,33 @@ py::array_t<T, flags> npfht_ip(py::array_t<T, flags> input) {
     return input;
 }
 
-#define UNUSED(x) (void)(x)
-py::array npfht(py::array input) {
+py::array npfht(py::array input, py::ssize_t nthreads=1) {
     py::buffer_info bi = input.request();
+    if(bi.ndim > 2) throw std::runtime_error("ndim > 2. Only accepted: 1d and 2d numpy arrays");
     if(bi.format.front() == 'f') {
         py::array_t<float, py::array::forcecast> farr(input);
+        if(bi.ndim == 2)
+            return npfht2d_oop(farr, nthreads);
         return npfht_oop(farr);
     } else {
         py::array_t<double, py::array::forcecast> farr(input);
+        if(bi.ndim == 2)
+            return npfht2d_oop(farr, nthreads);
         return npfht_oop(farr);
     }
 }
-py::array npfht_(py::array input) {
+
+py::array npfht_(py::array input, py::ssize_t nthreads=1) {
     py::buffer_info bi = input.request();
     if(bi.format.front() == 'f') {
         py::array_t<float, py::array::forcecast> farr(input);
+        if(bi.ndim == 2)
+            return npfht2d_ip(farr, nthreads);
         return npfht_ip(farr);
     } else {
         py::array_t<double, py::array::forcecast> farr(input);
+        if(bi.ndim == 2)
+            return npfht2d_ip(farr, nthreads);
         return npfht_ip(farr);
     }
 }
@@ -111,6 +174,6 @@ static char fht_docstring[] =
 
 PYBIND11_MODULE(ffht, m) {
     m.doc() = module_docstring;
-    m.def("fht", npfht, py::arg("input"), fht_docstring);
-    m.def("fht_", npfht_, py::arg("input"), "In-place version of ffht.fht. See ffht.fht documentation for details");
+    m.def("fht", npfht, py::arg("input"), py::arg("nthreads") = 1, fht_docstring);
+    m.def("fht_", npfht_, py::arg("input"), py::arg("nthreads") = 1, "In-place version of ffht.fht. See ffht.fht documentation for details");
 }
diff --git a/setup.py b/setup.py
index 9b9058a..b7c9043 100644
--- a/setup.py
+++ b/setup.py
@@ -23,7 +23,7 @@
                    extra_compile_args=['-march=native', '-O3', '-Wall', '-Wextra', '-pedantic',
                                        '-Wshadow', '-Wpointer-arith', '-Wcast-qual',
                                        '-Wstrict-prototypes', '-Wmissing-prototypes',
-                                       '-std=c++11'],
+                                       '-std=c++11', '-fopenmp'],
                    include_dirs=[np.get_include()] + ["pybind11/include"])
 
 setup(name='FFHT',