diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..2726187 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "pybind11"] + path = pybind11 + url = https://github.com/pybind/pybind11 diff --git a/fht-pybind11.cpp b/fht-pybind11.cpp new file mode 100644 index 0000000..8b93100 --- /dev/null +++ b/fht-pybind11.cpp @@ -0,0 +1,179 @@ +#include "pybind11/pybind11.h" +#include "pybind11/numpy.h" +#include "fht.h" + +namespace py = pybind11; + +static constexpr inline unsigned ilog2(size_t x) noexcept { + return __builtin_ctz(x); +} + +template +py::array_t npfht_oop(py::array_t input) { + auto bi = input.request(); + if(bi.ndim != 1) throw std::invalid_argument("input must be 1-d array"); + const size_t n = bi.size; + if(!n || (n & (n - 1)) != 0u) { + throw std::invalid_argument("Input must be power-of-2 sized"); + } + if(sizeof(T) != bi.itemsize) { + std::fprintf(stderr, "Error: Corrupted NumPy array (itemsize does not match data type).\n"); + throw std::runtime_error("Error: Corrupted NumPy array (itemsize does not match data type)."); + } + py::array_t ret(n); + py::buffer_info rbi = ret.request(); + if(::fht((T *)bi.ptr, (T *)rbi.ptr, ilog2(n))) { + throw std::runtime_error("FHT did not work properly"); + } + return ret; +} + +template +py::array_t npfht2d_oop(py::array_t input, int nthreads=1) { + auto bi = input.request(); + if(bi.ndim != 2) throw std::invalid_argument("input must be 2-d array"); + const py::ssize_t n = bi.shape[1]; + if(!n || (n & (n - 1)) != 0u) { + throw std::invalid_argument("Input must be power-of-2 dimension"); + } + const py::ssize_t nrows = bi.shape[0]; + if(sizeof(T) != bi.itemsize) { + std::fprintf(stderr, "Error: Corrupted NumPy array (itemsize does not match data type).\n"); + throw std::runtime_error("Error: Corrupted NumPy array (itemsize does not match data type)."); + } + py::array_t ret(std::vector{nrows, n}); + py::buffer_info rbi = ret.request(); + std::vector rvs(nrows); + const int il2 = ilog2(n); + #pragma omp parallel for num_threads(nthreads) + for(size_t i = 0; i < nrows; ++i) { + rvs[i] = ::fht((T *)bi.ptr + i * n, (T *)rbi.ptr + i * n, il2); + } + if(std::any_of(rvs.begin(), rvs.end(), [](int8_t x) {return x != 0;})) { + throw std::runtime_error("FHT did not work properly"); + } + return ret; +} + +template +py::array_t npfht2d_ip(py::array_t input, int nthreads=1) { + auto bi = input.request(); + if(bi.ndim != 2) throw std::invalid_argument("input must be 2-d array"); + const py::ssize_t n = bi.shape[1]; + if(!n || (n & (n - 1)) != 0u) { + throw std::invalid_argument("Input must be power-of-2 dimension"); + } + const py::ssize_t nrows = bi.shape[0]; + if(sizeof(T) != bi.itemsize) { + std::fprintf(stderr, "Error: Corrupted NumPy array (itemsize does not match data type).\n"); + throw std::runtime_error("Error: Corrupted NumPy array (itemsize does not match data type)."); + } + std::vector rvs(nrows); + const int il2 = ilog2(n); + #pragma omp parallel for num_threads(nthreads) + for(size_t i = 0; i < nrows; ++i) { + rvs[i] = ::fht((T *)bi.ptr + i * n, (T *)bi.ptr + i * n, il2); + } + if(std::any_of(rvs.begin(), rvs.end(), [](int8_t x) {return x != 0;})) { + throw std::runtime_error("FHT did not work properly"); + } + return input; +} + + + +template +py::array_t npfht_ip(py::array_t input) { + auto bi = input.request(); + T *ptr = (T *)bi.ptr; + if(bi.ndim != 1) throw std::invalid_argument("input must be 1-d array"); + const size_t n = bi.size; + if(!n || (n & (n - 1)) != 0u) { + throw std::invalid_argument("Input must be power-of-2 sized"); + } + if(sizeof(T) != bi.itemsize) { + std::fprintf(stderr, "Error: Corrupted NumPy array (itemsize does not match data type).\n"); + throw std::runtime_error("Error: Corrupted NumPy array (itemsize does not match data type)."); + } + if(::fht(ptr, ilog2(n))) { + throw std::runtime_error("FHT did not work properly"); + } + return input; +} + +py::array npfht(py::array input, py::ssize_t nthreads=1) { + py::buffer_info bi = input.request(); + if(bi.ndim > 2) throw std::runtime_error("ndim > 2. Only accepted: 1d and 2d numpy arrays"); + if(bi.format.front() == 'f') { + py::array_t farr(input); + if(bi.ndim == 2) + return npfht2d_oop(farr, nthreads); + return npfht_oop(farr); + } else { + py::array_t farr(input); + if(bi.ndim == 2) + return npfht2d_oop(farr, nthreads); + return npfht_oop(farr); + } +} + +py::array npfht_(py::array input, py::ssize_t nthreads=1) { + py::buffer_info bi = input.request(); + if(bi.format.front() == 'f') { + py::array_t farr(input); + if(bi.ndim == 2) + return npfht2d_ip(farr, nthreads); + return npfht_ip(farr); + } else { + py::array_t farr(input); + if(bi.ndim == 2) + return npfht2d_ip(farr, nthreads); + return npfht_ip(farr); + } +} + + + + +static char module_docstring[] = + "A C extension that computes the Fast Hadamard Transform"; +static char fht_docstring[] = + "Compute the Fast Hadamard Transform (FHT) for a given " + "one-dimensional NumPy array.\n\n" + "The Hadamard Transform is a linear orthogonal map defined on real vectors " + "whose length is a _power of two_. For the precise definition, see the " + "[Wikipedia entry](https://en.wikipedia.org/wiki/Hadamard_transform). The " + "Hadamard Transform has been recently used a lot in various machine " + "learning " + "and numerical algorithms.\n\n" + "The implementation uses " + "[AVX](https://en.wikipedia.org/wiki/Advanced_Vector_Extensions) " + "to speed up the computation. If AVX is not supported on your machine, " + "a simpler implementation without (explicit) vectorization is used.\n\n" + "The function takes two parameters:\n\n" + "* `buffer` is a NumPy array which is being transformed. It must be a " + "one-dimensional array with `dtype` equal to `float32` or `float64` (the " + "former is recommended unless you need high accuracy) and of size being a " + "power " + "of two. If your CPU supports AVX, then `buffer` must be aligned to 32 " + "bytes. " + "To allocate such an aligned buffer, use the function `created_aligned` " + "from this " + "module.\n" + "* `chunk` is a positive integer that controls when the implementation " + "switches " + "from recursive to iterative algorithm. The overall algorithm is " + "recursive, but as " + "soon as the vector becomes no longer than `chunk`, the iterative " + "algorithm is " + "invoked. For technical reasons, `chunk` must be at least 8. A good choice " + "is to " + "set `chunk` to 1024. But to fine-tune the performance one should use a " + "program " + "`best_chunk` supplied with the library.\n"; + +PYBIND11_MODULE(ffht, m) { + m.doc() = module_docstring; + m.def("fht", npfht, py::arg("input"), py::arg("nthreads") = 1, fht_docstring); + m.def("fht_", npfht_, py::arg("input"), py::arg("nthreads") = 1, "In-place version of ffht.fht. See ffht.fht documentation for details"); +} diff --git a/pybind11 b/pybind11 new file mode 160000 index 0000000..59aa998 --- /dev/null +++ b/pybind11 @@ -0,0 +1 @@ +Subproject commit 59aa99860c60bd171b9565e9920f125fdb749267 diff --git a/setup.py b/setup.py index f4841cb..b7c9043 100644 --- a/setup.py +++ b/setup.py @@ -18,19 +18,13 @@ sys.stderr.write('NumPy not found!\n') raise -if sys.version_info[0] == 2: - arr_sources = ['_ffht_2.c', 'fht.c'] - -if sys.version_info[0] == 3: - arr_sources = ['_ffht_3.c', 'fht.c'] - module = Extension('ffht', - sources= arr_sources, + sources=["fht-pybind11.cpp", "fht.c", "fast_copy.c"], extra_compile_args=['-march=native', '-O3', '-Wall', '-Wextra', '-pedantic', '-Wshadow', '-Wpointer-arith', '-Wcast-qual', '-Wstrict-prototypes', '-Wmissing-prototypes', - '-std=c99', '-DFHT_HEADER_ONLY'], - include_dirs=[np.get_include()]) + '-std=c++11', '-fopenmp'], + include_dirs=[np.get_include()] + ["pybind11/include"]) setup(name='FFHT', version='1.1',