From ccd62f7b85cc05af167204a5f3dbc011dc65181f Mon Sep 17 00:00:00 2001 From: Simon Kornblith Date: Wed, 2 Jul 2014 15:45:13 -0400 Subject: [PATCH 1/7] Implement reductions with optional skipna argument --- src/DataArrays.jl | 1 + src/operators.jl | 9 +-- src/reduce.jl | 196 ++++++++++++++++++++++++++++++++++++++++++++++ test/reduce.jl | 133 +++++++++++++++++++++++++++++++ test/runtests.jl | 1 + 5 files changed, 332 insertions(+), 8 deletions(-) create mode 100644 src/reduce.jl create mode 100644 test/reduce.jl diff --git a/src/DataArrays.jl b/src/DataArrays.jl index c370ee7..f25ccb9 100644 --- a/src/DataArrays.jl +++ b/src/DataArrays.jl @@ -63,6 +63,7 @@ module DataArrays include("datamatrix.jl") include("linalg.jl") include("operators.jl") + include("reduce.jl") include("broadcast.jl") include("sort.jl") include("extras.jl") diff --git a/src/operators.jl b/src/operators.jl index c1c5efa..a67215d 100644 --- a/src/operators.jl +++ b/src/operators.jl @@ -146,14 +146,7 @@ const bit_operators = [:(Base.(:&)), :(Base.(:|)), :(Base.(:$))] -const unary_vector_operators = [:(Base.minimum), - :(Base.maximum), - :(Base.prod), - :(Base.sum), - :(Base.mean), - :(Base.median), - :(Base.std), - :(Base.var), +const unary_vector_operators = [:(Base.median), :(StatsBase.mad), :(Base.norm), :(StatsBase.skewness), diff --git a/src/reduce.jl b/src/reduce.jl new file mode 100644 index 0000000..4a21359 --- /dev/null +++ b/src/reduce.jl @@ -0,0 +1,196 @@ +using Base: evaluate + +## mapreduce implementation that skips NA + +function skipna_init(f, op, na::BitArray, data::Array, ifirst::Int, ilast::Int) + # Get first non-NA element + ifirst = Base.findnextnot(na, ifirst) + @inbounds d1 = data[ifirst] + + # Get next non-NA element + ifirst = Base.findnextnot(na, ifirst+1) + @inbounds d2 = data[ifirst] + + # Reduce first two elements + (evaluate(op, evaluate(f, d1), evaluate(f, d2)), ifirst) +end + +function mapreduce_seq_impl_skipna(f, op, T, A::DataArray, ifirst::Int, ilast::Int) + data = A.data + na = A.na + chunks = na.chunks + + v, i = skipna_init(f, op, na, data, ifirst, ilast) + + while i < ilast + i += 1 + @inbounds na = Base.unsafe_bitgetindex(chunks, i) + na && continue + @inbounds d = data[i] + v = evaluate(op, v, evaluate(f, d)) + end + v +end + +# Kernel that skips NA without branching +# Only used for cases where branching is more expensive than computing for NAs +function mapreduce_seq_impl_skipna(f::Union(Base.IdFun, Base.AbsFun, Base.Abs2Fun), + op::Union(Base.AddFun, Base.MulFun, Base.AndFun, Base.OrFun), + T::Union(Type{Uint8}, Type{Uint16}, Type{Uint32}, Type{Uint64}, + Type{Int8}, Type{Int16}, Type{Int32}, Type{Int64}, + Type{Float32}, Type{Float64}), + A::DataArray, ifirst::Int, ilast::Int) + data = A.data + na = A.na + chunks = na.chunks + + v, i = skipna_init(f, op, na, data, ifirst, ilast) + + while i < ilast + i += 1 + @inbounds na = Base.unsafe_bitgetindex(chunks, i) + @inbounds d = data[i] + v = ifelse(na, v, evaluate(op, v, evaluate(f, d))) + end + v +end + +# Pairwise map-reduce +function mapreduce_pairwise_impl_skipna{T}(f, op, A::DataArray{T}, bytefirst::Int, bytelast::Int, n_notna::Int, blksize::Int) + if n_notna <= blksize + ifirst = 64*(bytefirst-1)+1 + ilast = min(64*bytelast, length(A)) + # Fall back to Base implementation if no NAs in block + return ilast - ifirst + 1 == n_notna ? Base.mapreduce_seq_impl(f, op, A.data, ifirst, ilast) : + mapreduce_seq_impl_skipna(f, op, T, A, ifirst, ilast) + end + + # Find byte in the middle of range + # The block size is restricted so that there will always be at + # least two non-NA elements in the returned range + chunks = A.na.chunks + nmid = 0 + imid = bytefirst-1 + while nmid < (n_notna >> 1) + imid += 1 + @inbounds nmid += count_zeros(chunks[imid]) + end + + v1 = mapreduce_pairwise_impl_skipna(f, op, A, bytefirst, imid, nmid, blksize) + v2 = mapreduce_pairwise_impl_skipna(f, op, A, imid+1, bytelast, n_notna-nmid, blksize) + evaluate(op, v1, v2) +end + +mapreduce_impl_skipna{T}(f, op, A::DataArray{T}) = + mapreduce_seq_impl_skipna(f, op, T, A, 1, length(A.data)) +mapreduce_impl_skipna(f, op::Base.AddFun, A::DataArray) = + mapreduce_pairwise_impl_skipna(f, op, A, 1, length(A.na.chunks), + length(A.na)-countnz(A.na), + max(128, Base.sum_pairwise_blocksize(f))) + +## general mapreduce interface + +function _mapreduce_skipna{T}(f, op, A::DataArray{T}) + n = length(A) + na = A.na + + nna = countnz(na) + nna == n && return Base.mr_empty(f, op, T) + nna == n-1 && return Base.r_promote(op, evaluate(f, A.data[Base.findnextnot(na, 1)])) + nna == 0 && return Base.mapreduce_impl(f, op, A.data, 1, n) + + mapreduce_impl_skipna(f, op, A) +end + +# This is only safe when we can guarantee that if a function is passed +# NA, it returns NA. Otherwise we will fall back to the implementation +# in Base, which is slow because it's type-unstable, but guarantees the +# correct semantics +function Base._mapreduce(f::Union(Base.IdFun, Base.AbsFun, Base.Abs2Fun, + Base.ExpFun, Base.LogFun, Base.CentralizedAbs2Fun), + op::Union(Base.AddFun, Base.MulFun, Base.MaxFun, Base.MinFun), + A::DataArray) + any(A.na) && return NA + Base._mapreduce(f, op, A.data) +end + + +function Base.mapreduce(f, op::Function, A::DataArray; skipna::Bool=false) + is(op, +) ? (skipna ? _mapreduce_skipna(f, Base.AddFun(), A) : Base._mapreduce(f, Base.AddFun(), A)) : + is(op, *) ? (skipna ? _mapreduce_skipna(f, Base.MulFun(), A) : Base._mapreduce(f, Base.MulFun(), A)) : + is(op, &) ? (skipna ? _mapreduce_skipna(f, Base.AndFun(), A) : Base._mapreduce(f, Base.AndFun(), A)) : + is(op, |) ? (skipna ? _mapreduce_skipna(f, Base.OrFun(), A) : Base._mapreduce(f, Base.OrFun(), A)) : + skipna ? _mapreduce_skipna(f, op, A) : Base._mapreduce(f, op, A) +end + +Base.mapreduce(f, op, A::DataArray; skipna::Bool=false) = + skipna ? _mapreduce_skipna(f, op, A) : Base._mapreduce(f, op, A) + +Base.reduce(op, A::DataArray; skipna::Bool=false) = + mapreduce(Base.IdFun(), op, A; skipna=skipna) + +## usual reductions + +for (fn, op) in ((:(Base.sum), Base.AddFun()), + (:(Base.prod), Base.MulFun()), + (:(Base.minimum), Base.MinFun()), + (:(Base.maximum), Base.MaxFun())) + @eval begin + $fn(f::Union(Function,Base.Func{1}), a::DataArray; skipna::Bool=false) = + mapreduce(f, $op, a; skipna=skipna) + $fn(a::DataArray; skipna::Bool=false) = + mapreduce(Base.IdFun(), $op, a; skipna=skipna) + end +end + +for (fn, f, op) in ((:(Base.sumabs), Base.AbsFun(), Base.AddFun()), + (:(Base.sumabs2), Base.Abs2Fun(), Base.AddFun())) + @eval $fn(a::DataArray; skipna::Bool=false) = mapreduce($f, $op, a; skipna=skipna) +end + +## mean + +Base.mean(a::DataArray; skipna::Bool=false) = + sum(a; skipna=skipna) / (length(a.na)-(skipna ? countnz(a.na) : 0)) + +## variance + +function Base.varm{T}(A::DataArray{T}, m::Number; corrected::Bool=true, skipna::Bool=false) + if skipna + n = length(A) + na = A.na + + nna = countnz(na) + nna == n && return convert(Base.momenttype(T), NaN) + nna == n-1 && return convert(Base.momenttype(T), + abs2(A.data[Base.findnextnot(na, 1)] - m)/(1 - int(corrected))) + + /(nna == 0 ? Base.centralize_sumabs2(A.data, m, 1, n) : + mapreduce_impl_skipna(Base.CentralizedAbs2Fun(m), Base.AddFun(), A), + n - nna - int(corrected)) + else + any(A.na) && return NA + Base.varm(A.data, m; corrected=corrected) + end +end +Base.varm{T}(A::DataArray{T}, m::NAtype; corrected::Bool=true, skipna::Bool=false) = NA + +function Base.varzm{T}(A::DataArray{T}; corrected::Bool=true, skipna::Bool=false) + n = length(A) + nna = skipna ? countnz(A.na) : 0 + (n == 0 || n == nna) && return convert(Base.momenttype(T), NaN) + return Base.sumabs2(A; skipna=skipna) / (n - nna - int(corrected)) +end + +function Base.var(A::DataArray; corrected::Bool=true, mean=nothing, skipna::Bool=false) + mean == 0 ? Base.varzm(A; corrected=corrected, skipna=skipna) : + mean == nothing ? varm(A, Base.mean(A; skipna=skipna); corrected=corrected, skipna=skipna) : + isa(mean, Union(Number, NAtype)) ? varm(A, mean; corrected=corrected, skipna=skipna) : + error("Invalid value of mean.") +end + +Base.stdm(A::DataArray, m::Number; corrected::Bool=true, skipna::Bool=false) = + sqrt(varm(A, m; corrected=corrected, skipna=skipna)) + +Base.std(A::DataArray; corrected::Bool=true, mean=nothing, skipna::Bool=false) = + sqrt(var(A; corrected=corrected, mean=mean, skipna=skipna)) diff --git a/test/reduce.jl b/test/reduce.jl new file mode 100644 index 0000000..dac1b1d --- /dev/null +++ b/test/reduce.jl @@ -0,0 +1,133 @@ +module TestReduce +using DataArrays, Base.Test + +## extended test of sum + +for skipna in (true, false) + @test sum(@data(Int8[]); skipna=skipna) === 0 + @test sum(@data(Int[]); skipna=skipna) === 0 + @test sum(@data(Float64[]); skipna=skipna) === 0.0 + + @test sum(@data([int8(3)]); skipna=skipna) === 3 + @test sum(@data([3]); skipna=skipna) === 3 + @test sum(@data([3.0]); skipna=skipna) === 3.0 + + z = DataArray(reshape(1:16, (2,2,2,2))) + fz = convert(DataArray{Float64}, z) + bfz = convert(DataArray{BigFloat}, z) + @test sum(z) === 136 + @test sum(fz) === 136.0 + @test sum(bfz) == 136 +end + +@test sum(@data(Int[NA])) === NA +@test sum(@data(Int[NA]); skipna=true) === 0 +@test sum(@data(Int[NA, NA])) === NA +@test sum(@data(Int[NA, NA]); skipna=true) === 0 +@test sum(@data(Int[NA, NA, 1]); skipna=true) === 1 +@test sum(@data(Int[NA, NA, 1, 2]); skipna=true) === 3 +@test sum(@data(Int[NA, 1, NA, 1, 2]); skipna=true) === 4 + +z = DataArray(reshape(1:16, (2,2,2,2))) +z[6] = NA +fz = convert(DataArray{Float64}, z) +bfz = convert(DataArray{BigFloat}, z) +@test isna(sum(z)) +@test isna(sum(fz)) +@test isna(sum(bfz)) +@test sum(z; skipna=true) === 130 +@test sum(fz; skipna=true) === 130.0 +@test sum(bfz; skipna=true) == 130 + +bs = Base.sum_pairwise_blocksize(Base.IdFun()) +for n in [bs-64, bs-1, bs, bs+1, bs+2, 2*bs-2:2*bs+3, 4*bs-2:4*bs+3] + da = DataArray(randn(n)) + s = sum(da.data) + @test_approx_eq sum(da) s + @test_approx_eq sum(da; skipna=true) s + + da2 = copy(da) + da2[1:2:end] = NA + @test isna(sum(da2)) + @test_approx_eq sum(da2; skipna=true) sum(dropna(da2)) + + da2 = convert(DataArray{BigFloat}, da2) + @test isna(sum(da2)) + @test_approx_eq sum(da2; skipna=true) sum(dropna(da2)) + + da2 = copy(da) + da2[2:2:end] = NA + @test isna(sum(da2)) + @test_approx_eq sum(da2; skipna=true) sum(dropna(da2)) + + da2 = convert(DataArray{BigFloat}, da2) + @test isna(sum(da2)) + @test_approx_eq sum(da2; skipna=true) sum(dropna(da2)) +end + +## other reductions + +macro same_behavior(ex1, ex2) + quote + v = try + $ex2 + catch e + e + end + isa(v, Exception) ? @test_throws(typeof(v), $ex1) : @test_approx_eq($ex1, v) + end +end + +_varuc(x; kw...) = var(x; corrected=false, kw...) +_varzm(x; kw...) = var(x; mean=0, kw...) +_varzmuc(x; kw...) = var(x; corrected=false, mean=0, kw...) +_var1m(x; kw...) = var(x; mean=1, kw...) +_var1muc(x; kw...) = var(x; corrected=false, mean=1, kw...) +_std1m(x; kw...) = stdm(x, 1; kw...) + +for fn in (prod, minimum, maximum, mean, + var, _varuc, _varzm, _varzmuc, _var1m, _var1muc, + std, _std1m, Base.sumabs, Base.sumabs2) + for n in [0, 1, 2, 62, 63, 64, 65, 66] + da = DataArray(randn(n)) + @same_behavior fn(da) fn(da.data) + @same_behavior fn(da; skipna=true) fn(da.data) + + da2 = copy(da) + da2[1:2:end] = NA + n > 0 && @test isna(fn(da2)) + @same_behavior fn(da2; skipna=true) fn(dropna(da2)) + + da2 = convert(DataArray{BigFloat}, da2) + n > 0 && @test isna(fn(da2)) + @same_behavior fn(da2; skipna=true) fn(dropna(da2)) + + da2 = copy(da) + da2[2:2:end] = NA + n > 1 && @test isna(fn(da2)) + @same_behavior fn(da2; skipna=true) fn(dropna(da2)) + + da2 = convert(DataArray{BigFloat}, da2) + n > 1 && @test isna(fn(da2)) + @same_behavior fn(da2; skipna=true) fn(dropna(da2)) + end +end + +## reduce and mapreduce drivers + +for fn in (+, *, |, &) + da = convert(DataArray, randbool(10)) + + s = mapreduce(Base.IdFun(), fn, da.data) + @test mapreduce(Base.IdFun(), fn, da) == s + @test mapreduce(Base.IdFun(), fn, da; skipna=true) == s + @test reduce(fn, da) == s + @test reduce(fn, da; skipna=true) == s +end + +# make sure reductions of & and | are still calling Base +@test isna(reduce(&, @data([true, NA]))) +@test !reduce(&, @data([false, NA])) +@test reduce(|, @data([true, NA])) +@test isna(reduce(|, @data([false, NA]))) +end diff --git a/test/runtests.jl b/test/runtests.jl index f103210..8fcab19 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -17,6 +17,7 @@ my_tests = ["abstractarray.jl", "linalg.jl", #"test/nas.jl", "operators.jl", + "reduce.jl", "broadcast.jl", "padding.jl", "pooleddataarray.jl", From a0ad165679e3b0e401176ccae5b2411dc72cb32c Mon Sep 17 00:00:00 2001 From: Simon Kornblith Date: Mon, 14 Jul 2014 01:34:54 -0400 Subject: [PATCH 2/7] Initial implementation of reductions across dimensions --- src/DataArrays.jl | 1 + src/broadcast.jl | 2 +- src/operators.jl | 14 ++- src/reduce.jl | 20 ++-- src/reducedim.jl | 252 ++++++++++++++++++++++++++++++++++++++++++++++ test/reducedim.jl | 131 ++++++++++++++++++++++++ test/runtests.jl | 1 + 7 files changed, 408 insertions(+), 13 deletions(-) create mode 100644 src/reducedim.jl create mode 100644 test/reducedim.jl diff --git a/src/DataArrays.jl b/src/DataArrays.jl index f25ccb9..5d678cf 100644 --- a/src/DataArrays.jl +++ b/src/DataArrays.jl @@ -64,6 +64,7 @@ module DataArrays include("linalg.jl") include("operators.jl") include("reduce.jl") + include("reducedim.jl") include("broadcast.jl") include("sort.jl") include("extras.jl") diff --git a/src/broadcast.jl b/src/broadcast.jl index 6d229c9..35b4669 100644 --- a/src/broadcast.jl +++ b/src/broadcast.jl @@ -1,4 +1,4 @@ -using DataArrays, Base.Cartesian, Base.@get! +using DataArrays, Base.@get! using Base.Broadcast: bitcache_chunks, bitcache_size, dumpbitcache, promote_eltype, broadcast_shape, eltype_plus, type_minus, type_div, type_pow diff --git a/src/operators.jl b/src/operators.jl index a67215d..980341b 100644 --- a/src/operators.jl +++ b/src/operators.jl @@ -9,6 +9,7 @@ const numeric_unary_operators = [:(Base.(:+)), const logical_unary_operators = [:(Base.(:!))] const elementary_functions = [:(Base.abs), + :(Base.abs2), :(Base.sign), :(Base.acos), :(Base.acosh), @@ -453,9 +454,17 @@ end # XXX: The below should be revisited once we have a way to infer what # the proper return type of an array should be. +# One-argument elementary functions that do something different for +# Complex +for f in (:(Base.abs), :(Base.abs2)) + @eval begin + @dataarray_unary $(f) Complex T.parameters[1] + end +end + # One-argument elementary functions that return the same type as their # inputs -for f in (:(Base.abs), :(Base.conj), :(Base.sign)) +for f in (:(Base.abs), :(Base.abs2), :(Base.conj), :(Base.sign)) @eval begin $(f)(::NAtype) = NA @dataarray_unary $(f) Number T @@ -665,7 +674,8 @@ Base.(:.^)(::MathConst{:e}, B::AbstractDataArray) = exp(B) for f in (:(Base.(:+)), :(Base.(:.+)), :(Base.(:-)), :(Base.(:.-)), :(Base.(:*)), :(Base.(:.*)), :(Base.(:.^)), :(Base.div), - :(Base.mod), :(Base.fld), :(Base.rem)) + :(Base.mod), :(Base.fld), :(Base.rem), :(Base.min), + :(Base.max)) @eval begin # Scalar with NA ($f)(::NAtype, ::NAtype) = NA diff --git a/src/reduce.jl b/src/reduce.jl index 4a21359..549b342 100644 --- a/src/reduce.jl +++ b/src/reduce.jl @@ -34,11 +34,12 @@ end # Kernel that skips NA without branching # Only used for cases where branching is more expensive than computing for NAs -function mapreduce_seq_impl_skipna(f::Union(Base.IdFun, Base.AbsFun, Base.Abs2Fun), - op::Union(Base.AddFun, Base.MulFun, Base.AndFun, Base.OrFun), - T::Union(Type{Uint8}, Type{Uint16}, Type{Uint32}, Type{Uint64}, - Type{Int8}, Type{Int16}, Type{Int32}, Type{Int64}, - Type{Float32}, Type{Float64}), +typealias FastMapFuns Union(Base.IdFun, Base.AbsFun, Base.Abs2Fun) +typealias FastReduceFuns Union(Base.AddFun, Base.MulFun, Base.AndFun, Base.OrFun) +typealias FastBitsTypes Union(Type{Uint8}, Type{Uint16}, Type{Uint32}, Type{Uint64}, + Type{Int8}, Type{Int16}, Type{Int32}, Type{Int64}, + Type{Float32}, Type{Float64}) +function mapreduce_seq_impl_skipna(f::FastMapFuns, op::FastReduceFuns, T::FastBitsTypes, A::DataArray, ifirst::Int, ilast::Int) data = A.data na = A.na @@ -106,15 +107,14 @@ end # NA, it returns NA. Otherwise we will fall back to the implementation # in Base, which is slow because it's type-unstable, but guarantees the # correct semantics -function Base._mapreduce(f::Union(Base.IdFun, Base.AbsFun, Base.Abs2Fun, - Base.ExpFun, Base.LogFun, Base.CentralizedAbs2Fun), - op::Union(Base.AddFun, Base.MulFun, Base.MaxFun, Base.MinFun), - A::DataArray) +typealias SafeMapFuns Union(Base.IdFun, Base.AbsFun, Base.Abs2Fun, + Base.ExpFun, Base.LogFun, Base.CentralizedAbs2Fun) +typealias SafeReduceFuns Union(Base.AddFun, Base.MulFun, Base.MaxFun, Base.MinFun) +function Base._mapreduce(f::SafeMapFuns, op::SafeReduceFuns, A::DataArray) any(A.na) && return NA Base._mapreduce(f, op, A.data) end - function Base.mapreduce(f, op::Function, A::DataArray; skipna::Bool=false) is(op, +) ? (skipna ? _mapreduce_skipna(f, Base.AddFun(), A) : Base._mapreduce(f, Base.AddFun(), A)) : is(op, *) ? (skipna ? _mapreduce_skipna(f, Base.MulFun(), A) : Base._mapreduce(f, Base.MulFun(), A)) : diff --git a/src/reducedim.jl b/src/reducedim.jl new file mode 100644 index 0000000..e5b54da --- /dev/null +++ b/src/reducedim.jl @@ -0,0 +1,252 @@ +@ngenerate N typeof(R) function Base._mapreducedim!{N}(f, op::Base.AndFun, R::Array{Bool}, A::BitArray{N}) + lsiz = Base.check_reducdims(R, A) + isempty(A) && return R + @nextract N sizeR d->size(R, d) + @nexprs 1 d->(state_0 = state_{N} = 1) + @nexprs N d->(skip_d = sizeR_d == 1) + Achunks = A.chunks + k = 1 + @nloops(N, i, A, + d->(state_{d-1} = state_d), + d->(skip_d || (state_d = state_0)), + begin + @inbounds R[state_0] &= evaluate(f, Base.unsafe_bitgetindex(Achunks, k)) + state_0 += 1 + k += 1 + end) + R +end + +# Determine if there are any true values in a BitArray in a given range +function _any(B::BitArray, istart::Int, iend::Int) + chunks = B.chunks + startidx, startbit = Base.get_chunks_id(istart) + endidx, endbit = Base.get_chunks_id(iend) + startidx == endidx && return chunks[startidx] >> startbit << (63-endbit+startbit) != 0 + + chunks[startidx] >> startbit != 0 && return true + for i = startidx+1:endidx-1 + chunks[i] != 0 && return true + end + chunks[endidx] << (63-endbit) != 0 && return true + return false +end + +## NA-preserving +@ngenerate N typeof(R) function _mapreducedim!{T,N}(f::SafeMapFuns, op::SafeReduceFuns, + R::AbstractArray, A::DataArray{T,N}) + data = A.data + na = A.na + + lsiz = Base.check_reducdims(R, data) + isempty(data) && return R + @nextract N sizeR d->size(R,d) + + if lsiz > 16 + # use mapreduce_impl, which is probably better tuned to achieve higher performance + nslices = div(length(A), lsiz) + ibase = 0 + extr = daextract(R) + for i = 1:nslices + if _any(na, ibase+1, ibase+lsiz) + if isa(R, DataArray) + unsafe_setna!(R, extr, i) + continue + else + error("cannot reduce a DataArray containing NAs to an AbstractArray") + end + end + @inbounds unsafe_dasetindex!(R, extr, Base.mapreduce_impl(f, op, data, ibase+1, ibase+lsiz), i) + ibase += lsiz + end + elseif isa(R, DataArray) + na_chunks = A.na.chunks + + # If reducing to a DataArray, skip strides with NAs + new_data = R.data + + # In my benchmarks, it is actually faster to compute a new NA + # array and bitpack it than to operate on the BitArray + # directly. + new_na = fill(false, size(new_data)) + + @nexprs 1 d->(state_0 = state_{N} = 1) + @nexprs N d->(skip_d = sizeR_d == 1) + k = 1 + @nloops(N, i, A, + d->(state_{d-1} = state_d), + d->(skip_d || (state_d = state_0)), begin + @inbounds if new_na[state_0] | Base.unsafe_bitgetindex(na_chunks, k) + new_na[state_0] = true + @goto loopend + end + + @inbounds x = data[k] + v = evaluate(f, x) + @inbounds v0 = new_data[state_0] + nv = evaluate(op, v0, v) + @inbounds new_data[state_0] = nv + + @label loopend + state_0 += 1 + k += 1 + end) + + R.na = bitpack(new_na) + else + # if reducing to a non-DataArray, throw an error at the start on NA + any(isna(A)) && error("cannot reduce a DataArray containing NAs to an AbstractArray") + @nloops N i data d->(j_d = sizeR_d==1 ? 1 : i_d) begin + @inbounds x = (@nref N data i) + v = evaluate(f, x) + @inbounds v0 = (@nref N R j) + nv = evaluate(op, v0, v) + @inbounds (@nref N R j) = nv + end + end + return R +end +_mapreducedim!(f, op, R, A) = Base._mapreducedim!(f, op, R, A) + +## NA-skipping +_getdata(A) = A +_getdata(A::DataArray) = A.data +@ngenerate N typeof(R) function _mapreducedim_skipna_impl!{T,N}(f, op, R::AbstractArray, A::DataArray{T,N}) + data = A.data + na = A.na + na_chunks = na.chunks + new_data = _getdata(R) + + lsiz = Base.check_reducdims(new_data, data) + isempty(data) && return new_data + @nextract N sizeR d->size(new_data,d) + sizA1 = size(data, 1) + + if lsiz > 16 + # keep the accumulator as a local variable when reducing along the first dimension + nslices = div(length(A), lsiz) + ibase = 0 + for i = 1:nslices + # TODO: use pairwise impl for sum + @inbounds v = new_data[i] + for k = ibase+1:ibase+lsiz + @inbounds Base.unsafe_bitgetindex(na_chunks, k) && continue + @inbounds x = data[k] + v = convert(typeof(v), evaluate(op, evaluate(f, x), v))::typeof(v) + end + @inbounds new_data[i] = v + ibase += lsiz + end + else + # general implementation + @nexprs 1 d->(state_0 = state_{N} = 1) + @nexprs N d->(skip_d = sizeR_d == 1) + k = 1 + @nloops(N, i, A, + d->(state_{d-1} = state_d), + d->(skip_d || (state_d = state_0)), begin + @inbounds Base.unsafe_bitgetindex(na_chunks, k) && @goto loopend + + @inbounds x = data[k] + v = evaluate(f, x) + @inbounds v0 = new_data[state_0] + nv = evaluate(op, v0, v) + @inbounds new_data[state_0] = nv + + @label loopend + state_0 += 1 + k += 1 + end) + end + return R +end + +_mapreducedim_skipna!(f, op, R::AbstractArray, A::DataArray) = + _mapreducedim_skipna_impl!(f, op, R, A) + +# for MinFun/MaxFun, min or max is NA if all values along a dimension are NA +function _mapreducedim_skipna!(f, op::Union(Base.MinFun, Base.MaxFun), R::AbstractArray, A::DataArray) + R.na = bitpack(all!(fill(true, size(R.na)), A.na)) + _mapreducedim_skipna_impl!(f, op, R, A) +end + +## general reducedim interface + +for op in (Base.AddFun, Base.MulFun, Base.AndFun, Base.OrFun, Base.MinFun, Base.MaxFun) + @eval begin + function Base.initarray!{T}(a::DataArray{T}, op::$op, init::Bool) + if init + Base.initarray!(a.data, op, true) + Base.fill!(a.na, false) + end + a + end + end +end + +function Base.reducedim_initarray{R}(A::DataArray, region, v0, ::Type{R}) + rd = Base.reduced_dims(A.data, region) + DataArray(fill!(similar(A.data, R, rd), v0), falses(rd)) +end +function Base.reducedim_initarray0{R}(A::DataArray, region, v0, ::Type{R}) + rd = Base.reduced_dims0(A,region) + DataArray(fill!(similar(A.data, R, rd), v0), falses(rd)) +end + +function Base.mapreducedim!(f::Function, op, R::AbstractArray, A::DataArray; skipna::Bool=false) + is(op, +) ? (skipna ? _mapreducedim_skipna!(f, Base.AddFun(), R, A) : _mapreducedim!(f, Base.AddFun(), R, A)) : + is(op, *) ? (skipna ? _mapreducedim_skipna!(f, Base.MulFun(), R, A) : _mapreducedim!(f, Base.MulFun(), R, A)) : + is(op, &) ? (skipna ? _mapreducedim_skipna!(f, Base.AndFun(), R, A) : _mapreducedim!(f, Base.AndFun(), R, A)) : + is(op, |) ? (skipna ? _mapreducedim_skipna!(f, Base.OrFun(), R, A) : _mapreducedim!(f, Base.OrFun(), R, A)) : + skipna ? _mapreducedim_skipna!(f, op, R, A) : _mapreducedim!(f, op, R, A) +end +Base.mapreducedim!(f, op, R::AbstractArray, A::DataArray; skipna::Bool=false) = + skipna ? _mapreducedim_skipna!(f, op, R, A) : _mapreducedim!(f, op, R, A) +Base.reducedim!{RT}(op, R::DataArray{RT}, A::AbstractArray; skipna::Bool=false) = + Base.mapreducedim!(Base.IdFun(), op, R, A, zero(RT); skipna=skipna) + +Base.mapreducedim(f, op, A::DataArray, region, v0; skipna::Bool=false) = + Base.mapreducedim!(f, op, Base.reducedim_initarray(A, region, v0), A; skipna=skipna) +Base.mapreducedim{T}(f, op, A::DataArray{T}, region; skipna::Bool=false) = + Base.mapreducedim!(f, op, Base.reducedim_init(f, op, A, region), A; skipna=skipna) + +Base.reducedim(op, A::DataArray, region, v0; skipna::Bool=false) = + Base.mapreducedim(Base.IdFun(), op, A, region, v0; skipna=skipna) +Base.reducedim(op, A::DataArray, region; skipna::Bool=false) = + Base.mapreducedim(Base.IdFun(), op, A, region; skipna=skipna) + +## usual reductions + +for (basfn, Op) in [(:sum, Base.AddFun), (:prod, Base.MulFun), + (:maximum, Base.MaxFun), (:minimum, Base.MinFun), + (:all, Base.AndFun), (:any, Base.OrFun)] + fname = Expr(:., :Base, Base.Meta.quot(basfn)) + fname! = Expr(:., :Base, Base.Meta.quot(symbol(string(basfn, '!')))) + @eval begin + $(fname!)(f::Union(Function,Base.Func{1}), r::AbstractArray, A::DataArray; + init::Bool=true, skipna::Bool=false) = + Base.mapreducedim!(f, $(Op)(), Base.initarray!(r, $(Op)(), init), A; skipna=skipna) + $(fname!)(r::AbstractArray, A::DataArray; init::Bool=true, skipna::Bool=false) = + $(fname!)(Base.IdFun(), r, A; init=init, skipna=skipna) + + $(fname)(f::Union(Function,Base.Func{1}), A::DataArray, region; skipna::Bool=false) = + Base.mapreducedim(f, $(Op)(), A, region; skipna=skipna) + $(fname)(A::DataArray, region; skipna::Bool=false) = + $(fname)(Base.IdFun(), A, region; skipna=skipna) + end +end + +for (basfn, fbase, Fun) in [(:sumabs, :sum, Base.AbsFun), + (:sumabs2, :sum, Base.Abs2Fun), + (:maxabs, :maximum, Base.AbsFun), + (:minabs, :minimum, Base.AbsFun)] + fname = Expr(:., :Base, Base.Meta.quot(basfn)) + fname! = Expr(:., :Base, Base.Meta.quot(symbol(string(basfn, '!')))) + fbase! = Expr(:., :Base, Base.Meta.quot(symbol(string(fbase, '!')))) + @eval begin + $(fname!)(r::AbstractArray, A::DataArray; init::Bool=true, skipna::Bool=false) = + $(fbase!)($(Fun)(), r, A; init=init, skipna=skipna) + $(fname)(A::DataArray, region; skipna::Bool=false) = + $(fbase)($(Fun)(), A, region; skipna=skipna) + end +end diff --git a/test/reducedim.jl b/test/reducedim.jl new file mode 100644 index 0000000..b5ad91f --- /dev/null +++ b/test/reducedim.jl @@ -0,0 +1,131 @@ +module TestReducedim +using DataArrays, Base.Test + +# Test for fast unit stride BitArray functions +function test_any() + bits = randbool(64*5) + for i = 1:length(bits), j = i:length(bits) + v = false + for k = i:j + v |= bits[k] + end + Base.Test.@test DataArrays._any(bits, i, j) == v + end +end +test_any() + +# mapslices from Base, hacked to work for these cases +function safe_mapslices{T}(f::Function, A::AbstractArray{T}, region, skipna) + dims = intersect(region, 1:ndims(A)) + if isempty(dims) + if skipna + naval = f(T[], 1) + A = copy(A) + A[isna(A)] = isempty(naval) ? NA : naval[1] + end + return A + end + + if isempty(dims) + return map(f,A) + end + + dimsA = [size(A)...] + ndimsA = ndims(A) + alldims = [1:ndimsA] + + otherdims = setdiff(alldims, dims) + + idx = cell(ndimsA) + fill!(idx, 1) + Asliceshape = tuple(dimsA[dims]...) + itershape = tuple(dimsA[otherdims]...) + for d in dims + idx[d] = 1:size(A,d) + end + + r1 = f(reshape(A[idx...], Asliceshape); skipna=skipna) + + # determine result size and allocate + Rsize = copy(dimsA) + # TODO: maybe support removing dimensions + if !isa(r1, AbstractArray) || ndims(r1) == 0 + r2 = similar(A, T, 1) + r2[1] = r1 + r1 = r2 + end + Rsize[dims] = [size(r1)...; ones(Int,max(0,length(dims)-ndims(r1)))] + R = similar(r1, tuple(Rsize...)) + + ridx = cell(ndims(R)) + fill!(ridx, 1) + for d in dims + ridx[d] = 1:size(R,d) + end + + R[ridx...] = r1 + + first = true + Base.cartesianmap(itershape) do idxs... + if first + first = false + else + ia = [idxs...] + idx[otherdims] = ia + ridx[otherdims] = ia + try + R[ridx...] = f(reshape(A[idx...], Asliceshape); skipna=skipna) + catch e + if isa(e, ErrorException) && e.msg == "Reducing over an empty array is not allowed." + R[ridx...] = NA + else + rethrow(e) + end + end + end + end + + return R +end + +macro test_da_approx_eq(da1, da2) + quote + v1 = $da1 + v2 = $da2 + na = isna(v1) + if na != isna(v2) + println(v1) + println(v2) + end + @test na == isna(v2) + defined = !na + if any(defined) + @test_approx_eq v1[defined] v2[defined] + end + end +end + +for Areduc in (DataArray(rand(3, 4, 5, 6)), + DataArray(rand(3, 4, 5, 6), rand(3, 4, 5, 6) .< 0.2)) + for skipna = (false, true) + for region in { + 1, 2, 3, 4, 5, (1, 2), (1, 3), (1, 4), (2, 3), (2, 4), (3, 4), + (1, 2, 3), (1, 3, 4), (2, 3, 4), (1, 2, 3, 4)} + r = DataArray(fill(NaN, Base.reduced_dims(size(Areduc), region))) + @test_da_approx_eq sum!(r, Areduc; skipna=skipna) safe_mapslices(sum, Areduc, region, skipna) + @test_da_approx_eq prod!(r, Areduc; skipna=skipna) safe_mapslices(prod, Areduc, region, skipna) + @test_da_approx_eq maximum!(r, Areduc; skipna=skipna) safe_mapslices(maximum, Areduc, region, skipna) + @test_da_approx_eq minimum!(r, Areduc; skipna=skipna) safe_mapslices(minimum, Areduc, region, skipna) + @test_da_approx_eq Base.sumabs!(r, Areduc; skipna=skipna) safe_mapslices(sum, abs(Areduc), region, skipna) + @test_da_approx_eq Base.sumabs2!(r, Areduc; skipna=skipna) safe_mapslices(sum, abs2(Areduc), region, skipna) + + @test_da_approx_eq sum(Areduc, region; skipna=skipna) safe_mapslices(sum, Areduc, region, skipna) + @test_da_approx_eq prod(Areduc, region; skipna=skipna) safe_mapslices(prod, Areduc, region, skipna) + @test_da_approx_eq maximum(Areduc, region; skipna=skipna) safe_mapslices(maximum, Areduc, region, skipna) + @test_da_approx_eq minimum(Areduc, region; skipna=skipna) safe_mapslices(minimum, Areduc, region, skipna) + @test_da_approx_eq Base.sumabs(Areduc, region; skipna=skipna) safe_mapslices(sum, abs(Areduc), region, skipna) + @test_da_approx_eq Base.sumabs2(Areduc, region; skipna=skipna) safe_mapslices(sum, abs2(Areduc), region, skipna) + end + end +end +end diff --git a/test/runtests.jl b/test/runtests.jl index 8fcab19..16bb124 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -18,6 +18,7 @@ my_tests = ["abstractarray.jl", #"test/nas.jl", "operators.jl", "reduce.jl", + "reducedim.jl", "broadcast.jl", "padding.jl", "pooleddataarray.jl", From 55ec26efef9b5af4d22b7e3a10370e17b22adb89 Mon Sep 17 00:00:00 2001 From: Simon Kornblith Date: Wed, 16 Jul 2014 01:35:48 -0400 Subject: [PATCH 3/7] Add mean and var across dimensions --- src/broadcast.jl | 10 +- src/dataarray.jl | 3 + src/indexing.jl | 1 + src/reducedim.jl | 366 ++++++++++++++++++++++++++++++++++++++++++---- test/reducedim.jl | 23 +++ 5 files changed, 371 insertions(+), 32 deletions(-) diff --git a/src/broadcast.jl b/src/broadcast.jl index 35b4669..af20330 100644 --- a/src/broadcast.jl +++ b/src/broadcast.jl @@ -103,9 +103,11 @@ function gen_broadcast_dataarray(nd::Int, arrtype::(DataType...), outtype, f::Fu # Set up output DataArray/PooledDataArray $(if outtype == DataArray quote - Bc = B.na.chunks - fill!(Bc, 0) Bdata = B.data + # Copy in case aliased + # TODO: check for aliasing? + Bna = falses(size(Bdata)) + Bc = Bna.chunks ind = 1 end elseif outtype == PooledDataArray @@ -158,6 +160,10 @@ function gen_broadcast_dataarray(nd::Int, arrtype::(DataType...), outtype, f::Fu :(ind += 1) end) end) + + $(if outtype == DataArray + :(B.na = Bna) + end) end _F_ end diff --git a/src/dataarray.jl b/src/dataarray.jl index d625541..8fe16ea 100644 --- a/src/dataarray.jl +++ b/src/dataarray.jl @@ -152,6 +152,9 @@ function Base.copy!(dest::DataArray, src::DataArray) # -> DataArray{T} dest end +Base.fill!(A::DataArray, ::NAtype) = (fill!(A.na, true); A) +Base.fill!(A::DataArray, v) = (fill!(A.data, v); fill!(A.na, false); A) + #' @description #' #' Create a deep copy of a DataArray. diff --git a/src/indexing.jl b/src/indexing.jl index d0358bd..c15374d 100644 --- a/src/indexing.jl +++ b/src/indexing.jl @@ -12,6 +12,7 @@ daextract(a) = nothing # Check for NA unsafe_isna(da::DataArray, extr, idx::Real) = Base.unsafe_bitgetindex(extr[2], idx) unsafe_isna(pda::PooledDataArray, extr, idx::Real) = extr[1][idx] == 0 +unsafe_isna(a, extr, idx::Real) = false unsafe_getindex_notna(da::DataArray, extr, idx::Real) = getindex(extr[1], idx) unsafe_getindex_notna(pda::PooledDataArray, extr, idx::Real) = getindex(extr[2], extr[1][idx]) unsafe_getindex_notna(a, extr, idx::Real) = Base.unsafe_getindex(a, idx) diff --git a/src/reducedim.jl b/src/reducedim.jl index e5b54da..40977fa 100644 --- a/src/reducedim.jl +++ b/src/reducedim.jl @@ -1,3 +1,8 @@ +## Utility function + +# This is a substantially faster implementation of the "all" reduction +# across dimensions for reducing a BitArray to an Array{Bool}. We use +# this below for implementing MaxFun and MinFun with skipna=true. @ngenerate N typeof(R) function Base._mapreducedim!{N}(f, op::Base.AndFun, R::Array{Bool}, A::BitArray{N}) lsiz = Base.check_reducdims(R, A) isempty(A) && return R @@ -17,7 +22,9 @@ R end -# Determine if there are any true values in a BitArray in a given range +# Determine if there are any true values in a BitArray in a given +# range. We use this for reductions with skipna=false along the first +# dimension. function _any(B::BitArray, istart::Int, iend::Int) chunks = B.chunks startidx, startbit = Base.get_chunks_id(istart) @@ -32,6 +39,24 @@ function _any(B::BitArray, istart::Int, iend::Int) return false end +# Counts the number of ones in a given range. We use this for counting +# the values for mean and var with skipna=false along the first +# dimension. +function _count(B::BitArray, istart::Int, iend::Int) + chunks = B.chunks + startidx, startbit = Base.get_chunks_id(istart) + endidx, endbit = Base.get_chunks_id(iend) + startidx == endidx && return count_ones(chunks[startidx] >> startbit << (63-endbit+startbit)) + + n = 0 + n += count_ones(chunks[startidx] >> startbit) + for i = startidx+1:endidx-1 + n += count_ones(chunks[i]) + end + n += count_ones(chunks[endidx] << (63-endbit)) + return n +end + ## NA-preserving @ngenerate N typeof(R) function _mapreducedim!{T,N}(f::SafeMapFuns, op::SafeReduceFuns, R::AbstractArray, A::DataArray{T,N}) @@ -40,7 +65,6 @@ end lsiz = Base.check_reducdims(R, data) isempty(data) && return R - @nextract N sizeR d->size(R,d) if lsiz > 16 # use mapreduce_impl, which is probably better tuned to achieve higher performance @@ -51,20 +75,22 @@ end if _any(na, ibase+1, ibase+lsiz) if isa(R, DataArray) unsafe_setna!(R, extr, i) - continue else error("cannot reduce a DataArray containing NAs to an AbstractArray") end + else + v = Base.mapreduce_impl(f, op, data, ibase+1, ibase+lsiz) + @inbounds unsafe_dasetindex!(R, extr, v, i) end - @inbounds unsafe_dasetindex!(R, extr, Base.mapreduce_impl(f, op, data, ibase+1, ibase+lsiz), i) ibase += lsiz end elseif isa(R, DataArray) + @nextract N sizeR d->size(R,d) na_chunks = A.na.chunks - # If reducing to a DataArray, skip strides with NAs new_data = R.data + # If reducing to a DataArray, skip strides with NAs. # In my benchmarks, it is actually faster to compute a new NA # array and bitpack it than to operate on the BitArray # directly. @@ -76,25 +102,26 @@ end @nloops(N, i, A, d->(state_{d-1} = state_d), d->(skip_d || (state_d = state_0)), begin - @inbounds if new_na[state_0] | Base.unsafe_bitgetindex(na_chunks, k) - new_na[state_0] = true - @goto loopend + @inbounds vna = new_na[state_0] | Base.unsafe_bitgetindex(na_chunks, k) + if vna + @inbounds new_na[state_0] = true + else + @inbounds x = data[k] + v = evaluate(f, x) + @inbounds v0 = new_data[state_0] + nv = evaluate(op, v0, v) + @inbounds new_data[state_0] = nv end - @inbounds x = data[k] - v = evaluate(f, x) - @inbounds v0 = new_data[state_0] - nv = evaluate(op, v0, v) - @inbounds new_data[state_0] = nv - - @label loopend state_0 += 1 k += 1 end) R.na = bitpack(new_na) else - # if reducing to a non-DataArray, throw an error at the start on NA + @nextract N sizeR d->size(R,d) + + # If reducing to a non-DataArray, throw an error at the start on NA any(isna(A)) && error("cannot reduce a DataArray containing NAs to an AbstractArray") @nloops N i data d->(j_d = sizeR_d==1 ? 1 : i_d) begin @inbounds x = (@nref N data i) @@ -111,16 +138,20 @@ _mapreducedim!(f, op, R, A) = Base._mapreducedim!(f, op, R, A) ## NA-skipping _getdata(A) = A _getdata(A::DataArray) = A.data -@ngenerate N typeof(R) function _mapreducedim_skipna_impl!{T,N}(f, op, R::AbstractArray, A::DataArray{T,N}) + +# mapreduce across a dimension. If specified, C contains the number of +# non-NA values reduced into each element of R. +@ngenerate N typeof(R) function _mapreducedim_skipna_impl!{T,N}(f, op, R::AbstractArray, + C::Union(Array{Int}, Nothing), + A::DataArray{T,N}) data = A.data na = A.na na_chunks = na.chunks new_data = _getdata(R) + isa(C, Nothing) || size(R) == size(C) || error("R and C must have same size") lsiz = Base.check_reducdims(new_data, data) - isempty(data) && return new_data - @nextract N sizeR d->size(new_data,d) - sizA1 = size(data, 1) + isempty(data) && return R if lsiz > 16 # keep the accumulator as a local variable when reducing along the first dimension @@ -129,6 +160,7 @@ _getdata(A::DataArray) = A.data for i = 1:nslices # TODO: use pairwise impl for sum @inbounds v = new_data[i] + @inbounds !isa(C, Nothing) && (C[i] = lsiz - _count(na, ibase+1, ibase+lsiz)) for k = ibase+1:ibase+lsiz @inbounds Base.unsafe_bitgetindex(na_chunks, k) && continue @inbounds x = data[k] @@ -139,21 +171,25 @@ _getdata(A::DataArray) = A.data end else # general implementation + @nextract N sizeR d->size(new_data,d) @nexprs 1 d->(state_0 = state_{N} = 1) @nexprs N d->(skip_d = sizeR_d == 1) k = 1 + !isa(C, Nothing) && fill!(C, div(length(A), length(R))) @nloops(N, i, A, d->(state_{d-1} = state_d), d->(skip_d || (state_d = state_0)), begin - @inbounds Base.unsafe_bitgetindex(na_chunks, k) && @goto loopend - - @inbounds x = data[k] - v = evaluate(f, x) - @inbounds v0 = new_data[state_0] - nv = evaluate(op, v0, v) - @inbounds new_data[state_0] = nv + @inbounds xna = Base.unsafe_bitgetindex(na_chunks, k) + if xna + !isa(C, Nothing) && @inbounds C[state_0] -= 1 + else + @inbounds x = data[k] + v = evaluate(f, x) + @inbounds v0 = new_data[state_0] + nv = evaluate(op, v0, v) + @inbounds new_data[state_0] = nv + end - @label loopend state_0 += 1 k += 1 end) @@ -162,12 +198,12 @@ _getdata(A::DataArray) = A.data end _mapreducedim_skipna!(f, op, R::AbstractArray, A::DataArray) = - _mapreducedim_skipna_impl!(f, op, R, A) + _mapreducedim_skipna_impl!(f, op, R, nothing, A) # for MinFun/MaxFun, min or max is NA if all values along a dimension are NA function _mapreducedim_skipna!(f, op::Union(Base.MinFun, Base.MaxFun), R::AbstractArray, A::DataArray) R.na = bitpack(all!(fill(true, size(R.na)), A.na)) - _mapreducedim_skipna_impl!(f, op, R, A) + _mapreducedim_skipna_impl!(f, op, R, nothing, A) end ## general reducedim interface @@ -250,3 +286,273 @@ for (basfn, fbase, Fun) in [(:sumabs, :sum, Base.AbsFun), $(fbase)($(Fun)(), A, region; skipna=skipna) end end + +## mean + +function Base.mean!{T}(R::AbstractArray{T}, A::DataArray; skipna::Bool=false, + init::Bool=true) + init && fill!(R, zero(eltype(R))) + if skipna + C = Array(Int, size(R)) + _mapreducedim_skipna_impl!(Base.IdFun(), Base.AddFun(), R, C, A) + broadcast!(/, R, R, C) + else + sum!(R, A; skipna=false) + broadcast!(/, R, R, convert(T, length(A)/length(R))) + R + end +end + +Base.mean{T}(A::DataArray{T}, region; skipna::Bool=false) = + mean!(Base.reducedim_initarray(A, region, zero(Base.momenttype(T))), A; skipna=skipna, + init=false) + +## var + +immutable MapReduceDim2ArgHelperFun{F,T} + f::F + val::T +end +Base.evaluate(f::MapReduceDim2ArgHelperFun, x) = evaluate(f.f, x, f.val) + +# A version of _mapreducedim! that accepts an array S of the same size +# as R, the elements of which are passed as a second argument to f. +@ngenerate N typeof(R) function _mapreducedim_2arg!{T,N}(f, op, R::AbstractArray, + A::DataArray{T,N}, + S::AbstractArray) + data = A.data + na = A.na + Sextr = daextract(S) + + lsiz = Base.check_reducdims(R, data) + size(R) == size(S) || error("R and S must have same size") + isempty(data) && return R + + if lsiz > 16 + # use mapreduce_impl, which is probably better tuned to achieve higher performance + nslices = div(length(A), lsiz) + ibase = 0 + extr = daextract(R) + for i = 1:nslices + if unsafe_isna(S, Sextr, i) || _any(na, ibase+1, ibase+lsiz) + if isa(R, DataArray) + unsafe_setna!(R, extr, i) + else + error("cannot reduce a DataArray containing NAs to an AbstractArray") + end + else + @inbounds s = unsafe_getindex_notna(S, Sextr, i) + v = Base.mapreduce_impl(MapReduceDim2ArgHelperFun(f, s), op, data, ibase+1, ibase+lsiz) + @inbounds unsafe_dasetindex!(R, extr, v, i) + end + ibase += lsiz + end + elseif isa(R, DataArray) + @nextract N sizeR d->size(R,d) + na_chunks = A.na.chunks + new_data = R.data + new_na = bitunpack(S.na) + + @nexprs 1 d->(state_0 = state_{N} = 1) + @nexprs N d->(skip_d = sizeR_d == 1) + k = 1 + @nloops(N, i, A, + d->(state_{d-1} = state_d), + d->(skip_d || (state_d = state_0)), begin + @inbounds vna = new_na[state_0] | Base.unsafe_bitgetindex(na_chunks, k) + if vna + @inbounds new_na[state_0] = true + else + @inbounds s = unsafe_getindex_notna(S, Sextr, state_0) + @inbounds x = data[k] + v = evaluate(f, x, s) + @inbounds v0 = new_data[state_0] + nv = evaluate(op, v0, v) + @inbounds new_data[state_0] = nv + end + + state_0 += 1 + k += 1 + end) + + R.na = bitpack(new_na) + else + @nextract N sizeR d->size(R,d) + + # If reducing to a non-DataArray, throw an error at the start on NA + (any(isna(A)) || any(isna(S))) && error("cannot reduce a DataArray containing NAs to an AbstractArray") + @nloops N i data d->(j_d = sizeR_d==1 ? 1 : i_d) begin + @inbounds s = unsafe_getindex_notna(S, Sextr, state_0) + @inbounds x = (@nref N data i) + v = evaluate(f, x, s) + @inbounds v0 = (@nref N R j) + nv = evaluate(op, v0, v) + @inbounds (@nref N R j) = nv + end + end + return R +end + +# A version of _mapreducedim_skipna! that accepts an array S of the same size +# as R, the elements of which are passed as a second argument to f. +@ngenerate N typeof(R) function _mapreducedim_skipna_2arg!{T,N}(f, op, R::AbstractArray, + C::Union(Array{Int}, Nothing), + A::DataArray{T,N}, S::AbstractArray) + data = A.data + na = A.na + na_chunks = na.chunks + new_data = _getdata(R) + Sextr = daextract(S) + + lsiz = Base.check_reducdims(new_data, data) + isa(C, Nothing) || size(R) == size(C) || error("R and C must have same size") + size(R) == size(S) || error("R and S must have same size") + isempty(data) && return R + @nextract N sizeR d->size(new_data,d) + sizA1 = size(data, 1) + + # If there are any NAs in S, assume these will produce NAs in R + if isa(S, DataArray) + if isa(R, DataArray) + copy!(R.na, S.na) + elseif any(S.na) + error("cannot reduce a DataArray containing NAs to an AbstractArray") + end + end + + if lsiz > 16 + # keep the accumulator as a local variable when reducing along the first dimension + nslices = div(length(A), lsiz) + ibase = 0 + for i = 1:nslices + @inbounds v = new_data[i] + !isa(C, Nothing) && (C[i] = lsiz - _count(na, ibase+1, ibase+lsiz)) + + # If S[i] is NA, skip this iteration + @inbounds sna = unsafe_isna(S, Sextr, i) + if !sna + @inbounds s = unsafe_getindex_notna(S, Sextr, i) + # TODO: use pairwise impl for sum + for k = ibase+1:ibase+lsiz + @inbounds Base.unsafe_bitgetindex(na_chunks, k) && continue + @inbounds x = data[k] + v = convert(typeof(v), evaluate(op, evaluate(f, x, s), v))::typeof(v) + end + + @inbounds new_data[i] = v + end + + ibase += lsiz + end + else + # general implementation + @nexprs 1 d->(state_0 = state_{N} = 1) + @nexprs N d->(skip_d = sizeR_d == 1) + k = 1 + !isa(C, Nothing) && fill!(C, div(length(A), length(R))) + @nloops(N, i, A, + d->(state_{d-1} = state_d), + d->(skip_d || (state_d = state_0)), begin + @inbounds xna = Base.unsafe_bitgetindex(na_chunks, k) | unsafe_isna(S, Sextr, state_0) + if xna + !isa(C, Nothing) && @inbounds C[state_0] -= 1 + else + @inbounds s = unsafe_getindex_notna(S, Sextr, state_0) + @inbounds x = data[k] + v = evaluate(f, x, s) + @inbounds v0 = new_data[state_0] + nv = evaluate(op, v0, v) + @inbounds new_data[state_0] = nv + end + + state_0 += 1 + k += 1 + end) + end + return R +end + +immutable Abs2MinusFun end +Base.evaluate(f::Abs2MinusFun, x, m) = abs2(x - m) + +function Base.varm!(R::AbstractArray, A::DataArray, m::AbstractArray; corrected::Bool=true, + skipna::Bool=false, init::Bool=true) + if isempty(A) + fill!(R, convert(eltype(R), NaN)) + else + init && fill!(R, zero(eltype(R))) + if skipna + C = Array(Int, size(R)) + + # Compute R = abs2(A-m) + _mapreducedim_skipna_2arg!(Abs2MinusFun(), Base.AddFun(), R, C, A, m) + + # Divide by number of non-NA values + if corrected + for i = 1:length(C) + @inbounds C[i] = max(C[i] - 1, 0) + end + end + broadcast!(/, R, R, C) + else + # Compute R = abs2(A-m) + _mapreducedim_2arg!(Abs2MinusFun(), Base.AddFun(), R, A, m) + + # Divide by number of values + broadcast!(/, R, R, div(length(A), length(R)) - int(corrected)) + end + end +end + +function Base.varzm!(R::AbstractArray, A::DataArray; corrected::Bool=true, + skipna::Bool=false, init::Bool=true) + if isempty(A) + fill!(R, convert(eltype(R), NaN)) + else + init && fill!(R, zero(eltype(R))) + if skipna + C = Array(Int, size(R)) + _mapreducedim_skipna_impl!(Base.Abs2Fun(), Base.AddFun(), R, C, A) + if corrected + for i = 1:length(C) + @inbounds C[i] = max(C[i] - 1, 0) + end + end + broadcast!(/, R, R, C) + else + Base.sumabs2!(R, A; init=true) + broadcast!(/, R, R, div(length(A), length(R)) - int(corrected)) + end + end +end + +Base.varm{T}(A::DataArray{T}, m::AbstractArray, region; corrected::Bool=true, + skipna::Bool=false) = + Base.varm!(Base.reducedim_initarray(A, region, zero(Base.momenttype(T))), A, m; + corrected=corrected, skipna=skipna, init=false) + +Base.varzm{T}(A::DataArray{T}, region::Union(Integer, AbstractArray, Tuple); + corrected::Bool=true, skipna::Bool=false) = + Base.varzm!(Base.reducedim_initarray(A, region, zero(Base.momenttype(T))), A; + corrected=corrected, skipna=skipna, init=false) + +function Base.var{T}(A::DataArray{T}, region::Union(Integer, AbstractArray, Tuple); + corrected::Bool=true, mean=nothing, skipna::Bool=false) + if mean == 0 + Base.varzm(A, region; corrected=corrected, skipna=skipna) + elseif mean == nothing + if skipna + # Can reduce mean into ordinary array + m = zeros(Base.momenttype(T), Base.reduced_dims(A, region)) + Base.varm(A, Base.mean!(m, A; skipna=skipna), region; + corrected=corrected, skipna=skipna) + else + Base.varm(A, Base.mean(A, region; skipna=skipna), region; + corrected=corrected, skipna=skipna) + end + elseif isa(mean, AbstractArray) + Base.varm(A, mean::AbstractArray, region; corrected=corrected, skipna=skipna) + else + error("invalid value of mean") + end +end diff --git a/test/reducedim.jl b/test/reducedim.jl index b5ad91f..452a3a7 100644 --- a/test/reducedim.jl +++ b/test/reducedim.jl @@ -14,6 +14,18 @@ function test_any() end test_any() +function test_count() + bits = randbool(64*5) + for i = 1:length(bits), j = i:length(bits) + v = 0 + for k = i:j + v += bits[k] + end + Base.Test.@test DataArrays._count(bits, i, j) == v + end +end +test_count() + # mapslices from Base, hacked to work for these cases function safe_mapslices{T}(f::Function, A::AbstractArray{T}, region, skipna) dims = intersect(region, 1:ndims(A)) @@ -105,12 +117,16 @@ macro test_da_approx_eq(da1, da2) end end +myvarzm(x; skipna::Bool=false) = var(x; mean=0, skipna=skipna) +myvar1m(x; skipna::Bool=false) = var(x; mean=1, skipna=skipna) + for Areduc in (DataArray(rand(3, 4, 5, 6)), DataArray(rand(3, 4, 5, 6), rand(3, 4, 5, 6) .< 0.2)) for skipna = (false, true) for region in { 1, 2, 3, 4, 5, (1, 2), (1, 3), (1, 4), (2, 3), (2, 4), (3, 4), (1, 2, 3), (1, 3, 4), (2, 3, 4), (1, 2, 3, 4)} + # println("region = $region, skipna = $skipna") r = DataArray(fill(NaN, Base.reduced_dims(size(Areduc), region))) @test_da_approx_eq sum!(r, Areduc; skipna=skipna) safe_mapslices(sum, Areduc, region, skipna) @test_da_approx_eq prod!(r, Areduc; skipna=skipna) safe_mapslices(prod, Areduc, region, skipna) @@ -118,6 +134,7 @@ for Areduc in (DataArray(rand(3, 4, 5, 6)), @test_da_approx_eq minimum!(r, Areduc; skipna=skipna) safe_mapslices(minimum, Areduc, region, skipna) @test_da_approx_eq Base.sumabs!(r, Areduc; skipna=skipna) safe_mapslices(sum, abs(Areduc), region, skipna) @test_da_approx_eq Base.sumabs2!(r, Areduc; skipna=skipna) safe_mapslices(sum, abs2(Areduc), region, skipna) + @test_da_approx_eq mean!(r, Areduc; skipna=skipna) safe_mapslices(mean, Areduc, region, skipna) @test_da_approx_eq sum(Areduc, region; skipna=skipna) safe_mapslices(sum, Areduc, region, skipna) @test_da_approx_eq prod(Areduc, region; skipna=skipna) safe_mapslices(prod, Areduc, region, skipna) @@ -125,6 +142,12 @@ for Areduc in (DataArray(rand(3, 4, 5, 6)), @test_da_approx_eq minimum(Areduc, region; skipna=skipna) safe_mapslices(minimum, Areduc, region, skipna) @test_da_approx_eq Base.sumabs(Areduc, region; skipna=skipna) safe_mapslices(sum, abs(Areduc), region, skipna) @test_da_approx_eq Base.sumabs2(Areduc, region; skipna=skipna) safe_mapslices(sum, abs2(Areduc), region, skipna) + @test_da_approx_eq mean(Areduc, region; skipna=skipna) safe_mapslices(mean, Areduc, region, skipna) + if region != 5 + @test_da_approx_eq var(Areduc, region; skipna=skipna) safe_mapslices(var, Areduc, region, skipna) + @test_da_approx_eq var(Areduc, region; mean=0, skipna=skipna) safe_mapslices(myvarzm, Areduc, region, skipna) + @test_da_approx_eq var(Areduc, region; mean=fill!(r, 1), skipna=skipna) safe_mapslices(myvar1m, Areduc, region, skipna) + end end end end From 2d1eec7407d377c482879301e9dd940bc62a94d9 Mon Sep 17 00:00:00 2001 From: Simon Kornblith Date: Wed, 16 Jul 2014 15:04:12 -0400 Subject: [PATCH 4/7] Performance tweaks, add benchmarks --- benchmark/operators.jl | 3 +-- benchmark/reduce.jl | 49 ++++++++++++++++++++++++++++++++++ benchmark/reducedim.jl | 53 +++++++++++++++++++++++++++++++++++++ src/reduce.jl | 24 ----------------- src/reducedim.jl | 60 +++++++++++++++++++++++------------------- 5 files changed, 136 insertions(+), 53 deletions(-) create mode 100644 benchmark/reduce.jl create mode 100644 benchmark/reducedim.jl diff --git a/benchmark/operators.jl b/benchmark/operators.jl index b7b4c0e..ea4b8cc 100644 --- a/benchmark/operators.jl +++ b/benchmark/operators.jl @@ -35,7 +35,7 @@ macro perf(fn, replications, idx...) quote println($name) gc_disable() - df = compare([()->$fn for i=$idx], $replications) + df = compare([let i=i; ()->$fn; end for i=$idx], $replications) gc_enable() gc() df[:Function] = TEST_NAMES[$idx] @@ -71,7 +71,6 @@ const Bool2 = make_test_types(make_bools, 1000) @perf Bool1[i] $ Bool2[i] 100 # Vector operators -@perf sum(Float1[i]) 250 1:div(length(Float1), 2) @perf diff(Float1[i]) 50 1:div(length(Float1), 2) @perf cumsum(Float1[i]) 50 1:div(length(Float1), 2) end diff --git a/benchmark/reduce.jl b/benchmark/reduce.jl new file mode 100644 index 0000000..eebc227 --- /dev/null +++ b/benchmark/reduce.jl @@ -0,0 +1,49 @@ +module ReduceBenchmark +using DataArrays, Benchmark + +# seed rng for more consistent timings +srand(1776) + +const TEST_NAMES = [ + "Vector", + "DataVector No NA skipna=false", + "DataVector No NA skipna=true", + "DataVector Half NA skipna=false", + "DataVector Half NA skipna=true" +] + +function make_test_types(genfunc, sz) + mat = genfunc(sz) + na = shuffle!([trues(ifloor(sz/2)), falses(iceil(sz/2))]) + ( + mat, + DataArray(mat), + DataArray(mat, na) + ) +end + +const Data = make_test_types(rand, 100000000) + +macro perf(fn, replications) + quote + println($fn) + fns = [()->$fn(Data[1]), + ()->$fn(Data[2]), + ()->$fn(Data[2]; skipna=true), + ()->$fn(Data[3]), + ()->$fn(Data[3]; skipna=true)] + gc_disable() + df = compare(fns, $replications) + gc_enable() + gc() + df[:Function] = TEST_NAMES + df[:Relative] = df[:Average]./df[1, :Average] + println(df) + end +end + +@perf sum 10 +@perf maximum 10 +@perf mean 10 +@perf var 10 +end diff --git a/benchmark/reducedim.jl b/benchmark/reducedim.jl new file mode 100644 index 0000000..29a6406 --- /dev/null +++ b/benchmark/reducedim.jl @@ -0,0 +1,53 @@ +module ReducedimBenchmark +using DataArrays, Benchmark + +# seed rng for more consistent timings +srand(1776) + +const TEST_NAMES = [ + "Matrix", + "DataMatrix No NA skipna=false", + "DataMatrix No NA skipna=true", + "DataMatrix Half NA skipna=false", + "DataMatrix Half NA skipna=true" +] + +function make_test_types(genfunc, sz) + mat = genfunc(abs2(sz)) + na = shuffle!([trues(ifloor(abs2(sz)/2)), falses(iceil(abs2(sz)/2))]) + ( + reshape(mat, sz, sz), + DataArray(reshape(mat, sz, sz)), + DataArray(reshape(mat, sz, sz), reshape(na, sz, sz)) + ) +end + +const Data = make_test_types(rand, 10000) + +macro perf(fn, dim, replications) + quote + println($fn, " (region = ", $dim, ")") + fns = [()->$fn(Data[1], $dim), + ()->$fn(Data[2], $dim), + ()->$fn(Data[2], $dim; skipna=true), + ()->$fn(Data[3], $dim), + ()->$fn(Data[3], $dim; skipna=true)] + gc_disable() + df = compare(fns, $replications) + gc_enable() + gc() + df[:Function] = TEST_NAMES + df[:Relative] = df[:Average]./df[1, :Average] + println(df) + end +end + +@perf sum 1 10 +@perf sum 2 10 +@perf maximum 1 10 +@perf maximum 2 10 +@perf mean 1 10 +@perf mean 2 10 +@perf var 1 10 +@perf var 2 10 +end diff --git a/src/reduce.jl b/src/reduce.jl index 549b342..50dedf5 100644 --- a/src/reduce.jl +++ b/src/reduce.jl @@ -32,30 +32,6 @@ function mapreduce_seq_impl_skipna(f, op, T, A::DataArray, ifirst::Int, ilast::I v end -# Kernel that skips NA without branching -# Only used for cases where branching is more expensive than computing for NAs -typealias FastMapFuns Union(Base.IdFun, Base.AbsFun, Base.Abs2Fun) -typealias FastReduceFuns Union(Base.AddFun, Base.MulFun, Base.AndFun, Base.OrFun) -typealias FastBitsTypes Union(Type{Uint8}, Type{Uint16}, Type{Uint32}, Type{Uint64}, - Type{Int8}, Type{Int16}, Type{Int32}, Type{Int64}, - Type{Float32}, Type{Float64}) -function mapreduce_seq_impl_skipna(f::FastMapFuns, op::FastReduceFuns, T::FastBitsTypes, - A::DataArray, ifirst::Int, ilast::Int) - data = A.data - na = A.na - chunks = na.chunks - - v, i = skipna_init(f, op, na, data, ifirst, ilast) - - while i < ilast - i += 1 - @inbounds na = Base.unsafe_bitgetindex(chunks, i) - @inbounds d = data[i] - v = ifelse(na, v, evaluate(op, v, evaluate(f, d))) - end - v -end - # Pairwise map-reduce function mapreduce_pairwise_impl_skipna{T}(f, op, A::DataArray{T}, bytefirst::Int, bytelast::Int, n_notna::Int, blksize::Int) if n_notna <= blksize diff --git a/src/reducedim.jl b/src/reducedim.jl index 40977fa..7ed9e76 100644 --- a/src/reducedim.jl +++ b/src/reducedim.jl @@ -59,7 +59,7 @@ end ## NA-preserving @ngenerate N typeof(R) function _mapreducedim!{T,N}(f::SafeMapFuns, op::SafeReduceFuns, - R::AbstractArray, A::DataArray{T,N}) + R::DataArray, A::DataArray{T,N}) data = A.data na = A.na @@ -73,18 +73,14 @@ end extr = daextract(R) for i = 1:nslices if _any(na, ibase+1, ibase+lsiz) - if isa(R, DataArray) - unsafe_setna!(R, extr, i) - else - error("cannot reduce a DataArray containing NAs to an AbstractArray") - end + unsafe_setna!(R, extr, i) else v = Base.mapreduce_impl(f, op, data, ibase+1, ibase+lsiz) @inbounds unsafe_dasetindex!(R, extr, v, i) end ibase += lsiz end - elseif isa(R, DataArray) + else @nextract N sizeR d->size(R,d) na_chunks = A.na.chunks @@ -118,6 +114,33 @@ end end) R.na = bitpack(new_na) + end + return R +end + +## NA-preserving to array +@ngenerate N typeof(R) function _mapreducedim!{T,N}(f::SafeMapFuns, op::SafeReduceFuns, + R::AbstractArray, A::DataArray{T,N}) + data = A.data + na = A.na + + lsiz = Base.check_reducdims(R, data) + isempty(data) && return R + + if lsiz > 16 + # use mapreduce_impl, which is probably better tuned to achieve higher performance + nslices = div(length(A), lsiz) + ibase = 0 + extr = daextract(R) + for i = 1:nslices + if _any(na, ibase+1, ibase+lsiz) + error("cannot reduce a DataArray containing NAs to an AbstractArray") + else + v = Base.mapreduce_impl(f, op, data, ibase+1, ibase+lsiz) + @inbounds unsafe_dasetindex!(R, extr, v, i) + end + ibase += lsiz + end else @nextract N sizeR d->size(R,d) @@ -317,7 +340,7 @@ Base.evaluate(f::MapReduceDim2ArgHelperFun, x) = evaluate(f.f, x, f.val) # A version of _mapreducedim! that accepts an array S of the same size # as R, the elements of which are passed as a second argument to f. -@ngenerate N typeof(R) function _mapreducedim_2arg!{T,N}(f, op, R::AbstractArray, +@ngenerate N typeof(R) function _mapreducedim_2arg!{T,N}(f, op, R::DataArray, A::DataArray{T,N}, S::AbstractArray) data = A.data @@ -347,7 +370,7 @@ Base.evaluate(f::MapReduceDim2ArgHelperFun, x) = evaluate(f.f, x, f.val) end ibase += lsiz end - elseif isa(R, DataArray) + else @nextract N sizeR d->size(R,d) na_chunks = A.na.chunks new_data = R.data @@ -376,19 +399,6 @@ Base.evaluate(f::MapReduceDim2ArgHelperFun, x) = evaluate(f.f, x, f.val) end) R.na = bitpack(new_na) - else - @nextract N sizeR d->size(R,d) - - # If reducing to a non-DataArray, throw an error at the start on NA - (any(isna(A)) || any(isna(S))) && error("cannot reduce a DataArray containing NAs to an AbstractArray") - @nloops N i data d->(j_d = sizeR_d==1 ? 1 : i_d) begin - @inbounds s = unsafe_getindex_notna(S, Sextr, state_0) - @inbounds x = (@nref N data i) - v = evaluate(f, x, s) - @inbounds v0 = (@nref N R j) - nv = evaluate(op, v0, v) - @inbounds (@nref N R j) = nv - end end return R end @@ -413,11 +423,7 @@ end # If there are any NAs in S, assume these will produce NAs in R if isa(S, DataArray) - if isa(R, DataArray) - copy!(R.na, S.na) - elseif any(S.na) - error("cannot reduce a DataArray containing NAs to an AbstractArray") - end + copy!(R.na, S.na) end if lsiz > 16 From c6a9f9d05fb9527489f58a11f313da7b968f2cc5 Mon Sep 17 00:00:00 2001 From: Simon Kornblith Date: Wed, 16 Jul 2014 15:18:37 -0400 Subject: [PATCH 5/7] Fix bugs and add tests for reductions with non-DataArray output --- src/indexing.jl | 1 + src/reducedim.jl | 10 ++++++++-- test/reducedim.jl | 28 +++++++++++++++------------- 3 files changed, 24 insertions(+), 15 deletions(-) diff --git a/src/indexing.jl b/src/indexing.jl index c15374d..e6c8eee 100644 --- a/src/indexing.jl +++ b/src/indexing.jl @@ -44,6 +44,7 @@ unsafe_dasetindex!(da::PooledDataArray, extr, val::NAtype, idx::Real) = nothing unsafe_dasetindex!(da::DataArray, extr, val, idx::Real) = setindex!(extr[1], val, idx) unsafe_dasetindex!(pda::PooledDataArray, extr, val, idx::Real) = setindex!(extr[1], getpoolidx(pda, val), idx) +unsafe_dasetindex!(a::AbstractArray, extr, val, idx::Real) = setindex!(a, val, idx) ## PooledDataArray helper functions diff --git a/src/reducedim.jl b/src/reducedim.jl index 7ed9e76..83960c4 100644 --- a/src/reducedim.jl +++ b/src/reducedim.jl @@ -224,8 +224,14 @@ _mapreducedim_skipna!(f, op, R::AbstractArray, A::DataArray) = _mapreducedim_skipna_impl!(f, op, R, nothing, A) # for MinFun/MaxFun, min or max is NA if all values along a dimension are NA +function _mapreducedim_skipna!(f, op::Union(Base.MinFun, Base.MaxFun), R::DataArray, A::DataArray) + R.na = bitpack(all!(fill(true, size(R)), A.na)) + _mapreducedim_skipna_impl!(f, op, R, nothing, A) +end function _mapreducedim_skipna!(f, op::Union(Base.MinFun, Base.MaxFun), R::AbstractArray, A::DataArray) - R.na = bitpack(all!(fill(true, size(R.na)), A.na)) + if any(all!(fill(true, size(R)), A.na)) + error("all values along specified dimension are NA for one element of reduced dimension; cannot reduce to non-DataArray") + end _mapreducedim_skipna_impl!(f, op, R, nothing, A) end @@ -374,7 +380,7 @@ Base.evaluate(f::MapReduceDim2ArgHelperFun, x) = evaluate(f.f, x, f.val) @nextract N sizeR d->size(R,d) na_chunks = A.na.chunks new_data = R.data - new_na = bitunpack(S.na) + new_na = isa(S, DataArray) ? bitunpack(S.na) : fill(false, size(S)) @nexprs 1 d->(state_0 = state_{N} = 1) @nexprs N d->(skip_d = sizeR_d == 1) diff --git a/test/reducedim.jl b/test/reducedim.jl index 452a3a7..3eac6a7 100644 --- a/test/reducedim.jl +++ b/test/reducedim.jl @@ -105,10 +105,6 @@ macro test_da_approx_eq(da1, da2) v1 = $da1 v2 = $da2 na = isna(v1) - if na != isna(v2) - println(v1) - println(v2) - end @test na == isna(v2) defined = !na if any(defined) @@ -127,14 +123,17 @@ for Areduc in (DataArray(rand(3, 4, 5, 6)), 1, 2, 3, 4, 5, (1, 2), (1, 3), (1, 4), (2, 3), (2, 4), (3, 4), (1, 2, 3), (1, 3, 4), (2, 3, 4), (1, 2, 3, 4)} # println("region = $region, skipna = $skipna") - r = DataArray(fill(NaN, Base.reduced_dims(size(Areduc), region))) - @test_da_approx_eq sum!(r, Areduc; skipna=skipna) safe_mapslices(sum, Areduc, region, skipna) - @test_da_approx_eq prod!(r, Areduc; skipna=skipna) safe_mapslices(prod, Areduc, region, skipna) - @test_da_approx_eq maximum!(r, Areduc; skipna=skipna) safe_mapslices(maximum, Areduc, region, skipna) - @test_da_approx_eq minimum!(r, Areduc; skipna=skipna) safe_mapslices(minimum, Areduc, region, skipna) - @test_da_approx_eq Base.sumabs!(r, Areduc; skipna=skipna) safe_mapslices(sum, abs(Areduc), region, skipna) - @test_da_approx_eq Base.sumabs2!(r, Areduc; skipna=skipna) safe_mapslices(sum, abs2(Areduc), region, skipna) - @test_da_approx_eq mean!(r, Areduc; skipna=skipna) safe_mapslices(mean, Areduc, region, skipna) + outputs = {DataArray(fill(NaN, Base.reduced_dims(size(Areduc), region)))} + !anyna(Areduc) && push!(outputs, outputs[1].data) + for r in outputs + @test_da_approx_eq sum!(r, Areduc; skipna=skipna) safe_mapslices(sum, Areduc, region, skipna) + @test_da_approx_eq prod!(r, Areduc; skipna=skipna) safe_mapslices(prod, Areduc, region, skipna) + @test_da_approx_eq maximum!(r, Areduc; skipna=skipna) safe_mapslices(maximum, Areduc, region, skipna) + @test_da_approx_eq minimum!(r, Areduc; skipna=skipna) safe_mapslices(minimum, Areduc, region, skipna) + @test_da_approx_eq Base.sumabs!(r, Areduc; skipna=skipna) safe_mapslices(sum, abs(Areduc), region, skipna) + @test_da_approx_eq Base.sumabs2!(r, Areduc; skipna=skipna) safe_mapslices(sum, abs2(Areduc), region, skipna) + @test_da_approx_eq mean!(r, Areduc; skipna=skipna) safe_mapslices(mean, Areduc, region, skipna) + end @test_da_approx_eq sum(Areduc, region; skipna=skipna) safe_mapslices(sum, Areduc, region, skipna) @test_da_approx_eq prod(Areduc, region; skipna=skipna) safe_mapslices(prod, Areduc, region, skipna) @@ -143,10 +142,13 @@ for Areduc in (DataArray(rand(3, 4, 5, 6)), @test_da_approx_eq Base.sumabs(Areduc, region; skipna=skipna) safe_mapslices(sum, abs(Areduc), region, skipna) @test_da_approx_eq Base.sumabs2(Areduc, region; skipna=skipna) safe_mapslices(sum, abs2(Areduc), region, skipna) @test_da_approx_eq mean(Areduc, region; skipna=skipna) safe_mapslices(mean, Areduc, region, skipna) + if region != 5 @test_da_approx_eq var(Areduc, region; skipna=skipna) safe_mapslices(var, Areduc, region, skipna) @test_da_approx_eq var(Areduc, region; mean=0, skipna=skipna) safe_mapslices(myvarzm, Areduc, region, skipna) - @test_da_approx_eq var(Areduc, region; mean=fill!(r, 1), skipna=skipna) safe_mapslices(myvar1m, Areduc, region, skipna) + for r in outputs + @test_da_approx_eq var(Areduc, region; mean=fill!(r, 1), skipna=skipna) safe_mapslices(myvar1m, Areduc, region, skipna) + end end end end From 888596b45fbda5272bdd2a39f7ea7dd48cea15ad Mon Sep 17 00:00:00 2001 From: Simon Kornblith Date: Wed, 16 Jul 2014 15:29:39 -0400 Subject: [PATCH 6/7] Add some additional tests for reductions to non-DataArrays --- test/reducedim.jl | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/test/reducedim.jl b/test/reducedim.jl index 3eac6a7..7beb71e 100644 --- a/test/reducedim.jl +++ b/test/reducedim.jl @@ -123,13 +123,24 @@ for Areduc in (DataArray(rand(3, 4, 5, 6)), 1, 2, 3, 4, 5, (1, 2), (1, 3), (1, 4), (2, 3), (2, 4), (3, 4), (1, 2, 3), (1, 3, 4), (2, 3, 4), (1, 2, 3, 4)} # println("region = $region, skipna = $skipna") + outputs = {DataArray(fill(NaN, Base.reduced_dims(size(Areduc), region)))} - !anyna(Areduc) && push!(outputs, outputs[1].data) + has_na = anyna(Areduc) + if has_na && !skipna + # Should throw an error reducing to non-DataArray + @test_throws ErrorException sum!(outputs[1].data, Areduc; skipna=skipna) + else + # Should be able to reduce to non-DataArray + push!(outputs, outputs[1].data) + end + for r in outputs @test_da_approx_eq sum!(r, Areduc; skipna=skipna) safe_mapslices(sum, Areduc, region, skipna) @test_da_approx_eq prod!(r, Areduc; skipna=skipna) safe_mapslices(prod, Areduc, region, skipna) - @test_da_approx_eq maximum!(r, Areduc; skipna=skipna) safe_mapslices(maximum, Areduc, region, skipna) - @test_da_approx_eq minimum!(r, Areduc; skipna=skipna) safe_mapslices(minimum, Areduc, region, skipna) + if !has_na + @test_da_approx_eq maximum!(r, Areduc; skipna=skipna) safe_mapslices(maximum, Areduc, region, skipna) + @test_da_approx_eq minimum!(r, Areduc; skipna=skipna) safe_mapslices(minimum, Areduc, region, skipna) + end @test_da_approx_eq Base.sumabs!(r, Areduc; skipna=skipna) safe_mapslices(sum, abs(Areduc), region, skipna) @test_da_approx_eq Base.sumabs2!(r, Areduc; skipna=skipna) safe_mapslices(sum, abs2(Areduc), region, skipna) @test_da_approx_eq mean!(r, Areduc; skipna=skipna) safe_mapslices(mean, Areduc, region, skipna) @@ -153,4 +164,14 @@ for Areduc in (DataArray(rand(3, 4, 5, 6)), end end end + +# Test NA-skipping behavior for maximum +a = @data([NA NA; 3 4]) +@test isequal(maximum(a, 1; skipna=true), [3 4]) +@test isequal(maximum!(zeros(1, 2), a; skipna=true), [3 4]) + +# Maximum should give an NA in the output if all values along dimension are NA +@test isequal(maximum(a, 2; skipna=true), @data([NA 4])') +# Maximum should refuse to reduce to a non-DataArray +@test_throws ErrorException maximum!(zeros(2, 1), a; skipna=true) end From 0381da705ac9f696f0fcb049fe6f9a2c2af48921 Mon Sep 17 00:00:00 2001 From: Simon Kornblith Date: Wed, 16 Jul 2014 15:38:46 -0400 Subject: [PATCH 7/7] Remove a snippet of dead code --- src/reducedim.jl | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/reducedim.jl b/src/reducedim.jl index 83960c4..b7239e0 100644 --- a/src/reducedim.jl +++ b/src/reducedim.jl @@ -364,11 +364,7 @@ Base.evaluate(f::MapReduceDim2ArgHelperFun, x) = evaluate(f.f, x, f.val) extr = daextract(R) for i = 1:nslices if unsafe_isna(S, Sextr, i) || _any(na, ibase+1, ibase+lsiz) - if isa(R, DataArray) - unsafe_setna!(R, extr, i) - else - error("cannot reduce a DataArray containing NAs to an AbstractArray") - end + unsafe_setna!(R, extr, i) else @inbounds s = unsafe_getindex_notna(S, Sextr, i) v = Base.mapreduce_impl(MapReduceDim2ArgHelperFun(f, s), op, data, ibase+1, ibase+lsiz)