From 629676922c4ac599a24dfb0dc21b973277b7b1fd Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Sun, 19 Feb 2017 22:11:20 +0100 Subject: [PATCH] Change unique() to return values in the same ordering as levels for PDAs While the generic unique() method says it preserves the order of appearance, the ordering of levels is more likely to be useful. In particular, it will allow StatsModels to use unique() to get levels present in the data in the user-defined order, with the first level as reference by default. The new code is based on CategoricalArrays. --- src/pooleddataarray.jl | 58 ++++++++++++----------------------------- test/pooleddataarray.jl | 20 ++++++++------ 2 files changed, 28 insertions(+), 50 deletions(-) diff --git a/src/pooleddataarray.jl b/src/pooleddataarray.jl index dcdaeed..8ef19c7 100644 --- a/src/pooleddataarray.jl +++ b/src/pooleddataarray.jl @@ -271,9 +271,9 @@ end #' @description #' #' Return a DataVector containing the unique values of a `PooledDataArray`, -#' in the order they appear in the data, including `NA` if any missing entries +#' in the order of `levels`, including `NA` if any missing entries #' are encountered. For `PooledDataArray`s, this function is much less efficient -#' than `levels`, which does not return the values in the same order. +#' than `levels`. #' #' @param da::DataArray{T} `DataArray` whose unique values are desired. #' @@ -286,50 +286,24 @@ end #' pdv = @pdata [1, -2, 1, NA, 4] #' distinct_values = unique(pdv) function Base.unique{T}(pda::PooledDataArray{T}) - n = length(pda) nlevels = length(pda.pool) - unique_values = Vector{T}(0) - sizehint!(unique_values, nlevels) - seen = Set{eltype(pda.refs)}() - - firstna = 0 - for i in 1:n - if isna(pda, i) - if firstna == 0 - firstna = length(unique_values) + 1 - end - elseif !in(pda.refs[i], seen) - push!(seen, pda.refs[i]) - push!(unique_values, pda.pool[pda.refs[i]]) - else - continue - end - - if firstna > 0 && length(unique_values) == nlevels - break + seen = fill(false, nlevels + 1) + batch = 0 + @inbounds for i in pda.refs + seen[i + 1] = true + # Only do a costly short-circuit check periodically + batch += 1 + if batch > 1000 + all(seen) && break + batch = 0 end end - - if firstna > 0 - res = DataArray(Vector{T}(nlevels + 1)) - i = 0 - for val in unique_values - i += 1 - if i == firstna - res.na[i] = true - i += 1 - end - res.data[i] = val - end - - if firstna == nlevels + 1 - res.na[nlevels + 1] = true - end - - return res - else - return DataArray(unique_values) + seenna = shift!(seen) + res = DataArray(levels(pda)[seen]) + if seenna + push!(res, NA) end + res end #' @description diff --git a/test/pooleddataarray.jl b/test/pooleddataarray.jl index ce99b49..610c829 100644 --- a/test/pooleddataarray.jl +++ b/test/pooleddataarray.jl @@ -31,16 +31,20 @@ module TestPDA @assert levels(setlevels!(@pdata([1.0, 2.0]), [3,4])) == [3.0, 4.0] y = @pdata [1, NA, -2, 1, NA, 4, NA] - @assert isequal(unique(y), @pdata [1, NA, -2, 4]) - @assert isequal(unique(reverse(y)), @data [NA, 4, 1, -2]) - @assert isequal(unique(dropna(y)), @data [1, -2, 4]) - @assert isequal(unique(reverse(dropna(y))), @data [4, 1, -2]) + @assert isequal(unique(y), @pdata [-2, 1, 4, NA]) + @assert isequal(unique(reverse(y)), @data [-2, 1, 4, NA]) + @assert isequal(unique(dropna(y)), @data levels(dropna(y))) + @assert isequal(unique(reverse(dropna(y))), @data levels(reverse(dropna(y)))) z = @pdata ["frank", NA, "gertrude", "frank", NA, "herbert", NA] - @assert isequal(unique(z), @pdata ["frank", NA, "gertrude", "herbert"]) - @assert isequal(unique(reverse(z)), @pdata [NA, "herbert", "frank", "gertrude"]) - @assert isequal(unique(dropna(z)), @pdata ["frank", "gertrude", "herbert"]) - @assert isequal(unique(reverse(dropna(z))), @pdata ["herbert", "frank", "gertrude"]) + @assert isequal(unique(z), @pdata ["frank", "gertrude", "herbert", NA]) + @assert isequal(unique(reverse(z)), @pdata ["frank", "gertrude", "herbert", NA]) + @assert isequal(unique(dropna(z)), @data levels(dropna(z))) + @assert isequal(unique(reverse(dropna(z))), @data levels(reverse(dropna(z)))) + + # check case where some levels are not present in data + z[3] = "frank" + @assert isequal(unique(z), @pdata ["frank", "herbert", NA]) # check case where only NA occurs in final position @assert isequal(unique(@pdata [1, 2, 1, NA]), @pdata [1, 2, NA])