Skip to content

Commit a066c07

Browse files
committed
add the created field to GroupBy and GatherBy
we are trying to address #102
1 parent 098c851 commit a066c07

File tree

2 files changed

+23
-12
lines changed

2 files changed

+23
-12
lines changed

src/sort/gatherby.jl

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -125,8 +125,9 @@ mutable struct GatherBy
125125
mapformats::Bool
126126
perm
127127
starts
128+
created::DateTime
128129
end
129-
Base.copy(gds::GatherBy) = GatherBy(copy(gds.parent), copy(gds.groupcols), copy(gds.groups), gds.lastvalid, gds.mapformats, gds.perm === nothing ? nothing : copy(gds.perm), gds.starts === nothing ? nothing : copy(gds.starts))
130+
Base.copy(gds::GatherBy) = GatherBy(copy(gds.parent), copy(gds.groupcols), copy(gds.groups), gds.lastvalid, gds.mapformats, gds.perm === nothing ? nothing : copy(gds.perm), gds.starts === nothing ? nothing : copy(gds.starts), gds.created)
130131

131132

132133
nrow(ds::GatherBy) = nrow(ds.parent)
@@ -176,28 +177,28 @@ end
176177
function gatherby(ds::AbstractDataset, cols::MultiColumnIndex; mapformats::Bool = true, stable::Bool = true, isgathered::Bool = false, eachrow::Bool = false, threads = true)
177178
colsidx = index(ds)[cols]
178179
if isempty(ds)
179-
return GatherBy(ds, colsidx, Int[], 0, mapformats, nothing, nothing)
180+
return GatherBy(ds, colsidx, Int[], 0, mapformats, nothing, nothing, _get_lastmodified(_attributes(ds)))
180181
end
181182

182183
T = nrow(ds) < typemax(Int32) ? Int32 : Int64
183184
_check_consistency(ds)
184185
if isgathered
185186
if eachrow
186-
return GatherBy(ds, colsidx, 1:nrow(ds), nrow(ds), mapformats, 1:nrow(ds), 1:nrow(ds))
187+
return GatherBy(ds, colsidx, 1:nrow(ds), nrow(ds), mapformats, 1:nrow(ds), 1:nrow(ds), _get_lastmodified(_attributes(ds)))
187188
else
188189
colindex, ranges, last_valid_index = _find_starts_of_groups(ds, colsidx, Val(T); mapformats = mapformats, threads = threads)
189190
groups = Vector{T}(undef, nrow(ds))
190191
_group_creator!(groups, ranges, last_valid_index)
191-
return GatherBy(ds, colindex, groups, last_valid_index, mapformats, 1:nrow(ds), ranges)
192+
return GatherBy(ds, colindex, groups, last_valid_index, mapformats, 1:nrow(ds), ranges, _get_lastmodified(_attributes(ds)))
192193
end
193194
else
194195
if eachrow
195196
a = _gather_groups(ds, colsidx, Val(T), mapformats = mapformats, stable = stable, threads = threads)
196197
b = compute_indices(a[1], a[3], nrow(ds) < typemax(Int32) ? Val(Int32) : Val(Int64); threads = threads)
197-
return GatherBy(ds, colsidx, 1:nrow(ds), nrow(ds), mapformats, b[1], 1:nrow(ds))
198+
return GatherBy(ds, colsidx, 1:nrow(ds), nrow(ds), mapformats, b[1], 1:nrow(ds), _get_lastmodified(_attributes(ds)))
198199
else
199200
a = _gather_groups(ds, colsidx, Val(T), mapformats = mapformats, stable = stable, threads = threads)
200-
return GatherBy(ds, colsidx, a[1], a[3], mapformats, nothing, nothing)
201+
return GatherBy(ds, colsidx, a[1], a[3], mapformats, nothing, nothing, _get_lastmodified(_attributes(ds)))
201202
end
202203
end
203204
end
@@ -215,7 +216,7 @@ function hm_gatherby(ds::AbstractDataset, cols::MultiColumnIndex; mapformats = f
215216
gds = groupby(ds, [:___tmp___cols8934_2, :___tmp___cols8934], stable = false, threads = threads)
216217
grpcols, ranges, last_valid_index = _find_starts_of_groups(view(ds, gds.perm, cols), cols, nrow(ds) < typemax(Int32) ? Val(Int32) : Val(Int64); mapformats = mapformats, threads = threads)
217218
select!(ds, Not([:___tmp___cols8934, :___tmp___cols8934_2]))
218-
GatherBy(ds, grpcols, nothing, last_valid_index, mapformats, gds.perm, ranges)
219+
GatherBy(ds, grpcols, nothing, last_valid_index, mapformats, gds.perm, ranges, _get_lastmodified(_attributes(ds)))
219220
end
220221

221222
function _fill_mapreduce_col!(x, f, op, y, loc)

src/sort/groupby.jl

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -167,9 +167,10 @@ mutable struct GroupBy
167167
starts
168168
lastvalid
169169
mapformats::Bool
170+
created::DateTime
170171
end
171172

172-
Base.copy(gds::GroupBy) = GroupBy(copy(gds.parent), copy(gds.groupcols), copy(gds.rev), copy(gds.perm), copy(gds.starts), gds.lastvalid, gds.mapformats)
173+
Base.copy(gds::GroupBy) = GroupBy(copy(gds.parent), copy(gds.groupcols), copy(gds.rev), copy(gds.perm), copy(gds.starts), gds.lastvalid, gds.mapformats, gds.created)
173174

174175
nrow(ds::GroupBy) = nrow(ds.parent)
175176
ncol(ds::GroupBy) = ncol(ds.parent)
@@ -185,10 +186,10 @@ function groupby(ds::Dataset, cols::MultiColumnIndex; alg = HeapSortAlg(), rev =
185186
_check_consistency(ds)
186187
colsidx = index(ds)[cols]
187188
if isempty(ds)
188-
return GroupBy(parent(ds), colsidx, rev, Int[], Int[], 0, mapformats)
189+
return GroupBy(parent(ds), colsidx, rev, Int[], Int[], 0, mapformats, _get_lastmodified(_attributes(ds)))
189190
end
190191
a = _sortperm(ds, cols, rev, a = alg, mapformats = mapformats, stable = stable, threads = threads)
191-
GroupBy(parent(ds),colsidx, rev, a[2], a[1], a[3], mapformats)
192+
GroupBy(parent(ds),colsidx, rev, a[2], a[1], a[3], mapformats, _get_lastmodified(_attributes(ds)))
192193
end
193194

194195
groupby(ds::Dataset, col::ColumnIndex; alg = HeapSortAlg(), rev = false, mapformats::Bool = true, stable = true, threads = true) = groupby(ds, [col], alg = alg, rev = rev, mapformats = mapformats, stable = stable, threads = threads)
@@ -209,7 +210,7 @@ function groupby(ds::GroupBy, cols::MultiColumnIndex; alg = HeapSortAlg(), rev =
209210
colsidx = index(ds)[cols]
210211
grng = GIVENRANGE(copy(_get_perms(ds)),copy(_group_starts(ds)), nothing, _ngroups(ds))
211212
a = _sortperm(ds, cols, rev, a = alg, mapformats = mapformats, stable = stable, givenrange = grng, skipcol = -1, threads = threads)
212-
GroupBy(parent(ds),colsidx, rev, a[2], a[1], a[3], mapformats)
213+
GroupBy(parent(ds),colsidx, rev, a[2], a[1], a[3], mapformats, _get_lastmodified(_attributes(parent(ds))))
213214
end
214215
groupby(ds::GroupBy, col::ColumnIndex; alg = HeapSortAlg(), rev = false, mapformats::Bool = true, stable = true, threads = true) = groupby(ds, [col], alg = alg, rev = rev, mapformats = mapformats, stable = stable, threads = threads)
215216

@@ -589,6 +590,15 @@ function groupby(ds::SubDataset, cols::MultiColumnIndex; alg = HeapSortAlg(), re
589590
_check_consistency(ds)
590591
colsidx = index(ds)[cols]
591592
a = _sortperm_v(ds, cols, rev, a = alg, mapformats = mapformats, stable = stable, threads = threads)
592-
GroupBy(ds, colsidx, rev, a[2], a[1], a[3], mapformats)
593+
GroupBy(ds, colsidx, rev, a[2], a[1], a[3], mapformats, _get_lastmodified(_attributes(ds)))
593594
end
594595
groupby(ds::SubDataset, col::ColumnIndex; alg = HeapSortAlg(), rev = false, mapformats::Bool = true, stable = true, threads = true) = groupby(ds, [col], alg = alg, rev = rev, mapformats = mapformats, stable = stable, threads = threads)
596+
597+
598+
### check consistency of grouped data - GroupBy, GatherBy
599+
600+
function _check_consistency(ds::Union{GroupBy, GatherBy})
601+
lmd=ds.created
602+
lmp=_get_lastmodified(_attributes(parent(ds)))
603+
@assert lmd == lmp "The parent data set which the grouped data set is based on has been modified. To fix the issue regroup data."
604+
end

0 commit comments

Comments
 (0)