-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathadaptive_blockquickselect.hpp
More file actions
402 lines (325 loc) · 17.8 KB
/
adaptive_blockquickselect.hpp
File metadata and controls
402 lines (325 loc) · 17.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
#include <cmath>
#include <algorithm>
#include <cstdio>
#pragma once
template <std::size_t BlockSize=2048, std::size_t CacheLineSize=64, typename Iter, typename Comp>
void nth_element(Iter begin, Iter end, const Comp comp, std::size_t n);
/*
* Swaps the references held by iter1 and iter2 if the value contained in iter2 is greater than or equal to that
* of iter1. With -O2 and above, gets compiled down to a conditional move instruction, preventing a branch from
* occurring.
*/
template <typename Iter, typename Comp>
inline static void compareAndSwap(const Iter& iter1, const Iter& iter2, const Comp& comp) {
typename std::iterator_traits<Iter>::value_type item = *iter1;
*iter1 = comp(*iter2, item) ? *iter2 : *iter1;
*iter2 = comp(*iter2, item) ? item : *iter2;
}
/*
* Sorting network for an input of size 3. Sorts the first, middle, and final items of the array, then places the
* median of the three values at begin + 1.
*
* Returns the median of the 3 items sampled, which will serve as the pivot iterator.
*/
template<typename Iter, typename Comp>
inline static Iter medianOf3(const Iter& begin, const Iter& end, const Comp& comp) {
compareAndSwap(begin, begin + ((end - begin) >> 1), comp);
compareAndSwap(begin + ((end - begin) >> 1), end - 1, comp);
compareAndSwap(begin, begin + ((end - begin) >> 1), comp);
std::iter_swap(begin + 1, begin + ((end - begin) >> 1));
return begin + 1;
}
/*
* Somewhat naive implementation of block partition that handles two cases:
* (1) When staticBlockPartition() is finished with its main loop and has remaining elements to partition.
* (2) Arrays of size less than (2 * BlockSize) + 1, which are too small for staticBlockPartition().
*
* The function has two main phases - the partitioning phase and the cleanup phase.
*
* The partitioning phase consists of the following steps:
* (1) Divide the array chunk into two slices - a left and right slice.
* (2) Iterate over both slices in the same loop and fill the buffers.
* (left slice) Add the index (relative to unpartitioned_range_begin) of any item !comp() to the pivot to the left buffer.
* (right slice) Add the index (relative to last_unpartitioned_item) of any item comp() to the pivot to the right buffer.
* (3) Find the number of mutual swaps between the buffers, which is the size of the smaller buffer.
* (4) Iterate through the two buffers, swapping out of place items until the number of mutual swaps has been reached.
*
* The cleanup phase consists of the following steps:
* (1) unpartitioned_range_begin begins tracking the beginning of the !comp() partition of the array chunk.
* (2) If either buffer has remaining elements to swap, the elements are instead swapped to the extrema closest to the center
* of the respective partition.
* (3) unpartitioned_range_begin is updated to reflect the new beginning of the !comp() partition.
* (4) The pivot is moved to unpartitioned_range_begin - 1 and is used as the return value.
*
* Assumes that:
* (1) pivot < unpartitioned_range_begin
* (2) unpartitioned_range_begin <= last_unpartitioned_item TODO - verify that this method works on arrays of size 0 lol
* (3) left_buffer and right_buffer have space for at least (remaining_unpartitioned_size / 2) + 1 items
*
* Returns a fully partitioned array around pivot.
*/
template <typename Iter, typename Comp>
static inline Iter dynamicBlockPartition(Iter pivot, Iter unpartitioned_range_begin, Iter last_unpartitioned_item, std::size_t* left_buffer, std::size_t* right_buffer, const Comp& comp) {
/*
* Partitioning phase
*/
// + 1 because last_unpartitioned_item is inclusive
std::size_t remaining_unpartitioned_size = last_unpartitioned_item - unpartitioned_range_begin + 1;
std::size_t left_buffer_size = 0;
std::size_t right_buffer_size = 0;
// TODO I wonder if there's any difference between iterating backwards and iterating forwards starting at arr_size / 2
// Iterate over the array chunk, populating the left and right swap buffers.
for (size_t i = 0; i < remaining_unpartitioned_size >> 1; ++i) {
left_buffer[left_buffer_size] = i;
left_buffer_size += comp(*pivot, *(unpartitioned_range_begin + i));
right_buffer[right_buffer_size] = i;
right_buffer_size += comp(*(last_unpartitioned_item - i), *pivot);
}
// The number of items to swap is the size of the smaller bucket.
std::size_t num_mutual_swaps = std::min(left_buffer_size, right_buffer_size);
// Perform a mutual exchange of misplaced values.
for (size_t i = 0; i < num_mutual_swaps; ++i) {
std::iter_swap(
unpartitioned_range_begin + left_buffer[i],
last_unpartitioned_item - right_buffer[i]
);
}
/*
* Cleanup phase
*/
Iter initial_unpartitioned_range_begin(unpartitioned_range_begin);
// Update unpartitioned_range_begin to be at the beginning of the !comp() partition. If the array chunk is odd, the middle
// item has not been inspected. If this item compares with the pivot, the comp() partition's size is incremented by 1.
unpartitioned_range_begin += (remaining_unpartitioned_size >> 1);
unpartitioned_range_begin += ((remaining_unpartitioned_size % 2) && comp(*(unpartitioned_range_begin), *pivot));
// If there are remaining elements in the left buffer, move them to the right end of the comp() partition.
// The elements closer to the chunk center are moved first, then the elements at the edges of the array.
for (size_t i = 0; i < left_buffer_size - num_mutual_swaps; ++i) {
std::iter_swap(
initial_unpartitioned_range_begin + left_buffer[left_buffer_size - 1 - i],
unpartitioned_range_begin - 1 - i
);
}
// Ditto for the left buffer and the !comp() partition.
for (size_t i = 0; i < right_buffer_size - num_mutual_swaps; ++i) {
std::iter_swap(
last_unpartitioned_item - right_buffer[right_buffer_size - 1 - i],
unpartitioned_range_begin + i
);
}
left_buffer_size -= num_mutual_swaps;
right_buffer_size -= num_mutual_swaps;
// Move unpartitioned_range_begin to the left/right, depending on which (if any) of the buffers had remaining items.
unpartitioned_range_begin -= left_buffer_size;
unpartitioned_range_begin += right_buffer_size;
// Swap the pivot with the last item in the comp() partition.
std::iter_swap(unpartitioned_range_begin - 1, pivot);
return unpartitioned_range_begin - 1;
}
/*
* Delegator function for the above definition of dynamicBlockPartition(). Dynamically allocates buffers based on the
* chunk size. Called by the partition delegator when the size of the array to partition is too small for
* staticBlockPartition().
*
* Returns the result of the main dynamicBlockPartition() function.
*/
template <typename Iter, typename Comp>
static inline Iter dynamicBlockPartition(Iter pivot, Iter unpartitioned_range_begin, Iter last_unpartitioned_item, const Comp& comp) {
alignas(64) std::size_t* left_buffer = new std::size_t[((last_unpartitioned_item - unpartitioned_range_begin + 1) >> 1) + 1];
alignas(64) std::size_t* right_buffer = new std::size_t[((last_unpartitioned_item - unpartitioned_range_begin + 1) >> 1) + 1];
pivot = dynamicBlockPartition(pivot, unpartitioned_range_begin, last_unpartitioned_item, left_buffer, right_buffer, comp);
delete[] left_buffer;
delete[] right_buffer;
return pivot;
}
/*
*
* Works very similarly to dynamicBlockPartition(), except it uses statically allocated buffers of size BlockSize.
*
* Steps:
* (1) Processes two blocks, one on each side of the array chunk, until the remaining unpartitioned range < (2 * BlockSize) + 1.
* (2) Then, performs cleanup similarly to dynamicBlockPartition().
* (3) Sends the remaining unpartitioned array chunk to dynamicBlockPartition().
*
* Returns the correctly positioned pivot from dynamicBlockPartition().
*/
template<std::size_t BlockSize, std::size_t ItemsPerCacheLine, typename Iter, typename Comp>
static inline Iter staticBlockPartition(Iter pivot, Iter unpartitioned_range_begin, Iter unpartitioned_range_end, const Comp& comp) {
constexpr std::size_t BLOCK_PARTITION_THRESHOLD = BlockSize * 2;
typename std::iterator_traits<Iter>::value_type pivot_val = *pivot;
Iter last_item = unpartitioned_range_end - 1;
alignas(64) std::size_t left_buffer[BlockSize];
alignas(64) std::size_t right_buffer[BlockSize];
std::size_t left_buffer_size = 0;
std::size_t right_buffer_size = 0;
std::size_t num_mutual_swaps = 0;
/*
* Partitioning phase
*/
while (static_cast<std::size_t>(last_item - unpartitioned_range_begin + 1) > BLOCK_PARTITION_THRESHOLD) {
// If both buffers are empty, process the next left and right blocks, recording any indices which are not
// correctly partitioned.
if (left_buffer_size == 0 && right_buffer_size == 0) {
for (std::size_t i = 0; i < BlockSize; ) {
for (std::size_t j = 0; j < ItemsPerCacheLine; ++j) {
left_buffer[left_buffer_size] = i;
right_buffer[right_buffer_size] = i;
left_buffer_size += comp(pivot_val, *(unpartitioned_range_begin + i));
right_buffer_size += comp(*(last_item - i), pivot_val);
++i;
}
}
}
// If only the left bucket is empty, fill it.
else if (left_buffer_size == 0) {
for (std::size_t i = 0; i < BlockSize; ) {
for (std::size_t j = 0; j < ItemsPerCacheLine; ++j) {
left_buffer[left_buffer_size] = i;
left_buffer_size += comp(pivot_val, *(unpartitioned_range_begin + i));
++i;
}
}
}
// If only the right bucket is empty, fill it.
else if (right_buffer_size == 0) {
for (std::size_t i = 0; i < BlockSize; ) {
for (std::size_t j = 0; j < ItemsPerCacheLine; ++j) {
right_buffer[right_buffer_size] = i;
right_buffer_size += comp(*(last_item - i), pivot_val);
++i;
}
}
}
// The number of items to swap is the size of the smaller bucket
num_mutual_swaps = std::min(left_buffer_size, right_buffer_size);
// Perform a mutual exchange of misplaced values.
for (std::size_t i = 0; i < num_mutual_swaps; ++i) {
std::iter_swap(
unpartitioned_range_begin + left_buffer[left_buffer_size - 1 - i],
last_item - right_buffer[right_buffer_size - 1 - i]
);
}
left_buffer_size -= num_mutual_swaps;
right_buffer_size -= num_mutual_swaps;
// If left_buffer_size == 0, add BlockSize to unpartitioned_range_begin
unpartitioned_range_begin += (0 - (left_buffer_size == 0)) & BlockSize;
// If right_buffer_size == 0, subtract BlockSize from last_item
last_item -= (0 - (right_buffer_size == 0)) & BlockSize;
}
/*
* Cleanup phase for final iteration of main loop
*/
// If left_buffer_size != 0, add BlockSize to begin
unpartitioned_range_begin += (0 - (left_buffer_size != 0)) & BlockSize;
// If right_buffer_size != 0, subtract BlockSize from last
last_item -= (0 - (right_buffer_size != 0)) & BlockSize;
// If there are remaining elements in the left buffer, move them to the right end of the comp() partition.
// The elements closer to the chunk center are moved first, then the elements at the edges of the array.
for (std::size_t i = 0; i < left_buffer_size; ++i) {
std::iter_swap(
unpartitioned_range_begin - i - 1,
unpartitioned_range_begin - BlockSize + left_buffer[left_buffer_size - 1 - i]
);
}
// Ditto for the left buffer and the !comp() partition.
for (std::size_t i = 0; i < right_buffer_size; ++i) {
std::iter_swap(
last_item + 1 + i,
last_item + BlockSize - right_buffer[right_buffer_size - 1 - i]
);
}
unpartitioned_range_begin -= left_buffer_size;
last_item += right_buffer_size;
return dynamicBlockPartition(pivot, unpartitioned_range_begin, last_item, left_buffer, right_buffer, comp);
}
/*
* Determines the pivot, creates mini-partitions from the sampled items, and then delegates to the static or dynamic
* block partitioning methods, depending on the chunk size.
*/
template<std::size_t BlockSize, std::size_t ItemsPerCacheLine, typename Iter, typename Comp>
static inline Iter selectPivotAndPartitionArray(const Iter& begin, const Iter& end, const std::size_t& k, std::size_t offset, const Comp& comp) {
constexpr std::size_t AdaptiveSampleThreshold = 2 * ItemsPerCacheLine;
std::size_t chunk_size = end - begin;
std::size_t sample_size = sqrt(chunk_size);
// If the sample size is smaller than the threshold for adaptive sampling, fall back to median of 3 pivot selection
// and use dynamic block partitioning.
if (sample_size < AdaptiveSampleThreshold) {
Iter pivot = medianOf3(begin, end, comp);
return dynamicBlockPartition(pivot, begin + 2, end - 2, comp);
}
// The samples are moved to the beginning of the array, and of those samples, the pivot_pos-th largest sample is
// chosen to be the pivot. The pivot position is determined by the size of k relative to the size of the chunk to partition.
float pivot_ratio = float(k) / chunk_size;
std::size_t pivot_pos = pivot_ratio * sample_size;
// If the pivot position is large enough to make it viable, adjust the pivot position so that the pivot position is
// the closest power of ItemsPerCacheLine, minus 1. The sample size is also adjusted to maintain the original
// pivot position : sample size ratio.
if (pivot_pos >= (AdaptiveSampleThreshold)) {
// Round the pivot position to the closest power of ItemsPerCacheLine, minus 1
if (((pivot_pos % ItemsPerCacheLine) >= (ItemsPerCacheLine / 2)) || (pivot_pos < ItemsPerCacheLine)) {
pivot_pos += ((ItemsPerCacheLine - 1) - (pivot_pos % ItemsPerCacheLine));
}
else {
pivot_pos -= ((pivot_pos % ItemsPerCacheLine) + 1);
}
// Account for the offset of the array chunk. For example, if the chunk begins at ItemsPerCacheLine + 3,
// this will subtract 3 from the pivot position so that the unpartitioned range begins on the start of a cache
// line.
if (pivot_pos <= offset) { pivot_pos += (ItemsPerCacheLine - offset); }
else { pivot_pos -= offset; }
// Adjust the sample size based on changes to the pivot position.
sample_size = std::ceil(float(pivot_pos) / pivot_ratio);
}
// Use the beginning and end of (sample size / 2) cache lines as the samples.
for (std::size_t i = 0; i < (sample_size >> 1); ++i) {
std::iter_swap(begin + (i << 1), begin + (ItemsPerCacheLine * i));
std::iter_swap(begin + (i << 1) + 1, begin + (ItemsPerCacheLine * i) + (ItemsPerCacheLine - 1));
}
// If the sample size is odd, we'll just be lazy and use whatever value is already at begin + sample_size - 1. It's random enough.
Iter pivot = begin + pivot_pos;
// Find the pivot_pos-th element of the sampled items.
nth_element(begin, begin + sample_size, comp, pivot - begin);
// Swap all items in the sample slice that are !comp() to the pivot with items at the end of the chunk.
std::swap_ranges(pivot + 1, begin + sample_size, end - (sample_size - pivot_pos) + 1);
// Partition items between [pivot + 1, !comp() mini partition begin)
if (static_cast<std::size_t>((end - (sample_size - pivot_pos) + 1) - (pivot + 1)) > (BlockSize << 1)) {
return staticBlockPartition<BlockSize, ItemsPerCacheLine>(pivot, pivot + 1, end - (sample_size - pivot_pos) + 1, comp);
}
else {
return dynamicBlockPartition(pivot, pivot + 1, end - (sample_size - pivot_pos) + 1, comp);
}
}
// TODO refactor to match signature of std::nth_element
/*
* Standard quickselect loop. Narrows the range of items until it finds the nth_element.
*/
template <std::size_t BlockSize=2048, std::size_t CacheLineSize=64, typename Iter, typename Comp>
void nth_element(Iter begin, Iter end, const Comp comp, std::size_t n) {
constexpr std::size_t items_per_cache_line = CacheLineSize / sizeof(*begin);
std::size_t offset = 0;
while (true) {
if (begin == end) return;
// If we're looking for the smallest item in the array, find it and move it to the beginning.
if (n == 0) {
Iter min_iter = std::min_element(begin, end);
std::iter_swap(min_iter, begin);
return;
}
// If we're looking for the largest item in the array, find it and move it to the last item's position.
else if (n == static_cast<std::size_t>(end - begin - 1)) {
Iter max_iter = std::max_element(begin, end);
std::iter_swap(max_iter, end - 1);
return;
}
Iter pivot = selectPivotAndPartitionArray<BlockSize, items_per_cache_line>(begin, end, n, offset, comp);
if (static_cast<std::size_t>(pivot - begin) == n) return;
bool n_is_smaller = n < static_cast<std::size_t>(pivot - begin);
// If the nth_element is smaller than the pivot's position, set the new search range to [begin, pivot).
end -= ((end - pivot) & (0 - (n_is_smaller)));
// If the nth_element is larger than the pivot's position, set the new search range to (pivot, end) and adjust the offset.
n -= ((pivot - begin + 1) & (0 - (!n_is_smaller)));
offset += ((pivot - begin + 1) & (0 - (!n_is_smaller)));
offset &= (items_per_cache_line - 1);
begin += ((pivot - begin + 1) & (0 - (!n_is_smaller)));
}
}