master
1const builtin = @import("builtin");
2const std = @import("../std.zig");
3const sort = std.sort;
4const math = std.math;
5const mem = std.mem;
6
7const Range = struct {
8 start: usize,
9 end: usize,
10
11 fn init(start: usize, end: usize) Range {
12 return Range{
13 .start = start,
14 .end = end,
15 };
16 }
17
18 fn length(self: Range) usize {
19 return self.end - self.start;
20 }
21};
22
23const Iterator = struct {
24 size: usize,
25 power_of_two: usize,
26 numerator: usize,
27 decimal: usize,
28 denominator: usize,
29 decimal_step: usize,
30 numerator_step: usize,
31
32 fn init(size2: usize, min_level: usize) Iterator {
33 const power_of_two = math.floorPowerOfTwo(usize, size2);
34 const denominator = power_of_two / min_level;
35 return Iterator{
36 .numerator = 0,
37 .decimal = 0,
38 .size = size2,
39 .power_of_two = power_of_two,
40 .denominator = denominator,
41 .decimal_step = size2 / denominator,
42 .numerator_step = size2 % denominator,
43 };
44 }
45
46 fn begin(self: *Iterator) void {
47 self.numerator = 0;
48 self.decimal = 0;
49 }
50
51 fn nextRange(self: *Iterator) Range {
52 const start = self.decimal;
53
54 self.decimal += self.decimal_step;
55 self.numerator += self.numerator_step;
56 if (self.numerator >= self.denominator) {
57 self.numerator -= self.denominator;
58 self.decimal += 1;
59 }
60
61 return Range{
62 .start = start,
63 .end = self.decimal,
64 };
65 }
66
67 fn finished(self: *Iterator) bool {
68 return self.decimal >= self.size;
69 }
70
71 fn nextLevel(self: *Iterator) bool {
72 self.decimal_step += self.decimal_step;
73 self.numerator_step += self.numerator_step;
74 if (self.numerator_step >= self.denominator) {
75 self.numerator_step -= self.denominator;
76 self.decimal_step += 1;
77 }
78
79 return (self.decimal_step < self.size);
80 }
81
82 fn length(self: *Iterator) usize {
83 return self.decimal_step;
84 }
85};
86
87const Pull = struct {
88 from: usize,
89 to: usize,
90 count: usize,
91 range: Range,
92};
93
94/// Stable in-place sort. O(n) best case, O(n*log(n)) worst case and average case.
95/// O(1) memory (no allocator required).
96/// Sorts in ascending order with respect to the given `lessThan` function.
97///
98/// NOTE: The algorithm only works when the comparison is less-than or greater-than.
99/// (See https://github.com/ziglang/zig/issues/8289)
100pub fn block(
101 comptime T: type,
102 items: []T,
103 context: anytype,
104 comptime lessThanFn: fn (@TypeOf(context), lhs: T, rhs: T) bool,
105) void {
106 const lessThan = if (builtin.mode == .Debug) struct {
107 fn lessThan(ctx: @TypeOf(context), lhs: T, rhs: T) bool {
108 const lt = lessThanFn(ctx, lhs, rhs);
109 const gt = lessThanFn(ctx, rhs, lhs);
110 std.debug.assert(!(lt and gt));
111 return lt;
112 }
113 }.lessThan else lessThanFn;
114
115 // Implementation ported from https://github.com/BonzaiThePenguin/WikiSort/blob/master/WikiSort.c
116 var cache: [512]T = undefined;
117
118 if (items.len < 4) {
119 if (items.len == 3) {
120 // hard coded insertion sort
121 if (lessThan(context, items[1], items[0])) mem.swap(T, &items[0], &items[1]);
122 if (lessThan(context, items[2], items[1])) {
123 mem.swap(T, &items[1], &items[2]);
124 if (lessThan(context, items[1], items[0])) mem.swap(T, &items[0], &items[1]);
125 }
126 } else if (items.len == 2) {
127 if (lessThan(context, items[1], items[0])) mem.swap(T, &items[0], &items[1]);
128 }
129 return;
130 }
131
132 // sort groups of 4-8 items at a time using an unstable sorting network,
133 // but keep track of the original item orders to force it to be stable
134 // http://pages.ripco.net/~jgamble/nw.html
135 var iterator = Iterator.init(items.len, 4);
136 while (!iterator.finished()) {
137 var order = [_]u8{ 0, 1, 2, 3, 4, 5, 6, 7 };
138 const range = iterator.nextRange();
139
140 const sliced_items = items[range.start..];
141 switch (range.length()) {
142 8 => {
143 swap(T, sliced_items, &order, 0, 1, context, lessThan);
144 swap(T, sliced_items, &order, 2, 3, context, lessThan);
145 swap(T, sliced_items, &order, 4, 5, context, lessThan);
146 swap(T, sliced_items, &order, 6, 7, context, lessThan);
147 swap(T, sliced_items, &order, 0, 2, context, lessThan);
148 swap(T, sliced_items, &order, 1, 3, context, lessThan);
149 swap(T, sliced_items, &order, 4, 6, context, lessThan);
150 swap(T, sliced_items, &order, 5, 7, context, lessThan);
151 swap(T, sliced_items, &order, 1, 2, context, lessThan);
152 swap(T, sliced_items, &order, 5, 6, context, lessThan);
153 swap(T, sliced_items, &order, 0, 4, context, lessThan);
154 swap(T, sliced_items, &order, 3, 7, context, lessThan);
155 swap(T, sliced_items, &order, 1, 5, context, lessThan);
156 swap(T, sliced_items, &order, 2, 6, context, lessThan);
157 swap(T, sliced_items, &order, 1, 4, context, lessThan);
158 swap(T, sliced_items, &order, 3, 6, context, lessThan);
159 swap(T, sliced_items, &order, 2, 4, context, lessThan);
160 swap(T, sliced_items, &order, 3, 5, context, lessThan);
161 swap(T, sliced_items, &order, 3, 4, context, lessThan);
162 },
163 7 => {
164 swap(T, sliced_items, &order, 1, 2, context, lessThan);
165 swap(T, sliced_items, &order, 3, 4, context, lessThan);
166 swap(T, sliced_items, &order, 5, 6, context, lessThan);
167 swap(T, sliced_items, &order, 0, 2, context, lessThan);
168 swap(T, sliced_items, &order, 3, 5, context, lessThan);
169 swap(T, sliced_items, &order, 4, 6, context, lessThan);
170 swap(T, sliced_items, &order, 0, 1, context, lessThan);
171 swap(T, sliced_items, &order, 4, 5, context, lessThan);
172 swap(T, sliced_items, &order, 2, 6, context, lessThan);
173 swap(T, sliced_items, &order, 0, 4, context, lessThan);
174 swap(T, sliced_items, &order, 1, 5, context, lessThan);
175 swap(T, sliced_items, &order, 0, 3, context, lessThan);
176 swap(T, sliced_items, &order, 2, 5, context, lessThan);
177 swap(T, sliced_items, &order, 1, 3, context, lessThan);
178 swap(T, sliced_items, &order, 2, 4, context, lessThan);
179 swap(T, sliced_items, &order, 2, 3, context, lessThan);
180 },
181 6 => {
182 swap(T, sliced_items, &order, 1, 2, context, lessThan);
183 swap(T, sliced_items, &order, 4, 5, context, lessThan);
184 swap(T, sliced_items, &order, 0, 2, context, lessThan);
185 swap(T, sliced_items, &order, 3, 5, context, lessThan);
186 swap(T, sliced_items, &order, 0, 1, context, lessThan);
187 swap(T, sliced_items, &order, 3, 4, context, lessThan);
188 swap(T, sliced_items, &order, 2, 5, context, lessThan);
189 swap(T, sliced_items, &order, 0, 3, context, lessThan);
190 swap(T, sliced_items, &order, 1, 4, context, lessThan);
191 swap(T, sliced_items, &order, 2, 4, context, lessThan);
192 swap(T, sliced_items, &order, 1, 3, context, lessThan);
193 swap(T, sliced_items, &order, 2, 3, context, lessThan);
194 },
195 5 => {
196 swap(T, sliced_items, &order, 0, 1, context, lessThan);
197 swap(T, sliced_items, &order, 3, 4, context, lessThan);
198 swap(T, sliced_items, &order, 2, 4, context, lessThan);
199 swap(T, sliced_items, &order, 2, 3, context, lessThan);
200 swap(T, sliced_items, &order, 1, 4, context, lessThan);
201 swap(T, sliced_items, &order, 0, 3, context, lessThan);
202 swap(T, sliced_items, &order, 0, 2, context, lessThan);
203 swap(T, sliced_items, &order, 1, 3, context, lessThan);
204 swap(T, sliced_items, &order, 1, 2, context, lessThan);
205 },
206 4 => {
207 swap(T, sliced_items, &order, 0, 1, context, lessThan);
208 swap(T, sliced_items, &order, 2, 3, context, lessThan);
209 swap(T, sliced_items, &order, 0, 2, context, lessThan);
210 swap(T, sliced_items, &order, 1, 3, context, lessThan);
211 swap(T, sliced_items, &order, 1, 2, context, lessThan);
212 },
213 else => {},
214 }
215 }
216 if (items.len < 8) return;
217
218 // then merge sort the higher levels, which can be 8-15, 16-31, 32-63, 64-127, etc.
219 while (true) {
220 // if every A and B block will fit into the cache, use a special branch
221 // specifically for merging with the cache
222 // (we use < rather than <= since the block size might be one more than
223 // iterator.length())
224 if (iterator.length() < cache.len) {
225 // if four subarrays fit into the cache, it's faster to merge both
226 // pairs of subarrays into the cache,
227 // then merge the two merged subarrays from the cache back into the original array
228 if ((iterator.length() + 1) * 4 <= cache.len and iterator.length() * 4 <= items.len) {
229 iterator.begin();
230 while (!iterator.finished()) {
231 // merge A1 and B1 into the cache
232 var A1 = iterator.nextRange();
233 var B1 = iterator.nextRange();
234 var A2 = iterator.nextRange();
235 var B2 = iterator.nextRange();
236
237 if (lessThan(context, items[B1.end - 1], items[A1.start])) {
238 // the two ranges are in reverse order, so copy them in reverse order into the cache
239 const a1_items = items[A1.start..A1.end];
240 @memcpy(cache[B1.length()..][0..a1_items.len], a1_items);
241 const b1_items = items[B1.start..B1.end];
242 @memcpy(cache[0..b1_items.len], b1_items);
243 } else if (lessThan(context, items[B1.start], items[A1.end - 1])) {
244 // these two ranges weren't already in order, so merge them into the cache
245 mergeInto(T, items, A1, B1, cache[0..], context, lessThan);
246 } else {
247 // if A1, B1, A2, and B2 are all in order, skip doing anything else
248 if (!lessThan(context, items[B2.start], items[A2.end - 1]) and !lessThan(context, items[A2.start], items[B1.end - 1])) continue;
249
250 // copy A1 and B1 into the cache in the same order
251 const a1_items = items[A1.start..A1.end];
252 @memcpy(cache[0..a1_items.len], a1_items);
253 const b1_items = items[B1.start..B1.end];
254 @memcpy(cache[A1.length()..][0..b1_items.len], b1_items);
255 }
256 A1 = Range.init(A1.start, B1.end);
257
258 // merge A2 and B2 into the cache
259 if (lessThan(context, items[B2.end - 1], items[A2.start])) {
260 // the two ranges are in reverse order, so copy them in reverse order into the cache
261 const a2_items = items[A2.start..A2.end];
262 @memcpy(cache[A1.length() + B2.length() ..][0..a2_items.len], a2_items);
263 const b2_items = items[B2.start..B2.end];
264 @memcpy(cache[A1.length()..][0..b2_items.len], b2_items);
265 } else if (lessThan(context, items[B2.start], items[A2.end - 1])) {
266 // these two ranges weren't already in order, so merge them into the cache
267 mergeInto(T, items, A2, B2, cache[A1.length()..], context, lessThan);
268 } else {
269 // copy A2 and B2 into the cache in the same order
270 const a2_items = items[A2.start..A2.end];
271 @memcpy(cache[A1.length()..][0..a2_items.len], a2_items);
272 const b2_items = items[B2.start..B2.end];
273 @memcpy(cache[A1.length() + A2.length() ..][0..b2_items.len], b2_items);
274 }
275 A2 = Range.init(A2.start, B2.end);
276
277 // merge A1 and A2 from the cache into the items
278 const A3 = Range.init(0, A1.length());
279 const B3 = Range.init(A1.length(), A1.length() + A2.length());
280
281 if (lessThan(context, cache[B3.end - 1], cache[A3.start])) {
282 // the two ranges are in reverse order, so copy them in reverse order into the items
283 const a3_items = cache[A3.start..A3.end];
284 @memcpy(items[A1.start + A2.length() ..][0..a3_items.len], a3_items);
285 const b3_items = cache[B3.start..B3.end];
286 @memcpy(items[A1.start..][0..b3_items.len], b3_items);
287 } else if (lessThan(context, cache[B3.start], cache[A3.end - 1])) {
288 // these two ranges weren't already in order, so merge them back into the items
289 mergeInto(T, cache[0..], A3, B3, items[A1.start..], context, lessThan);
290 } else {
291 // copy A3 and B3 into the items in the same order
292 const a3_items = cache[A3.start..A3.end];
293 @memcpy(items[A1.start..][0..a3_items.len], a3_items);
294 const b3_items = cache[B3.start..B3.end];
295 @memcpy(items[A1.start + A1.length() ..][0..b3_items.len], b3_items);
296 }
297 }
298
299 // we merged two levels at the same time, so we're done with this level already
300 // (iterator.nextLevel() is called again at the bottom of this outer merge loop)
301 _ = iterator.nextLevel();
302 } else {
303 iterator.begin();
304 while (!iterator.finished()) {
305 const A = iterator.nextRange();
306 const B = iterator.nextRange();
307
308 if (lessThan(context, items[B.end - 1], items[A.start])) {
309 // the two ranges are in reverse order, so a simple rotation should fix it
310 mem.rotate(T, items[A.start..B.end], A.length());
311 } else if (lessThan(context, items[B.start], items[A.end - 1])) {
312 // these two ranges weren't already in order, so we'll need to merge them!
313 const a_items = items[A.start..A.end];
314 @memcpy(cache[0..a_items.len], a_items);
315 mergeExternal(T, items, A, B, cache[0..], context, lessThan);
316 }
317 }
318 }
319 } else {
320 // this is where the in-place merge logic starts!
321 // 1. pull out two internal buffers each containing √A unique values
322 // 1a. adjust block_size and buffer_size if we couldn't find enough unique values
323 // 2. loop over the A and B subarrays within this level of the merge sort
324 // 3. break A and B into blocks of size 'block_size'
325 // 4. "tag" each of the A blocks with values from the first internal buffer
326 // 5. roll the A blocks through the B blocks and drop/rotate them where they belong
327 // 6. merge each A block with any B values that follow, using the cache or the second internal buffer
328 // 7. sort the second internal buffer if it exists
329 // 8. redistribute the two internal buffers back into the items
330 var block_size: usize = math.sqrt(iterator.length());
331 var buffer_size = iterator.length() / block_size + 1;
332
333 // as an optimization, we really only need to pull out the internal buffers once for each level of merges
334 // after that we can reuse the same buffers over and over, then redistribute it when we're finished with this level
335 var A: Range = undefined;
336 var B: Range = undefined;
337 var index: usize = 0;
338 var last: usize = 0;
339 var count: usize = 0;
340 var find: usize = 0;
341 var start: usize = 0;
342 var pull_index: usize = 0;
343 var pull = [_]Pull{
344 Pull{
345 .from = 0,
346 .to = 0,
347 .count = 0,
348 .range = Range.init(0, 0),
349 },
350 Pull{
351 .from = 0,
352 .to = 0,
353 .count = 0,
354 .range = Range.init(0, 0),
355 },
356 };
357
358 var buffer1 = Range.init(0, 0);
359 var buffer2 = Range.init(0, 0);
360
361 // find two internal buffers of size 'buffer_size' each
362 find = buffer_size + buffer_size;
363 var find_separately = false;
364
365 if (block_size <= cache.len) {
366 // if every A block fits into the cache then we won't need the second internal buffer,
367 // so we really only need to find 'buffer_size' unique values
368 find = buffer_size;
369 } else if (find > iterator.length()) {
370 // we can't fit both buffers into the same A or B subarray, so find two buffers separately
371 find = buffer_size;
372 find_separately = true;
373 }
374
375 // we need to find either a single contiguous space containing 2√A unique values (which will be split up into two buffers of size √A each),
376 // or we need to find one buffer of < 2√A unique values, and a second buffer of √A unique values,
377 // OR if we couldn't find that many unique values, we need the largest possible buffer we can get
378
379 // in the case where it couldn't find a single buffer of at least √A unique values,
380 // all of the Merge steps must be replaced by a different merge algorithm (MergeInPlace)
381 iterator.begin();
382 while (!iterator.finished()) {
383 A = iterator.nextRange();
384 B = iterator.nextRange();
385
386 // just store information about where the values will be pulled from and to,
387 // as well as how many values there are, to create the two internal buffers
388
389 // check A for the number of unique values we need to fill an internal buffer
390 // these values will be pulled out to the start of A
391 last = A.start;
392 count = 1;
393 while (count < find) : ({
394 last = index;
395 count += 1;
396 }) {
397 index = findLastForward(T, items, items[last], Range.init(last + 1, A.end), find - count, context, lessThan);
398 if (index == A.end) break;
399 }
400 index = last;
401
402 if (count >= buffer_size) {
403 // keep track of the range within the items where we'll need to "pull out" these values to create the internal buffer
404 pull[pull_index] = Pull{
405 .range = Range.init(A.start, B.end),
406 .count = count,
407 .from = index,
408 .to = A.start,
409 };
410 pull_index = 1;
411
412 if (count == buffer_size + buffer_size) {
413 // we were able to find a single contiguous section containing 2√A unique values,
414 // so this section can be used to contain both of the internal buffers we'll need
415 buffer1 = Range.init(A.start, A.start + buffer_size);
416 buffer2 = Range.init(A.start + buffer_size, A.start + count);
417 break;
418 } else if (find == buffer_size + buffer_size) {
419 // we found a buffer that contains at least √A unique values, but did not contain the full 2√A unique values,
420 // so we still need to find a second separate buffer of at least √A unique values
421 buffer1 = Range.init(A.start, A.start + count);
422 find = buffer_size;
423 } else if (block_size <= cache.len) {
424 // we found the first and only internal buffer that we need, so we're done!
425 buffer1 = Range.init(A.start, A.start + count);
426 break;
427 } else if (find_separately) {
428 // found one buffer, but now find the other one
429 buffer1 = Range.init(A.start, A.start + count);
430 find_separately = false;
431 } else {
432 // we found a second buffer in an 'A' subarray containing √A unique values, so we're done!
433 buffer2 = Range.init(A.start, A.start + count);
434 break;
435 }
436 } else if (pull_index == 0 and count > buffer1.length()) {
437 // keep track of the largest buffer we were able to find
438 buffer1 = Range.init(A.start, A.start + count);
439 pull[pull_index] = Pull{
440 .range = Range.init(A.start, B.end),
441 .count = count,
442 .from = index,
443 .to = A.start,
444 };
445 }
446
447 // check B for the number of unique values we need to fill an internal buffer
448 // these values will be pulled out to the end of B
449 last = B.end - 1;
450 count = 1;
451 while (count < find) : ({
452 last = index - 1;
453 count += 1;
454 }) {
455 index = findFirstBackward(T, items, items[last], Range.init(B.start, last), find - count, context, lessThan);
456 if (index == B.start) break;
457 }
458 index = last;
459
460 if (count >= buffer_size) {
461 // keep track of the range within the items where we'll need to "pull out" these values to create the internal buffe
462 pull[pull_index] = Pull{
463 .range = Range.init(A.start, B.end),
464 .count = count,
465 .from = index,
466 .to = B.end,
467 };
468 pull_index = 1;
469
470 if (count == buffer_size + buffer_size) {
471 // we were able to find a single contiguous section containing 2√A unique values,
472 // so this section can be used to contain both of the internal buffers we'll need
473 buffer1 = Range.init(B.end - count, B.end - buffer_size);
474 buffer2 = Range.init(B.end - buffer_size, B.end);
475 break;
476 } else if (find == buffer_size + buffer_size) {
477 // we found a buffer that contains at least √A unique values, but did not contain the full 2√A unique values,
478 // so we still need to find a second separate buffer of at least √A unique values
479 buffer1 = Range.init(B.end - count, B.end);
480 find = buffer_size;
481 } else if (block_size <= cache.len) {
482 // we found the first and only internal buffer that we need, so we're done!
483 buffer1 = Range.init(B.end - count, B.end);
484 break;
485 } else if (find_separately) {
486 // found one buffer, but now find the other one
487 buffer1 = Range.init(B.end - count, B.end);
488 find_separately = false;
489 } else {
490 // buffer2 will be pulled out from a 'B' subarray, so if the first buffer was pulled out from the corresponding 'A' subarray,
491 // we need to adjust the end point for that A subarray so it knows to stop redistributing its values before reaching buffer2
492 if (pull[0].range.start == A.start) pull[0].range.end -= pull[1].count;
493
494 // we found a second buffer in an 'B' subarray containing √A unique values, so we're done!
495 buffer2 = Range.init(B.end - count, B.end);
496 break;
497 }
498 } else if (pull_index == 0 and count > buffer1.length()) {
499 // keep track of the largest buffer we were able to find
500 buffer1 = Range.init(B.end - count, B.end);
501 pull[pull_index] = Pull{
502 .range = Range.init(A.start, B.end),
503 .count = count,
504 .from = index,
505 .to = B.end,
506 };
507 }
508 }
509
510 // pull out the two ranges so we can use them as internal buffers
511 pull_index = 0;
512 while (pull_index < 2) : (pull_index += 1) {
513 const length = pull[pull_index].count;
514
515 if (pull[pull_index].to < pull[pull_index].from) {
516 // we're pulling the values out to the left, which means the start of an A subarray
517 index = pull[pull_index].from;
518 count = 1;
519 while (count < length) : (count += 1) {
520 index = findFirstBackward(T, items, items[index - 1], Range.init(pull[pull_index].to, pull[pull_index].from - (count - 1)), length - count, context, lessThan);
521 const range = Range.init(index + 1, pull[pull_index].from + 1);
522 mem.rotate(T, items[range.start..range.end], range.length() - count);
523 pull[pull_index].from = index + count;
524 }
525 } else if (pull[pull_index].to > pull[pull_index].from) {
526 // we're pulling values out to the right, which means the end of a B subarray
527 index = pull[pull_index].from + 1;
528 count = 1;
529 while (count < length) : (count += 1) {
530 index = findLastForward(T, items, items[index], Range.init(index, pull[pull_index].to), length - count, context, lessThan);
531 const range = Range.init(pull[pull_index].from, index - 1);
532 mem.rotate(T, items[range.start..range.end], count);
533 pull[pull_index].from = index - 1 - count;
534 }
535 }
536 }
537
538 // adjust block_size and buffer_size based on the values we were able to pull out
539 buffer_size = buffer1.length();
540 block_size = iterator.length() / buffer_size + 1;
541
542 // the first buffer NEEDS to be large enough to tag each of the evenly sized A blocks,
543 // so this was originally here to test the math for adjusting block_size above
544 // assert((iterator.length() + 1)/block_size <= buffer_size);
545
546 // now that the two internal buffers have been created, it's time to merge each A+B combination at this level of the merge sort!
547 iterator.begin();
548 while (!iterator.finished()) {
549 A = iterator.nextRange();
550 B = iterator.nextRange();
551
552 // remove any parts of A or B that are being used by the internal buffers
553 start = A.start;
554 if (start == pull[0].range.start) {
555 if (pull[0].from > pull[0].to) {
556 A.start += pull[0].count;
557
558 // if the internal buffer takes up the entire A or B subarray, then there's nothing to merge
559 // this only happens for very small subarrays, like √4 = 2, 2 * (2 internal buffers) = 4,
560 // which also only happens when cache.len is small or 0 since it'd otherwise use MergeExternal
561 if (A.length() == 0) continue;
562 } else if (pull[0].from < pull[0].to) {
563 B.end -= pull[0].count;
564 if (B.length() == 0) continue;
565 }
566 }
567 if (start == pull[1].range.start) {
568 if (pull[1].from > pull[1].to) {
569 A.start += pull[1].count;
570 if (A.length() == 0) continue;
571 } else if (pull[1].from < pull[1].to) {
572 B.end -= pull[1].count;
573 if (B.length() == 0) continue;
574 }
575 }
576
577 if (lessThan(context, items[B.end - 1], items[A.start])) {
578 // the two ranges are in reverse order, so a simple rotation should fix it
579 mem.rotate(T, items[A.start..B.end], A.length());
580 } else if (lessThan(context, items[A.end], items[A.end - 1])) {
581 // these two ranges weren't already in order, so we'll need to merge them!
582 var findA: usize = undefined;
583
584 // break the remainder of A into blocks. firstA is the uneven-sized first A block
585 var blockA = Range.init(A.start, A.end);
586 var firstA = Range.init(A.start, A.start + blockA.length() % block_size);
587
588 // swap the first value of each A block with the value in buffer1
589 var indexA = buffer1.start;
590 index = firstA.end;
591 while (index < blockA.end) : ({
592 indexA += 1;
593 index += block_size;
594 }) {
595 mem.swap(T, &items[indexA], &items[index]);
596 }
597
598 // start rolling the A blocks through the B blocks!
599 // whenever we leave an A block behind, we'll need to merge the previous A block with any B blocks that follow it, so track that information as well
600 var lastA = firstA;
601 var lastB = Range.init(0, 0);
602 var blockB = Range.init(B.start, B.start + @min(block_size, B.length()));
603 blockA.start += firstA.length();
604 indexA = buffer1.start;
605
606 // if the first unevenly sized A block fits into the cache, copy it there for when we go to Merge it
607 // otherwise, if the second buffer is available, block swap the contents into that
608 if (lastA.length() <= cache.len) {
609 const last_a_items = items[lastA.start..lastA.end];
610 @memcpy(cache[0..last_a_items.len], last_a_items);
611 } else if (buffer2.length() > 0) {
612 blockSwap(T, items, lastA.start, buffer2.start, lastA.length());
613 }
614
615 if (blockA.length() > 0) {
616 while (true) {
617 // if there's a previous B block and the first value of the minimum A block is <= the last value of the previous B block,
618 // then drop that minimum A block behind. or if there are no B blocks left then keep dropping the remaining A blocks.
619 if ((lastB.length() > 0 and !lessThan(context, items[lastB.end - 1], items[indexA])) or blockB.length() == 0) {
620 // figure out where to split the previous B block, and rotate it at the split
621 const B_split = binaryFirst(T, items, items[indexA], lastB, context, lessThan);
622 const B_remaining = lastB.end - B_split;
623
624 // swap the minimum A block to the beginning of the rolling A blocks
625 var minA = blockA.start;
626 findA = minA + block_size;
627 while (findA < blockA.end) : (findA += block_size) {
628 if (lessThan(context, items[findA], items[minA])) {
629 minA = findA;
630 }
631 }
632 blockSwap(T, items, blockA.start, minA, block_size);
633
634 // swap the first item of the previous A block back with its original value, which is stored in buffer1
635 mem.swap(T, &items[blockA.start], &items[indexA]);
636 indexA += 1;
637
638 // locally merge the previous A block with the B values that follow it
639 // if lastA fits into the external cache we'll use that (with MergeExternal),
640 // or if the second internal buffer exists we'll use that (with MergeInternal),
641 // or failing that we'll use a strictly in-place merge algorithm (MergeInPlace)
642
643 if (lastA.length() <= cache.len) {
644 mergeExternal(T, items, lastA, Range.init(lastA.end, B_split), cache[0..], context, lessThan);
645 } else if (buffer2.length() > 0) {
646 mergeInternal(T, items, lastA, Range.init(lastA.end, B_split), buffer2, context, lessThan);
647 } else {
648 mergeInPlace(T, items, lastA, Range.init(lastA.end, B_split), context, lessThan);
649 }
650
651 if (buffer2.length() > 0 or block_size <= cache.len) {
652 // copy the previous A block into the cache or buffer2, since that's where we need it to be when we go to merge it anyway
653 if (block_size <= cache.len) {
654 @memcpy(cache[0..block_size], items[blockA.start..][0..block_size]);
655 } else {
656 blockSwap(T, items, blockA.start, buffer2.start, block_size);
657 }
658
659 // this is equivalent to rotating, but faster
660 // the area normally taken up by the A block is either the contents of buffer2, or data we don't need anymore since we memcopied it
661 // either way, we don't need to retain the order of those items, so instead of rotating we can just block swap B to where it belongs
662 blockSwap(T, items, B_split, blockA.start + block_size - B_remaining, B_remaining);
663 } else {
664 // we are unable to use the 'buffer2' trick to speed up the rotation operation since buffer2 doesn't exist, so perform a normal rotation
665 mem.rotate(T, items[B_split .. blockA.start + block_size], blockA.start - B_split);
666 }
667
668 // update the range for the remaining A blocks, and the range remaining from the B block after it was split
669 lastA = Range.init(blockA.start - B_remaining, blockA.start - B_remaining + block_size);
670 lastB = Range.init(lastA.end, lastA.end + B_remaining);
671
672 // if there are no more A blocks remaining, this step is finished!
673 blockA.start += block_size;
674 if (blockA.length() == 0) break;
675 } else if (blockB.length() < block_size) {
676 // move the last B block, which is unevenly sized, to before the remaining A blocks, by using a rotation
677 // the cache is disabled here since it might contain the contents of the previous A block
678 mem.rotate(T, items[blockA.start..blockB.end], blockB.start - blockA.start);
679
680 lastB = Range.init(blockA.start, blockA.start + blockB.length());
681 blockA.start += blockB.length();
682 blockA.end += blockB.length();
683 blockB.end = blockB.start;
684 } else {
685 // roll the leftmost A block to the end by swapping it with the next B block
686 blockSwap(T, items, blockA.start, blockB.start, block_size);
687 lastB = Range.init(blockA.start, blockA.start + block_size);
688
689 blockA.start += block_size;
690 blockA.end += block_size;
691 blockB.start += block_size;
692
693 if (blockB.end > B.end - block_size) {
694 blockB.end = B.end;
695 } else {
696 blockB.end += block_size;
697 }
698 }
699 }
700 }
701
702 // merge the last A block with the remaining B values
703 if (lastA.length() <= cache.len) {
704 mergeExternal(T, items, lastA, Range.init(lastA.end, B.end), cache[0..], context, lessThan);
705 } else if (buffer2.length() > 0) {
706 mergeInternal(T, items, lastA, Range.init(lastA.end, B.end), buffer2, context, lessThan);
707 } else {
708 mergeInPlace(T, items, lastA, Range.init(lastA.end, B.end), context, lessThan);
709 }
710 }
711 }
712
713 // when we're finished with this merge step we should have the one
714 // or two internal buffers left over, where the second buffer is all jumbled up
715 // insertion sort the second buffer, then redistribute the buffers
716 // back into the items using the opposite process used for creating the buffer
717
718 // while an unstable sort like quicksort could be applied here, in benchmarks
719 // it was consistently slightly slower than a simple insertion sort,
720 // even for tens of millions of items. this may be because insertion
721 // sort is quite fast when the data is already somewhat sorted, like it is here
722 sort.insertion(T, items[buffer2.start..buffer2.end], context, lessThan);
723
724 pull_index = 0;
725 while (pull_index < 2) : (pull_index += 1) {
726 var unique = pull[pull_index].count * 2;
727 if (pull[pull_index].from > pull[pull_index].to) {
728 // the values were pulled out to the left, so redistribute them back to the right
729 var buffer = Range.init(pull[pull_index].range.start, pull[pull_index].range.start + pull[pull_index].count);
730 while (buffer.length() > 0) {
731 index = findFirstForward(T, items, items[buffer.start], Range.init(buffer.end, pull[pull_index].range.end), unique, context, lessThan);
732 const amount = index - buffer.end;
733 mem.rotate(T, items[buffer.start..index], buffer.length());
734 buffer.start += (amount + 1);
735 buffer.end += amount;
736 unique -= 2;
737 }
738 } else if (pull[pull_index].from < pull[pull_index].to) {
739 // the values were pulled out to the right, so redistribute them back to the left
740 var buffer = Range.init(pull[pull_index].range.end - pull[pull_index].count, pull[pull_index].range.end);
741 while (buffer.length() > 0) {
742 index = findLastBackward(T, items, items[buffer.end - 1], Range.init(pull[pull_index].range.start, buffer.start), unique, context, lessThan);
743 const amount = buffer.start - index;
744 mem.rotate(T, items[index..buffer.end], amount);
745 buffer.start -= amount;
746 buffer.end -= (amount + 1);
747 unique -= 2;
748 }
749 }
750 }
751 }
752
753 // double the size of each A and B subarray that will be merged in the next level
754 if (!iterator.nextLevel()) break;
755 }
756}
757// merge operation without a buffer
758fn mergeInPlace(
759 comptime T: type,
760 items: []T,
761 A_arg: Range,
762 B_arg: Range,
763 context: anytype,
764 comptime lessThan: fn (@TypeOf(context), lhs: T, rhs: T) bool,
765) void {
766 if (A_arg.length() == 0 or B_arg.length() == 0) return;
767
768 // this just repeatedly binary searches into B and rotates A into position.
769 // the paper suggests using the 'rotation-based Hwang and Lin algorithm' here,
770 // but I decided to stick with this because it had better situational performance
771 //
772 // (Hwang and Lin is designed for merging subarrays of very different sizes,
773 // but WikiSort almost always uses subarrays that are roughly the same size)
774 //
775 // normally this is incredibly suboptimal, but this function is only called
776 // when none of the A or B blocks in any subarray contained 2√A unique values,
777 // which places a hard limit on the number of times this will ACTUALLY need
778 // to binary search and rotate.
779 //
780 // according to my analysis the worst case is √A rotations performed on √A items
781 // once the constant factors are removed, which ends up being O(n)
782 //
783 // again, this is NOT a general-purpose solution – it only works well in this case!
784 // kind of like how the O(n^2) insertion sort is used in some places
785
786 var A = A_arg;
787 var B = B_arg;
788
789 while (true) {
790 // find the first place in B where the first item in A needs to be inserted
791 const mid = binaryFirst(T, items, items[A.start], B, context, lessThan);
792
793 // rotate A into place
794 const amount = mid - A.end;
795 mem.rotate(T, items[A.start..mid], A.length());
796 if (B.end == mid) break;
797
798 // calculate the new A and B ranges
799 B.start = mid;
800 A = Range.init(A.start + amount, B.start);
801 A.start = binaryLast(T, items, items[A.start], A, context, lessThan);
802 if (A.length() == 0) break;
803 }
804}
805
806// merge operation using an internal buffer
807fn mergeInternal(
808 comptime T: type,
809 items: []T,
810 A: Range,
811 B: Range,
812 buffer: Range,
813 context: anytype,
814 comptime lessThan: fn (@TypeOf(context), lhs: T, rhs: T) bool,
815) void {
816 // whenever we find a value to add to the final array, swap it with the value that's already in that spot
817 // when this algorithm is finished, 'buffer' will contain its original contents, but in a different order
818 var A_count: usize = 0;
819 var B_count: usize = 0;
820 var insert: usize = 0;
821
822 if (B.length() > 0 and A.length() > 0) {
823 while (true) {
824 if (!lessThan(context, items[B.start + B_count], items[buffer.start + A_count])) {
825 mem.swap(T, &items[A.start + insert], &items[buffer.start + A_count]);
826 A_count += 1;
827 insert += 1;
828 if (A_count >= A.length()) break;
829 } else {
830 mem.swap(T, &items[A.start + insert], &items[B.start + B_count]);
831 B_count += 1;
832 insert += 1;
833 if (B_count >= B.length()) break;
834 }
835 }
836 }
837
838 // swap the remainder of A into the final array
839 blockSwap(T, items, buffer.start + A_count, A.start + insert, A.length() - A_count);
840}
841
842fn blockSwap(comptime T: type, items: []T, start1: usize, start2: usize, block_size: usize) void {
843 var index: usize = 0;
844 while (index < block_size) : (index += 1) {
845 mem.swap(T, &items[start1 + index], &items[start2 + index]);
846 }
847}
848
849// combine a linear search with a binary search to reduce the number of comparisons in situations
850// where have some idea as to how many unique values there are and where the next value might be
851fn findFirstForward(
852 comptime T: type,
853 items: []T,
854 value: T,
855 range: Range,
856 unique: usize,
857 context: anytype,
858 comptime lessThan: fn (@TypeOf(context), lhs: T, rhs: T) bool,
859) usize {
860 if (range.length() == 0) return range.start;
861 const skip = @max(range.length() / unique, @as(usize, 1));
862
863 var index = range.start + skip;
864 while (lessThan(context, items[index - 1], value)) : (index += skip) {
865 if (index >= range.end - skip) {
866 return binaryFirst(T, items, value, Range.init(index, range.end), context, lessThan);
867 }
868 }
869
870 return binaryFirst(T, items, value, Range.init(index - skip, index), context, lessThan);
871}
872
873fn findFirstBackward(
874 comptime T: type,
875 items: []T,
876 value: T,
877 range: Range,
878 unique: usize,
879 context: anytype,
880 comptime lessThan: fn (@TypeOf(context), lhs: T, rhs: T) bool,
881) usize {
882 if (range.length() == 0) return range.start;
883 const skip = @max(range.length() / unique, @as(usize, 1));
884
885 var index = range.end - skip;
886 while (index > range.start and !lessThan(context, items[index - 1], value)) : (index -= skip) {
887 if (index < range.start + skip) {
888 return binaryFirst(T, items, value, Range.init(range.start, index), context, lessThan);
889 }
890 }
891
892 return binaryFirst(T, items, value, Range.init(index, index + skip), context, lessThan);
893}
894
895fn findLastForward(
896 comptime T: type,
897 items: []T,
898 value: T,
899 range: Range,
900 unique: usize,
901 context: anytype,
902 comptime lessThan: fn (@TypeOf(context), lhs: T, rhs: T) bool,
903) usize {
904 if (range.length() == 0) return range.start;
905 const skip = @max(range.length() / unique, @as(usize, 1));
906
907 var index = range.start + skip;
908 while (!lessThan(context, value, items[index - 1])) : (index += skip) {
909 if (index >= range.end - skip) {
910 return binaryLast(T, items, value, Range.init(index, range.end), context, lessThan);
911 }
912 }
913
914 return binaryLast(T, items, value, Range.init(index - skip, index), context, lessThan);
915}
916
917fn findLastBackward(
918 comptime T: type,
919 items: []T,
920 value: T,
921 range: Range,
922 unique: usize,
923 context: anytype,
924 comptime lessThan: fn (@TypeOf(context), lhs: T, rhs: T) bool,
925) usize {
926 if (range.length() == 0) return range.start;
927 const skip = @max(range.length() / unique, @as(usize, 1));
928
929 var index = range.end - skip;
930 while (index > range.start and lessThan(context, value, items[index - 1])) : (index -= skip) {
931 if (index < range.start + skip) {
932 return binaryLast(T, items, value, Range.init(range.start, index), context, lessThan);
933 }
934 }
935
936 return binaryLast(T, items, value, Range.init(index, index + skip), context, lessThan);
937}
938
939fn binaryFirst(
940 comptime T: type,
941 items: []T,
942 value: T,
943 range: Range,
944 context: anytype,
945 comptime lessThan: fn (@TypeOf(context), lhs: T, rhs: T) bool,
946) usize {
947 var curr = range.start;
948 var size = range.length();
949 if (range.start >= range.end) return range.end;
950 while (size > 0) {
951 const offset = size % 2;
952
953 size /= 2;
954 const mid_item = items[curr + size];
955 if (lessThan(context, mid_item, value)) {
956 curr += size + offset;
957 }
958 }
959 return curr;
960}
961
962fn binaryLast(
963 comptime T: type,
964 items: []T,
965 value: T,
966 range: Range,
967 context: anytype,
968 comptime lessThan: fn (@TypeOf(context), lhs: T, rhs: T) bool,
969) usize {
970 var curr = range.start;
971 var size = range.length();
972 if (range.start >= range.end) return range.end;
973 while (size > 0) {
974 const offset = size % 2;
975
976 size /= 2;
977 const mid_item = items[curr + size];
978 if (!lessThan(context, value, mid_item)) {
979 curr += size + offset;
980 }
981 }
982 return curr;
983}
984
985fn mergeInto(
986 comptime T: type,
987 from: []T,
988 A: Range,
989 B: Range,
990 into: []T,
991 context: anytype,
992 comptime lessThan: fn (@TypeOf(context), lhs: T, rhs: T) bool,
993) void {
994 var A_index: usize = A.start;
995 var B_index: usize = B.start;
996 const A_last = A.end;
997 const B_last = B.end;
998 var insert_index: usize = 0;
999
1000 while (true) {
1001 if (!lessThan(context, from[B_index], from[A_index])) {
1002 into[insert_index] = from[A_index];
1003 A_index += 1;
1004 insert_index += 1;
1005 if (A_index == A_last) {
1006 // copy the remainder of B into the final array
1007 const from_b = from[B_index..B_last];
1008 @memcpy(into[insert_index..][0..from_b.len], from_b);
1009 break;
1010 }
1011 } else {
1012 into[insert_index] = from[B_index];
1013 B_index += 1;
1014 insert_index += 1;
1015 if (B_index == B_last) {
1016 // copy the remainder of A into the final array
1017 const from_a = from[A_index..A_last];
1018 @memcpy(into[insert_index..][0..from_a.len], from_a);
1019 break;
1020 }
1021 }
1022 }
1023}
1024
1025fn mergeExternal(
1026 comptime T: type,
1027 items: []T,
1028 A: Range,
1029 B: Range,
1030 cache: []T,
1031 context: anytype,
1032 comptime lessThan: fn (@TypeOf(context), lhs: T, rhs: T) bool,
1033) void {
1034 // A fits into the cache, so use that instead of the internal buffer
1035 var A_index: usize = 0;
1036 var B_index: usize = B.start;
1037 var insert_index: usize = A.start;
1038 const A_last = A.length();
1039 const B_last = B.end;
1040
1041 if (B.length() > 0 and A.length() > 0) {
1042 while (true) {
1043 if (!lessThan(context, items[B_index], cache[A_index])) {
1044 items[insert_index] = cache[A_index];
1045 A_index += 1;
1046 insert_index += 1;
1047 if (A_index == A_last) break;
1048 } else {
1049 items[insert_index] = items[B_index];
1050 B_index += 1;
1051 insert_index += 1;
1052 if (B_index == B_last) break;
1053 }
1054 }
1055 }
1056
1057 // copy the remainder of A into the final array
1058 const cache_a = cache[A_index..A_last];
1059 @memcpy(items[insert_index..][0..cache_a.len], cache_a);
1060}
1061
1062fn swap(
1063 comptime T: type,
1064 items: []T,
1065 order: *[8]u8,
1066 x: usize,
1067 y: usize,
1068 context: anytype,
1069 comptime lessThan: fn (@TypeOf(context), lhs: T, rhs: T) bool,
1070) void {
1071 if (lessThan(context, items[y], items[x]) or ((order.*)[x] > (order.*)[y] and !lessThan(context, items[x], items[y]))) {
1072 mem.swap(T, &items[x], &items[y]);
1073 mem.swap(u8, &(order.*)[x], &(order.*)[y]);
1074 }
1075}