master
   1const builtin = @import("builtin");
   2const std = @import("../std.zig");
   3const sort = std.sort;
   4const math = std.math;
   5const mem = std.mem;
   6
   7const Range = struct {
   8    start: usize,
   9    end: usize,
  10
  11    fn init(start: usize, end: usize) Range {
  12        return Range{
  13            .start = start,
  14            .end = end,
  15        };
  16    }
  17
  18    fn length(self: Range) usize {
  19        return self.end - self.start;
  20    }
  21};
  22
  23const Iterator = struct {
  24    size: usize,
  25    power_of_two: usize,
  26    numerator: usize,
  27    decimal: usize,
  28    denominator: usize,
  29    decimal_step: usize,
  30    numerator_step: usize,
  31
  32    fn init(size2: usize, min_level: usize) Iterator {
  33        const power_of_two = math.floorPowerOfTwo(usize, size2);
  34        const denominator = power_of_two / min_level;
  35        return Iterator{
  36            .numerator = 0,
  37            .decimal = 0,
  38            .size = size2,
  39            .power_of_two = power_of_two,
  40            .denominator = denominator,
  41            .decimal_step = size2 / denominator,
  42            .numerator_step = size2 % denominator,
  43        };
  44    }
  45
  46    fn begin(self: *Iterator) void {
  47        self.numerator = 0;
  48        self.decimal = 0;
  49    }
  50
  51    fn nextRange(self: *Iterator) Range {
  52        const start = self.decimal;
  53
  54        self.decimal += self.decimal_step;
  55        self.numerator += self.numerator_step;
  56        if (self.numerator >= self.denominator) {
  57            self.numerator -= self.denominator;
  58            self.decimal += 1;
  59        }
  60
  61        return Range{
  62            .start = start,
  63            .end = self.decimal,
  64        };
  65    }
  66
  67    fn finished(self: *Iterator) bool {
  68        return self.decimal >= self.size;
  69    }
  70
  71    fn nextLevel(self: *Iterator) bool {
  72        self.decimal_step += self.decimal_step;
  73        self.numerator_step += self.numerator_step;
  74        if (self.numerator_step >= self.denominator) {
  75            self.numerator_step -= self.denominator;
  76            self.decimal_step += 1;
  77        }
  78
  79        return (self.decimal_step < self.size);
  80    }
  81
  82    fn length(self: *Iterator) usize {
  83        return self.decimal_step;
  84    }
  85};
  86
  87const Pull = struct {
  88    from: usize,
  89    to: usize,
  90    count: usize,
  91    range: Range,
  92};
  93
  94/// Stable in-place sort. O(n) best case, O(n*log(n)) worst case and average case.
  95/// O(1) memory (no allocator required).
  96/// Sorts in ascending order with respect to the given `lessThan` function.
  97///
  98/// NOTE: The algorithm only works when the comparison is less-than or greater-than.
  99///       (See https://github.com/ziglang/zig/issues/8289)
 100pub fn block(
 101    comptime T: type,
 102    items: []T,
 103    context: anytype,
 104    comptime lessThanFn: fn (@TypeOf(context), lhs: T, rhs: T) bool,
 105) void {
 106    const lessThan = if (builtin.mode == .Debug) struct {
 107        fn lessThan(ctx: @TypeOf(context), lhs: T, rhs: T) bool {
 108            const lt = lessThanFn(ctx, lhs, rhs);
 109            const gt = lessThanFn(ctx, rhs, lhs);
 110            std.debug.assert(!(lt and gt));
 111            return lt;
 112        }
 113    }.lessThan else lessThanFn;
 114
 115    // Implementation ported from https://github.com/BonzaiThePenguin/WikiSort/blob/master/WikiSort.c
 116    var cache: [512]T = undefined;
 117
 118    if (items.len < 4) {
 119        if (items.len == 3) {
 120            // hard coded insertion sort
 121            if (lessThan(context, items[1], items[0])) mem.swap(T, &items[0], &items[1]);
 122            if (lessThan(context, items[2], items[1])) {
 123                mem.swap(T, &items[1], &items[2]);
 124                if (lessThan(context, items[1], items[0])) mem.swap(T, &items[0], &items[1]);
 125            }
 126        } else if (items.len == 2) {
 127            if (lessThan(context, items[1], items[0])) mem.swap(T, &items[0], &items[1]);
 128        }
 129        return;
 130    }
 131
 132    // sort groups of 4-8 items at a time using an unstable sorting network,
 133    // but keep track of the original item orders to force it to be stable
 134    // http://pages.ripco.net/~jgamble/nw.html
 135    var iterator = Iterator.init(items.len, 4);
 136    while (!iterator.finished()) {
 137        var order = [_]u8{ 0, 1, 2, 3, 4, 5, 6, 7 };
 138        const range = iterator.nextRange();
 139
 140        const sliced_items = items[range.start..];
 141        switch (range.length()) {
 142            8 => {
 143                swap(T, sliced_items, &order, 0, 1, context, lessThan);
 144                swap(T, sliced_items, &order, 2, 3, context, lessThan);
 145                swap(T, sliced_items, &order, 4, 5, context, lessThan);
 146                swap(T, sliced_items, &order, 6, 7, context, lessThan);
 147                swap(T, sliced_items, &order, 0, 2, context, lessThan);
 148                swap(T, sliced_items, &order, 1, 3, context, lessThan);
 149                swap(T, sliced_items, &order, 4, 6, context, lessThan);
 150                swap(T, sliced_items, &order, 5, 7, context, lessThan);
 151                swap(T, sliced_items, &order, 1, 2, context, lessThan);
 152                swap(T, sliced_items, &order, 5, 6, context, lessThan);
 153                swap(T, sliced_items, &order, 0, 4, context, lessThan);
 154                swap(T, sliced_items, &order, 3, 7, context, lessThan);
 155                swap(T, sliced_items, &order, 1, 5, context, lessThan);
 156                swap(T, sliced_items, &order, 2, 6, context, lessThan);
 157                swap(T, sliced_items, &order, 1, 4, context, lessThan);
 158                swap(T, sliced_items, &order, 3, 6, context, lessThan);
 159                swap(T, sliced_items, &order, 2, 4, context, lessThan);
 160                swap(T, sliced_items, &order, 3, 5, context, lessThan);
 161                swap(T, sliced_items, &order, 3, 4, context, lessThan);
 162            },
 163            7 => {
 164                swap(T, sliced_items, &order, 1, 2, context, lessThan);
 165                swap(T, sliced_items, &order, 3, 4, context, lessThan);
 166                swap(T, sliced_items, &order, 5, 6, context, lessThan);
 167                swap(T, sliced_items, &order, 0, 2, context, lessThan);
 168                swap(T, sliced_items, &order, 3, 5, context, lessThan);
 169                swap(T, sliced_items, &order, 4, 6, context, lessThan);
 170                swap(T, sliced_items, &order, 0, 1, context, lessThan);
 171                swap(T, sliced_items, &order, 4, 5, context, lessThan);
 172                swap(T, sliced_items, &order, 2, 6, context, lessThan);
 173                swap(T, sliced_items, &order, 0, 4, context, lessThan);
 174                swap(T, sliced_items, &order, 1, 5, context, lessThan);
 175                swap(T, sliced_items, &order, 0, 3, context, lessThan);
 176                swap(T, sliced_items, &order, 2, 5, context, lessThan);
 177                swap(T, sliced_items, &order, 1, 3, context, lessThan);
 178                swap(T, sliced_items, &order, 2, 4, context, lessThan);
 179                swap(T, sliced_items, &order, 2, 3, context, lessThan);
 180            },
 181            6 => {
 182                swap(T, sliced_items, &order, 1, 2, context, lessThan);
 183                swap(T, sliced_items, &order, 4, 5, context, lessThan);
 184                swap(T, sliced_items, &order, 0, 2, context, lessThan);
 185                swap(T, sliced_items, &order, 3, 5, context, lessThan);
 186                swap(T, sliced_items, &order, 0, 1, context, lessThan);
 187                swap(T, sliced_items, &order, 3, 4, context, lessThan);
 188                swap(T, sliced_items, &order, 2, 5, context, lessThan);
 189                swap(T, sliced_items, &order, 0, 3, context, lessThan);
 190                swap(T, sliced_items, &order, 1, 4, context, lessThan);
 191                swap(T, sliced_items, &order, 2, 4, context, lessThan);
 192                swap(T, sliced_items, &order, 1, 3, context, lessThan);
 193                swap(T, sliced_items, &order, 2, 3, context, lessThan);
 194            },
 195            5 => {
 196                swap(T, sliced_items, &order, 0, 1, context, lessThan);
 197                swap(T, sliced_items, &order, 3, 4, context, lessThan);
 198                swap(T, sliced_items, &order, 2, 4, context, lessThan);
 199                swap(T, sliced_items, &order, 2, 3, context, lessThan);
 200                swap(T, sliced_items, &order, 1, 4, context, lessThan);
 201                swap(T, sliced_items, &order, 0, 3, context, lessThan);
 202                swap(T, sliced_items, &order, 0, 2, context, lessThan);
 203                swap(T, sliced_items, &order, 1, 3, context, lessThan);
 204                swap(T, sliced_items, &order, 1, 2, context, lessThan);
 205            },
 206            4 => {
 207                swap(T, sliced_items, &order, 0, 1, context, lessThan);
 208                swap(T, sliced_items, &order, 2, 3, context, lessThan);
 209                swap(T, sliced_items, &order, 0, 2, context, lessThan);
 210                swap(T, sliced_items, &order, 1, 3, context, lessThan);
 211                swap(T, sliced_items, &order, 1, 2, context, lessThan);
 212            },
 213            else => {},
 214        }
 215    }
 216    if (items.len < 8) return;
 217
 218    // then merge sort the higher levels, which can be 8-15, 16-31, 32-63, 64-127, etc.
 219    while (true) {
 220        // if every A and B block will fit into the cache, use a special branch
 221        // specifically for merging with the cache
 222        // (we use < rather than <= since the block size might be one more than
 223        // iterator.length())
 224        if (iterator.length() < cache.len) {
 225            // if four subarrays fit into the cache, it's faster to merge both
 226            // pairs of subarrays into the cache,
 227            // then merge the two merged subarrays from the cache back into the original array
 228            if ((iterator.length() + 1) * 4 <= cache.len and iterator.length() * 4 <= items.len) {
 229                iterator.begin();
 230                while (!iterator.finished()) {
 231                    // merge A1 and B1 into the cache
 232                    var A1 = iterator.nextRange();
 233                    var B1 = iterator.nextRange();
 234                    var A2 = iterator.nextRange();
 235                    var B2 = iterator.nextRange();
 236
 237                    if (lessThan(context, items[B1.end - 1], items[A1.start])) {
 238                        // the two ranges are in reverse order, so copy them in reverse order into the cache
 239                        const a1_items = items[A1.start..A1.end];
 240                        @memcpy(cache[B1.length()..][0..a1_items.len], a1_items);
 241                        const b1_items = items[B1.start..B1.end];
 242                        @memcpy(cache[0..b1_items.len], b1_items);
 243                    } else if (lessThan(context, items[B1.start], items[A1.end - 1])) {
 244                        // these two ranges weren't already in order, so merge them into the cache
 245                        mergeInto(T, items, A1, B1, cache[0..], context, lessThan);
 246                    } else {
 247                        // if A1, B1, A2, and B2 are all in order, skip doing anything else
 248                        if (!lessThan(context, items[B2.start], items[A2.end - 1]) and !lessThan(context, items[A2.start], items[B1.end - 1])) continue;
 249
 250                        // copy A1 and B1 into the cache in the same order
 251                        const a1_items = items[A1.start..A1.end];
 252                        @memcpy(cache[0..a1_items.len], a1_items);
 253                        const b1_items = items[B1.start..B1.end];
 254                        @memcpy(cache[A1.length()..][0..b1_items.len], b1_items);
 255                    }
 256                    A1 = Range.init(A1.start, B1.end);
 257
 258                    // merge A2 and B2 into the cache
 259                    if (lessThan(context, items[B2.end - 1], items[A2.start])) {
 260                        // the two ranges are in reverse order, so copy them in reverse order into the cache
 261                        const a2_items = items[A2.start..A2.end];
 262                        @memcpy(cache[A1.length() + B2.length() ..][0..a2_items.len], a2_items);
 263                        const b2_items = items[B2.start..B2.end];
 264                        @memcpy(cache[A1.length()..][0..b2_items.len], b2_items);
 265                    } else if (lessThan(context, items[B2.start], items[A2.end - 1])) {
 266                        // these two ranges weren't already in order, so merge them into the cache
 267                        mergeInto(T, items, A2, B2, cache[A1.length()..], context, lessThan);
 268                    } else {
 269                        // copy A2 and B2 into the cache in the same order
 270                        const a2_items = items[A2.start..A2.end];
 271                        @memcpy(cache[A1.length()..][0..a2_items.len], a2_items);
 272                        const b2_items = items[B2.start..B2.end];
 273                        @memcpy(cache[A1.length() + A2.length() ..][0..b2_items.len], b2_items);
 274                    }
 275                    A2 = Range.init(A2.start, B2.end);
 276
 277                    // merge A1 and A2 from the cache into the items
 278                    const A3 = Range.init(0, A1.length());
 279                    const B3 = Range.init(A1.length(), A1.length() + A2.length());
 280
 281                    if (lessThan(context, cache[B3.end - 1], cache[A3.start])) {
 282                        // the two ranges are in reverse order, so copy them in reverse order into the items
 283                        const a3_items = cache[A3.start..A3.end];
 284                        @memcpy(items[A1.start + A2.length() ..][0..a3_items.len], a3_items);
 285                        const b3_items = cache[B3.start..B3.end];
 286                        @memcpy(items[A1.start..][0..b3_items.len], b3_items);
 287                    } else if (lessThan(context, cache[B3.start], cache[A3.end - 1])) {
 288                        // these two ranges weren't already in order, so merge them back into the items
 289                        mergeInto(T, cache[0..], A3, B3, items[A1.start..], context, lessThan);
 290                    } else {
 291                        // copy A3 and B3 into the items in the same order
 292                        const a3_items = cache[A3.start..A3.end];
 293                        @memcpy(items[A1.start..][0..a3_items.len], a3_items);
 294                        const b3_items = cache[B3.start..B3.end];
 295                        @memcpy(items[A1.start + A1.length() ..][0..b3_items.len], b3_items);
 296                    }
 297                }
 298
 299                // we merged two levels at the same time, so we're done with this level already
 300                // (iterator.nextLevel() is called again at the bottom of this outer merge loop)
 301                _ = iterator.nextLevel();
 302            } else {
 303                iterator.begin();
 304                while (!iterator.finished()) {
 305                    const A = iterator.nextRange();
 306                    const B = iterator.nextRange();
 307
 308                    if (lessThan(context, items[B.end - 1], items[A.start])) {
 309                        // the two ranges are in reverse order, so a simple rotation should fix it
 310                        mem.rotate(T, items[A.start..B.end], A.length());
 311                    } else if (lessThan(context, items[B.start], items[A.end - 1])) {
 312                        // these two ranges weren't already in order, so we'll need to merge them!
 313                        const a_items = items[A.start..A.end];
 314                        @memcpy(cache[0..a_items.len], a_items);
 315                        mergeExternal(T, items, A, B, cache[0..], context, lessThan);
 316                    }
 317                }
 318            }
 319        } else {
 320            // this is where the in-place merge logic starts!
 321            // 1. pull out two internal buffers each containing √A unique values
 322            //    1a. adjust block_size and buffer_size if we couldn't find enough unique values
 323            // 2. loop over the A and B subarrays within this level of the merge sort
 324            // 3. break A and B into blocks of size 'block_size'
 325            // 4. "tag" each of the A blocks with values from the first internal buffer
 326            // 5. roll the A blocks through the B blocks and drop/rotate them where they belong
 327            // 6. merge each A block with any B values that follow, using the cache or the second internal buffer
 328            // 7. sort the second internal buffer if it exists
 329            // 8. redistribute the two internal buffers back into the items
 330            var block_size: usize = math.sqrt(iterator.length());
 331            var buffer_size = iterator.length() / block_size + 1;
 332
 333            // as an optimization, we really only need to pull out the internal buffers once for each level of merges
 334            // after that we can reuse the same buffers over and over, then redistribute it when we're finished with this level
 335            var A: Range = undefined;
 336            var B: Range = undefined;
 337            var index: usize = 0;
 338            var last: usize = 0;
 339            var count: usize = 0;
 340            var find: usize = 0;
 341            var start: usize = 0;
 342            var pull_index: usize = 0;
 343            var pull = [_]Pull{
 344                Pull{
 345                    .from = 0,
 346                    .to = 0,
 347                    .count = 0,
 348                    .range = Range.init(0, 0),
 349                },
 350                Pull{
 351                    .from = 0,
 352                    .to = 0,
 353                    .count = 0,
 354                    .range = Range.init(0, 0),
 355                },
 356            };
 357
 358            var buffer1 = Range.init(0, 0);
 359            var buffer2 = Range.init(0, 0);
 360
 361            // find two internal buffers of size 'buffer_size' each
 362            find = buffer_size + buffer_size;
 363            var find_separately = false;
 364
 365            if (block_size <= cache.len) {
 366                // if every A block fits into the cache then we won't need the second internal buffer,
 367                // so we really only need to find 'buffer_size' unique values
 368                find = buffer_size;
 369            } else if (find > iterator.length()) {
 370                // we can't fit both buffers into the same A or B subarray, so find two buffers separately
 371                find = buffer_size;
 372                find_separately = true;
 373            }
 374
 375            // we need to find either a single contiguous space containing 2√A unique values (which will be split up into two buffers of size √A each),
 376            // or we need to find one buffer of < 2√A unique values, and a second buffer of √A unique values,
 377            // OR if we couldn't find that many unique values, we need the largest possible buffer we can get
 378
 379            // in the case where it couldn't find a single buffer of at least √A unique values,
 380            // all of the Merge steps must be replaced by a different merge algorithm (MergeInPlace)
 381            iterator.begin();
 382            while (!iterator.finished()) {
 383                A = iterator.nextRange();
 384                B = iterator.nextRange();
 385
 386                // just store information about where the values will be pulled from and to,
 387                // as well as how many values there are, to create the two internal buffers
 388
 389                // check A for the number of unique values we need to fill an internal buffer
 390                // these values will be pulled out to the start of A
 391                last = A.start;
 392                count = 1;
 393                while (count < find) : ({
 394                    last = index;
 395                    count += 1;
 396                }) {
 397                    index = findLastForward(T, items, items[last], Range.init(last + 1, A.end), find - count, context, lessThan);
 398                    if (index == A.end) break;
 399                }
 400                index = last;
 401
 402                if (count >= buffer_size) {
 403                    // keep track of the range within the items where we'll need to "pull out" these values to create the internal buffer
 404                    pull[pull_index] = Pull{
 405                        .range = Range.init(A.start, B.end),
 406                        .count = count,
 407                        .from = index,
 408                        .to = A.start,
 409                    };
 410                    pull_index = 1;
 411
 412                    if (count == buffer_size + buffer_size) {
 413                        // we were able to find a single contiguous section containing 2√A unique values,
 414                        // so this section can be used to contain both of the internal buffers we'll need
 415                        buffer1 = Range.init(A.start, A.start + buffer_size);
 416                        buffer2 = Range.init(A.start + buffer_size, A.start + count);
 417                        break;
 418                    } else if (find == buffer_size + buffer_size) {
 419                        // we found a buffer that contains at least √A unique values, but did not contain the full 2√A unique values,
 420                        // so we still need to find a second separate buffer of at least √A unique values
 421                        buffer1 = Range.init(A.start, A.start + count);
 422                        find = buffer_size;
 423                    } else if (block_size <= cache.len) {
 424                        // we found the first and only internal buffer that we need, so we're done!
 425                        buffer1 = Range.init(A.start, A.start + count);
 426                        break;
 427                    } else if (find_separately) {
 428                        // found one buffer, but now find the other one
 429                        buffer1 = Range.init(A.start, A.start + count);
 430                        find_separately = false;
 431                    } else {
 432                        // we found a second buffer in an 'A' subarray containing √A unique values, so we're done!
 433                        buffer2 = Range.init(A.start, A.start + count);
 434                        break;
 435                    }
 436                } else if (pull_index == 0 and count > buffer1.length()) {
 437                    // keep track of the largest buffer we were able to find
 438                    buffer1 = Range.init(A.start, A.start + count);
 439                    pull[pull_index] = Pull{
 440                        .range = Range.init(A.start, B.end),
 441                        .count = count,
 442                        .from = index,
 443                        .to = A.start,
 444                    };
 445                }
 446
 447                // check B for the number of unique values we need to fill an internal buffer
 448                // these values will be pulled out to the end of B
 449                last = B.end - 1;
 450                count = 1;
 451                while (count < find) : ({
 452                    last = index - 1;
 453                    count += 1;
 454                }) {
 455                    index = findFirstBackward(T, items, items[last], Range.init(B.start, last), find - count, context, lessThan);
 456                    if (index == B.start) break;
 457                }
 458                index = last;
 459
 460                if (count >= buffer_size) {
 461                    // keep track of the range within the items where we'll need to "pull out" these values to create the internal buffe
 462                    pull[pull_index] = Pull{
 463                        .range = Range.init(A.start, B.end),
 464                        .count = count,
 465                        .from = index,
 466                        .to = B.end,
 467                    };
 468                    pull_index = 1;
 469
 470                    if (count == buffer_size + buffer_size) {
 471                        // we were able to find a single contiguous section containing 2√A unique values,
 472                        // so this section can be used to contain both of the internal buffers we'll need
 473                        buffer1 = Range.init(B.end - count, B.end - buffer_size);
 474                        buffer2 = Range.init(B.end - buffer_size, B.end);
 475                        break;
 476                    } else if (find == buffer_size + buffer_size) {
 477                        // we found a buffer that contains at least √A unique values, but did not contain the full 2√A unique values,
 478                        // so we still need to find a second separate buffer of at least √A unique values
 479                        buffer1 = Range.init(B.end - count, B.end);
 480                        find = buffer_size;
 481                    } else if (block_size <= cache.len) {
 482                        // we found the first and only internal buffer that we need, so we're done!
 483                        buffer1 = Range.init(B.end - count, B.end);
 484                        break;
 485                    } else if (find_separately) {
 486                        // found one buffer, but now find the other one
 487                        buffer1 = Range.init(B.end - count, B.end);
 488                        find_separately = false;
 489                    } else {
 490                        // buffer2 will be pulled out from a 'B' subarray, so if the first buffer was pulled out from the corresponding 'A' subarray,
 491                        // we need to adjust the end point for that A subarray so it knows to stop redistributing its values before reaching buffer2
 492                        if (pull[0].range.start == A.start) pull[0].range.end -= pull[1].count;
 493
 494                        // we found a second buffer in an 'B' subarray containing √A unique values, so we're done!
 495                        buffer2 = Range.init(B.end - count, B.end);
 496                        break;
 497                    }
 498                } else if (pull_index == 0 and count > buffer1.length()) {
 499                    // keep track of the largest buffer we were able to find
 500                    buffer1 = Range.init(B.end - count, B.end);
 501                    pull[pull_index] = Pull{
 502                        .range = Range.init(A.start, B.end),
 503                        .count = count,
 504                        .from = index,
 505                        .to = B.end,
 506                    };
 507                }
 508            }
 509
 510            // pull out the two ranges so we can use them as internal buffers
 511            pull_index = 0;
 512            while (pull_index < 2) : (pull_index += 1) {
 513                const length = pull[pull_index].count;
 514
 515                if (pull[pull_index].to < pull[pull_index].from) {
 516                    // we're pulling the values out to the left, which means the start of an A subarray
 517                    index = pull[pull_index].from;
 518                    count = 1;
 519                    while (count < length) : (count += 1) {
 520                        index = findFirstBackward(T, items, items[index - 1], Range.init(pull[pull_index].to, pull[pull_index].from - (count - 1)), length - count, context, lessThan);
 521                        const range = Range.init(index + 1, pull[pull_index].from + 1);
 522                        mem.rotate(T, items[range.start..range.end], range.length() - count);
 523                        pull[pull_index].from = index + count;
 524                    }
 525                } else if (pull[pull_index].to > pull[pull_index].from) {
 526                    // we're pulling values out to the right, which means the end of a B subarray
 527                    index = pull[pull_index].from + 1;
 528                    count = 1;
 529                    while (count < length) : (count += 1) {
 530                        index = findLastForward(T, items, items[index], Range.init(index, pull[pull_index].to), length - count, context, lessThan);
 531                        const range = Range.init(pull[pull_index].from, index - 1);
 532                        mem.rotate(T, items[range.start..range.end], count);
 533                        pull[pull_index].from = index - 1 - count;
 534                    }
 535                }
 536            }
 537
 538            // adjust block_size and buffer_size based on the values we were able to pull out
 539            buffer_size = buffer1.length();
 540            block_size = iterator.length() / buffer_size + 1;
 541
 542            // the first buffer NEEDS to be large enough to tag each of the evenly sized A blocks,
 543            // so this was originally here to test the math for adjusting block_size above
 544            // assert((iterator.length() + 1)/block_size <= buffer_size);
 545
 546            // now that the two internal buffers have been created, it's time to merge each A+B combination at this level of the merge sort!
 547            iterator.begin();
 548            while (!iterator.finished()) {
 549                A = iterator.nextRange();
 550                B = iterator.nextRange();
 551
 552                // remove any parts of A or B that are being used by the internal buffers
 553                start = A.start;
 554                if (start == pull[0].range.start) {
 555                    if (pull[0].from > pull[0].to) {
 556                        A.start += pull[0].count;
 557
 558                        // if the internal buffer takes up the entire A or B subarray, then there's nothing to merge
 559                        // this only happens for very small subarrays, like √4 = 2, 2 * (2 internal buffers) = 4,
 560                        // which also only happens when cache.len is small or 0 since it'd otherwise use MergeExternal
 561                        if (A.length() == 0) continue;
 562                    } else if (pull[0].from < pull[0].to) {
 563                        B.end -= pull[0].count;
 564                        if (B.length() == 0) continue;
 565                    }
 566                }
 567                if (start == pull[1].range.start) {
 568                    if (pull[1].from > pull[1].to) {
 569                        A.start += pull[1].count;
 570                        if (A.length() == 0) continue;
 571                    } else if (pull[1].from < pull[1].to) {
 572                        B.end -= pull[1].count;
 573                        if (B.length() == 0) continue;
 574                    }
 575                }
 576
 577                if (lessThan(context, items[B.end - 1], items[A.start])) {
 578                    // the two ranges are in reverse order, so a simple rotation should fix it
 579                    mem.rotate(T, items[A.start..B.end], A.length());
 580                } else if (lessThan(context, items[A.end], items[A.end - 1])) {
 581                    // these two ranges weren't already in order, so we'll need to merge them!
 582                    var findA: usize = undefined;
 583
 584                    // break the remainder of A into blocks. firstA is the uneven-sized first A block
 585                    var blockA = Range.init(A.start, A.end);
 586                    var firstA = Range.init(A.start, A.start + blockA.length() % block_size);
 587
 588                    // swap the first value of each A block with the value in buffer1
 589                    var indexA = buffer1.start;
 590                    index = firstA.end;
 591                    while (index < blockA.end) : ({
 592                        indexA += 1;
 593                        index += block_size;
 594                    }) {
 595                        mem.swap(T, &items[indexA], &items[index]);
 596                    }
 597
 598                    // start rolling the A blocks through the B blocks!
 599                    // whenever we leave an A block behind, we'll need to merge the previous A block with any B blocks that follow it, so track that information as well
 600                    var lastA = firstA;
 601                    var lastB = Range.init(0, 0);
 602                    var blockB = Range.init(B.start, B.start + @min(block_size, B.length()));
 603                    blockA.start += firstA.length();
 604                    indexA = buffer1.start;
 605
 606                    // if the first unevenly sized A block fits into the cache, copy it there for when we go to Merge it
 607                    // otherwise, if the second buffer is available, block swap the contents into that
 608                    if (lastA.length() <= cache.len) {
 609                        const last_a_items = items[lastA.start..lastA.end];
 610                        @memcpy(cache[0..last_a_items.len], last_a_items);
 611                    } else if (buffer2.length() > 0) {
 612                        blockSwap(T, items, lastA.start, buffer2.start, lastA.length());
 613                    }
 614
 615                    if (blockA.length() > 0) {
 616                        while (true) {
 617                            // if there's a previous B block and the first value of the minimum A block is <= the last value of the previous B block,
 618                            // then drop that minimum A block behind. or if there are no B blocks left then keep dropping the remaining A blocks.
 619                            if ((lastB.length() > 0 and !lessThan(context, items[lastB.end - 1], items[indexA])) or blockB.length() == 0) {
 620                                // figure out where to split the previous B block, and rotate it at the split
 621                                const B_split = binaryFirst(T, items, items[indexA], lastB, context, lessThan);
 622                                const B_remaining = lastB.end - B_split;
 623
 624                                // swap the minimum A block to the beginning of the rolling A blocks
 625                                var minA = blockA.start;
 626                                findA = minA + block_size;
 627                                while (findA < blockA.end) : (findA += block_size) {
 628                                    if (lessThan(context, items[findA], items[minA])) {
 629                                        minA = findA;
 630                                    }
 631                                }
 632                                blockSwap(T, items, blockA.start, minA, block_size);
 633
 634                                // swap the first item of the previous A block back with its original value, which is stored in buffer1
 635                                mem.swap(T, &items[blockA.start], &items[indexA]);
 636                                indexA += 1;
 637
 638                                // locally merge the previous A block with the B values that follow it
 639                                // if lastA fits into the external cache we'll use that (with MergeExternal),
 640                                // or if the second internal buffer exists we'll use that (with MergeInternal),
 641                                // or failing that we'll use a strictly in-place merge algorithm (MergeInPlace)
 642
 643                                if (lastA.length() <= cache.len) {
 644                                    mergeExternal(T, items, lastA, Range.init(lastA.end, B_split), cache[0..], context, lessThan);
 645                                } else if (buffer2.length() > 0) {
 646                                    mergeInternal(T, items, lastA, Range.init(lastA.end, B_split), buffer2, context, lessThan);
 647                                } else {
 648                                    mergeInPlace(T, items, lastA, Range.init(lastA.end, B_split), context, lessThan);
 649                                }
 650
 651                                if (buffer2.length() > 0 or block_size <= cache.len) {
 652                                    // copy the previous A block into the cache or buffer2, since that's where we need it to be when we go to merge it anyway
 653                                    if (block_size <= cache.len) {
 654                                        @memcpy(cache[0..block_size], items[blockA.start..][0..block_size]);
 655                                    } else {
 656                                        blockSwap(T, items, blockA.start, buffer2.start, block_size);
 657                                    }
 658
 659                                    // this is equivalent to rotating, but faster
 660                                    // the area normally taken up by the A block is either the contents of buffer2, or data we don't need anymore since we memcopied it
 661                                    // either way, we don't need to retain the order of those items, so instead of rotating we can just block swap B to where it belongs
 662                                    blockSwap(T, items, B_split, blockA.start + block_size - B_remaining, B_remaining);
 663                                } else {
 664                                    // we are unable to use the 'buffer2' trick to speed up the rotation operation since buffer2 doesn't exist, so perform a normal rotation
 665                                    mem.rotate(T, items[B_split .. blockA.start + block_size], blockA.start - B_split);
 666                                }
 667
 668                                // update the range for the remaining A blocks, and the range remaining from the B block after it was split
 669                                lastA = Range.init(blockA.start - B_remaining, blockA.start - B_remaining + block_size);
 670                                lastB = Range.init(lastA.end, lastA.end + B_remaining);
 671
 672                                // if there are no more A blocks remaining, this step is finished!
 673                                blockA.start += block_size;
 674                                if (blockA.length() == 0) break;
 675                            } else if (blockB.length() < block_size) {
 676                                // move the last B block, which is unevenly sized, to before the remaining A blocks, by using a rotation
 677                                // the cache is disabled here since it might contain the contents of the previous A block
 678                                mem.rotate(T, items[blockA.start..blockB.end], blockB.start - blockA.start);
 679
 680                                lastB = Range.init(blockA.start, blockA.start + blockB.length());
 681                                blockA.start += blockB.length();
 682                                blockA.end += blockB.length();
 683                                blockB.end = blockB.start;
 684                            } else {
 685                                // roll the leftmost A block to the end by swapping it with the next B block
 686                                blockSwap(T, items, blockA.start, blockB.start, block_size);
 687                                lastB = Range.init(blockA.start, blockA.start + block_size);
 688
 689                                blockA.start += block_size;
 690                                blockA.end += block_size;
 691                                blockB.start += block_size;
 692
 693                                if (blockB.end > B.end - block_size) {
 694                                    blockB.end = B.end;
 695                                } else {
 696                                    blockB.end += block_size;
 697                                }
 698                            }
 699                        }
 700                    }
 701
 702                    // merge the last A block with the remaining B values
 703                    if (lastA.length() <= cache.len) {
 704                        mergeExternal(T, items, lastA, Range.init(lastA.end, B.end), cache[0..], context, lessThan);
 705                    } else if (buffer2.length() > 0) {
 706                        mergeInternal(T, items, lastA, Range.init(lastA.end, B.end), buffer2, context, lessThan);
 707                    } else {
 708                        mergeInPlace(T, items, lastA, Range.init(lastA.end, B.end), context, lessThan);
 709                    }
 710                }
 711            }
 712
 713            // when we're finished with this merge step we should have the one
 714            // or two internal buffers left over, where the second buffer is all jumbled up
 715            // insertion sort the second buffer, then redistribute the buffers
 716            // back into the items using the opposite process used for creating the buffer
 717
 718            // while an unstable sort like quicksort could be applied here, in benchmarks
 719            // it was consistently slightly slower than a simple insertion sort,
 720            // even for tens of millions of items. this may be because insertion
 721            // sort is quite fast when the data is already somewhat sorted, like it is here
 722            sort.insertion(T, items[buffer2.start..buffer2.end], context, lessThan);
 723
 724            pull_index = 0;
 725            while (pull_index < 2) : (pull_index += 1) {
 726                var unique = pull[pull_index].count * 2;
 727                if (pull[pull_index].from > pull[pull_index].to) {
 728                    // the values were pulled out to the left, so redistribute them back to the right
 729                    var buffer = Range.init(pull[pull_index].range.start, pull[pull_index].range.start + pull[pull_index].count);
 730                    while (buffer.length() > 0) {
 731                        index = findFirstForward(T, items, items[buffer.start], Range.init(buffer.end, pull[pull_index].range.end), unique, context, lessThan);
 732                        const amount = index - buffer.end;
 733                        mem.rotate(T, items[buffer.start..index], buffer.length());
 734                        buffer.start += (amount + 1);
 735                        buffer.end += amount;
 736                        unique -= 2;
 737                    }
 738                } else if (pull[pull_index].from < pull[pull_index].to) {
 739                    // the values were pulled out to the right, so redistribute them back to the left
 740                    var buffer = Range.init(pull[pull_index].range.end - pull[pull_index].count, pull[pull_index].range.end);
 741                    while (buffer.length() > 0) {
 742                        index = findLastBackward(T, items, items[buffer.end - 1], Range.init(pull[pull_index].range.start, buffer.start), unique, context, lessThan);
 743                        const amount = buffer.start - index;
 744                        mem.rotate(T, items[index..buffer.end], amount);
 745                        buffer.start -= amount;
 746                        buffer.end -= (amount + 1);
 747                        unique -= 2;
 748                    }
 749                }
 750            }
 751        }
 752
 753        // double the size of each A and B subarray that will be merged in the next level
 754        if (!iterator.nextLevel()) break;
 755    }
 756}
 757// merge operation without a buffer
 758fn mergeInPlace(
 759    comptime T: type,
 760    items: []T,
 761    A_arg: Range,
 762    B_arg: Range,
 763    context: anytype,
 764    comptime lessThan: fn (@TypeOf(context), lhs: T, rhs: T) bool,
 765) void {
 766    if (A_arg.length() == 0 or B_arg.length() == 0) return;
 767
 768    // this just repeatedly binary searches into B and rotates A into position.
 769    // the paper suggests using the 'rotation-based Hwang and Lin algorithm' here,
 770    // but I decided to stick with this because it had better situational performance
 771    //
 772    // (Hwang and Lin is designed for merging subarrays of very different sizes,
 773    // but WikiSort almost always uses subarrays that are roughly the same size)
 774    //
 775    // normally this is incredibly suboptimal, but this function is only called
 776    // when none of the A or B blocks in any subarray contained 2√A unique values,
 777    // which places a hard limit on the number of times this will ACTUALLY need
 778    // to binary search and rotate.
 779    //
 780    // according to my analysis the worst case is √A rotations performed on √A items
 781    // once the constant factors are removed, which ends up being O(n)
 782    //
 783    // again, this is NOT a general-purpose solution – it only works well in this case!
 784    // kind of like how the O(n^2) insertion sort is used in some places
 785
 786    var A = A_arg;
 787    var B = B_arg;
 788
 789    while (true) {
 790        // find the first place in B where the first item in A needs to be inserted
 791        const mid = binaryFirst(T, items, items[A.start], B, context, lessThan);
 792
 793        // rotate A into place
 794        const amount = mid - A.end;
 795        mem.rotate(T, items[A.start..mid], A.length());
 796        if (B.end == mid) break;
 797
 798        // calculate the new A and B ranges
 799        B.start = mid;
 800        A = Range.init(A.start + amount, B.start);
 801        A.start = binaryLast(T, items, items[A.start], A, context, lessThan);
 802        if (A.length() == 0) break;
 803    }
 804}
 805
 806// merge operation using an internal buffer
 807fn mergeInternal(
 808    comptime T: type,
 809    items: []T,
 810    A: Range,
 811    B: Range,
 812    buffer: Range,
 813    context: anytype,
 814    comptime lessThan: fn (@TypeOf(context), lhs: T, rhs: T) bool,
 815) void {
 816    // whenever we find a value to add to the final array, swap it with the value that's already in that spot
 817    // when this algorithm is finished, 'buffer' will contain its original contents, but in a different order
 818    var A_count: usize = 0;
 819    var B_count: usize = 0;
 820    var insert: usize = 0;
 821
 822    if (B.length() > 0 and A.length() > 0) {
 823        while (true) {
 824            if (!lessThan(context, items[B.start + B_count], items[buffer.start + A_count])) {
 825                mem.swap(T, &items[A.start + insert], &items[buffer.start + A_count]);
 826                A_count += 1;
 827                insert += 1;
 828                if (A_count >= A.length()) break;
 829            } else {
 830                mem.swap(T, &items[A.start + insert], &items[B.start + B_count]);
 831                B_count += 1;
 832                insert += 1;
 833                if (B_count >= B.length()) break;
 834            }
 835        }
 836    }
 837
 838    // swap the remainder of A into the final array
 839    blockSwap(T, items, buffer.start + A_count, A.start + insert, A.length() - A_count);
 840}
 841
 842fn blockSwap(comptime T: type, items: []T, start1: usize, start2: usize, block_size: usize) void {
 843    var index: usize = 0;
 844    while (index < block_size) : (index += 1) {
 845        mem.swap(T, &items[start1 + index], &items[start2 + index]);
 846    }
 847}
 848
 849// combine a linear search with a binary search to reduce the number of comparisons in situations
 850// where have some idea as to how many unique values there are and where the next value might be
 851fn findFirstForward(
 852    comptime T: type,
 853    items: []T,
 854    value: T,
 855    range: Range,
 856    unique: usize,
 857    context: anytype,
 858    comptime lessThan: fn (@TypeOf(context), lhs: T, rhs: T) bool,
 859) usize {
 860    if (range.length() == 0) return range.start;
 861    const skip = @max(range.length() / unique, @as(usize, 1));
 862
 863    var index = range.start + skip;
 864    while (lessThan(context, items[index - 1], value)) : (index += skip) {
 865        if (index >= range.end - skip) {
 866            return binaryFirst(T, items, value, Range.init(index, range.end), context, lessThan);
 867        }
 868    }
 869
 870    return binaryFirst(T, items, value, Range.init(index - skip, index), context, lessThan);
 871}
 872
 873fn findFirstBackward(
 874    comptime T: type,
 875    items: []T,
 876    value: T,
 877    range: Range,
 878    unique: usize,
 879    context: anytype,
 880    comptime lessThan: fn (@TypeOf(context), lhs: T, rhs: T) bool,
 881) usize {
 882    if (range.length() == 0) return range.start;
 883    const skip = @max(range.length() / unique, @as(usize, 1));
 884
 885    var index = range.end - skip;
 886    while (index > range.start and !lessThan(context, items[index - 1], value)) : (index -= skip) {
 887        if (index < range.start + skip) {
 888            return binaryFirst(T, items, value, Range.init(range.start, index), context, lessThan);
 889        }
 890    }
 891
 892    return binaryFirst(T, items, value, Range.init(index, index + skip), context, lessThan);
 893}
 894
 895fn findLastForward(
 896    comptime T: type,
 897    items: []T,
 898    value: T,
 899    range: Range,
 900    unique: usize,
 901    context: anytype,
 902    comptime lessThan: fn (@TypeOf(context), lhs: T, rhs: T) bool,
 903) usize {
 904    if (range.length() == 0) return range.start;
 905    const skip = @max(range.length() / unique, @as(usize, 1));
 906
 907    var index = range.start + skip;
 908    while (!lessThan(context, value, items[index - 1])) : (index += skip) {
 909        if (index >= range.end - skip) {
 910            return binaryLast(T, items, value, Range.init(index, range.end), context, lessThan);
 911        }
 912    }
 913
 914    return binaryLast(T, items, value, Range.init(index - skip, index), context, lessThan);
 915}
 916
 917fn findLastBackward(
 918    comptime T: type,
 919    items: []T,
 920    value: T,
 921    range: Range,
 922    unique: usize,
 923    context: anytype,
 924    comptime lessThan: fn (@TypeOf(context), lhs: T, rhs: T) bool,
 925) usize {
 926    if (range.length() == 0) return range.start;
 927    const skip = @max(range.length() / unique, @as(usize, 1));
 928
 929    var index = range.end - skip;
 930    while (index > range.start and lessThan(context, value, items[index - 1])) : (index -= skip) {
 931        if (index < range.start + skip) {
 932            return binaryLast(T, items, value, Range.init(range.start, index), context, lessThan);
 933        }
 934    }
 935
 936    return binaryLast(T, items, value, Range.init(index, index + skip), context, lessThan);
 937}
 938
 939fn binaryFirst(
 940    comptime T: type,
 941    items: []T,
 942    value: T,
 943    range: Range,
 944    context: anytype,
 945    comptime lessThan: fn (@TypeOf(context), lhs: T, rhs: T) bool,
 946) usize {
 947    var curr = range.start;
 948    var size = range.length();
 949    if (range.start >= range.end) return range.end;
 950    while (size > 0) {
 951        const offset = size % 2;
 952
 953        size /= 2;
 954        const mid_item = items[curr + size];
 955        if (lessThan(context, mid_item, value)) {
 956            curr += size + offset;
 957        }
 958    }
 959    return curr;
 960}
 961
 962fn binaryLast(
 963    comptime T: type,
 964    items: []T,
 965    value: T,
 966    range: Range,
 967    context: anytype,
 968    comptime lessThan: fn (@TypeOf(context), lhs: T, rhs: T) bool,
 969) usize {
 970    var curr = range.start;
 971    var size = range.length();
 972    if (range.start >= range.end) return range.end;
 973    while (size > 0) {
 974        const offset = size % 2;
 975
 976        size /= 2;
 977        const mid_item = items[curr + size];
 978        if (!lessThan(context, value, mid_item)) {
 979            curr += size + offset;
 980        }
 981    }
 982    return curr;
 983}
 984
 985fn mergeInto(
 986    comptime T: type,
 987    from: []T,
 988    A: Range,
 989    B: Range,
 990    into: []T,
 991    context: anytype,
 992    comptime lessThan: fn (@TypeOf(context), lhs: T, rhs: T) bool,
 993) void {
 994    var A_index: usize = A.start;
 995    var B_index: usize = B.start;
 996    const A_last = A.end;
 997    const B_last = B.end;
 998    var insert_index: usize = 0;
 999
1000    while (true) {
1001        if (!lessThan(context, from[B_index], from[A_index])) {
1002            into[insert_index] = from[A_index];
1003            A_index += 1;
1004            insert_index += 1;
1005            if (A_index == A_last) {
1006                // copy the remainder of B into the final array
1007                const from_b = from[B_index..B_last];
1008                @memcpy(into[insert_index..][0..from_b.len], from_b);
1009                break;
1010            }
1011        } else {
1012            into[insert_index] = from[B_index];
1013            B_index += 1;
1014            insert_index += 1;
1015            if (B_index == B_last) {
1016                // copy the remainder of A into the final array
1017                const from_a = from[A_index..A_last];
1018                @memcpy(into[insert_index..][0..from_a.len], from_a);
1019                break;
1020            }
1021        }
1022    }
1023}
1024
1025fn mergeExternal(
1026    comptime T: type,
1027    items: []T,
1028    A: Range,
1029    B: Range,
1030    cache: []T,
1031    context: anytype,
1032    comptime lessThan: fn (@TypeOf(context), lhs: T, rhs: T) bool,
1033) void {
1034    // A fits into the cache, so use that instead of the internal buffer
1035    var A_index: usize = 0;
1036    var B_index: usize = B.start;
1037    var insert_index: usize = A.start;
1038    const A_last = A.length();
1039    const B_last = B.end;
1040
1041    if (B.length() > 0 and A.length() > 0) {
1042        while (true) {
1043            if (!lessThan(context, items[B_index], cache[A_index])) {
1044                items[insert_index] = cache[A_index];
1045                A_index += 1;
1046                insert_index += 1;
1047                if (A_index == A_last) break;
1048            } else {
1049                items[insert_index] = items[B_index];
1050                B_index += 1;
1051                insert_index += 1;
1052                if (B_index == B_last) break;
1053            }
1054        }
1055    }
1056
1057    // copy the remainder of A into the final array
1058    const cache_a = cache[A_index..A_last];
1059    @memcpy(items[insert_index..][0..cache_a.len], cache_a);
1060}
1061
1062fn swap(
1063    comptime T: type,
1064    items: []T,
1065    order: *[8]u8,
1066    x: usize,
1067    y: usize,
1068    context: anytype,
1069    comptime lessThan: fn (@TypeOf(context), lhs: T, rhs: T) bool,
1070) void {
1071    if (lessThan(context, items[y], items[x]) or ((order.*)[x] > (order.*)[y] and !lessThan(context, items[x], items[y]))) {
1072        mem.swap(T, &items[x], &items[y]);
1073        mem.swap(u8, &(order.*)[x], &(order.*)[y]);
1074    }
1075}