master
1const std = @import("std");
2const builtin = @import("builtin");
3const Log2Int = std.math.Log2Int;
4const common = @import("common.zig");
5const HalveInt = common.HalveInt;
6
7const lo = switch (builtin.cpu.arch.endian()) {
8 .big => 1,
9 .little => 0,
10};
11const hi = 1 - lo;
12
13// Let _u1 and _u0 be the high and low limbs of U respectively.
14// Returns U / v_ and sets r = U % v_.
15fn divwide_generic(comptime T: type, _u1: T, _u0: T, v_: T, r: *T) T {
16 const HalfT = HalveInt(T, false).HalfT;
17 @setRuntimeSafety(common.test_safety);
18 var v = v_;
19
20 const b = @as(T, 1) << (@bitSizeOf(T) / 2);
21 var un64: T = undefined;
22 var un10: T = undefined;
23
24 const s: Log2Int(T) = @intCast(@clz(v));
25 if (s > 0) {
26 // Normalize divisor
27 v <<= s;
28 un64 = (_u1 << s) | (_u0 >> @intCast((@bitSizeOf(T) - @as(T, @intCast(s)))));
29 un10 = _u0 << s;
30 } else {
31 // Avoid undefined behavior of (u0 >> @bitSizeOf(T))
32 un64 = _u1;
33 un10 = _u0;
34 }
35
36 // Break divisor up into two 32-bit digits
37 const vn1 = v >> (@bitSizeOf(T) / 2);
38 const vn0 = v & std.math.maxInt(HalfT);
39
40 // Break right half of dividend into two digits
41 const un1 = un10 >> (@bitSizeOf(T) / 2);
42 const un0 = un10 & std.math.maxInt(HalfT);
43
44 // Compute the first quotient digit, q1
45 var q1 = un64 / vn1;
46 var rhat = un64 -% q1 *% vn1;
47
48 // q1 has at most error 2. No more than 2 iterations
49 while (q1 >= b or q1 * vn0 > b * rhat + un1) {
50 q1 -= 1;
51 rhat += vn1;
52 if (rhat >= b) break;
53 }
54
55 const un21 = un64 *% b +% un1 -% q1 *% v;
56
57 // Compute the second quotient digit
58 var q0 = un21 / vn1;
59 rhat = un21 -% q0 *% vn1;
60
61 // q0 has at most error 2. No more than 2 iterations.
62 while (q0 >= b or q0 * vn0 > b * rhat + un0) {
63 q0 -= 1;
64 rhat += vn1;
65 if (rhat >= b) break;
66 }
67
68 r.* = (un21 *% b +% un0 -% q0 *% v) >> s;
69 return q1 *% b +% q0;
70}
71
72fn divwide(comptime T: type, _u1: T, _u0: T, v: T, r: *T) T {
73 @setRuntimeSafety(common.test_safety);
74 if (T == u64 and builtin.target.cpu.arch == .x86_64 and builtin.target.os.tag != .windows) {
75 var rem: T = undefined;
76 const quo = asm (
77 \\divq %[v]
78 : [_] "={rax}" (-> T),
79 [_] "={rdx}" (rem),
80 : [v] "r" (v),
81 [_] "{rax}" (_u0),
82 [_] "{rdx}" (_u1),
83 );
84 r.* = rem;
85 return quo;
86 } else {
87 return divwide_generic(T, _u1, _u0, v, r);
88 }
89}
90
91// Returns a_ / b_ and sets maybe_rem = a_ % b.
92pub fn udivmod(comptime T: type, a_: T, b_: T, maybe_rem: ?*T) T {
93 @setRuntimeSafety(common.test_safety);
94 const HalfT = HalveInt(T, false).HalfT;
95 const SignedT = std.meta.Int(.signed, @bitSizeOf(T));
96
97 if (b_ > a_) {
98 if (maybe_rem) |rem| {
99 rem.* = a_;
100 }
101 return 0;
102 }
103
104 const a: [2]HalfT = @bitCast(a_);
105 const b: [2]HalfT = @bitCast(b_);
106 var q: [2]HalfT = undefined;
107 var r: [2]HalfT = undefined;
108
109 // When the divisor fits in 64 bits, we can use an optimized path
110 if (b[hi] == 0) {
111 r[hi] = 0;
112 if (a[hi] < b[lo]) {
113 // The result fits in 64 bits
114 q[hi] = 0;
115 q[lo] = divwide(HalfT, a[hi], a[lo], b[lo], &r[lo]);
116 } else {
117 // First, divide with the high part to get the remainder. After that a_hi < b_lo.
118 q[hi] = a[hi] / b[lo];
119 q[lo] = divwide(HalfT, a[hi] % b[lo], a[lo], b[lo], &r[lo]);
120 }
121 if (maybe_rem) |rem| {
122 rem.* = @bitCast(r);
123 }
124 return @bitCast(q);
125 }
126
127 // 0 <= shift <= 63
128 const shift: Log2Int(T) = @clz(b[hi]) - @clz(a[hi]);
129 var af: T = @bitCast(a);
130 var bf = @as(T, @bitCast(b)) << shift;
131 q = @bitCast(@as(T, 0));
132
133 for (0..shift + 1) |_| {
134 q[lo] <<= 1;
135 // Branchless version of:
136 // if (af >= bf) {
137 // af -= bf;
138 // q[lo] |= 1;
139 // }
140 const s = @as(SignedT, @bitCast(bf -% af -% 1)) >> (@bitSizeOf(T) - 1);
141 q[lo] |= @intCast(s & 1);
142 af -= bf & @as(T, @bitCast(s));
143 bf >>= 1;
144 }
145 if (maybe_rem) |rem| {
146 rem.* = @bitCast(af);
147 }
148 return @bitCast(q);
149}