master
1const std = @import("../std.zig");
2const builtin = @import("builtin");
3const assert = std.debug.assert;
4const expect = std.testing.expect;
5const expectEqual = std.testing.expectEqual;
6
7pub fn FloatRepr(comptime Float: type) type {
8 const fractional_bits = floatFractionalBits(Float);
9 const exponent_bits = floatExponentBits(Float);
10 return packed struct {
11 const Repr = @This();
12
13 mantissa: StoredMantissa,
14 exponent: BiasedExponent,
15 sign: std.math.Sign,
16
17 pub const StoredMantissa = @Int(.unsigned, floatMantissaBits(Float));
18 pub const Mantissa = @Int(.unsigned, 1 + fractional_bits);
19 pub const Exponent = @Int(.signed, exponent_bits);
20 pub const BiasedExponent = enum(@Int(.unsigned, exponent_bits)) {
21 denormal = 0,
22 min_normal = 1,
23 zero = (1 << (exponent_bits - 1)) - 1,
24 max_normal = (1 << exponent_bits) - 2,
25 infinite = (1 << exponent_bits) - 1,
26 _,
27
28 pub const Int = @typeInfo(BiasedExponent).@"enum".tag_type;
29
30 pub fn unbias(biased: BiasedExponent) Exponent {
31 switch (biased) {
32 .denormal => unreachable,
33 else => return @bitCast(@intFromEnum(biased) -% @intFromEnum(BiasedExponent.zero)),
34 .infinite => unreachable,
35 }
36 }
37
38 pub fn bias(unbiased: Exponent) BiasedExponent {
39 return @enumFromInt(@intFromEnum(BiasedExponent.zero) +% @as(Int, @bitCast(unbiased)));
40 }
41 };
42
43 pub const Normalized = struct {
44 fraction: Fraction,
45 exponent: Normalized.Exponent,
46
47 pub const Fraction = @Int(.unsigned, fractional_bits);
48 pub const Exponent = @Int(.signed, 1 + exponent_bits);
49
50 /// This currently truncates denormal values, which needs to be fixed before this can be used to
51 /// produce a rounded value.
52 pub fn reconstruct(normalized: Normalized, sign: std.math.Sign) Float {
53 if (normalized.exponent > BiasedExponent.max_normal.unbias()) return @bitCast(Repr{
54 .mantissa = 0,
55 .exponent = .infinite,
56 .sign = sign,
57 });
58 const mantissa = @as(Mantissa, 1 << fractional_bits) | normalized.fraction;
59 if (normalized.exponent < BiasedExponent.min_normal.unbias()) return @bitCast(Repr{
60 .mantissa = @truncate(std.math.shr(
61 Mantissa,
62 mantissa,
63 BiasedExponent.min_normal.unbias() - normalized.exponent,
64 )),
65 .exponent = .denormal,
66 .sign = sign,
67 });
68 return @bitCast(Repr{
69 .mantissa = @truncate(mantissa),
70 .exponent = .bias(@intCast(normalized.exponent)),
71 .sign = sign,
72 });
73 }
74 };
75
76 pub const Classified = union(enum) { normalized: Normalized, infinity, nan, invalid };
77 fn classify(repr: Repr) Classified {
78 return switch (repr.exponent) {
79 .denormal => {
80 const mantissa: Mantissa = repr.mantissa;
81 const shift = @clz(mantissa);
82 return .{ .normalized = .{
83 .fraction = @truncate(mantissa << shift),
84 .exponent = @as(Normalized.Exponent, comptime BiasedExponent.min_normal.unbias()) - shift,
85 } };
86 },
87 else => if (repr.mantissa <= std.math.maxInt(Normalized.Fraction)) .{ .normalized = .{
88 .fraction = @intCast(repr.mantissa),
89 .exponent = repr.exponent.unbias(),
90 } } else .invalid,
91 .infinite => switch (repr.mantissa) {
92 0 => .infinity,
93 else => .nan,
94 },
95 };
96 }
97 };
98}
99
100/// Creates a raw "1.0" mantissa for floating point type T. Used to dedupe f80 logic.
101inline fn mantissaOne(comptime T: type) comptime_int {
102 return if (@typeInfo(T).float.bits == 80) 1 << floatFractionalBits(T) else 0;
103}
104
105/// Creates floating point type T from an unbiased exponent and raw mantissa.
106inline fn reconstructFloat(comptime T: type, comptime exponent: comptime_int, comptime mantissa: comptime_int) T {
107 const TBits = @Int(.unsigned, @bitSizeOf(T));
108 const biased_exponent = @as(TBits, exponent + floatExponentMax(T));
109 return @as(T, @bitCast((biased_exponent << floatMantissaBits(T)) | @as(TBits, mantissa)));
110}
111
112/// Returns the number of bits in the exponent of floating point type T.
113pub inline fn floatExponentBits(comptime T: type) comptime_int {
114 comptime assert(@typeInfo(T) == .float);
115
116 return switch (@typeInfo(T).float.bits) {
117 16 => 5,
118 32 => 8,
119 64 => 11,
120 80 => 15,
121 128 => 15,
122 else => @compileError("unknown floating point type " ++ @typeName(T)),
123 };
124}
125
126/// Returns the number of bits in the mantissa of floating point type T.
127pub inline fn floatMantissaBits(comptime T: type) comptime_int {
128 comptime assert(@typeInfo(T) == .float);
129
130 return switch (@typeInfo(T).float.bits) {
131 16 => 10,
132 32 => 23,
133 64 => 52,
134 80 => 64,
135 128 => 112,
136 else => @compileError("unknown floating point type " ++ @typeName(T)),
137 };
138}
139
140/// Returns the number of fractional bits in the mantissa of floating point type T.
141pub inline fn floatFractionalBits(comptime T: type) comptime_int {
142 comptime assert(@typeInfo(T) == .float);
143
144 // standard IEEE floats have an implicit 0.m or 1.m integer part
145 // f80 is special and has an explicitly stored bit in the MSB
146 // this function corresponds to `MANT_DIG - 1' from C
147 return switch (@typeInfo(T).float.bits) {
148 16 => 10,
149 32 => 23,
150 64 => 52,
151 80 => 63,
152 128 => 112,
153 else => @compileError("unknown floating point type " ++ @typeName(T)),
154 };
155}
156
157/// Returns the minimum exponent that can represent
158/// a normalised value in floating point type T.
159pub inline fn floatExponentMin(comptime T: type) comptime_int {
160 return -floatExponentMax(T) + 1;
161}
162
163/// Returns the maximum exponent that can represent
164/// a normalised value in floating point type T.
165pub inline fn floatExponentMax(comptime T: type) comptime_int {
166 return (1 << (floatExponentBits(T) - 1)) - 1;
167}
168
169/// Returns the smallest subnormal number representable in floating point type T.
170pub inline fn floatTrueMin(comptime T: type) T {
171 return reconstructFloat(T, floatExponentMin(T) - 1, 1);
172}
173
174/// Returns the smallest normal number representable in floating point type T.
175pub inline fn floatMin(comptime T: type) T {
176 return reconstructFloat(T, floatExponentMin(T), mantissaOne(T));
177}
178
179/// Returns the largest normal number representable in floating point type T.
180pub inline fn floatMax(comptime T: type) T {
181 const all1s_mantissa = (1 << floatMantissaBits(T)) - 1;
182 return reconstructFloat(T, floatExponentMax(T), all1s_mantissa);
183}
184
185/// Returns the machine epsilon of floating point type T.
186pub inline fn floatEps(comptime T: type) T {
187 return reconstructFloat(T, -floatFractionalBits(T), mantissaOne(T));
188}
189
190/// Returns the local epsilon of floating point type T.
191pub inline fn floatEpsAt(comptime T: type, x: T) T {
192 switch (@typeInfo(T)) {
193 .float => |F| {
194 const U: type = @Int(.unsigned, F.bits);
195 const u: U = @bitCast(x);
196 const y: T = @bitCast(u ^ 1);
197 return @abs(x - y);
198 },
199 else => @compileError("floatEpsAt only supports floats"),
200 }
201}
202
203/// Returns the inf value for a floating point `Type`.
204pub inline fn inf(comptime Type: type) Type {
205 const RuntimeType = switch (Type) {
206 else => Type,
207 comptime_float => f128, // any float type will do
208 };
209 return reconstructFloat(RuntimeType, floatExponentMax(RuntimeType) + 1, mantissaOne(RuntimeType));
210}
211
212/// Returns the canonical quiet NaN representation for a floating point `Type`.
213pub inline fn nan(comptime Type: type) Type {
214 const RuntimeType = switch (Type) {
215 else => Type,
216 comptime_float => f128, // any float type will do
217 };
218 return reconstructFloat(
219 RuntimeType,
220 floatExponentMax(RuntimeType) + 1,
221 mantissaOne(RuntimeType) | 1 << (floatFractionalBits(RuntimeType) - 1),
222 );
223}
224
225/// Returns a signalling NaN representation for a floating point `Type`.
226///
227/// TODO: LLVM is known to miscompile on some architectures to quiet NaN -
228/// this is tracked by https://github.com/ziglang/zig/issues/14366
229pub inline fn snan(comptime Type: type) Type {
230 const RuntimeType = switch (Type) {
231 else => Type,
232 comptime_float => f128, // any float type will do
233 };
234 return reconstructFloat(
235 RuntimeType,
236 floatExponentMax(RuntimeType) + 1,
237 mantissaOne(RuntimeType) | 1 << (floatFractionalBits(RuntimeType) - 2),
238 );
239}
240
241fn floatBits(comptime Type: type) !void {
242 // (1 +) for the sign bit, since it is separate from the other bits
243 const size = 1 + floatExponentBits(Type) + floatMantissaBits(Type);
244 try expect(@bitSizeOf(Type) == size);
245 try expect(floatFractionalBits(Type) <= floatMantissaBits(Type));
246
247 // for machine epsilon, assert expmin <= -prec <= expmax
248 try expect(floatExponentMin(Type) <= -floatFractionalBits(Type));
249 try expect(-floatFractionalBits(Type) <= floatExponentMax(Type));
250}
251test floatBits {
252 try floatBits(f16);
253 try floatBits(f32);
254 try floatBits(f64);
255 try floatBits(f80);
256 try floatBits(f128);
257 try floatBits(c_longdouble);
258}
259
260test inf {
261 const inf_u16: u16 = 0x7C00;
262 const inf_u32: u32 = 0x7F800000;
263 const inf_u64: u64 = 0x7FF0000000000000;
264 const inf_u80: u80 = 0x7FFF8000000000000000;
265 const inf_u128: u128 = 0x7FFF0000000000000000000000000000;
266 try expectEqual(inf_u16, @as(u16, @bitCast(inf(f16))));
267 try expectEqual(inf_u32, @as(u32, @bitCast(inf(f32))));
268 try expectEqual(inf_u64, @as(u64, @bitCast(inf(f64))));
269 try expectEqual(inf_u80, @as(u80, @bitCast(inf(f80))));
270 try expectEqual(inf_u128, @as(u128, @bitCast(inf(f128))));
271}
272
273test nan {
274 const qnan_u16: u16 = 0x7E00;
275 const qnan_u32: u32 = 0x7FC00000;
276 const qnan_u64: u64 = 0x7FF8000000000000;
277 const qnan_u80: u80 = 0x7FFFC000000000000000;
278 const qnan_u128: u128 = 0x7FFF8000000000000000000000000000;
279 try expectEqual(qnan_u16, @as(u16, @bitCast(nan(f16))));
280 try expectEqual(qnan_u32, @as(u32, @bitCast(nan(f32))));
281 try expectEqual(qnan_u64, @as(u64, @bitCast(nan(f64))));
282 try expectEqual(qnan_u80, @as(u80, @bitCast(nan(f80))));
283 try expectEqual(qnan_u128, @as(u128, @bitCast(nan(f128))));
284}
285
286test snan {
287 const snan_u16: u16 = 0x7D00;
288 const snan_u32: u32 = 0x7FA00000;
289 const snan_u64: u64 = 0x7FF4000000000000;
290 const snan_u80: u80 = 0x7FFFA000000000000000;
291 const snan_u128: u128 = 0x7FFF4000000000000000000000000000;
292 try expectEqual(snan_u16, @as(u16, @bitCast(snan(f16))));
293 try expectEqual(snan_u32, @as(u32, @bitCast(snan(f32))));
294 try expectEqual(snan_u64, @as(u64, @bitCast(snan(f64))));
295 try expectEqual(snan_u80, @as(u80, @bitCast(snan(f80))));
296 try expectEqual(snan_u128, @as(u128, @bitCast(snan(f128))));
297}