zig/lib/std/math/float.zig at master

  1const std = @import("../std.zig");
  2const builtin = @import("builtin");
  3const assert = std.debug.assert;
  4const expect = std.testing.expect;
  5const expectEqual = std.testing.expectEqual;
  6
  7pub fn FloatRepr(comptime Float: type) type {
  8    const fractional_bits = floatFractionalBits(Float);
  9    const exponent_bits = floatExponentBits(Float);
 10    return packed struct {
 11        const Repr = @This();
 12
 13        mantissa: StoredMantissa,
 14        exponent: BiasedExponent,
 15        sign: std.math.Sign,
 16
 17        pub const StoredMantissa = @Int(.unsigned, floatMantissaBits(Float));
 18        pub const Mantissa = @Int(.unsigned, 1 + fractional_bits);
 19        pub const Exponent = @Int(.signed, exponent_bits);
 20        pub const BiasedExponent = enum(@Int(.unsigned, exponent_bits)) {
 21            denormal = 0,
 22            min_normal = 1,
 23            zero = (1 << (exponent_bits - 1)) - 1,
 24            max_normal = (1 << exponent_bits) - 2,
 25            infinite = (1 << exponent_bits) - 1,
 26            _,
 27
 28            pub const Int = @typeInfo(BiasedExponent).@"enum".tag_type;
 29
 30            pub fn unbias(biased: BiasedExponent) Exponent {
 31                switch (biased) {
 32                    .denormal => unreachable,
 33                    else => return @bitCast(@intFromEnum(biased) -% @intFromEnum(BiasedExponent.zero)),
 34                    .infinite => unreachable,
 35                }
 36            }
 37
 38            pub fn bias(unbiased: Exponent) BiasedExponent {
 39                return @enumFromInt(@intFromEnum(BiasedExponent.zero) +% @as(Int, @bitCast(unbiased)));
 40            }
 41        };
 42
 43        pub const Normalized = struct {
 44            fraction: Fraction,
 45            exponent: Normalized.Exponent,
 46
 47            pub const Fraction = @Int(.unsigned, fractional_bits);
 48            pub const Exponent = @Int(.signed, 1 + exponent_bits);
 49
 50            /// This currently truncates denormal values, which needs to be fixed before this can be used to
 51            /// produce a rounded value.
 52            pub fn reconstruct(normalized: Normalized, sign: std.math.Sign) Float {
 53                if (normalized.exponent > BiasedExponent.max_normal.unbias()) return @bitCast(Repr{
 54                    .mantissa = 0,
 55                    .exponent = .infinite,
 56                    .sign = sign,
 57                });
 58                const mantissa = @as(Mantissa, 1 << fractional_bits) | normalized.fraction;
 59                if (normalized.exponent < BiasedExponent.min_normal.unbias()) return @bitCast(Repr{
 60                    .mantissa = @truncate(std.math.shr(
 61                        Mantissa,
 62                        mantissa,
 63                        BiasedExponent.min_normal.unbias() - normalized.exponent,
 64                    )),
 65                    .exponent = .denormal,
 66                    .sign = sign,
 67                });
 68                return @bitCast(Repr{
 69                    .mantissa = @truncate(mantissa),
 70                    .exponent = .bias(@intCast(normalized.exponent)),
 71                    .sign = sign,
 72                });
 73            }
 74        };
 75
 76        pub const Classified = union(enum) { normalized: Normalized, infinity, nan, invalid };
 77        fn classify(repr: Repr) Classified {
 78            return switch (repr.exponent) {
 79                .denormal => {
 80                    const mantissa: Mantissa = repr.mantissa;
 81                    const shift = @clz(mantissa);
 82                    return .{ .normalized = .{
 83                        .fraction = @truncate(mantissa << shift),
 84                        .exponent = @as(Normalized.Exponent, comptime BiasedExponent.min_normal.unbias()) - shift,
 85                    } };
 86                },
 87                else => if (repr.mantissa <= std.math.maxInt(Normalized.Fraction)) .{ .normalized = .{
 88                    .fraction = @intCast(repr.mantissa),
 89                    .exponent = repr.exponent.unbias(),
 90                } } else .invalid,
 91                .infinite => switch (repr.mantissa) {
 92                    0 => .infinity,
 93                    else => .nan,
 94                },
 95            };
 96        }
 97    };
 98}
 99
100/// Creates a raw "1.0" mantissa for floating point type T. Used to dedupe f80 logic.
101inline fn mantissaOne(comptime T: type) comptime_int {
102    return if (@typeInfo(T).float.bits == 80) 1 << floatFractionalBits(T) else 0;
103}
104
105/// Creates floating point type T from an unbiased exponent and raw mantissa.
106inline fn reconstructFloat(comptime T: type, comptime exponent: comptime_int, comptime mantissa: comptime_int) T {
107    const TBits = @Int(.unsigned, @bitSizeOf(T));
108    const biased_exponent = @as(TBits, exponent + floatExponentMax(T));
109    return @as(T, @bitCast((biased_exponent << floatMantissaBits(T)) | @as(TBits, mantissa)));
110}
111
112/// Returns the number of bits in the exponent of floating point type T.
113pub inline fn floatExponentBits(comptime T: type) comptime_int {
114    comptime assert(@typeInfo(T) == .float);
115
116    return switch (@typeInfo(T).float.bits) {
117        16 => 5,
118        32 => 8,
119        64 => 11,
120        80 => 15,
121        128 => 15,
122        else => @compileError("unknown floating point type " ++ @typeName(T)),
123    };
124}
125
126/// Returns the number of bits in the mantissa of floating point type T.
127pub inline fn floatMantissaBits(comptime T: type) comptime_int {
128    comptime assert(@typeInfo(T) == .float);
129
130    return switch (@typeInfo(T).float.bits) {
131        16 => 10,
132        32 => 23,
133        64 => 52,
134        80 => 64,
135        128 => 112,
136        else => @compileError("unknown floating point type " ++ @typeName(T)),
137    };
138}
139
140/// Returns the number of fractional bits in the mantissa of floating point type T.
141pub inline fn floatFractionalBits(comptime T: type) comptime_int {
142    comptime assert(@typeInfo(T) == .float);
143
144    // standard IEEE floats have an implicit 0.m or 1.m integer part
145    // f80 is special and has an explicitly stored bit in the MSB
146    // this function corresponds to `MANT_DIG - 1' from C
147    return switch (@typeInfo(T).float.bits) {
148        16 => 10,
149        32 => 23,
150        64 => 52,
151        80 => 63,
152        128 => 112,
153        else => @compileError("unknown floating point type " ++ @typeName(T)),
154    };
155}
156
157/// Returns the minimum exponent that can represent
158/// a normalised value in floating point type T.
159pub inline fn floatExponentMin(comptime T: type) comptime_int {
160    return -floatExponentMax(T) + 1;
161}
162
163/// Returns the maximum exponent that can represent
164/// a normalised value in floating point type T.
165pub inline fn floatExponentMax(comptime T: type) comptime_int {
166    return (1 << (floatExponentBits(T) - 1)) - 1;
167}
168
169/// Returns the smallest subnormal number representable in floating point type T.
170pub inline fn floatTrueMin(comptime T: type) T {
171    return reconstructFloat(T, floatExponentMin(T) - 1, 1);
172}
173
174/// Returns the smallest normal number representable in floating point type T.
175pub inline fn floatMin(comptime T: type) T {
176    return reconstructFloat(T, floatExponentMin(T), mantissaOne(T));
177}
178
179/// Returns the largest normal number representable in floating point type T.
180pub inline fn floatMax(comptime T: type) T {
181    const all1s_mantissa = (1 << floatMantissaBits(T)) - 1;
182    return reconstructFloat(T, floatExponentMax(T), all1s_mantissa);
183}
184
185/// Returns the machine epsilon of floating point type T.
186pub inline fn floatEps(comptime T: type) T {
187    return reconstructFloat(T, -floatFractionalBits(T), mantissaOne(T));
188}
189
190/// Returns the local epsilon of floating point type T.
191pub inline fn floatEpsAt(comptime T: type, x: T) T {
192    switch (@typeInfo(T)) {
193        .float => |F| {
194            const U: type = @Int(.unsigned, F.bits);
195            const u: U = @bitCast(x);
196            const y: T = @bitCast(u ^ 1);
197            return @abs(x - y);
198        },
199        else => @compileError("floatEpsAt only supports floats"),
200    }
201}
202
203/// Returns the inf value for a floating point `Type`.
204pub inline fn inf(comptime Type: type) Type {
205    const RuntimeType = switch (Type) {
206        else => Type,
207        comptime_float => f128, // any float type will do
208    };
209    return reconstructFloat(RuntimeType, floatExponentMax(RuntimeType) + 1, mantissaOne(RuntimeType));
210}
211
212/// Returns the canonical quiet NaN representation for a floating point `Type`.
213pub inline fn nan(comptime Type: type) Type {
214    const RuntimeType = switch (Type) {
215        else => Type,
216        comptime_float => f128, // any float type will do
217    };
218    return reconstructFloat(
219        RuntimeType,
220        floatExponentMax(RuntimeType) + 1,
221        mantissaOne(RuntimeType) | 1 << (floatFractionalBits(RuntimeType) - 1),
222    );
223}
224
225/// Returns a signalling NaN representation for a floating point `Type`.
226///
227/// TODO: LLVM is known to miscompile on some architectures to quiet NaN -
228///       this is tracked by https://github.com/ziglang/zig/issues/14366
229pub inline fn snan(comptime Type: type) Type {
230    const RuntimeType = switch (Type) {
231        else => Type,
232        comptime_float => f128, // any float type will do
233    };
234    return reconstructFloat(
235        RuntimeType,
236        floatExponentMax(RuntimeType) + 1,
237        mantissaOne(RuntimeType) | 1 << (floatFractionalBits(RuntimeType) - 2),
238    );
239}
240
241fn floatBits(comptime Type: type) !void {
242    // (1 +) for the sign bit, since it is separate from the other bits
243    const size = 1 + floatExponentBits(Type) + floatMantissaBits(Type);
244    try expect(@bitSizeOf(Type) == size);
245    try expect(floatFractionalBits(Type) <= floatMantissaBits(Type));
246
247    // for machine epsilon, assert expmin <= -prec <= expmax
248    try expect(floatExponentMin(Type) <= -floatFractionalBits(Type));
249    try expect(-floatFractionalBits(Type) <= floatExponentMax(Type));
250}
251test floatBits {
252    try floatBits(f16);
253    try floatBits(f32);
254    try floatBits(f64);
255    try floatBits(f80);
256    try floatBits(f128);
257    try floatBits(c_longdouble);
258}
259
260test inf {
261    const inf_u16: u16 = 0x7C00;
262    const inf_u32: u32 = 0x7F800000;
263    const inf_u64: u64 = 0x7FF0000000000000;
264    const inf_u80: u80 = 0x7FFF8000000000000000;
265    const inf_u128: u128 = 0x7FFF0000000000000000000000000000;
266    try expectEqual(inf_u16, @as(u16, @bitCast(inf(f16))));
267    try expectEqual(inf_u32, @as(u32, @bitCast(inf(f32))));
268    try expectEqual(inf_u64, @as(u64, @bitCast(inf(f64))));
269    try expectEqual(inf_u80, @as(u80, @bitCast(inf(f80))));
270    try expectEqual(inf_u128, @as(u128, @bitCast(inf(f128))));
271}
272
273test nan {
274    const qnan_u16: u16 = 0x7E00;
275    const qnan_u32: u32 = 0x7FC00000;
276    const qnan_u64: u64 = 0x7FF8000000000000;
277    const qnan_u80: u80 = 0x7FFFC000000000000000;
278    const qnan_u128: u128 = 0x7FFF8000000000000000000000000000;
279    try expectEqual(qnan_u16, @as(u16, @bitCast(nan(f16))));
280    try expectEqual(qnan_u32, @as(u32, @bitCast(nan(f32))));
281    try expectEqual(qnan_u64, @as(u64, @bitCast(nan(f64))));
282    try expectEqual(qnan_u80, @as(u80, @bitCast(nan(f80))));
283    try expectEqual(qnan_u128, @as(u128, @bitCast(nan(f128))));
284}
285
286test snan {
287    const snan_u16: u16 = 0x7D00;
288    const snan_u32: u32 = 0x7FA00000;
289    const snan_u64: u64 = 0x7FF4000000000000;
290    const snan_u80: u80 = 0x7FFFA000000000000000;
291    const snan_u128: u128 = 0x7FFF4000000000000000000000000000;
292    try expectEqual(snan_u16, @as(u16, @bitCast(snan(f16))));
293    try expectEqual(snan_u32, @as(u32, @bitCast(snan(f32))));
294    try expectEqual(snan_u64, @as(u64, @bitCast(snan(f64))));
295    try expectEqual(snan_u80, @as(u80, @bitCast(snan(f80))));
296    try expectEqual(snan_u128, @as(u128, @bitCast(snan(f128))));
297}