Commit c9218f1719

Frank Denis <github@pureftpd.org>
2020-08-21 13:56:11
Make poly1305 faster
1 parent 243b5c7
Changed files (1)
lib
std
lib/std/crypto/poly1305.zig
@@ -3,224 +3,193 @@
 // This file is part of [zig](https://ziglang.org/), which is MIT licensed.
 // The MIT license requires this copyright notice to be included in all copies
 // and substantial portions of the software.
-// Translated from monocypher which is licensed under CC-0/BSD-3.
-//
-// https://monocypher.org/
-
-const std = @import("../std.zig");
-const builtin = std.builtin;
-
-const Endian = builtin.Endian;
-const readIntLittle = std.mem.readIntLittle;
-const writeIntLittle = std.mem.writeIntLittle;
+const std = @import("std");
+const mem = std.mem;
 
 pub const Poly1305 = struct {
-    const Self = @This();
-
+    pub const block_size: usize = 16;
     pub const mac_length = 16;
     pub const minimum_key_length = 32;
 
     // constant multiplier (from the secret key)
-    r: [4]u32,
+    r: [3]u64,
     // accumulated hash
-    h: [5]u32,
-    // chunk of the message
-    c: [5]u32,
+    h: [3]u64 = [_]u64{ 0, 0, 0 },
     // random number added at the end (from the secret key)
-    pad: [4]u32,
-    // How many bytes are there in the chunk.
-    c_idx: usize,
-
-    fn secureZero(self: *Self) void {
-        std.mem.secureZero(u8, @ptrCast([*]u8, self)[0..@sizeOf(Poly1305)]);
-    }
+    pad: [2]u64,
+    // how many bytes are waiting to be processed in a partial block
+    leftover: usize = 0,
+    // partial block buffer
+    buf: [block_size]u8 align(16) = undefined,
 
-    pub fn create(out: []u8, msg: []const u8, key: []const u8) void {
-        std.debug.assert(out.len >= mac_length);
+    pub fn init(key: []const u8) Poly1305 {
         std.debug.assert(key.len >= minimum_key_length);
+        const t0 = mem.readIntLittle(u64, key[0..8]);
+        const t1 = mem.readIntLittle(u64, key[8..16]);
+        return Poly1305{
+            .r = [_]u64{
+                t0 & 0xffc0fffffff,
+                ((t0 >> 44) | (t1 << 20)) & 0xfffffc0ffff,
+                ((t1 >> 24)) & 0x00ffffffc0f,
+            },
+            .pad = [_]u64{
+                mem.readIntLittle(u64, key[16..24]),
+                mem.readIntLittle(u64, key[24..32]),
+            },
+        };
+    }
 
-        var ctx = Poly1305.init(key);
-        ctx.update(msg);
-        ctx.final(out);
+    fn blocks(st: *Poly1305, m: []const u8, last: comptime bool) void {
+        const hibit: u64 = if (last) 0 else 1 << 40;
+        const r0 = st.r[0];
+        const r1 = st.r[1];
+        const r2 = st.r[2];
+        const s1 = r1 * (5 << 2);
+        const s2 = r2 * (5 << 2);
+        var i: usize = 0;
+        while (i + block_size <= m.len) : (i += block_size) {
+            // h += m[i]
+            const t0 = mem.readIntLittle(u64, m[i..][0..8]);
+            const t1 = mem.readIntLittle(u64, m[i + 8 ..][0..8]);
+            st.h[0] += t0 & 0xfffffffffff;
+            st.h[1] += ((t0 >> 44) | (t1 << 20)) & 0xfffffffffff;
+            st.h[2] += (((t1 >> 24)) & 0x3ffffffffff) | hibit;
+
+            // h *= r
+            const d0 = @as(u128, st.h[0]) * @as(u128, r0) + @as(u128, st.h[1]) * @as(u128, s2) + @as(u128, st.h[2]) * @as(u128, s1);
+            var d1 = @as(u128, st.h[0]) * @as(u128, r1) + @as(u128, st.h[1]) * @as(u128, r0) + @as(u128, st.h[2]) * @as(u128, s2);
+            var d2 = @as(u128, st.h[0]) * @as(u128, r2) + @as(u128, st.h[1]) * @as(u128, r1) + @as(u128, st.h[2]) * @as(u128, r0);
+
+            // partial reduction
+            var carry = d0 >> 44;
+            st.h[0] = @truncate(u64, d0) & 0xfffffffffff;
+            d1 += carry;
+            carry = @intCast(u64, d1 >> 44);
+            st.h[1] = @truncate(u64, d1) & 0xfffffffffff;
+            d2 += carry;
+            carry = @intCast(u64, d2 >> 42);
+            st.h[2] = @truncate(u64, d2) & 0x3ffffffffff;
+            st.h[0] += @truncate(u64, carry) * 5;
+            carry = st.h[0] >> 44;
+            st.h[0] &= 0xfffffffffff;
+            st.h[1] += @truncate(u64, carry);
+        }
     }
 
-    // Initialize the MAC context.
-    //   - key.len is sufficient size.
-    pub fn init(key: []const u8) Self {
-        var ctx: Poly1305 = undefined;
+    pub fn update(st: *Poly1305, m: []const u8) void {
+        var mb = m;
 
-        // Initial hash is zero
-        {
-            var i: usize = 0;
-            while (i < 5) : (i += 1) {
-                ctx.h[i] = 0;
+        // handle leftover
+        if (st.leftover > 0) {
+            const want = std.math.min(block_size - st.leftover, mb.len);
+            const mc = mb[0..want];
+            for (mc) |x, i| {
+                st.buf[st.leftover + i] = x;
             }
-        }
-        // add 2^130 to every input block
-        ctx.c[4] = 1;
-        polyClearC(&ctx);
-
-        // load r and pad (r has some of its bits cleared)
-        {
-            var i: usize = 0;
-            while (i < 1) : (i += 1) {
-                ctx.r[0] = readIntLittle(u32, key[0..4]) & 0x0fffffff;
+            mb = mb[want..];
+            st.leftover += want;
+            if (st.leftover > block_size) {
+                return;
             }
+            st.blocks(&st.buf, false);
+            st.leftover = 0;
         }
-        {
-            var i: usize = 1;
-            while (i < 4) : (i += 1) {
-                ctx.r[i] = readIntLittle(u32, key[i * 4 ..][0..4]) & 0x0ffffffc;
-            }
-        }
-        {
-            var i: usize = 0;
-            while (i < 4) : (i += 1) {
-                ctx.pad[i] = readIntLittle(u32, key[i * 4 + 16 ..][0..4]);
-            }
-        }
-
-        return ctx;
-    }
-
-    // h = (h + c) * r
-    // preconditions:
-    //   ctx->h <= 4_ffffffff_ffffffff_ffffffff_ffffffff
-    //   ctx->c <= 1_ffffffff_ffffffff_ffffffff_ffffffff
-    //   ctx->r <=   0ffffffc_0ffffffc_0ffffffc_0fffffff
-    // Postcondition:
-    //   ctx->h <= 4_ffffffff_ffffffff_ffffffff_ffffffff
-    fn polyBlock(ctx: *Self) void {
-        // s = h + c, without carry propagation
-        const s0 = @as(u64, ctx.h[0]) + ctx.c[0]; // s0 <= 1_fffffffe
-        const s1 = @as(u64, ctx.h[1]) + ctx.c[1]; // s1 <= 1_fffffffe
-        const s2 = @as(u64, ctx.h[2]) + ctx.c[2]; // s2 <= 1_fffffffe
-        const s3 = @as(u64, ctx.h[3]) + ctx.c[3]; // s3 <= 1_fffffffe
-        const s4 = @as(u64, ctx.h[4]) + ctx.c[4]; // s4 <=          5
-
-        // Local all the things!
-        const r0 = ctx.r[0]; // r0  <= 0fffffff
-        const r1 = ctx.r[1]; // r1  <= 0ffffffc
-        const r2 = ctx.r[2]; // r2  <= 0ffffffc
-        const r3 = ctx.r[3]; // r3  <= 0ffffffc
-        const rr0 = (r0 >> 2) * 5; // rr0 <= 13fffffb // lose 2 bits...
-        const rr1 = (r1 >> 2) + r1; // rr1 <= 13fffffb // rr1 == (r1 >> 2) * 5
-        const rr2 = (r2 >> 2) + r2; // rr2 <= 13fffffb // rr1 == (r2 >> 2) * 5
-        const rr3 = (r3 >> 2) + r3; // rr3 <= 13fffffb // rr1 == (r3 >> 2) * 5
-
-        // (h + c) * r, without carry propagation
-        const x0 = s0 * r0 + s1 * rr3 + s2 * rr2 + s3 * rr1 + s4 * rr0; //<=97ffffe007fffff8
-        const x1 = s0 * r1 + s1 * r0 + s2 * rr3 + s3 * rr2 + s4 * rr1; //<=8fffffe20ffffff6
-        const x2 = s0 * r2 + s1 * r1 + s2 * r0 + s3 * rr3 + s4 * rr2; //<=87ffffe417fffff4
-        const x3 = s0 * r3 + s1 * r2 + s2 * r1 + s3 * r0 + s4 * rr3; //<=7fffffe61ffffff2
-        const x4 = s4 * (r0 & 3); // ...recover 2 bits      //<=               f
-
-        // partial reduction modulo 2^130 - 5
-        const _u5 = @truncate(u32, x4 + (x3 >> 32)); // u5 <= 7ffffff5
-        const _u0 = (_u5 >> 2) * 5 + (x0 & 0xffffffff);
-        const _u1 = (_u0 >> 32) + (x1 & 0xffffffff) + (x0 >> 32);
-        const _u2 = (_u1 >> 32) + (x2 & 0xffffffff) + (x1 >> 32);
-        const _u3 = (_u2 >> 32) + (x3 & 0xffffffff) + (x2 >> 32);
-        const _u4 = (_u3 >> 32) + (_u5 & 3);
-
-        // Update the hash
-        ctx.h[0] = @truncate(u32, _u0); // u0 <= 1_9ffffff0
-        ctx.h[1] = @truncate(u32, _u1); // u1 <= 1_97ffffe0
-        ctx.h[2] = @truncate(u32, _u2); // u2 <= 1_8fffffe2
-        ctx.h[3] = @truncate(u32, _u3); // u3 <= 1_87ffffe4
-        ctx.h[4] = @truncate(u32, _u4); // u4 <=          4
-    }
-
-    // (re-)initializes the input counter and input buffer
-    fn polyClearC(ctx: *Self) void {
-        ctx.c[0] = 0;
-        ctx.c[1] = 0;
-        ctx.c[2] = 0;
-        ctx.c[3] = 0;
-        ctx.c_idx = 0;
-    }
 
-    fn polyTakeInput(ctx: *Self, input: u8) void {
-        const word = ctx.c_idx >> 2;
-        const byte = ctx.c_idx & 3;
-        ctx.c[word] |= std.math.shl(u32, input, byte * 8);
-        ctx.c_idx += 1;
-    }
+        // process full blocks
+        if (mb.len >= block_size) {
+            const want = mb.len & ~(block_size - 1);
+            st.blocks(mb[0..want], false);
+            mb = mb[want..];
+        }
 
-    fn polyUpdate(ctx: *Self, msg: []const u8) void {
-        for (msg) |b| {
-            polyTakeInput(ctx, b);
-            if (ctx.c_idx == 16) {
-                polyBlock(ctx);
-                polyClearC(ctx);
+        // store leftover
+        if (mb.len > 0) {
+            for (mb) |x, i| {
+                st.buf[st.leftover + i] = x;
             }
+            st.leftover += mb.len;
         }
     }
 
-    fn alignTo(x: usize, block_size: usize) usize {
-        return ((~x) +% 1) & (block_size - 1);
-    }
-
-    // Feed data into the MAC context.
-    pub fn update(ctx: *Self, msg: []const u8) void {
-        // Align ourselves with block boundaries
-        const alignm = std.math.min(alignTo(ctx.c_idx, 16), msg.len);
-        polyUpdate(ctx, msg[0..alignm]);
-
-        var nmsg = msg[alignm..];
-
-        // Process the msg block by block
-        const nb_blocks = nmsg.len >> 4;
-        var i: usize = 0;
-        while (i < nb_blocks) : (i += 1) {
-            ctx.c[0] = readIntLittle(u32, nmsg[0..4]);
-            ctx.c[1] = readIntLittle(u32, nmsg[4..8]);
-            ctx.c[2] = readIntLittle(u32, nmsg[8..12]);
-            ctx.c[3] = readIntLittle(u32, nmsg[12..16]);
-            polyBlock(ctx);
-            nmsg = nmsg[16..];
-        }
-        if (nb_blocks > 0) {
-            polyClearC(ctx);
+    pub fn final(st: *Poly1305, out: []u8) void {
+        std.debug.assert(out.len >= mac_length);
+        if (st.leftover > 0) {
+            var i = st.leftover;
+            st.buf[i] = 1;
+            i += 1;
+            while (i < block_size) : (i += 1) {
+                st.buf[i] = 0;
+            }
+            st.blocks(&st.buf, true);
         }
-
-        // remaining bytes
-        polyUpdate(ctx, nmsg[0..]);
+        // fully carry h
+        var carry = st.h[1] >> 44;
+        st.h[1] &= 0xfffffffffff;
+        st.h[2] += carry;
+        carry = st.h[2] >> 42;
+        st.h[2] &= 0x3ffffffffff;
+        st.h[0] += carry * 5;
+        carry = st.h[0] >> 44;
+        st.h[0] &= 0xfffffffffff;
+        st.h[1] += carry;
+        carry = st.h[1] >> 44;
+        st.h[1] &= 0xfffffffffff;
+        st.h[2] += carry;
+        carry = st.h[2] >> 42;
+        st.h[2] &= 0x3ffffffffff;
+        st.h[0] += carry * 5;
+        carry = st.h[0] >> 44;
+        st.h[0] &= 0xfffffffffff;
+        st.h[1] += carry;
+
+        // compute h + -p
+        var g0 = st.h[0] + 5;
+        carry = g0 >> 44;
+        g0 &= 0xfffffffffff;
+        var g1 = st.h[1] + carry;
+        carry = g1 >> 44;
+        g1 &= 0xfffffffffff;
+        var g2 = st.h[2] + carry -% (1 << 42);
+
+        // (hopefully) constant-time select h if h < p, or h + -p if h >= p
+        const mask = (g2 >> 63) -% 1;
+        g0 &= mask;
+        g1 &= mask;
+        g2 &= mask;
+        const nmask = ~mask;
+        st.h[0] = (st.h[0] & nmask) | g0;
+        st.h[1] = (st.h[1] & nmask) | g1;
+        st.h[2] = (st.h[2] & nmask) | g2;
+
+        // h = (h + pad)
+        const t0 = st.pad[0];
+        const t1 = st.pad[1];
+        st.h[0] += (t0 & 0xfffffffffff);
+        carry = (st.h[0] >> 44);
+        st.h[0] &= 0xfffffffffff;
+        st.h[1] += (((t0 >> 44) | (t1 << 20)) & 0xfffffffffff) + carry;
+        carry = (st.h[1] >> 44);
+        st.h[1] &= 0xfffffffffff;
+        st.h[2] += (((t1 >> 24)) & 0x3ffffffffff) + carry;
+        st.h[2] &= 0x3ffffffffff;
+
+        // mac = h % (2^128)
+        st.h[0] |= st.h[1] << 44;
+        st.h[1] = (st.h[1] >> 20) | (st.h[2] << 24);
+
+        mem.writeIntLittle(u64, out[0..8], st.h[0]);
+        mem.writeIntLittle(u64, out[8..16], st.h[1]);
+
+        mem.secureZero(u64, &st.r);
     }
 
-    // Finalize the MAC and output into buffer provided by caller.
-    pub fn final(ctx: *Self, out: []u8) void {
-        // Process the last block (if any)
-        if (ctx.c_idx != 0) {
-            // move the final 1 according to remaining input length
-            // (We may add less than 2^130 to the last input block)
-            ctx.c[4] = 0;
-            polyTakeInput(ctx, 1);
-            // one last hash update
-            polyBlock(ctx);
-        }
+    pub fn create(out: []u8, msg: []const u8, key: []const u8) void {
+        std.debug.assert(out.len >= mac_length);
+        std.debug.assert(key.len >= minimum_key_length);
 
-        // check if we should subtract 2^130-5 by performing the
-        // corresponding carry propagation.
-        const _u0 = @as(u64, 5) + ctx.h[0]; // <= 1_00000004
-        const _u1 = (_u0 >> 32) + ctx.h[1]; // <= 1_00000000
-        const _u2 = (_u1 >> 32) + ctx.h[2]; // <= 1_00000000
-        const _u3 = (_u2 >> 32) + ctx.h[3]; // <= 1_00000000
-        const _u4 = (_u3 >> 32) + ctx.h[4]; // <=          5
-        // u4 indicates how many times we should subtract 2^130-5 (0 or 1)
-
-        // h + pad, minus 2^130-5 if u4 exceeds 3
-        const uu0 = (_u4 >> 2) * 5 + ctx.h[0] + ctx.pad[0]; // <= 2_00000003
-        const uu1 = (uu0 >> 32) + ctx.h[1] + ctx.pad[1]; // <= 2_00000000
-        const uu2 = (uu1 >> 32) + ctx.h[2] + ctx.pad[2]; // <= 2_00000000
-        const uu3 = (uu2 >> 32) + ctx.h[3] + ctx.pad[3]; // <= 2_00000000
-
-        writeIntLittle(u32, out[0..4], @truncate(u32, uu0));
-        writeIntLittle(u32, out[4..8], @truncate(u32, uu1));
-        writeIntLittle(u32, out[8..12], @truncate(u32, uu2));
-        writeIntLittle(u32, out[12..16], @truncate(u32, uu3));
-
-        ctx.secureZero();
+        var st = Poly1305.init(key);
+        st.update(msg);
+        st.final(out);
     }
 };