Commit 636308a17d

Frank Denis <124872+jedisct1@users.noreply.github.com>
2024-11-22 10:00:49
std.crypto.aes: introduce AES block vectors (#22023)
* std.crypto.aes: introduce AES block vectors Modern Intel CPUs with the VAES extension can handle more than a single AES block per instruction. So can some ARM and RISC-V CPUs. Software implementations with bitslicing can also greatly benefit from this. Implement low-level operations on AES block vectors, and the parallel AEGIS variants on top of them. AMD Zen4: aegis-128x4: 73225 MiB/s aegis-128x2: 51571 MiB/s aegis-128l: 25806 MiB/s aegis-256x4: 46742 MiB/s aegis-256x2: 30227 MiB/s aegis-256: 8436 MiB/s aes128-gcm: 5926 MiB/s aes256-gcm: 5085 MiB/s AES-GCM, and anything based on AES-CTR are also going to benefit from this later. * Make AEGIS-MAC twice a fast
1 parent f845fa0
lib/std/crypto/aes/aesni.zig
@@ -2,18 +2,23 @@ const std = @import("../../std.zig");
 const builtin = @import("builtin");
 const mem = std.mem;
 const debug = std.debug;
-const BlockVec = @Vector(2, u64);
+
+const has_vaes = builtin.cpu.arch == .x86_64 and std.Target.x86.featureSetHas(builtin.cpu.features, .vaes);
+const has_avx512f = builtin.cpu.arch == .x86_64 and std.Target.x86.featureSetHas(builtin.cpu.features, .avx512f);
 
 /// A single AES block.
 pub const Block = struct {
+    const Repr = @Vector(2, u64);
+
+    /// The length of an AES block in bytes.
     pub const block_length: usize = 16;
 
     /// Internal representation of a block.
-    repr: BlockVec,
+    repr: Repr,
 
     /// Convert a byte sequence into an internal representation.
     pub inline fn fromBytes(bytes: *const [16]u8) Block {
-        const repr = mem.bytesToValue(BlockVec, bytes);
+        const repr = mem.bytesToValue(Repr, bytes);
         return Block{ .repr = repr };
     }
 
@@ -33,7 +38,7 @@ pub const Block = struct {
         return Block{
             .repr = asm (
                 \\ vaesenc %[rk], %[in], %[out]
-                : [out] "=x" (-> BlockVec),
+                : [out] "=x" (-> Repr),
                 : [in] "x" (block.repr),
                   [rk] "x" (round_key.repr),
             ),
@@ -45,7 +50,7 @@ pub const Block = struct {
         return Block{
             .repr = asm (
                 \\ vaesenclast %[rk], %[in], %[out]
-                : [out] "=x" (-> BlockVec),
+                : [out] "=x" (-> Repr),
                 : [in] "x" (block.repr),
                   [rk] "x" (round_key.repr),
             ),
@@ -57,7 +62,7 @@ pub const Block = struct {
         return Block{
             .repr = asm (
                 \\ vaesdec %[rk], %[in], %[out]
-                : [out] "=x" (-> BlockVec),
+                : [out] "=x" (-> Repr),
                 : [in] "x" (block.repr),
                   [rk] "x" (inv_round_key.repr),
             ),
@@ -69,7 +74,7 @@ pub const Block = struct {
         return Block{
             .repr = asm (
                 \\ vaesdeclast %[rk], %[in], %[out]
-                : [out] "=x" (-> BlockVec),
+                : [out] "=x" (-> Repr),
                 : [in] "x" (block.repr),
                   [rk] "x" (inv_round_key.repr),
             ),
@@ -168,17 +173,158 @@ pub const Block = struct {
     };
 };
 
+/// A fixed-size vector of AES blocks.
+/// All operations are performed in parallel, using SIMD instructions when available.
+pub fn BlockVec(comptime blocks_count: comptime_int) type {
+    return struct {
+        const Self = @This();
+
+        /// The number of AES blocks the target architecture can process with a single instruction.
+        pub const native_vector_size = w: {
+            if (has_avx512f and blocks_count % 4 == 0) break :w 4;
+            if (has_vaes and blocks_count % 2 == 0) break :w 2;
+            break :w 1;
+        };
+
+        /// The size of the AES block vector that the target architecture can process with a single instruction, in bytes.
+        pub const native_word_size = native_vector_size * 16;
+
+        const native_words = blocks_count / native_vector_size;
+
+        const Repr = @Vector(native_vector_size * 2, u64);
+
+        /// Internal representation of a block vector.
+        repr: [native_words]Repr,
+
+        /// Length of the block vector in bytes.
+        pub const block_length: usize = blocks_count * 16;
+
+        /// Convert a byte sequence into an internal representation.
+        pub inline fn fromBytes(bytes: *const [blocks_count * 16]u8) Self {
+            var out: Self = undefined;
+            inline for (0..native_words) |i| {
+                out.repr[i] = mem.bytesToValue(Repr, bytes[i * native_word_size ..][0..native_word_size]);
+            }
+            return out;
+        }
+
+        /// Convert the internal representation of a block vector into a byte sequence.
+        pub inline fn toBytes(block_vec: Self) [blocks_count * 16]u8 {
+            var out: [blocks_count * 16]u8 = undefined;
+            inline for (0..native_words) |i| {
+                out[i * native_word_size ..][0..native_word_size].* = mem.toBytes(block_vec.repr[i]);
+            }
+            return out;
+        }
+
+        /// XOR the block vector with a byte sequence.
+        pub inline fn xorBytes(block_vec: Self, bytes: *const [blocks_count * 16]u8) [blocks_count * 16]u8 {
+            var x: Self = undefined;
+            inline for (0..native_words) |i| {
+                x.repr[i] = block_vec.repr[i] ^ mem.bytesToValue(Repr, bytes[i * native_word_size ..][0..native_word_size]);
+            }
+            return x.toBytes();
+        }
+
+        /// Apply the forward AES operation to the block vector with a vector of round keys.
+        pub inline fn encrypt(block_vec: Self, round_key_vec: Self) Self {
+            var out: Self = undefined;
+            inline for (0..native_words) |i| {
+                out.repr[i] = asm (
+                    \\ vaesenc %[rk], %[in], %[out]
+                    : [out] "=x" (-> Repr),
+                    : [in] "x" (block_vec.repr[i]),
+                      [rk] "x" (round_key_vec.repr[i]),
+                );
+            }
+            return out;
+        }
+
+        /// Apply the forward AES operation to the block vector with a vector of last round keys.
+        pub inline fn encryptLast(block_vec: Self, round_key_vec: Self) Self {
+            var out: Self = undefined;
+            inline for (0..native_words) |i| {
+                out.repr[i] = asm (
+                    \\ vaesenclast %[rk], %[in], %[out]
+                    : [out] "=x" (-> Repr),
+                    : [in] "x" (block_vec.repr[i]),
+                      [rk] "x" (round_key_vec.repr[i]),
+                );
+            }
+            return out;
+        }
+
+        /// Apply the inverse AES operation to the block vector with a vector of round keys.
+        pub inline fn decrypt(block_vec: Self, inv_round_key_vec: Self) Self {
+            var out: Self = undefined;
+            inline for (0..native_words) |i| {
+                out.repr[i] = asm (
+                    \\ vaesdec %[rk], %[in], %[out]
+                    : [out] "=x" (-> Repr),
+                    : [in] "x" (block_vec.repr[i]),
+                      [rk] "x" (inv_round_key_vec.repr[i]),
+                );
+            }
+            return out;
+        }
+
+        /// Apply the inverse AES operation to the block vector with a vector of last round keys.
+        pub inline fn decryptLast(block_vec: Self, inv_round_key_vec: Self) Self {
+            var out: Self = undefined;
+            inline for (0..native_words) |i| {
+                out.repr[i] = asm (
+                    \\ vaesdeclast %[rk], %[in], %[out]
+                    : [out] "=x" (-> Repr),
+                    : [in] "x" (block_vec.repr[i]),
+                      [rk] "x" (inv_round_key_vec.repr[i]),
+                );
+            }
+            return out;
+        }
+
+        /// Apply the bitwise XOR operation to the content of two block vectors.
+        pub inline fn xorBlocks(block_vec1: Self, block_vec2: Self) Self {
+            var out: Self = undefined;
+            inline for (0..native_words) |i| {
+                out.repr[i] = block_vec1.repr[i] ^ block_vec2.repr[i];
+            }
+            return out;
+        }
+
+        /// Apply the bitwise AND operation to the content of two block vectors.
+        pub inline fn andBlocks(block_vec1: Self, block_vec2: Self) Self {
+            var out: Self = undefined;
+            inline for (0..native_words) |i| {
+                out.repr[i] = block_vec1.repr[i] & block_vec2.repr[i];
+            }
+            return out;
+        }
+
+        /// Apply the bitwise OR operation to the content of two block vectors.
+        pub inline fn orBlocks(block_vec1: Self, block_vec2: Block) Self {
+            var out: Self = undefined;
+            inline for (0..native_words) |i| {
+                out.repr[i] = block_vec1.repr[i] | block_vec2.repr[i];
+            }
+            return out;
+        }
+    };
+}
+
 fn KeySchedule(comptime Aes: type) type {
     std.debug.assert(Aes.rounds == 10 or Aes.rounds == 14);
     const rounds = Aes.rounds;
 
     return struct {
         const Self = @This();
+
+        const Repr = Aes.block.Repr;
+
         round_keys: [rounds + 1]Block,
 
-        fn drc(comptime second: bool, comptime rc: u8, t: BlockVec, tx: BlockVec) BlockVec {
-            var s: BlockVec = undefined;
-            var ts: BlockVec = undefined;
+        fn drc(comptime second: bool, comptime rc: u8, t: Repr, tx: Repr) Repr {
+            var s: Repr = undefined;
+            var ts: Repr = undefined;
             return asm (
                 \\ vaeskeygenassist %[rc], %[t], %[s]
                 \\ vpslldq $4, %[tx], %[ts]
@@ -187,7 +333,7 @@ fn KeySchedule(comptime Aes: type) type {
                 \\ vpxor   %[ts], %[r], %[r]
                 \\ vpshufd %[mask], %[s], %[ts]
                 \\ vpxor   %[ts], %[r], %[r]
-                : [r] "=&x" (-> BlockVec),
+                : [r] "=&x" (-> Repr),
                   [s] "=&x" (s),
                   [ts] "=&x" (ts),
                 : [rc] "n" (rc),
@@ -234,7 +380,7 @@ fn KeySchedule(comptime Aes: type) type {
                 inv_round_keys[i] = Block{
                     .repr = asm (
                         \\ vaesimc %[rk], %[inv_rk]
-                        : [inv_rk] "=x" (-> BlockVec),
+                        : [inv_rk] "=x" (-> Repr),
                         : [rk] "x" (round_keys[rounds - i].repr),
                     ),
                 };
lib/std/crypto/aes/armcrypto.zig
@@ -1,18 +1,19 @@
 const std = @import("../../std.zig");
 const mem = std.mem;
 const debug = std.debug;
-const BlockVec = @Vector(2, u64);
 
 /// A single AES block.
 pub const Block = struct {
+    const Repr = @Vector(2, u64);
+
     pub const block_length: usize = 16;
 
     /// Internal representation of a block.
-    repr: BlockVec,
+    repr: Repr,
 
     /// Convert a byte sequence into an internal representation.
     pub inline fn fromBytes(bytes: *const [16]u8) Block {
-        const repr = mem.bytesToValue(BlockVec, bytes);
+        const repr = mem.bytesToValue(Repr, bytes);
         return Block{ .repr = repr };
     }
 
@@ -36,7 +37,7 @@ pub const Block = struct {
                 \\ mov   %[out].16b, %[in].16b
                 \\ aese  %[out].16b, %[zero].16b
                 \\ aesmc %[out].16b, %[out].16b
-                : [out] "=&x" (-> BlockVec),
+                : [out] "=&x" (-> Repr),
                 : [in] "x" (block.repr),
                   [zero] "x" (zero),
             )) ^ round_key.repr,
@@ -49,7 +50,7 @@ pub const Block = struct {
             .repr = (asm (
                 \\ mov   %[out].16b, %[in].16b
                 \\ aese  %[out].16b, %[zero].16b
-                : [out] "=&x" (-> BlockVec),
+                : [out] "=&x" (-> Repr),
                 : [in] "x" (block.repr),
                   [zero] "x" (zero),
             )) ^ round_key.repr,
@@ -63,7 +64,7 @@ pub const Block = struct {
                 \\ mov   %[out].16b, %[in].16b
                 \\ aesd  %[out].16b, %[zero].16b
                 \\ aesimc %[out].16b, %[out].16b
-                : [out] "=&x" (-> BlockVec),
+                : [out] "=&x" (-> Repr),
                 : [in] "x" (block.repr),
                   [zero] "x" (zero),
             )) ^ inv_round_key.repr,
@@ -76,7 +77,7 @@ pub const Block = struct {
             .repr = (asm (
                 \\ mov   %[out].16b, %[in].16b
                 \\ aesd  %[out].16b, %[zero].16b
-                : [out] "=&x" (-> BlockVec),
+                : [out] "=&x" (-> Repr),
                 : [in] "x" (block.repr),
                   [zero] "x" (zero),
             )) ^ inv_round_key.repr,
@@ -165,6 +166,118 @@ pub const Block = struct {
     };
 };
 
+/// A fixed-size vector of AES blocks.
+/// All operations are performed in parallel, using SIMD instructions when available.
+pub fn BlockVec(comptime blocks_count: comptime_int) type {
+    return struct {
+        const Self = @This();
+
+        /// The number of AES blocks the target architecture can process with a single instruction.
+        pub const native_vector_size = 1;
+
+        /// The size of the AES block vector that the target architecture can process with a single instruction, in bytes.
+        pub const native_word_size = native_vector_size * 16;
+
+        const native_words = blocks_count;
+
+        /// Internal representation of a block vector.
+        repr: [native_words]Block,
+
+        /// Length of the block vector in bytes.
+        pub const block_length: usize = blocks_count * 16;
+
+        /// Convert a byte sequence into an internal representation.
+        pub inline fn fromBytes(bytes: *const [blocks_count * 16]u8) Self {
+            var out: Self = undefined;
+            inline for (0..native_words) |i| {
+                out.repr[i] = Block.fromBytes(bytes[i * native_word_size ..][0..native_word_size]);
+            }
+            return out;
+        }
+
+        /// Convert the internal representation of a block vector into a byte sequence.
+        pub inline fn toBytes(block_vec: Self) [blocks_count * 16]u8 {
+            var out: [blocks_count * 16]u8 = undefined;
+            inline for (0..native_words) |i| {
+                out[i * native_word_size ..][0..native_word_size].* = block_vec.repr[i].toBytes();
+            }
+            return out;
+        }
+
+        /// XOR the block vector with a byte sequence.
+        pub inline fn xorBytes(block_vec: Self, bytes: *const [blocks_count * 16]u8) [32]u8 {
+            var out: Self = undefined;
+            inline for (0..native_words) |i| {
+                out.repr[i] = block_vec.repr[i].xorBytes(bytes[i * native_word_size ..][0..native_word_size]);
+            }
+            return out;
+        }
+
+        /// Apply the forward AES operation to the block vector with a vector of round keys.
+        pub inline fn encrypt(block_vec: Self, round_key_vec: Self) Self {
+            var out: Self = undefined;
+            inline for (0..native_words) |i| {
+                out.repr[i] = block_vec.repr[i].encrypt(round_key_vec.repr[i]);
+            }
+            return out;
+        }
+
+        /// Apply the forward AES operation to the block vector with a vector of last round keys.
+        pub inline fn encryptLast(block_vec: Self, round_key_vec: Self) Self {
+            var out: Self = undefined;
+            inline for (0..native_words) |i| {
+                out.repr[i] = block_vec.repr[i].encryptLast(round_key_vec.repr[i]);
+            }
+            return out;
+        }
+
+        /// Apply the inverse AES operation to the block vector with a vector of round keys.
+        pub inline fn decrypt(block_vec: Self, inv_round_key_vec: Self) Self {
+            var out: Self = undefined;
+            inline for (0..native_words) |i| {
+                out.repr[i] = block_vec.repr[i].decrypt(inv_round_key_vec.repr[i]);
+            }
+            return out;
+        }
+
+        /// Apply the inverse AES operation to the block vector with a vector of last round keys.
+        pub inline fn decryptLast(block_vec: Self, inv_round_key_vec: Self) Self {
+            var out: Self = undefined;
+            inline for (0..native_words) |i| {
+                out.repr[i] = block_vec.repr[i].decryptLast(inv_round_key_vec.repr[i]);
+            }
+            return out;
+        }
+
+        /// Apply the bitwise XOR operation to the content of two block vectors.
+        pub inline fn xorBlocks(block_vec1: Self, block_vec2: Self) Self {
+            var out: Self = undefined;
+            inline for (0..native_words) |i| {
+                out.repr[i] = block_vec1.repr[i].xorBlocks(block_vec2.repr[i]);
+            }
+            return out;
+        }
+
+        /// Apply the bitwise AND operation to the content of two block vectors.
+        pub inline fn andBlocks(block_vec1: Self, block_vec2: Self) Self {
+            var out: Self = undefined;
+            inline for (0..native_words) |i| {
+                out.repr[i] = block_vec1.repr[i].andBlocks(block_vec2.repr[i]);
+            }
+            return out;
+        }
+
+        /// Apply the bitwise OR operation to the content of two block vectors.
+        pub inline fn orBlocks(block_vec1: Self, block_vec2: Block) Self {
+            var out: Self = undefined;
+            inline for (0..native_words) |i| {
+                out.repr[i] = block_vec1.repr[i].orBlocks(block_vec2.repr[i]);
+            }
+            return out;
+        }
+    };
+}
+
 fn KeySchedule(comptime Aes: type) type {
     std.debug.assert(Aes.rounds == 10 or Aes.rounds == 14);
     const rounds = Aes.rounds;
@@ -172,17 +285,19 @@ fn KeySchedule(comptime Aes: type) type {
     return struct {
         const Self = @This();
 
+        const Repr = Aes.block.Repr;
+
         const zero = @Vector(2, u64){ 0, 0 };
         const mask1 = @Vector(16, u8){ 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12 };
         const mask2 = @Vector(16, u8){ 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15 };
 
         round_keys: [rounds + 1]Block,
 
-        fn drc128(comptime rc: u8, t: BlockVec) BlockVec {
-            var v1: BlockVec = undefined;
-            var v2: BlockVec = undefined;
-            var v3: BlockVec = undefined;
-            var v4: BlockVec = undefined;
+        fn drc128(comptime rc: u8, t: Repr) Repr {
+            var v1: Repr = undefined;
+            var v2: Repr = undefined;
+            var v3: Repr = undefined;
+            var v4: Repr = undefined;
 
             return asm (
                 \\ movi %[v2].4s, %[rc]
@@ -196,7 +311,7 @@ fn KeySchedule(comptime Aes: type) type {
                 \\ eor  %[v1].16b, %[v1].16b, %[r].16b
                 \\ eor  %[r].16b, %[v1].16b, %[v3].16b
                 \\ eor  %[r].16b, %[r].16b, %[v4].16b
-                : [r] "=&x" (-> BlockVec),
+                : [r] "=&x" (-> Repr),
                   [v1] "=&x" (v1),
                   [v2] "=&x" (v2),
                   [v3] "=&x" (v3),
@@ -208,11 +323,11 @@ fn KeySchedule(comptime Aes: type) type {
             );
         }
 
-        fn drc256(comptime second: bool, comptime rc: u8, t: BlockVec, tx: BlockVec) BlockVec {
-            var v1: BlockVec = undefined;
-            var v2: BlockVec = undefined;
-            var v3: BlockVec = undefined;
-            var v4: BlockVec = undefined;
+        fn drc256(comptime second: bool, comptime rc: u8, t: Repr, tx: Repr) Repr {
+            var v1: Repr = undefined;
+            var v2: Repr = undefined;
+            var v3: Repr = undefined;
+            var v4: Repr = undefined;
 
             return asm (
                 \\ movi %[v2].4s, %[rc]
@@ -226,7 +341,7 @@ fn KeySchedule(comptime Aes: type) type {
                 \\ eor  %[v1].16b, %[v1].16b, %[v2].16b
                 \\ eor  %[v1].16b, %[v1].16b, %[v3].16b
                 \\ eor  %[r].16b, %[v1].16b, %[v4].16b
-                : [r] "=&x" (-> BlockVec),
+                : [r] "=&x" (-> Repr),
                   [v1] "=&x" (v1),
                   [v2] "=&x" (v2),
                   [v3] "=&x" (v3),
@@ -276,7 +391,7 @@ fn KeySchedule(comptime Aes: type) type {
                 inv_round_keys[i] = Block{
                     .repr = asm (
                         \\ aesimc %[inv_rk].16b, %[rk].16b
-                        : [inv_rk] "=x" (-> BlockVec),
+                        : [inv_rk] "=x" (-> Repr),
                         : [rk] "x" (round_keys[rounds - i].repr),
                     ),
                 };
lib/std/crypto/aes/soft.zig
@@ -2,16 +2,16 @@ const std = @import("../../std.zig");
 const math = std.math;
 const mem = std.mem;
 
-const BlockVec = [4]u32;
-
 const side_channels_mitigations = std.options.side_channels_mitigations;
 
 /// A single AES block.
 pub const Block = struct {
+    const Repr = [4]u32;
+
     pub const block_length: usize = 16;
 
     /// Internal representation of a block.
-    repr: BlockVec align(16),
+    repr: Repr align(16),
 
     /// Convert a byte sequence into an internal representation.
     pub inline fn fromBytes(bytes: *const [16]u8) Block {
@@ -19,7 +19,7 @@ pub const Block = struct {
         const s1 = mem.readInt(u32, bytes[4..8], .little);
         const s2 = mem.readInt(u32, bytes[8..12], .little);
         const s3 = mem.readInt(u32, bytes[12..16], .little);
-        return Block{ .repr = BlockVec{ s0, s1, s2, s3 } };
+        return Block{ .repr = Repr{ s0, s1, s2, s3 } };
     }
 
     /// Convert the internal representation of a block into a byte sequence.
@@ -65,7 +65,7 @@ pub const Block = struct {
         t2 ^= round_key.repr[2];
         t3 ^= round_key.repr[3];
 
-        return Block{ .repr = BlockVec{ t0, t1, t2, t3 } };
+        return Block{ .repr = Repr{ t0, t1, t2, t3 } };
     }
 
     /// Encrypt a block with a round key *WITHOUT ANY PROTECTION AGAINST SIDE CHANNELS*
@@ -110,7 +110,7 @@ pub const Block = struct {
         t2 ^= round_key.repr[2];
         t3 ^= round_key.repr[3];
 
-        return Block{ .repr = BlockVec{ t0, t1, t2, t3 } };
+        return Block{ .repr = Repr{ t0, t1, t2, t3 } };
     }
 
     /// Encrypt a block with the last round key.
@@ -136,7 +136,7 @@ pub const Block = struct {
         t2 ^= round_key.repr[2];
         t3 ^= round_key.repr[3];
 
-        return Block{ .repr = BlockVec{ t0, t1, t2, t3 } };
+        return Block{ .repr = Repr{ t0, t1, t2, t3 } };
     }
 
     /// Decrypt a block with a round key.
@@ -161,7 +161,7 @@ pub const Block = struct {
         t2 ^= round_key.repr[2];
         t3 ^= round_key.repr[3];
 
-        return Block{ .repr = BlockVec{ t0, t1, t2, t3 } };
+        return Block{ .repr = Repr{ t0, t1, t2, t3 } };
     }
 
     /// Decrypt a block with a round key *WITHOUT ANY PROTECTION AGAINST SIDE CHANNELS*
@@ -206,7 +206,7 @@ pub const Block = struct {
         t2 ^= round_key.repr[2];
         t3 ^= round_key.repr[3];
 
-        return Block{ .repr = BlockVec{ t0, t1, t2, t3 } };
+        return Block{ .repr = Repr{ t0, t1, t2, t3 } };
     }
 
     /// Decrypt a block with the last round key.
@@ -232,12 +232,12 @@ pub const Block = struct {
         t2 ^= round_key.repr[2];
         t3 ^= round_key.repr[3];
 
-        return Block{ .repr = BlockVec{ t0, t1, t2, t3 } };
+        return Block{ .repr = Repr{ t0, t1, t2, t3 } };
     }
 
     /// Apply the bitwise XOR operation to the content of two blocks.
     pub inline fn xorBlocks(block1: Block, block2: Block) Block {
-        var x: BlockVec = undefined;
+        var x: Repr = undefined;
         comptime var i = 0;
         inline while (i < 4) : (i += 1) {
             x[i] = block1.repr[i] ^ block2.repr[i];
@@ -247,7 +247,7 @@ pub const Block = struct {
 
     /// Apply the bitwise AND operation to the content of two blocks.
     pub inline fn andBlocks(block1: Block, block2: Block) Block {
-        var x: BlockVec = undefined;
+        var x: Repr = undefined;
         comptime var i = 0;
         inline while (i < 4) : (i += 1) {
             x[i] = block1.repr[i] & block2.repr[i];
@@ -257,7 +257,7 @@ pub const Block = struct {
 
     /// Apply the bitwise OR operation to the content of two blocks.
     pub inline fn orBlocks(block1: Block, block2: Block) Block {
-        var x: BlockVec = undefined;
+        var x: Repr = undefined;
         comptime var i = 0;
         inline while (i < 4) : (i += 1) {
             x[i] = block1.repr[i] | block2.repr[i];
@@ -332,6 +332,118 @@ pub const Block = struct {
     };
 };
 
+/// A fixed-size vector of AES blocks.
+/// All operations are performed in parallel, using SIMD instructions when available.
+pub fn BlockVec(comptime blocks_count: comptime_int) type {
+    return struct {
+        const Self = @This();
+
+        /// The number of AES blocks the target architecture can process with a single instruction.
+        pub const native_vector_size = 1;
+
+        /// The size of the AES block vector that the target architecture can process with a single instruction, in bytes.
+        pub const native_word_size = native_vector_size * 16;
+
+        const native_words = blocks_count;
+
+        /// Internal representation of a block vector.
+        repr: [native_words]Block,
+
+        /// Length of the block vector in bytes.
+        pub const block_length: usize = blocks_count * 16;
+
+        /// Convert a byte sequence into an internal representation.
+        pub inline fn fromBytes(bytes: *const [blocks_count * 16]u8) Self {
+            var out: Self = undefined;
+            for (0..native_words) |i| {
+                out.repr[i] = Block.fromBytes(bytes[i * native_word_size ..][0..native_word_size]);
+            }
+            return out;
+        }
+
+        /// Convert the internal representation of a block vector into a byte sequence.
+        pub inline fn toBytes(block_vec: Self) [blocks_count * 16]u8 {
+            var out: [blocks_count * 16]u8 = undefined;
+            for (0..native_words) |i| {
+                out[i * native_word_size ..][0..native_word_size].* = block_vec.repr[i].toBytes();
+            }
+            return out;
+        }
+
+        /// XOR the block vector with a byte sequence.
+        pub inline fn xorBytes(block_vec: Self, bytes: *const [blocks_count * 16]u8) [32]u8 {
+            var out: Self = undefined;
+            for (0..native_words) |i| {
+                out.repr[i] = block_vec.repr[i].xorBytes(bytes[i * native_word_size ..][0..native_word_size]);
+            }
+            return out;
+        }
+
+        /// Apply the forward AES operation to the block vector with a vector of round keys.
+        pub inline fn encrypt(block_vec: Self, round_key_vec: Self) Self {
+            var out: Self = undefined;
+            for (0..native_words) |i| {
+                out.repr[i] = block_vec.repr[i].encrypt(round_key_vec.repr[i]);
+            }
+            return out;
+        }
+
+        /// Apply the forward AES operation to the block vector with a vector of last round keys.
+        pub inline fn encryptLast(block_vec: Self, round_key_vec: Self) Self {
+            var out: Self = undefined;
+            for (0..native_words) |i| {
+                out.repr[i] = block_vec.repr[i].encryptLast(round_key_vec.repr[i]);
+            }
+            return out;
+        }
+
+        /// Apply the inverse AES operation to the block vector with a vector of round keys.
+        pub inline fn decrypt(block_vec: Self, inv_round_key_vec: Self) Self {
+            var out: Self = undefined;
+            for (0..native_words) |i| {
+                out.repr[i] = block_vec.repr[i].decrypt(inv_round_key_vec.repr[i]);
+            }
+            return out;
+        }
+
+        /// Apply the inverse AES operation to the block vector with a vector of last round keys.
+        pub inline fn decryptLast(block_vec: Self, inv_round_key_vec: Self) Self {
+            var out: Self = undefined;
+            for (0..native_words) |i| {
+                out.repr[i] = block_vec.repr[i].decryptLast(inv_round_key_vec.repr[i]);
+            }
+            return out;
+        }
+
+        /// Apply the bitwise XOR operation to the content of two block vectors.
+        pub inline fn xorBlocks(block_vec1: Self, block_vec2: Self) Self {
+            var out: Self = undefined;
+            for (0..native_words) |i| {
+                out.repr[i] = block_vec1.repr[i].xorBlocks(block_vec2.repr[i]);
+            }
+            return out;
+        }
+
+        /// Apply the bitwise AND operation to the content of two block vectors.
+        pub inline fn andBlocks(block_vec1: Self, block_vec2: Self) Self {
+            var out: Self = undefined;
+            for (0..native_words) |i| {
+                out.repr[i] = block_vec1.repr[i].andBlocks(block_vec2.repr[i]);
+            }
+            return out;
+        }
+
+        /// Apply the bitwise OR operation to the content of two block vectors.
+        pub inline fn orBlocks(block_vec1: Self, block_vec2: Block) Self {
+            var out: Self = undefined;
+            for (0..native_words) |i| {
+                out.repr[i] = block_vec1.repr[i].orBlocks(block_vec2.repr[i]);
+            }
+            return out;
+        }
+    };
+}
+
 fn KeySchedule(comptime Aes: type) type {
     std.debug.assert(Aes.rounds == 10 or Aes.rounds == 14);
     const key_length = Aes.key_bits / 8;
@@ -671,7 +783,7 @@ fn mul(a: u8, b: u8) u8 {
 
 const cache_line_bytes = std.atomic.cache_line;
 
-inline fn sbox_lookup(sbox: *align(64) const [256]u8, idx0: u8, idx1: u8, idx2: u8, idx3: u8) [4]u8 {
+fn sbox_lookup(sbox: *align(64) const [256]u8, idx0: u8, idx1: u8, idx2: u8, idx3: u8) [4]u8 {
     if (side_channels_mitigations == .none) {
         return [4]u8{
             sbox[idx0],
@@ -709,7 +821,7 @@ inline fn sbox_lookup(sbox: *align(64) const [256]u8, idx0: u8, idx1: u8, idx2:
     }
 }
 
-inline fn table_lookup(table: *align(64) const [4][256]u32, idx0: u8, idx1: u8, idx2: u8, idx3: u8) [4]u32 {
+fn table_lookup(table: *align(64) const [4][256]u32, idx0: u8, idx1: u8, idx2: u8, idx3: u8) [4]u32 {
     if (side_channels_mitigations == .none) {
         return [4]u32{
             table[0][idx0],
@@ -718,17 +830,18 @@ inline fn table_lookup(table: *align(64) const [4][256]u32, idx0: u8, idx1: u8,
             table[3][idx3],
         };
     } else {
+        const table_len: usize = 256;
         const stride = switch (side_channels_mitigations) {
             .none => unreachable,
-            .basic => table[0].len / 4,
-            .medium => @max(1, @min(table[0].len, 2 * cache_line_bytes / 4)),
-            .full => @max(1, @min(table[0].len, cache_line_bytes / 4)),
+            .basic => table_len / 4,
+            .medium => @max(1, @min(table_len, 2 * cache_line_bytes / 4)),
+            .full => @max(1, @min(table_len, cache_line_bytes / 4)),
         };
         const of0 = idx0 % stride;
         const of1 = idx1 % stride;
         const of2 = idx2 % stride;
         const of3 = idx3 % stride;
-        var t: [4][table[0].len / stride]u32 align(64) = undefined;
+        var t: [4][table_len / stride]u32 align(64) = undefined;
         var i: usize = 0;
         while (i < t[0].len) : (i += 1) {
             const tx = table[0][i * stride ..];
lib/std/crypto/aegis.zig
@@ -1,16 +1,21 @@
 //! AEGIS is a very fast authenticated encryption system built on top of the core AES function.
 //!
-//! The AEGIS-128L variant has a 128 bit key, a 128 bit nonce, and processes 256 bit message blocks.
-//! The AEGIS-256  variant has a 256 bit key, a 256 bit nonce, and processes 128 bit message blocks.
+//! The AEGIS-128* variants have a 128 bit key and a 128 bit nonce.
+//! The AEGIS-256* variants have a 256 bit key and a 256 bit nonce.
+//! All of them can compute 128 and 256 bit authentication tags.
 //!
 //! The AEGIS cipher family offers performance that significantly exceeds that of AES-GCM with
 //! hardware support for parallelizable AES block encryption.
 //!
-//! Unlike with AES-GCM, nonces can be safely chosen at random with no practical limit when using AEGIS-256.
-//! AEGIS-128L also allows for more messages to be safely encrypted when using random nonces.
+//! On high-end Intel CPUs with AVX-512 support, AEGIS-128X4 and AEGIS-256X4 are the fastest options.
+//! On other modern server, desktop and mobile CPUs, AEGIS-128X2 and AEGIS-256X2 are usually the fastest options.
+//! AEGIS-128L and AEGIS-256 perform well on a broad range of platforms, including WebAssembly.
 //!
-//! AEGIS is believed to be key-committing, making it a safer choice than most other AEADs
-//! when the key has low entropy, or can be controlled by an attacker.
+//! Unlike with AES-GCM, nonces can be safely chosen at random with no practical limit when using AEGIS-256*.
+//! AEGIS-128* also allows for more messages to be safely encrypted when using random nonces.
+//!
+//! Unless the associated data can be fully controled by an adversary, AEGIS is believed to be key-committing,
+//! making it a safer choice than most other AEADs when the key has low entropy, or can be controlled by an attacker.
 //!
 //! Finally, leaking the state does not leak the key.
 //!
@@ -20,122 +25,202 @@ const std = @import("std");
 const crypto = std.crypto;
 const mem = std.mem;
 const assert = std.debug.assert;
-const AesBlock = crypto.core.aes.Block;
 const AuthenticationError = crypto.errors.AuthenticationError;
 
-/// AEGIS-128L with a 128-bit authentication tag.
-pub const Aegis128L = Aegis128LGeneric(128);
-
-/// AEGIS-128L with a 256-bit authentication tag.
-pub const Aegis128L_256 = Aegis128LGeneric(256);
-
-/// AEGIS-256 with a 128-bit authentication tag.
-pub const Aegis256 = Aegis256Generic(128);
-
-/// AEGIS-256 with a 256-bit authentication tag.
-pub const Aegis256_256 = Aegis256Generic(256);
-
-const State128L = struct {
-    blocks: [8]AesBlock,
-
-    fn init(key: [16]u8, nonce: [16]u8) State128L {
-        const c1 = AesBlock.fromBytes(&[16]u8{ 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1, 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd });
-        const c2 = AesBlock.fromBytes(&[16]u8{ 0x0, 0x1, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d, 0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62 });
-        const key_block = AesBlock.fromBytes(&key);
-        const nonce_block = AesBlock.fromBytes(&nonce);
-        const blocks = [8]AesBlock{
-            key_block.xorBlocks(nonce_block),
-            c1,
-            c2,
-            c1,
-            key_block.xorBlocks(nonce_block),
-            key_block.xorBlocks(c2),
-            key_block.xorBlocks(c1),
-            key_block.xorBlocks(c2),
-        };
-        var state = State128L{ .blocks = blocks };
-        var i: usize = 0;
-        while (i < 10) : (i += 1) {
-            state.update(nonce_block, key_block);
+/// AEGIS-128X4 with a 128 bit tag
+pub const Aegis128X4 = Aegis128XGeneric(4, 128);
+/// AEGIS-128X2 with a 128 bit tag
+pub const Aegis128X2 = Aegis128XGeneric(2, 128);
+/// AEGIS-128L with a 128 bit tag
+pub const Aegis128L = Aegis128XGeneric(1, 128);
+
+/// AEGIS-256X4 with a 128 bit tag
+pub const Aegis256X4 = Aegis256XGeneric(4, 128);
+/// AEGIS-256X2 with a 128 bit tag
+pub const Aegis256X2 = Aegis256XGeneric(2, 128);
+/// AEGIS-256 with a 128 bit tag
+pub const Aegis256 = Aegis256XGeneric(1, 128);
+
+/// AEGIS-128X4 with a 256 bit tag
+pub const Aegis128X4_256 = Aegis128XGeneric(4, 256);
+/// AEGIS-128X2 with a 256 bit tag
+pub const Aegis128X2_256 = Aegis128XGeneric(2, 256);
+/// AEGIS-128L with a 256 bit tag
+pub const Aegis128L_256 = Aegis128XGeneric(1, 256);
+
+/// AEGIS-256X4 with a 256 bit tag
+pub const Aegis256X4_256 = Aegis256XGeneric(4, 256);
+/// AEGIS-256X2 with a 256 bit tag
+pub const Aegis256X2_256 = Aegis256XGeneric(2, 256);
+/// AEGIS-256 with a 256 bit tag
+pub const Aegis256_256 = Aegis256XGeneric(1, 256);
+
+fn State128X(comptime degree: u7) type {
+    return struct {
+        const AesBlockVec = crypto.core.aes.BlockVec(degree);
+        const State = @This();
+
+        blocks: [8]AesBlockVec,
+
+        const aes_block_length = AesBlockVec.block_length;
+        const rate = aes_block_length * 2;
+        const alignment = AesBlockVec.native_word_size;
+
+        fn init(key: [16]u8, nonce: [16]u8) State {
+            const c1 = AesBlockVec.fromBytes(&[16]u8{ 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1, 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd } ** degree);
+            const c2 = AesBlockVec.fromBytes(&[16]u8{ 0x0, 0x1, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d, 0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62 } ** degree);
+            const key_block = AesBlockVec.fromBytes(&(key ** degree));
+            const nonce_block = AesBlockVec.fromBytes(&(nonce ** degree));
+            const blocks = [8]AesBlockVec{
+                key_block.xorBlocks(nonce_block),
+                c1,
+                c2,
+                c1,
+                key_block.xorBlocks(nonce_block),
+                key_block.xorBlocks(c2),
+                key_block.xorBlocks(c1),
+                key_block.xorBlocks(c2),
+            };
+            var state = State{ .blocks = blocks };
+            if (degree > 1) {
+                const context_block = ctx: {
+                    var contexts_bytes = [_]u8{0} ** aes_block_length;
+                    for (0..degree) |i| {
+                        contexts_bytes[i * 16] = @intCast(i);
+                        contexts_bytes[i * 16 + 1] = @intCast(degree - 1);
+                    }
+                    break :ctx AesBlockVec.fromBytes(&contexts_bytes);
+                };
+                for (0..10) |_| {
+                    state.blocks[3] = state.blocks[3].xorBlocks(context_block);
+                    state.blocks[7] = state.blocks[7].xorBlocks(context_block);
+                    state.update(nonce_block, key_block);
+                }
+            } else {
+                for (0..10) |_| {
+                    state.update(nonce_block, key_block);
+                }
+            }
+            return state;
         }
-        return state;
-    }
 
-    inline fn update(state: *State128L, d1: AesBlock, d2: AesBlock) void {
-        const blocks = &state.blocks;
-        const tmp = blocks[7];
-        comptime var i: usize = 7;
-        inline while (i > 0) : (i -= 1) {
-            blocks[i] = blocks[i - 1].encrypt(blocks[i]);
+        inline fn update(state: *State, d1: AesBlockVec, d2: AesBlockVec) void {
+            const blocks = &state.blocks;
+            const tmp = blocks[7];
+            comptime var i: usize = 7;
+            inline while (i > 0) : (i -= 1) {
+                blocks[i] = blocks[i - 1].encrypt(blocks[i]);
+            }
+            blocks[0] = tmp.encrypt(blocks[0]);
+            blocks[0] = blocks[0].xorBlocks(d1);
+            blocks[4] = blocks[4].xorBlocks(d2);
         }
-        blocks[0] = tmp.encrypt(blocks[0]);
-        blocks[0] = blocks[0].xorBlocks(d1);
-        blocks[4] = blocks[4].xorBlocks(d2);
-    }
 
-    fn absorb(state: *State128L, src: *const [32]u8) void {
-        const msg0 = AesBlock.fromBytes(src[0..16]);
-        const msg1 = AesBlock.fromBytes(src[16..32]);
-        state.update(msg0, msg1);
-    }
+        fn absorb(state: *State, src: *const [rate]u8) void {
+            const msg0 = AesBlockVec.fromBytes(src[0..aes_block_length]);
+            const msg1 = AesBlockVec.fromBytes(src[aes_block_length..rate]);
+            state.update(msg0, msg1);
+        }
 
-    fn enc(state: *State128L, dst: *[32]u8, src: *const [32]u8) void {
-        const blocks = &state.blocks;
-        const msg0 = AesBlock.fromBytes(src[0..16]);
-        const msg1 = AesBlock.fromBytes(src[16..32]);
-        var tmp0 = msg0.xorBlocks(blocks[6]).xorBlocks(blocks[1]);
-        var tmp1 = msg1.xorBlocks(blocks[2]).xorBlocks(blocks[5]);
-        tmp0 = tmp0.xorBlocks(blocks[2].andBlocks(blocks[3]));
-        tmp1 = tmp1.xorBlocks(blocks[6].andBlocks(blocks[7]));
-        dst[0..16].* = tmp0.toBytes();
-        dst[16..32].* = tmp1.toBytes();
-        state.update(msg0, msg1);
-    }
+        fn enc(state: *State, dst: *[rate]u8, src: *const [rate]u8) void {
+            const blocks = &state.blocks;
+            const msg0 = AesBlockVec.fromBytes(src[0..aes_block_length]);
+            const msg1 = AesBlockVec.fromBytes(src[aes_block_length..rate]);
+            var tmp0 = msg0.xorBlocks(blocks[6]).xorBlocks(blocks[1]);
+            var tmp1 = msg1.xorBlocks(blocks[2]).xorBlocks(blocks[5]);
+            tmp0 = tmp0.xorBlocks(blocks[2].andBlocks(blocks[3]));
+            tmp1 = tmp1.xorBlocks(blocks[6].andBlocks(blocks[7]));
+            dst[0..aes_block_length].* = tmp0.toBytes();
+            dst[aes_block_length..rate].* = tmp1.toBytes();
+            state.update(msg0, msg1);
+        }
 
-    fn dec(state: *State128L, dst: *[32]u8, src: *const [32]u8) void {
-        const blocks = &state.blocks;
-        var msg0 = AesBlock.fromBytes(src[0..16]).xorBlocks(blocks[6]).xorBlocks(blocks[1]);
-        var msg1 = AesBlock.fromBytes(src[16..32]).xorBlocks(blocks[2]).xorBlocks(blocks[5]);
-        msg0 = msg0.xorBlocks(blocks[2].andBlocks(blocks[3]));
-        msg1 = msg1.xorBlocks(blocks[6].andBlocks(blocks[7]));
-        dst[0..16].* = msg0.toBytes();
-        dst[16..32].* = msg1.toBytes();
-        state.update(msg0, msg1);
-    }
+        fn dec(state: *State, dst: *[rate]u8, src: *const [rate]u8) void {
+            const blocks = &state.blocks;
+            var msg0 = AesBlockVec.fromBytes(src[0..aes_block_length]).xorBlocks(blocks[6]).xorBlocks(blocks[1]);
+            var msg1 = AesBlockVec.fromBytes(src[aes_block_length..rate]).xorBlocks(blocks[2]).xorBlocks(blocks[5]);
+            msg0 = msg0.xorBlocks(blocks[2].andBlocks(blocks[3]));
+            msg1 = msg1.xorBlocks(blocks[6].andBlocks(blocks[7]));
+            dst[0..aes_block_length].* = msg0.toBytes();
+            dst[aes_block_length..rate].* = msg1.toBytes();
+            state.update(msg0, msg1);
+        }
 
-    fn mac(state: *State128L, comptime tag_bits: u9, adlen: usize, mlen: usize) [tag_bits / 8]u8 {
-        const blocks = &state.blocks;
-        var sizes: [16]u8 = undefined;
-        mem.writeInt(u64, sizes[0..8], @as(u64, adlen) * 8, .little);
-        mem.writeInt(u64, sizes[8..16], @as(u64, mlen) * 8, .little);
-        const tmp = AesBlock.fromBytes(&sizes).xorBlocks(blocks[2]);
-        var i: usize = 0;
-        while (i < 7) : (i += 1) {
-            state.update(tmp, tmp);
+        fn decLast(state: *State, dst: []u8, src: []const u8) void {
+            const blocks = &state.blocks;
+            const z0 = blocks[6].xorBlocks(blocks[1]).xorBlocks(blocks[2].andBlocks(blocks[3]));
+            const z1 = blocks[2].xorBlocks(blocks[5]).xorBlocks(blocks[6].andBlocks(blocks[7]));
+            var pad = [_]u8{0} ** rate;
+            pad[0..aes_block_length].* = z0.toBytes();
+            pad[aes_block_length..].* = z1.toBytes();
+            for (pad[0..src.len], src) |*p, x| p.* ^= x;
+            @memcpy(dst, pad[0..src.len]);
+            @memset(pad[src.len..], 0);
+            const msg0 = AesBlockVec.fromBytes(pad[0..aes_block_length]);
+            const msg1 = AesBlockVec.fromBytes(pad[aes_block_length..rate]);
+            state.update(msg0, msg1);
         }
-        return switch (tag_bits) {
-            128 => blocks[0].xorBlocks(blocks[1]).xorBlocks(blocks[2]).xorBlocks(blocks[3])
-                .xorBlocks(blocks[4]).xorBlocks(blocks[5]).xorBlocks(blocks[6]).toBytes(),
-            256 => tag: {
-                const t1 = blocks[0].xorBlocks(blocks[1]).xorBlocks(blocks[2]).xorBlocks(blocks[3]);
-                const t2 = blocks[4].xorBlocks(blocks[5]).xorBlocks(blocks[6]).xorBlocks(blocks[7]);
-                break :tag t1.toBytes() ++ t2.toBytes();
-            },
-            else => unreachable,
-        };
-    }
-};
 
-fn Aegis128LGeneric(comptime tag_bits: u9) type {
+        fn mac(state: *State, comptime tag_bits: u9, adlen: usize, mlen: usize) [tag_bits / 8]u8 {
+            const blocks = &state.blocks;
+            var sizes: [aes_block_length]u8 = undefined;
+            mem.writeInt(u64, sizes[0..8], @as(u64, adlen) * 8, .little);
+            mem.writeInt(u64, sizes[8..16], @as(u64, mlen) * 8, .little);
+            for (1..degree) |i| {
+                @memcpy(sizes[i * 16 ..][0..16], sizes[0..16]);
+            }
+            const tmp = AesBlockVec.fromBytes(&sizes).xorBlocks(blocks[2]);
+            for (0..7) |_| {
+                state.update(tmp, tmp);
+            }
+            switch (tag_bits) {
+                128 => {
+                    var tag_multi = blocks[0].xorBlocks(blocks[1]).xorBlocks(blocks[2]).xorBlocks(blocks[3]).xorBlocks(blocks[4]).xorBlocks(blocks[5]).xorBlocks(blocks[6]).toBytes();
+                    var tag = tag_multi[0..16].*;
+                    @memcpy(tag[0..], tag_multi[0..16]);
+                    for (1..degree) |d| {
+                        for (0..16) |i| {
+                            tag[i] ^= tag_multi[d * 16 + i];
+                        }
+                    }
+                    return tag;
+                },
+                256 => {
+                    const tag_multi_1 = blocks[0].xorBlocks(blocks[1]).xorBlocks(blocks[2]).xorBlocks(blocks[3]).toBytes();
+                    const tag_multi_2 = blocks[4].xorBlocks(blocks[5]).xorBlocks(blocks[6]).xorBlocks(blocks[7]).toBytes();
+                    var tag = tag_multi_1[0..16].* ++ tag_multi_2[0..16].*;
+                    for (1..degree) |d| {
+                        for (0..16) |i| {
+                            tag[i] ^= tag_multi_1[d * 16 + i];
+                            tag[i + 16] ^= tag_multi_2[d * 16 + i];
+                        }
+                    }
+                    return tag;
+                },
+                else => unreachable,
+            }
+        }
+    };
+}
+
+/// AEGIS is a very fast authenticated encryption system built on top of the core AES function.
+///
+/// The 128 bits variants of AEGIS have a 128 bit key and a 128 bit nonce.
+///
+/// https://datatracker.ietf.org/doc/draft-irtf-cfrg-aegis-aead/
+fn Aegis128XGeneric(comptime degree: u7, comptime tag_bits: u9) type {
+    comptime assert(degree > 0); // degree must be greater than 0
     comptime assert(tag_bits == 128 or tag_bits == 256); // tag must be 128 or 256 bits
 
     return struct {
+        const State = State128X(degree);
+
         pub const tag_length = tag_bits / 8;
         pub const nonce_length = 16;
         pub const key_length = 16;
-        pub const block_length = 32;
+        pub const block_length = State.rate;
 
-        const State = State128L;
+        const alignment = State.alignment;
 
         /// c: ciphertext: output buffer should be of size m.len
         /// tag: authentication tag: output MAC
@@ -145,27 +230,27 @@ fn Aegis128LGeneric(comptime tag_bits: u9) type {
         /// k: private key
         pub fn encrypt(c: []u8, tag: *[tag_length]u8, m: []const u8, ad: []const u8, npub: [nonce_length]u8, key: [key_length]u8) void {
             assert(c.len == m.len);
-            var state = State128L.init(key, npub);
-            var src: [32]u8 align(16) = undefined;
-            var dst: [32]u8 align(16) = undefined;
+            var state = State.init(key, npub);
+            var src: [block_length]u8 align(alignment) = undefined;
+            var dst: [block_length]u8 align(alignment) = undefined;
             var i: usize = 0;
-            while (i + 32 <= ad.len) : (i += 32) {
-                state.absorb(ad[i..][0..32]);
+            while (i + block_length <= ad.len) : (i += block_length) {
+                state.absorb(ad[i..][0..block_length]);
             }
-            if (ad.len % 32 != 0) {
+            if (ad.len % block_length != 0) {
                 @memset(src[0..], 0);
-                @memcpy(src[0 .. ad.len % 32], ad[i..][0 .. ad.len % 32]);
+                @memcpy(src[0 .. ad.len % block_length], ad[i..][0 .. ad.len % block_length]);
                 state.absorb(&src);
             }
             i = 0;
-            while (i + 32 <= m.len) : (i += 32) {
-                state.enc(c[i..][0..32], m[i..][0..32]);
+            while (i + block_length <= m.len) : (i += block_length) {
+                state.enc(c[i..][0..block_length], m[i..][0..block_length]);
             }
-            if (m.len % 32 != 0) {
+            if (m.len % block_length != 0) {
                 @memset(src[0..], 0);
-                @memcpy(src[0 .. m.len % 32], m[i..][0 .. m.len % 32]);
+                @memcpy(src[0 .. m.len % block_length], m[i..][0 .. m.len % block_length]);
                 state.enc(&dst, &src);
-                @memcpy(c[i..][0 .. m.len % 32], dst[0 .. m.len % 32]);
+                @memcpy(c[i..][0 .. m.len % block_length], dst[0 .. m.len % block_length]);
             }
             tag.* = state.mac(tag_bits, ad.len, m.len);
         }
@@ -181,31 +266,23 @@ fn Aegis128LGeneric(comptime tag_bits: u9) type {
         /// Contents of `m` are undefined if an error is returned.
         pub fn decrypt(m: []u8, c: []const u8, tag: [tag_length]u8, ad: []const u8, npub: [nonce_length]u8, key: [key_length]u8) AuthenticationError!void {
             assert(c.len == m.len);
-            var state = State128L.init(key, npub);
-            var src: [32]u8 align(16) = undefined;
-            var dst: [32]u8 align(16) = undefined;
+            var state = State.init(key, npub);
+            var src: [block_length]u8 align(alignment) = undefined;
             var i: usize = 0;
-            while (i + 32 <= ad.len) : (i += 32) {
-                state.absorb(ad[i..][0..32]);
+            while (i + block_length <= ad.len) : (i += block_length) {
+                state.absorb(ad[i..][0..block_length]);
             }
-            if (ad.len % 32 != 0) {
+            if (ad.len % block_length != 0) {
                 @memset(src[0..], 0);
-                @memcpy(src[0 .. ad.len % 32], ad[i..][0 .. ad.len % 32]);
+                @memcpy(src[0 .. ad.len % block_length], ad[i..][0 .. ad.len % block_length]);
                 state.absorb(&src);
             }
             i = 0;
-            while (i + 32 <= m.len) : (i += 32) {
-                state.dec(m[i..][0..32], c[i..][0..32]);
+            while (i + block_length <= m.len) : (i += block_length) {
+                state.dec(m[i..][0..block_length], c[i..][0..block_length]);
             }
-            if (m.len % 32 != 0) {
-                @memset(src[0..], 0);
-                @memcpy(src[0 .. m.len % 32], c[i..][0 .. m.len % 32]);
-                state.dec(&dst, &src);
-                @memcpy(m[i..][0 .. m.len % 32], dst[0 .. m.len % 32]);
-                @memset(dst[0 .. m.len % 32], 0);
-                const blocks = &state.blocks;
-                blocks[0] = blocks[0].xorBlocks(AesBlock.fromBytes(dst[0..16]));
-                blocks[4] = blocks[4].xorBlocks(AesBlock.fromBytes(dst[16..32]));
+            if (m.len % block_length != 0) {
+                state.decLast(m[i..], c[i..]);
             }
             var computed_tag = state.mac(tag_bits, ad.len, m.len);
             const verify = crypto.timing_safe.eql([tag_length]u8, computed_tag, tag);
@@ -218,107 +295,172 @@ fn Aegis128LGeneric(comptime tag_bits: u9) type {
     };
 }
 
-const State256 = struct {
-    blocks: [6]AesBlock,
-
-    fn init(key: [32]u8, nonce: [32]u8) State256 {
-        const c1 = AesBlock.fromBytes(&[16]u8{ 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1, 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd });
-        const c2 = AesBlock.fromBytes(&[16]u8{ 0x0, 0x1, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d, 0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62 });
-        const key_block1 = AesBlock.fromBytes(key[0..16]);
-        const key_block2 = AesBlock.fromBytes(key[16..32]);
-        const nonce_block1 = AesBlock.fromBytes(nonce[0..16]);
-        const nonce_block2 = AesBlock.fromBytes(nonce[16..32]);
-        const kxn1 = key_block1.xorBlocks(nonce_block1);
-        const kxn2 = key_block2.xorBlocks(nonce_block2);
-        const blocks = [6]AesBlock{
-            kxn1,
-            kxn2,
-            c1,
-            c2,
-            key_block1.xorBlocks(c2),
-            key_block2.xorBlocks(c1),
-        };
-        var state = State256{ .blocks = blocks };
-        var i: usize = 0;
-        while (i < 4) : (i += 1) {
-            state.update(key_block1);
-            state.update(key_block2);
-            state.update(kxn1);
-            state.update(kxn2);
+fn State256X(comptime degree: u7) type {
+    return struct {
+        const AesBlockVec = crypto.core.aes.BlockVec(degree);
+        const State = @This();
+
+        blocks: [6]AesBlockVec,
+
+        const aes_block_length = AesBlockVec.block_length;
+        const rate = aes_block_length;
+        const alignment = AesBlockVec.native_word_size;
+
+        fn init(key: [32]u8, nonce: [32]u8) State {
+            const c1 = AesBlockVec.fromBytes(&[16]u8{ 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1, 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd } ** degree);
+            const c2 = AesBlockVec.fromBytes(&[16]u8{ 0x0, 0x1, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d, 0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62 } ** degree);
+            const key_block1 = AesBlockVec.fromBytes(key[0..16] ** degree);
+            const key_block2 = AesBlockVec.fromBytes(key[16..32] ** degree);
+            const nonce_block1 = AesBlockVec.fromBytes(nonce[0..16] ** degree);
+            const nonce_block2 = AesBlockVec.fromBytes(nonce[16..32] ** degree);
+            const kxn1 = key_block1.xorBlocks(nonce_block1);
+            const kxn2 = key_block2.xorBlocks(nonce_block2);
+            const blocks = [6]AesBlockVec{
+                kxn1,
+                kxn2,
+                c1,
+                c2,
+                key_block1.xorBlocks(c2),
+                key_block2.xorBlocks(c1),
+            };
+            var state = State{ .blocks = blocks };
+            if (degree > 1) {
+                const context_block = ctx: {
+                    var contexts_bytes = [_]u8{0} ** aes_block_length;
+                    for (0..degree) |i| {
+                        contexts_bytes[i * 16] = @intCast(i);
+                        contexts_bytes[i * 16 + 1] = @intCast(degree - 1);
+                    }
+                    break :ctx AesBlockVec.fromBytes(&contexts_bytes);
+                };
+                for (0..4) |_| {
+                    state.blocks[3] = state.blocks[3].xorBlocks(context_block);
+                    state.blocks[5] = state.blocks[5].xorBlocks(context_block);
+                    state.update(key_block1);
+                    state.blocks[3] = state.blocks[3].xorBlocks(context_block);
+                    state.blocks[5] = state.blocks[5].xorBlocks(context_block);
+                    state.update(key_block2);
+                    state.blocks[3] = state.blocks[3].xorBlocks(context_block);
+                    state.blocks[5] = state.blocks[5].xorBlocks(context_block);
+                    state.update(kxn1);
+                    state.blocks[3] = state.blocks[3].xorBlocks(context_block);
+                    state.blocks[5] = state.blocks[5].xorBlocks(context_block);
+                    state.update(kxn2);
+                }
+            } else {
+                for (0..4) |_| {
+                    state.update(key_block1);
+                    state.update(key_block2);
+                    state.update(kxn1);
+                    state.update(kxn2);
+                }
+            }
+            return state;
         }
-        return state;
-    }
 
-    inline fn update(state: *State256, d: AesBlock) void {
-        const blocks = &state.blocks;
-        const tmp = blocks[5].encrypt(blocks[0]);
-        comptime var i: usize = 5;
-        inline while (i > 0) : (i -= 1) {
-            blocks[i] = blocks[i - 1].encrypt(blocks[i]);
+        inline fn update(state: *State, d: AesBlockVec) void {
+            const blocks = &state.blocks;
+            const tmp = blocks[5].encrypt(blocks[0]);
+            comptime var i: usize = 5;
+            inline while (i > 0) : (i -= 1) {
+                blocks[i] = blocks[i - 1].encrypt(blocks[i]);
+            }
+            blocks[0] = tmp.xorBlocks(d);
         }
-        blocks[0] = tmp.xorBlocks(d);
-    }
 
-    fn absorb(state: *State256, src: *const [16]u8) void {
-        const msg = AesBlock.fromBytes(src);
-        state.update(msg);
-    }
+        fn absorb(state: *State, src: *const [rate]u8) void {
+            const msg = AesBlockVec.fromBytes(src);
+            state.update(msg);
+        }
 
-    fn enc(state: *State256, dst: *[16]u8, src: *const [16]u8) void {
-        const blocks = &state.blocks;
-        const msg = AesBlock.fromBytes(src);
-        var tmp = msg.xorBlocks(blocks[5]).xorBlocks(blocks[4]).xorBlocks(blocks[1]);
-        tmp = tmp.xorBlocks(blocks[2].andBlocks(blocks[3]));
-        dst.* = tmp.toBytes();
-        state.update(msg);
-    }
+        fn enc(state: *State, dst: *[rate]u8, src: *const [rate]u8) void {
+            const blocks = &state.blocks;
+            const msg = AesBlockVec.fromBytes(src);
+            var tmp = msg.xorBlocks(blocks[5]).xorBlocks(blocks[4]).xorBlocks(blocks[1]);
+            tmp = tmp.xorBlocks(blocks[2].andBlocks(blocks[3]));
+            dst.* = tmp.toBytes();
+            state.update(msg);
+        }
 
-    fn dec(state: *State256, dst: *[16]u8, src: *const [16]u8) void {
-        const blocks = &state.blocks;
-        var msg = AesBlock.fromBytes(src).xorBlocks(blocks[5]).xorBlocks(blocks[4]).xorBlocks(blocks[1]);
-        msg = msg.xorBlocks(blocks[2].andBlocks(blocks[3]));
-        dst.* = msg.toBytes();
-        state.update(msg);
-    }
+        fn dec(state: *State, dst: *[rate]u8, src: *const [rate]u8) void {
+            const blocks = &state.blocks;
+            var msg = AesBlockVec.fromBytes(src).xorBlocks(blocks[5]).xorBlocks(blocks[4]).xorBlocks(blocks[1]);
+            msg = msg.xorBlocks(blocks[2].andBlocks(blocks[3]));
+            dst.* = msg.toBytes();
+            state.update(msg);
+        }
 
-    fn mac(state: *State256, comptime tag_bits: u9, adlen: usize, mlen: usize) [tag_bits / 8]u8 {
-        const blocks = &state.blocks;
-        var sizes: [16]u8 = undefined;
-        mem.writeInt(u64, sizes[0..8], @as(u64, adlen) * 8, .little);
-        mem.writeInt(u64, sizes[8..16], @as(u64, mlen) * 8, .little);
-        const tmp = AesBlock.fromBytes(&sizes).xorBlocks(blocks[3]);
-        var i: usize = 0;
-        while (i < 7) : (i += 1) {
-            state.update(tmp);
+        fn decLast(state: *State, dst: []u8, src: []const u8) void {
+            const blocks = &state.blocks;
+            const z = blocks[5].xorBlocks(blocks[4]).xorBlocks(blocks[1]).xorBlocks(blocks[2].andBlocks(blocks[3]));
+            var pad = z.toBytes();
+            for (pad[0..src.len], src) |*p, x| p.* ^= x;
+            @memcpy(dst, pad[0..src.len]);
+            @memset(pad[src.len..], 0);
+            const msg = AesBlockVec.fromBytes(pad[0..]);
+            state.update(msg);
         }
-        return switch (tag_bits) {
-            128 => blocks[0].xorBlocks(blocks[1]).xorBlocks(blocks[2]).xorBlocks(blocks[3])
-                .xorBlocks(blocks[4]).xorBlocks(blocks[5]).toBytes(),
-            256 => tag: {
-                const t1 = blocks[0].xorBlocks(blocks[1]).xorBlocks(blocks[2]);
-                const t2 = blocks[3].xorBlocks(blocks[4]).xorBlocks(blocks[5]);
-                break :tag t1.toBytes() ++ t2.toBytes();
-            },
-            else => unreachable,
-        };
-    }
-};
+
+        fn mac(state: *State, comptime tag_bits: u9, adlen: usize, mlen: usize) [tag_bits / 8]u8 {
+            const blocks = &state.blocks;
+            var sizes: [aes_block_length]u8 = undefined;
+            mem.writeInt(u64, sizes[0..8], @as(u64, adlen) * 8, .little);
+            mem.writeInt(u64, sizes[8..16], @as(u64, mlen) * 8, .little);
+            for (1..degree) |i| {
+                @memcpy(sizes[i * 16 ..][0..16], sizes[0..16]);
+            }
+            const tmp = AesBlockVec.fromBytes(&sizes).xorBlocks(blocks[3]);
+            for (0..7) |_| {
+                state.update(tmp);
+            }
+            switch (tag_bits) {
+                128 => {
+                    var tag_multi = blocks[0].xorBlocks(blocks[1]).xorBlocks(blocks[2]).xorBlocks(blocks[3]).xorBlocks(blocks[4]).xorBlocks(blocks[5]).toBytes();
+                    var tag = tag_multi[0..16].*;
+                    @memcpy(tag[0..], tag_multi[0..16]);
+                    for (1..degree) |d| {
+                        for (0..16) |i| {
+                            tag[i] ^= tag_multi[d * 16 + i];
+                        }
+                    }
+                    return tag;
+                },
+                256 => {
+                    const tag_multi_1 = blocks[0].xorBlocks(blocks[1]).xorBlocks(blocks[2]).toBytes();
+                    const tag_multi_2 = blocks[3].xorBlocks(blocks[4]).xorBlocks(blocks[5]).toBytes();
+                    var tag = tag_multi_1[0..16].* ++ tag_multi_2[0..16].*;
+                    for (1..degree) |d| {
+                        for (0..16) |i| {
+                            tag[i] ^= tag_multi_1[d * 16 + i];
+                            tag[i + 16] ^= tag_multi_2[d * 16 + i];
+                        }
+                    }
+                    return tag;
+                },
+                else => unreachable,
+            }
+        }
+    };
+}
 
 /// AEGIS is a very fast authenticated encryption system built on top of the core AES function.
 ///
-/// The 256 bit variant of AEGIS has a 256 bit key, a 256 bit nonce, and processes 128 bit message blocks.
+/// The 256 bits variants of AEGIS have a 256 bit key and a 256 bit nonce.
 ///
 /// https://datatracker.ietf.org/doc/draft-irtf-cfrg-aegis-aead/
-fn Aegis256Generic(comptime tag_bits: u9) type {
+fn Aegis256XGeneric(comptime degree: u7, comptime tag_bits: u9) type {
+    comptime assert(degree > 0); // degree must be greater than 0
     comptime assert(tag_bits == 128 or tag_bits == 256); // tag must be 128 or 256 bits
 
     return struct {
+        const State = State256X(degree);
+
         pub const tag_length = tag_bits / 8;
         pub const nonce_length = 32;
         pub const key_length = 32;
-        pub const block_length = 16;
+        pub const block_length = State.rate;
 
-        const State = State256;
+        const alignment = State.alignment;
 
         /// c: ciphertext: output buffer should be of size m.len
         /// tag: authentication tag: output MAC
@@ -328,27 +470,27 @@ fn Aegis256Generic(comptime tag_bits: u9) type {
         /// k: private key
         pub fn encrypt(c: []u8, tag: *[tag_length]u8, m: []const u8, ad: []const u8, npub: [nonce_length]u8, key: [key_length]u8) void {
             assert(c.len == m.len);
-            var state = State256.init(key, npub);
-            var src: [16]u8 align(16) = undefined;
-            var dst: [16]u8 align(16) = undefined;
+            var state = State.init(key, npub);
+            var src: [block_length]u8 align(alignment) = undefined;
+            var dst: [block_length]u8 align(alignment) = undefined;
             var i: usize = 0;
-            while (i + 16 <= ad.len) : (i += 16) {
-                state.enc(&dst, ad[i..][0..16]);
+            while (i + block_length <= ad.len) : (i += block_length) {
+                state.enc(&dst, ad[i..][0..block_length]);
             }
-            if (ad.len % 16 != 0) {
+            if (ad.len % block_length != 0) {
                 @memset(src[0..], 0);
-                @memcpy(src[0 .. ad.len % 16], ad[i..][0 .. ad.len % 16]);
+                @memcpy(src[0 .. ad.len % block_length], ad[i..][0 .. ad.len % block_length]);
                 state.enc(&dst, &src);
             }
             i = 0;
-            while (i + 16 <= m.len) : (i += 16) {
-                state.enc(c[i..][0..16], m[i..][0..16]);
+            while (i + block_length <= m.len) : (i += block_length) {
+                state.enc(c[i..][0..block_length], m[i..][0..block_length]);
             }
-            if (m.len % 16 != 0) {
+            if (m.len % block_length != 0) {
                 @memset(src[0..], 0);
-                @memcpy(src[0 .. m.len % 16], m[i..][0 .. m.len % 16]);
+                @memcpy(src[0 .. m.len % block_length], m[i..][0 .. m.len % block_length]);
                 state.enc(&dst, &src);
-                @memcpy(c[i..][0 .. m.len % 16], dst[0 .. m.len % 16]);
+                @memcpy(c[i..][0 .. m.len % block_length], dst[0 .. m.len % block_length]);
             }
             tag.* = state.mac(tag_bits, ad.len, m.len);
         }
@@ -364,30 +506,23 @@ fn Aegis256Generic(comptime tag_bits: u9) type {
         /// Contents of `m` are undefined if an error is returned.
         pub fn decrypt(m: []u8, c: []const u8, tag: [tag_length]u8, ad: []const u8, npub: [nonce_length]u8, key: [key_length]u8) AuthenticationError!void {
             assert(c.len == m.len);
-            var state = State256.init(key, npub);
-            var src: [16]u8 align(16) = undefined;
-            var dst: [16]u8 align(16) = undefined;
+            var state = State.init(key, npub);
+            var src: [block_length]u8 align(alignment) = undefined;
             var i: usize = 0;
-            while (i + 16 <= ad.len) : (i += 16) {
-                state.enc(&dst, ad[i..][0..16]);
+            while (i + block_length <= ad.len) : (i += block_length) {
+                state.absorb(ad[i..][0..block_length]);
             }
-            if (ad.len % 16 != 0) {
+            if (ad.len % block_length != 0) {
                 @memset(src[0..], 0);
-                @memcpy(src[0 .. ad.len % 16], ad[i..][0 .. ad.len % 16]);
-                state.enc(&dst, &src);
+                @memcpy(src[0 .. ad.len % block_length], ad[i..][0 .. ad.len % block_length]);
+                state.absorb(&src);
             }
             i = 0;
-            while (i + 16 <= m.len) : (i += 16) {
-                state.dec(m[i..][0..16], c[i..][0..16]);
+            while (i + block_length <= m.len) : (i += block_length) {
+                state.dec(m[i..][0..block_length], c[i..][0..block_length]);
             }
-            if (m.len % 16 != 0) {
-                @memset(src[0..], 0);
-                @memcpy(src[0 .. m.len % 16], c[i..][0 .. m.len % 16]);
-                state.dec(&dst, &src);
-                @memcpy(m[i..][0 .. m.len % 16], dst[0 .. m.len % 16]);
-                @memset(dst[0 .. m.len % 16], 0);
-                const blocks = &state.blocks;
-                blocks[0] = blocks[0].xorBlocks(AesBlock.fromBytes(&dst));
+            if (m.len % block_length != 0) {
+                state.decLast(m[i..], c[i..]);
             }
             var computed_tag = state.mac(tag_bits, ad.len, m.len);
             const verify = crypto.timing_safe.eql([tag_length]u8, computed_tag, tag);
@@ -400,6 +535,24 @@ fn Aegis256Generic(comptime tag_bits: u9) type {
     };
 }
 
+/// The `Aegis128X4Mac` message authentication function outputs 256 bit tags.
+/// In addition to being extremely fast, its large state, non-linearity
+/// and non-invertibility provides the following properties:
+/// - 128 bit security, stronger than GHash/Polyval/Poly1305.
+/// - Recovering the secret key from the state would require ~2^128 attempts,
+///   which is infeasible for any practical adversary.
+/// - It has a large security margin against internal collisions.
+pub const Aegis128X4Mac = AegisMac(Aegis128X4_256);
+
+/// The `Aegis128X2Mac` message authentication function outputs 256 bit tags.
+/// In addition to being extremely fast, its large state, non-linearity
+/// and non-invertibility provides the following properties:
+/// - 128 bit security, stronger than GHash/Polyval/Poly1305.
+/// - Recovering the secret key from the state would require ~2^128 attempts,
+///   which is infeasible for any practical adversary.
+/// - It has a large security margin against internal collisions.
+pub const Aegis128X2Mac = AegisMac(Aegis128X2_256);
+
 /// The `Aegis128LMac` message authentication function outputs 256 bit tags.
 /// In addition to being extremely fast, its large state, non-linearity
 /// and non-invertibility provides the following properties:
@@ -409,34 +562,60 @@ fn Aegis256Generic(comptime tag_bits: u9) type {
 /// - It has a large security margin against internal collisions.
 pub const Aegis128LMac = AegisMac(Aegis128L_256);
 
+/// The `Aegis256X4Mac` message authentication function has a 256-bit key size,
+/// and outputs 256 bit tags. Unless theoretical multi-target attacks are a
+/// concern, the AEGIS-128L variant should be preferred.
+/// AEGIS' large state, non-linearity and non-invertibility provides the
+/// following properties:
+/// - 256 bit security against forgery.
+/// - Recovering the secret key from the state would require ~2^256 attempts,
+///   which is infeasible for any practical adversary.
+/// - It has a large security margin against internal collisions.
+pub const Aegis256X4Mac = AegisMac(Aegis256X4_256);
+
+/// The `Aegis256X2Mac` message authentication function has a 256-bit key size,
+/// and outputs 256 bit tags. Unless theoretical multi-target attacks are a
+/// concern, the AEGIS-128L variant should be preferred.
+/// AEGIS' large state, non-linearity and non-invertibility provides the
+/// following properties:
+/// - 256 bit security against forgery.
+/// - Recovering the secret key from the state would require ~2^256 attempts,
+///   which is infeasible for any practical adversary.
+/// - It has a large security margin against internal collisions.
+pub const Aegis256X2Mac = AegisMac(Aegis256X2_256);
+
 /// The `Aegis256Mac` message authentication function has a 256-bit key size,
 /// and outputs 256 bit tags. Unless theoretical multi-target attacks are a
 /// concern, the AEGIS-128L variant should be preferred.
 /// AEGIS' large state, non-linearity and non-invertibility provides the
 /// following properties:
-/// - More than 128 bit security against forgery.
+/// - 256 bit security against forgery.
 /// - Recovering the secret key from the state would require ~2^256 attempts,
 ///   which is infeasible for any practical adversary.
 /// - It has a large security margin against internal collisions.
 pub const Aegis256Mac = AegisMac(Aegis256_256);
 
-/// Aegis128L MAC with a 128-bit output.
-/// A MAC with a 128-bit output is not safe unless the number of messages
-/// authenticated with the same key remains small.
-/// After 2^48 messages, the probability of a collision is already ~ 2^-33.
-/// If unsure, use the  Aegis128LMac type, that has a 256 bit output.
+/// AEGIS-128X4 MAC with 128-bit tags
+pub const Aegis128X4Mac_128 = AegisMac(Aegis128X4);
+
+/// AEGIS-128X2 MAC with 128-bit tags
+pub const Aegis128X2Mac_128 = AegisMac(Aegis128X2);
+
+/// AEGIS-128L MAC with 128-bit tags
 pub const Aegis128LMac_128 = AegisMac(Aegis128L);
 
-/// Aegis256 MAC with a 128-bit output.
-/// A MAC with a 128-bit output is not safe unless the number of messages
-/// authenticated with the same key remains small.
-/// After 2^48 messages, the probability of a collision is already ~ 2^-33.
-/// If unsure, use the  Aegis256Mac type, that has a 256 bit output.
+/// AEGIS-256X4 MAC with 128-bit tags
+pub const Aegis256X4Mac_128 = AegisMac(Aegis256X4);
+
+/// AEGIS-256X2 MAC with 128-bit tags
+pub const Aegis256X2Mac_128 = AegisMac(Aegis256X2);
+
+/// AEGIS-256 MAC with 128-bit tags
 pub const Aegis256Mac_128 = AegisMac(Aegis256);
 
 fn AegisMac(comptime T: type) type {
     return struct {
-        const Self = @This();
+        const Mac = @This();
 
         pub const mac_length = T.tag_length;
         pub const key_length = T.key_length;
@@ -448,15 +627,15 @@ fn AegisMac(comptime T: type) type {
         msg_len: usize = 0,
 
         /// Initialize a state for the MAC function
-        pub fn init(key: *const [key_length]u8) Self {
+        pub fn init(key: *const [key_length]u8) Mac {
             const nonce = [_]u8{0} ** T.nonce_length;
-            return Self{
+            return Mac{
                 .state = T.State.init(key.*, nonce),
             };
         }
 
         /// Add data to the state
-        pub fn update(self: *Self, b: []const u8) void {
+        pub fn update(self: *Mac, b: []const u8) void {
             self.msg_len += b.len;
 
             const len_partial = @min(b.len, block_length - self.off);
@@ -469,6 +648,10 @@ fn AegisMac(comptime T: type) type {
 
             var i = len_partial;
             self.off = 0;
+            while (i + block_length * 2 <= b.len) : (i += block_length * 2) {
+                self.state.absorb(b[i..][0..block_length]);
+                self.state.absorb(b[i..][block_length .. block_length * 2]);
+            }
             while (i + block_length <= b.len) : (i += block_length) {
                 self.state.absorb(b[i..][0..block_length]);
             }
@@ -479,7 +662,7 @@ fn AegisMac(comptime T: type) type {
         }
 
         /// Return an authentication tag for the current state
-        pub fn final(self: *Self, out: *[mac_length]u8) void {
+        pub fn final(self: *Mac, out: *[mac_length]u8) void {
             if (self.off > 0) {
                 var pad = [_]u8{0} ** block_length;
                 @memcpy(pad[0..self.off], self.buf[0..self.off]);
@@ -490,20 +673,20 @@ fn AegisMac(comptime T: type) type {
 
         /// Return an authentication tag for a message and a key
         pub fn create(out: *[mac_length]u8, msg: []const u8, key: *const [key_length]u8) void {
-            var ctx = Self.init(key);
+            var ctx = Mac.init(key);
             ctx.update(msg);
             ctx.final(out);
         }
 
         pub const Error = error{};
-        pub const Writer = std.io.Writer(*Self, Error, write);
+        pub const Writer = std.io.Writer(*Mac, Error, write);
 
-        fn write(self: *Self, bytes: []const u8) Error!usize {
+        fn write(self: *Mac, bytes: []const u8) Error!usize {
             self.update(bytes);
             return bytes.len;
         }
 
-        pub fn writer(self: *Self) Writer {
+        pub fn writer(self: *Mac) Writer {
             return .{ .context = self };
         }
     };
@@ -568,6 +751,23 @@ test "Aegis128L test vector 3" {
     try htest.assertEqual("83cc600dc4e3e7e62d4055826174f149", &tag);
 }
 
+test "Aegis128X2 test vector 1" {
+    const key: [Aegis128X2.key_length]u8 = [_]u8{ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f };
+    const nonce: [Aegis128X2.nonce_length]u8 = [_]u8{ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f };
+    var empty = [_]u8{};
+    var tag: [Aegis128X2.tag_length]u8 = undefined;
+    var tag256: [Aegis128X2_256.tag_length]u8 = undefined;
+
+    Aegis128X2.encrypt(&empty, &tag, &empty, &empty, nonce, key);
+    Aegis128X2_256.encrypt(&empty, &tag256, &empty, &empty, nonce, key);
+    try htest.assertEqual("63117dc57756e402819a82e13eca8379", &tag);
+    try htest.assertEqual("b92c71fdbd358b8a4de70b27631ace90cffd9b9cfba82028412bac41b4f53759", &tag256);
+    tag[0] +%= 1;
+    try testing.expectError(error.AuthenticationFailed, Aegis128X2.decrypt(&empty, &empty, tag, &empty, nonce, key));
+    tag256[0] +%= 1;
+    try testing.expectError(error.AuthenticationFailed, Aegis128X2_256.decrypt(&empty, &empty, tag256, &empty, nonce, key));
+}
+
 test "Aegis256 test vector 1" {
     const key: [Aegis256.key_length]u8 = [_]u8{ 0x10, 0x01 } ++ [_]u8{0x00} ** 30;
     const nonce: [Aegis256.nonce_length]u8 = [_]u8{ 0x10, 0x00, 0x02 } ++ [_]u8{0x00} ** 29;
@@ -624,6 +824,23 @@ test "Aegis256 test vector 3" {
     try htest.assertEqual("f7a0878f68bd083e8065354071fc27c3", &tag);
 }
 
+test "Aegis256X4 test vector 1" {
+    const key: [Aegis256X4.key_length]u8 = [_]u8{ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f };
+    const nonce: [Aegis256X4.nonce_length]u8 = [_]u8{ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f };
+    var empty = [_]u8{};
+    var tag: [Aegis256X4.tag_length]u8 = undefined;
+    var tag256: [Aegis256X4_256.tag_length]u8 = undefined;
+
+    Aegis256X4.encrypt(&empty, &tag, &empty, &empty, nonce, key);
+    Aegis256X4_256.encrypt(&empty, &tag256, &empty, &empty, nonce, key);
+    try htest.assertEqual("3b7fee6cee7bf17888ad11ed2397beb4", &tag);
+    try htest.assertEqual("6093a1a8aab20ec635dc1ca71745b01b5bec4fc444c9ffbebd710d4a34d20eaf", &tag256);
+    tag[0] +%= 1;
+    try testing.expectError(error.AuthenticationFailed, Aegis256X4.decrypt(&empty, &empty, tag, &empty, nonce, key));
+    tag256[0] +%= 1;
+    try testing.expectError(error.AuthenticationFailed, Aegis256X4_256.decrypt(&empty, &empty, tag256, &empty, nonce, key));
+}
+
 test "Aegis MAC" {
     const key = [_]u8{0x00} ** Aegis128LMac.key_length;
     var msg: [64]u8 = undefined;
lib/std/crypto/aes.zig
@@ -22,6 +22,7 @@ pub const has_hardware_support =
     (builtin.cpu.arch == .aarch64 and has_armaes);
 
 pub const Block = impl.Block;
+pub const BlockVec = impl.BlockVec;
 pub const AesEncryptCtx = impl.AesEncryptCtx;
 pub const AesDecryptCtx = impl.AesDecryptCtx;
 pub const Aes128 = impl.Aes128;
lib/std/crypto/benchmark.zig
@@ -72,6 +72,10 @@ const macs = [_]Crypto{
     Crypto{ .ty = crypto.auth.siphash.SipHash64(1, 3), .name = "siphash-1-3" },
     Crypto{ .ty = crypto.auth.siphash.SipHash128(2, 4), .name = "siphash128-2-4" },
     Crypto{ .ty = crypto.auth.siphash.SipHash128(1, 3), .name = "siphash128-1-3" },
+    Crypto{ .ty = crypto.auth.aegis.Aegis128X4Mac, .name = "aegis-128x4 mac" },
+    Crypto{ .ty = crypto.auth.aegis.Aegis256X4Mac, .name = "aegis-256x4 mac" },
+    Crypto{ .ty = crypto.auth.aegis.Aegis128X2Mac, .name = "aegis-128x2 mac" },
+    Crypto{ .ty = crypto.auth.aegis.Aegis256X2Mac, .name = "aegis-256x2 mac" },
     Crypto{ .ty = crypto.auth.aegis.Aegis128LMac, .name = "aegis-128l mac" },
     Crypto{ .ty = crypto.auth.aegis.Aegis256Mac, .name = "aegis-256 mac" },
     Crypto{ .ty = crypto.auth.cmac.CmacAes128, .name = "aes-cmac" },
@@ -283,7 +287,11 @@ const aeads = [_]Crypto{
     Crypto{ .ty = crypto.aead.chacha_poly.XChaCha20Poly1305, .name = "xchacha20Poly1305" },
     Crypto{ .ty = crypto.aead.chacha_poly.XChaCha8Poly1305, .name = "xchacha8Poly1305" },
     Crypto{ .ty = crypto.aead.salsa_poly.XSalsa20Poly1305, .name = "xsalsa20Poly1305" },
+    Crypto{ .ty = crypto.aead.aegis.Aegis128X4, .name = "aegis-128x4" },
+    Crypto{ .ty = crypto.aead.aegis.Aegis128X2, .name = "aegis-128x2" },
     Crypto{ .ty = crypto.aead.aegis.Aegis128L, .name = "aegis-128l" },
+    Crypto{ .ty = crypto.aead.aegis.Aegis256X4, .name = "aegis-256x4" },
+    Crypto{ .ty = crypto.aead.aegis.Aegis256X2, .name = "aegis-256x2" },
     Crypto{ .ty = crypto.aead.aegis.Aegis256, .name = "aegis-256" },
     Crypto{ .ty = crypto.aead.aes_gcm.Aes128Gcm, .name = "aes128-gcm" },
     Crypto{ .ty = crypto.aead.aes_gcm.Aes256Gcm, .name = "aes256-gcm" },
lib/std/crypto.zig
@@ -7,10 +7,23 @@ pub const timing_safe = @import("crypto/timing_safe.zig");
 /// Authenticated Encryption with Associated Data
 pub const aead = struct {
     pub const aegis = struct {
-        pub const Aegis128L = @import("crypto/aegis.zig").Aegis128L;
-        pub const Aegis128L_256 = @import("crypto/aegis.zig").Aegis128L_256;
-        pub const Aegis256 = @import("crypto/aegis.zig").Aegis256;
-        pub const Aegis256_256 = @import("crypto/aegis.zig").Aegis256_256;
+        const variants = @import("crypto/aegis.zig");
+
+        pub const Aegis128X4 = variants.Aegis128X4;
+        pub const Aegis128X2 = variants.Aegis128X2;
+        pub const Aegis128L = variants.Aegis128L;
+
+        pub const Aegis256X4 = variants.Aegis256X4;
+        pub const Aegis256X2 = variants.Aegis256X2;
+        pub const Aegis256 = variants.Aegis256;
+
+        pub const Aegis128X4_256 = variants.Aegis128X4_256;
+        pub const Aegis128X2_256 = variants.Aegis128X2_256;
+        pub const Aegis128L_256 = variants.Aegis128L_256;
+
+        pub const Aegis256X4_256 = variants.Aegis256X4_256;
+        pub const Aegis256X2_256 = variants.Aegis256X2_256;
+        pub const Aegis256_256 = variants.Aegis256_256;
     };
 
     pub const aes_gcm = struct {
@@ -44,10 +57,22 @@ pub const auth = struct {
     pub const hmac = @import("crypto/hmac.zig");
     pub const siphash = @import("crypto/siphash.zig");
     pub const aegis = struct {
-        pub const Aegis128LMac = @import("crypto/aegis.zig").Aegis128LMac;
-        pub const Aegis128LMac_128 = @import("crypto/aegis.zig").Aegis128LMac_128;
-        pub const Aegis256Mac = @import("crypto/aegis.zig").Aegis256Mac;
-        pub const Aegis256Mac_128 = @import("crypto/aegis.zig").Aegis256Mac_128;
+        const variants = @import("crypto/aegis.zig");
+        pub const Aegis128X4Mac = variants.Aegis128X4Mac;
+        pub const Aegis128X2Mac = variants.Aegis128X2Mac;
+        pub const Aegis128LMac = variants.Aegis128LMac;
+
+        pub const Aegis256X4Mac = variants.Aegis256X4Mac;
+        pub const Aegis256X2Mac = variants.Aegis256X2Mac;
+        pub const Aegis256Mac = variants.Aegis256Mac;
+
+        pub const Aegis128X4Mac_128 = variants.Aegis128X4Mac_128;
+        pub const Aegis128X2Mac_128 = variants.Aegis128X2Mac_128;
+        pub const Aegis128LMac_128 = variants.Aegis128LMac_128;
+
+        pub const Aegis256X4Mac_128 = variants.Aegis256X4Mac_128;
+        pub const Aegis256X2Mac_128 = variants.Aegis256X2Mac_128;
+        pub const Aegis256Mac_128 = variants.Aegis256Mac_128;
     };
     pub const cmac = @import("crypto/cmac.zig");
 };