From: Loup Vaillant Date: Mon, 3 Jul 2023 21:27:37 +0000 (+0200) Subject: Faster Argon2 inner loop X-Git-Url: https://git.codecow.com/?a=commitdiff_plain;h=014f45d9fe54271c8e08006eea83fa97b9d55d32;p=Monocypher.git Faster Argon2 inner loop Compilers aren't magical. They need help to generate the best code. Here we want to compute the following expression: mask = 0xffffffff; 2 * (a & mask) * (b & mask) The most efficient way to do this looks like this: u64 al = (u32)a; // Truncate u64 bl = (u32)b; // Truncate u64 x = al * bl; // 32->64 bits multiply u64 2x = x << 1; // shift return 2x; My compiler doesn't pick up on this, and perform a slower alternative instead. Either the multiply by two uses an actual multiply instead of a shift, or the shift is done first, forcing a more expensive 64->64 multiply. More naive compilers may even do both. Whatever the cause, I got 5% faster code on GCC 11.3. --- diff --git a/src/monocypher.c b/src/monocypher.c index 03a53f6..11de568 100644 --- a/src/monocypher.c +++ b/src/monocypher.c @@ -710,12 +710,12 @@ static void extended_hash(u8 *digest, u32 digest_size, } } -#define LSB(x) ((x) & 0xffffffff) +#define LSB(x) ((u64)(u32)x) #define G(a, b, c, d) \ - a += b + 2 * LSB(a) * LSB(b); d ^= a; d = rotr64(d, 32); \ - c += d + 2 * LSB(c) * LSB(d); b ^= c; b = rotr64(b, 24); \ - a += b + 2 * LSB(a) * LSB(b); d ^= a; d = rotr64(d, 16); \ - c += d + 2 * LSB(c) * LSB(d); b ^= c; b = rotr64(b, 63) + a += b + ((LSB(a) * LSB(b)) << 1); d ^= a; d = rotr64(d, 32); \ + c += d + ((LSB(c) * LSB(d)) << 1); b ^= c; b = rotr64(b, 24); \ + a += b + ((LSB(a) * LSB(b)) << 1); d ^= a; d = rotr64(d, 16); \ + c += d + ((LSB(c) * LSB(d)) << 1); b ^= c; b = rotr64(b, 63) #define ROUND(v0, v1, v2, v3, v4, v5, v6, v7, \ v8, v9, v10, v11, v12, v13, v14, v15) \ G(v0, v4, v8, v12); G(v1, v5, v9, v13); \