From: Loup Vaillant <loup@loup-vaillant.fr>
Date: Mon, 3 Jul 2023 21:27:37 +0000 (+0200)
Subject: Faster Argon2 inner loop
X-Git-Url: https://git.codecow.com/?a=commitdiff_plain;h=014f45d9fe54271c8e08006eea83fa97b9d55d32;p=Monocypher.git

Faster Argon2 inner loop

Compilers aren't magical. They need help to generate the best code.
Here we want to compute the following expression:

    mask = 0xffffffff;
    2 * (a & mask) * (b & mask)

The most efficient way to do this looks like this:

    u64 al = (u32)a;   // Truncate
    u64 bl = (u32)b;   // Truncate
    u64 x  = al * bl;  // 32->64 bits multiply
    u64 2x = x << 1;   // shift
    return 2x;

My compiler doesn't pick up on this, and perform a slower alternative
instead. Either the multiply by two uses an actual multiply instead of a
shift, or the shift is done first, forcing a more expensive 64->64
multiply.  More naive compilers may even do both.

Whatever the cause, I got 5% faster code on GCC 11.3.
---

diff --git a/src/monocypher.c b/src/monocypher.c
index 03a53f6..11de568 100644
--- a/src/monocypher.c
+++ b/src/monocypher.c
@@ -710,12 +710,12 @@ static void extended_hash(u8       *digest, u32 digest_size,
 	}
 }
 
-#define LSB(x) ((x) & 0xffffffff)
+#define LSB(x) ((u64)(u32)x)
 #define G(a, b, c, d)	\
-	a += b + 2 * LSB(a) * LSB(b);  d ^= a;  d = rotr64(d, 32); \
-	c += d + 2 * LSB(c) * LSB(d);  b ^= c;  b = rotr64(b, 24); \
-	a += b + 2 * LSB(a) * LSB(b);  d ^= a;  d = rotr64(d, 16); \
-	c += d + 2 * LSB(c) * LSB(d);  b ^= c;  b = rotr64(b, 63)
+	a += b + ((LSB(a) * LSB(b)) << 1);  d ^= a;  d = rotr64(d, 32); \
+	c += d + ((LSB(c) * LSB(d)) << 1);  b ^= c;  b = rotr64(b, 24); \
+	a += b + ((LSB(a) * LSB(b)) << 1);  d ^= a;  d = rotr64(d, 16); \
+	c += d + ((LSB(c) * LSB(d)) << 1);  b ^= c;  b = rotr64(b, 63)
 #define ROUND(v0,  v1,  v2,  v3,  v4,  v5,  v6,  v7,	\
               v8,  v9, v10, v11, v12, v13, v14, v15)	\
 	G(v0, v4,  v8, v12);  G(v1, v5,  v9, v13); \