From: Loup Vaillant Date: Tue, 13 Jul 2021 22:19:56 +0000 (+0200) Subject: Simplified crypto_x25519_dirty_fast() X-Git-Url: https://git.codecow.com/?a=commitdiff_plain;h=11bb1f6a8e6c70bd39ff55fb2701d6ba99ce062c;p=Monocypher.git Simplified crypto_x25519_dirty_fast() I noticed a pattern that I didn't exploit optimally. The current code is a bit simpler. Also, I couldn't reverse engineer my own code, so I've expanded the comments to explain the trick in more detail. --- diff --git a/src/monocypher.c b/src/monocypher.c index aac38f2..5af1ae7 100644 --- a/src/monocypher.c +++ b/src/monocypher.c @@ -2588,6 +2588,49 @@ void crypto_x25519_dirty_small(u8 public_key[32], const u8 secret_key[32]) WIPE_BUFFER(scalar); } +// Select low order point +// We're computing the [cofactor]lop scalar multiplication, where: +// +// cofactor = tweak & 7. +// lop = (lop_x, lop_y) +// lop_x = sqrt((sqrt(d + 1) + 1) / d) +// lop_y = -lop_x * sqrtm1 +// +// The low order point has order 8. There are 4 such points. We've +// chosen the one whose both coordinates are positive (below p/2). +// The 8 low order points are as follows: +// +// [0]lop = ( 0 , 1 ) +// [1]lop = ( lop_x , lop_y) +// [2]lop = ( sqrt(-1), -0 ) +// [3]lop = ( lop_x , -lop_y) +// [4]lop = (-0 , -1 ) +// [5]lop = (-lop_x , -lop_y) +// [6]lop = (-sqrt(-1), 0 ) +// [7]lop = (-lop_x , lop_y) +// +// The x coordinate is either 0, sqrt(-1), lop_x, or their opposite. +// The y coordinate is either 0, -1 , lop_y, or their opposite. +// The pattern for both is the same, except for a rotation of 2 (modulo 8) +// +// This helper function captures the pattern, and we can use it thus: +// +// select_lop(x, lop_x, sqrtm1, cofactor); +// select_lop(y, lop_y, fe_one, cofactor + 2); +// +// This is faster than an actual scalar multiplication, +// and requires less code than naive constant time look up. +static void select_lop(fe out, const fe x, const fe k, u8 cofactor) +{ + fe tmp; + fe_0(out); + fe_ccopy(out, k , (cofactor >> 1) & 1); // bit 1 + fe_ccopy(out, x , (cofactor >> 0) & 1); // bit 0 + fe_neg (tmp, out); + fe_ccopy(out, tmp, (cofactor >> 2) & 1); // bit 2 + WIPE_BUFFER(tmp); +} + // "Fast" dirty ephemeral key // We use this one by default. // @@ -2597,38 +2640,17 @@ void crypto_x25519_dirty_small(u8 public_key[32], const u8 secret_key[32]) // The cost is a bigger binary for programs that don't also sign messages. void crypto_x25519_dirty_fast(u8 public_key[32], const u8 secret_key[32]) { + // Compute clean scalar multiplication u8 scalar[32]; ge pk; COPY(scalar, secret_key, 32); trim_scalar(scalar); ge_scalarmult_base(&pk, scalar); - // Select low order point - // We're computing the [cofactor]lop scalar multiplication, where: - // cofactor = tweak & 7. - // lop = (lop_x, lop_y) - // lop_x = sqrt((sqrt(d + 1) + 1) / d) - // lop_y = -lop_x * sqrtm1 - // Notes: - // - A (single) Montgomery ladder would be twice as slow. - // - An actual scalar multiplication would hurt performance. - // - A full table lookup would take more code. - u8 cofactor = secret_key[0] & 7; - int a = (cofactor >> 2) & 1; - int b = (cofactor >> 1) & 1; - int c = (cofactor >> 0) & 1; - fe t1, t2, t3; - fe_0(t1); - fe_ccopy(t1, sqrtm1, b); - fe_ccopy(t1, lop_x , c); - fe_neg (t3, t1); - fe_ccopy(t1, t3, a); - fe_1(t2); - fe_0(t3); - fe_ccopy(t2, t3 , b); - fe_ccopy(t2, lop_y, c); - fe_neg (t3, t2); - fe_ccopy(t2, t3, a^b); + // Compute low order point + fe t1, t2; + select_lop(t1, lop_x, sqrtm1, secret_key[0]); + select_lop(t2, lop_y, fe_one, secret_key[0] + 2); ge_precomp low_order_point; fe_add(low_order_point.Yp, t2, t1); fe_sub(low_order_point.Ym, t2, t1); @@ -2646,9 +2668,9 @@ void crypto_x25519_dirty_fast(u8 public_key[32], const u8 secret_key[32]) fe_tobytes(public_key, t1); - WIPE_BUFFER(t1); WIPE_BUFFER(scalar); - WIPE_BUFFER(t2); WIPE_CTX(&pk); - WIPE_BUFFER(t3); WIPE_CTX(&low_order_point); + WIPE_BUFFER(t1); WIPE_CTX(&pk); + WIPE_BUFFER(t2); WIPE_CTX(&low_order_point); + WIPE_BUFFER(scalar); } ///////////////////