From fb0f4df38cb7ee029f8feacd2971173458d9ffcd Mon Sep 17 00:00:00 2001 From: Loup Vaillant Date: Sun, 29 Nov 2020 21:27:25 +0100 Subject: [PATCH] Faster reduction modulo L Replaced TweetNaCl's code by Barrett reduction. - mod_l(), reduce(), and mul_add() use less stack. - mod_l(), reduce(), and mul_add() are now much faster. - Signature generation is noticeably faster (~7% on my Skylake laptop). - Now I understand all the code. No more black boxes. The change cost a total of 8 lines of code. --- AUTHORS.md | 3 +- CHANGELOG.md | 1 + src/monocypher.c | 203 +++++++++++++++++++++++++++-------------------- 3 files changed, 117 insertions(+), 90 deletions(-) diff --git a/AUTHORS.md b/AUTHORS.md index 22d30fa..007c4c6 100644 --- a/AUTHORS.md +++ b/AUTHORS.md @@ -19,8 +19,7 @@ Implementors - **Argon2i:** Loup Vaillant, implemented from spec. - **X25519:** Daniel J. Bernstein, taken and packaged from SUPERCOP ref10. -- **EdDSA:** Loup Vaillant, with bits and pieces from SUPERCOP ref10 - and TweetNaCl. +- **EdDSA:** Loup Vaillant, with bits and pieces from SUPERCOP ref10. Test suite ---------- diff --git a/CHANGELOG.md b/CHANGELOG.md index 2e84f51..39f7cba 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ - Quality assurance for 2^255-19 arithmetic (elliptic curves): - Documented carry propagation. - Enforced slightly safer invariants. +- Improved the speed of EdDSA signature generation. - Made the vectors.h header more compact and easier to modify. - TIS-CI integration. - Added speed benchmark for ed25519-donna. diff --git a/src/monocypher.c b/src/monocypher.c index f628171..936a01e 100644 --- a/src/monocypher.c +++ b/src/monocypher.c @@ -1667,73 +1667,118 @@ void crypto_x25519_public_key(u8 public_key[32], /////////////////////////// /// Arithmetic modulo L /// /////////////////////////// -static const u8 L[32] = { - 0xed, 0xd3, 0xf5, 0x5c, 0x1a, 0x63, 0x12, 0x58, 0xd6, 0x9c, 0xf7, 0xa2, - 0xde, 0xf9, 0xde, 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, }; - -// r = x mod L (little-endian) -static void modL(u8 *r, i64 x[64]) -{ - for (unsigned i = 63; i >= 32; i--) { - i64 carry = 0; - FOR (j, i-32, i-12) { - x[j] += carry - 16 * x[i] * L[j - (i - 32)]; - carry = (x[j] + 128) >> 8; - x[j] -= carry * (1 << 8); +static const u32 L[8] = {0x5cf5d3ed, 0x5812631a, 0xa2f79cd6, 0x14def9de, + 0x00000000, 0x00000000, 0x00000000, 0x10000000,}; + +// p = a*b + p +static void multiply(u32 p[16], const u32 a[8], const u32 b[8]) +{ + FOR (i, 0, 8) { + u64 carry = 0; + FOR (j, 0, 8) { + carry += p[i+j] + (u64)a[i] * b[j]; + p[i+j] = (u32)carry; + carry >>= 32; } - x[i-12] += carry; - x[i] = 0; + p[i+8] = (u32)carry; } - i64 carry = 0; - FOR (i, 0, 32) { - x[i] += carry - (x[31] >> 4) * L[i]; - carry = x[i] >> 8; - x[i] &= 255; +} + +static int is_above_l(const u32 x[8]) +{ + // We work with L directly, in a 2's complement encoding + // (-L == ~L + 1) + u64 carry = 1; + FOR (i, 0, 8) { + carry += (u64)x[i] + ~L[i]; + carry >>= 32; } - FOR (i, 0, 32) { - x[i] -= carry * L[i]; + return carry; +} + +// Final reduction modulo L, by conditionally removing L. +// if x < l , then r = x +// if l <= x 2*l, then r = x-l +// otherwise the result will be wrong +static void remove_l(u32 r[8], const u32 x[8]) +{ + u64 carry = is_above_l(x); + u32 mask = ~(u32)carry + 1; // carry == 0 or 1 + FOR (i, 0, 8) { + carry += (u64)x[i] + (~L[i] & mask); + r[i] = (u32)carry; + carry >>= 32; + } +} + +// Full reduction modulo L (Barrett reduction) +// Uses +static void mod_l(u8 reduced[32], const u32 x[16]) +{ + static const u32 r[9] = {0x0a2c131b,0xed9ce5a3,0x086329a7,0x2106215d, + 0xffffffeb,0xffffffff,0xffffffff,0xffffffff,0xf,}; + // xr = x * r + u32 xr[25] = {0}; + FOR (i, 0, 9) { + u64 carry = 0; + FOR (j, 0, 16) { + carry += xr[i+j] + (u64)r[i] * x[j]; + xr[i+j] = (u32)carry; + carry >>= 32; + } + xr[i+16] = (u32)carry; + } + // xr = floor(xr / 2^512) * L + // Since the result is guaranteed to be below 2*L, + // it is enough to only compute the first 256 bits. + // The division is performed by saying xr[i+16]. (16 * 32 = 512) + ZERO(xr, 8); + FOR (i, 0, 8) { + u64 carry = 0; + FOR (j, 0, 8-i) { + carry += xr[i+j] + (u64)xr[i+16] * L[j]; + xr[i+j] = (u32)carry; + carry >>= 32; + } } - FOR (i, 0, 32) { - x[i+1] += x[i] >> 8; - r[i ] = x[i] & 255; + // xr = x - xr + u64 carry = 1; + carry = 1; + FOR (i, 0, 8) { + carry += (u64)x[i] + ~xr[i]; + xr[i] = (u32)carry; + carry >>= 32; } + // Final reduction modulo L (conditional subtraction) + remove_l(xr, xr); + store32_le_buf(reduced, xr, 8); + + WIPE_BUFFER(xr); } -// Reduces a 64-byte hash modulo L (little endian) static void reduce(u8 r[64]) { - i64 x[64]; - COPY(x, r, 64); - modL(r, x); + u32 x[16]; + load32_le_buf(x, r, 16); + mod_l(r, x); WIPE_BUFFER(x); } // r = (a * b) + c static void mul_add(u8 r[32], const u8 a[32], const u8 b[32], const u8 c[32]) { - i64 s[64]; - FOR (i, 0, 32) { - s[i] = (i64)(u64)c[i]; // preserve unsigned - } - ZERO(s + 32, 32); - FOR (i, 0, 32) { - FOR (j, 0, 32) { - s[i+j] += a[i] * (u64)b[j]; - } - } - modL(r, s); - WIPE_BUFFER(s); -} + u32 A[8]; load32_le_buf(A, a, 8); + u32 B[8]; load32_le_buf(B, b, 8); + u32 p[16]; + load32_le_buf(p, c, 8); + ZERO(p + 8, 8); -// Variable time! a must not be secret! -static int is_above_L(const u8 a[32]) -{ - for (int i = 31; i >= 0; i--) { - if (a[i] > L[i]) { return 1; } - if (a[i] < L[i]) { return 0; } - } - return 1; + multiply(p, A, B); + + mod_l(r, p); + WIPE_BUFFER(p); + WIPE_BUFFER(A); + WIPE_BUFFER(B); } /////////////// @@ -2075,9 +2120,11 @@ static void ge_double_scalarmult_vartime(ge *P, const u8 p[32], const u8 b[32]) // => Use only to *check* signatures. static int ge_r_check(u8 R_check[32], u8 s[32], u8 h_ram[32], u8 pk[32]) { - ge A; // not secret, not wiped + ge A; // not secret, not wiped + u32 s32[8]; // not secret, not wiped + load32_le_buf(s32, s, 8); if (ge_frombytes_vartime(&A, pk) || // A = pk - is_above_L(s)) { // prevent s malleability + is_above_l(s32)) { // prevent s malleability return -1; } fe_neg(A.X, A.X); @@ -2475,12 +2522,12 @@ void crypto_from_eddsa_public(u8 x25519[32], const u8 eddsa[32]) // s + L * (x%8) < 2^256 static void add_xl(u8 s[32], u8 x) { - u32 mod8 = x & 7; - u32 carry = 0; - FOR (i , 0, 32) { - carry = carry + s[i] + L[i] * mod8; - s[i] = (u8)carry; - carry >>= 8; + u64 mod8 = x & 7; + u64 carry = 0; + FOR (i , 0, 8) { + carry = carry + load32_le(s + 4*i) + L[i] * mod8; + store32_le(s + 4*i, (u32)carry); + carry >>= 32; } } @@ -2795,19 +2842,6 @@ void crypto_key_exchange(u8 shared_key[32], /////////////////////// /// Scalar division /// /////////////////////// -static void multiply(u32 p[16], const u32 a[8], const u32 b[8]) -{ - ZERO(p, 16); - FOR (i, 0, 8) { - u64 carry = 0; - FOR (j, 0, 8) { - carry += p[i+j] + (u64)a[i] * b[j]; - p[i+j] = (u32)carry; - carry >>= 32; - } - p[i+8] = (u32)carry; - } -} // Montgomery reduction. // Divides x by (2^256), and reduces the result modulo L @@ -2838,7 +2872,7 @@ static void redc(u32 u[8], u32 x[16]) carry >>= 32; } } - u32 t[16]; + u32 t[16] = {0}; multiply(t, s, l); // t = t + x @@ -2854,17 +2888,8 @@ static void redc(u32 u[8], u32 x[16]) // So a constant time conditional subtraction is enough // We work with L directly, in a 2's complement encoding // (-L == ~L + 1) - carry = 1; - FOR (i, 0, 8) { - carry += (u64)t[i+8] + ~l[i]; - carry >>= 32; - } - u32 mask = ~(u32)carry + 1; // carry == 0 or 1 - FOR (i, 0, 8) { - carry += (u64)t[i+8] + (~l[i] & mask); - u[i] = (u32)carry; - carry >>= 32; - } + remove_l(u, t+8); + WIPE_BUFFER(s); WIPE_BUFFER(t); } @@ -2888,19 +2913,21 @@ void crypto_x25519_inverse(u8 blind_salt [32], const u8 private_key[32], // m_scl = scalar * 2^256 (modulo L) u32 m_scl[8]; { - i64 tmp[64]; - ZERO(tmp, 32); - COPY(tmp+32, scalar, 32); - modL(scalar, tmp); + u32 tmp[16]; + ZERO(tmp, 8); + load32_le_buf(tmp+8, scalar, 8); + mod_l(scalar, tmp); load32_le_buf(m_scl, scalar, 8); WIPE_BUFFER(tmp); // Wipe ASAP to save stack space } u32 product[16]; for (int i = 252; i >= 0; i--) { + ZERO(product, 16); multiply(product, m_inv, m_inv); redc(m_inv, product); if (scalar_bit(Lm2, i)) { + ZERO(product, 16); multiply(product, m_inv, m_scl); redc(m_inv, product); } -- 2.47.3