From: Loup Vaillant Date: Sun, 19 Feb 2017 22:27:21 +0000 (+0100) Subject: ref10 curve25519. Moar Speed. X-Git-Url: https://git.codecow.com/?a=commitdiff_plain;h=974fe8dd6607a691d2f1d76ccb62241b748ddfbe;p=Monocypher.git ref10 curve25519. Moar Speed. --- diff --git a/monocypher.c b/monocypher.c index 8bf0d74..858c259 100644 --- a/monocypher.c +++ b/monocypher.c @@ -25,8 +25,10 @@ #define FOR(i, start, end) for (size_t i = start; i < end; i++) #define sv static void +typedef int8_t i8; typedef uint8_t u8; typedef uint32_t u32; +typedef int32_t i32; typedef int64_t i64; typedef uint64_t u64; @@ -339,7 +341,7 @@ sv incr(u64 x[2], u64 y) if (x[0] < y) { x[1]++; } // handle overflow } -sv blake2b_compress(crypto_blake2b_ctx *ctx, _Bool last_block) +sv blake2b_compress(crypto_blake2b_ctx *ctx, int last_block) { static const u8 sigma[12][16] = { { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, @@ -652,10 +654,10 @@ static u32 gidx_next(gidx_ctx *ctx) // Pass 1+: 3 last segments plus already constructed // blocks in this segment. THE SPEC SUGGESTS OTHERWISE. // I CONFORM TO THE REFERENCE IMPLEMENTATION. - _Bool first_pass = ctx->pass_number == 0; - u32 slice_size = ctx->nb_blocks / 4; - u32 area_size = ((first_pass ? ctx->slice_number : 3) - * slice_size + index - 1); + int first_pass = ctx->pass_number == 0; + u32 slice_size = ctx->nb_blocks / 4; + u32 area_size = ((first_pass ? ctx->slice_number : 3) + * slice_size + index - 1); // Computes the starting position of the reference area. // CONTRARY TO WHAT THE SPEC SUGGESTS, IT STARTS AT THE @@ -684,7 +686,7 @@ void crypto_argon2i(u8 *tag, u32 tag_size, u32 nb_iterations) { // work area seen as blocks (must be suitably aligned) - block *blocks = work_area; + block *blocks = (block*)work_area; { crypto_blake2b_ctx ctx; crypto_blake2b_init(&ctx); @@ -728,7 +730,7 @@ void crypto_argon2i(u8 *tag, u32 tag_size, // fill (then re-fill) the rest of the blocks FOR (pass_number, 0, nb_iterations) { - _Bool first_pass = pass_number == 0; + int first_pass = pass_number == 0; // Simple copy on pass 0, XOR instead of overwrite on subsequent passes void (*xcopy) (block*, const block*) = first_pass ?copy_block :xor_block; @@ -759,274 +761,461 @@ void crypto_argon2i(u8 *tag, u32 tag_size, extended_hash(tag, tag_size, final_block, 1024); } +//////////////////////////////////// +/// Arithmetic modulo 2^255 - 19 /// Taken from Supercop's ref10 implementation. +//////////////////////////////////// A bit bigger than TweetNaCl, much faster. -/////////////// -/// X-25519 /// (Taken from TweetNaCl) -/////////////// -typedef i64 gf[16]; -static const u8 _0[16]; -static const u8 _9[32] = { 9 }; -static const gf _121665 = { 0xdb41, 1 }; - -sv car_25519(gf o) -{ - FOR(i, 0, 16) { - o[i] += 1LL << 16; - i64 c = o[i] >> 16; - o[(i+1) * (i<15)] += c - 1 + (37 * (c-1) * (i==15)); - o[i] -= c << 16; - } -} +// field element +typedef i32 fe[10]; + +sv fe_0 (fe h) { FOR (i, 0, 10) h[i] = 0; } +sv fe_1 (fe h) { h[0] = 1; FOR (i, 1, 10) h[i] = 0; } +sv fe_neg (fe h, const fe f) { FOR (i, 0, 10) h[i] = -f[i]; } +sv fe_add (fe h, const fe f, const fe g) { FOR (i, 0, 10) h[i] = f[i] + g[i]; } +sv fe_sub (fe h, const fe f, const fe g) { FOR (i, 0, 10) h[i] = f[i] - g[i]; } +sv fe_copy(fe h, const fe f ) { FOR (i, 0, 10) h[i] = f[i]; } -sv sel_25519(gf p, gf q, int b) +sv fe_cswap(fe f, fe g, u32 b) { - i64 c = ~(b-1); - FOR(i, 0, 16) { - i64 t = c & (p[i] ^ q[i]); - p[i] ^= t; - q[i] ^= t; + FOR (i, 0, 10) { + i32 x = (f[i] ^ g[i]) & -b; + f[i] = f[i] ^ x; + g[i] = g[i] ^ x; } } -sv pack_25519(u8 *o, const gf n) +static u32 load24_le(const u8 s[3]) { - gf t; - FOR(i, 0, 16) t[i] = n[i]; - car_25519(t); - car_25519(t); - car_25519(t); - FOR(j, 0, 2) { - gf m; - m[0] = t[0] - 0xffed; - FOR(i, 1, 15) { - m[i ] = t[i] - 0xffff - ((m[i-1] >> 16) & 1); - m[i-1] &= 0xffff; - } - m[15] = t[15] - 0x7fff - ((m[14] >> 16) & 1); - int b = (m[15] >> 16) & 1; - m[14] &= 0xffff; - sel_25519(t, m, 1-b); + return (u32)s[0] + | ((u32)s[1] << 8) + | ((u32)s[2] << 16); +} + +sv fe_carry(fe h, i64 t[10]) +{ + i64 c0, c1, c2, c3, c4, c5, c6, c7, c8, c9; + c9 = (t[9] + (i64) (1<<24)) >> 25; t[0] += c9 * 19; t[9] -= c9 << 25; + c1 = (t[1] + (i64) (1<<24)) >> 25; t[2] += c1; t[1] -= c1 << 25; + c3 = (t[3] + (i64) (1<<24)) >> 25; t[4] += c3; t[3] -= c3 << 25; + c5 = (t[5] + (i64) (1<<24)) >> 25; t[6] += c5; t[5] -= c5 << 25; + c7 = (t[7] + (i64) (1<<24)) >> 25; t[8] += c7; t[7] -= c7 << 25; + c0 = (t[0] + (i64) (1<<25)) >> 26; t[1] += c0; t[0] -= c0 << 26; + c2 = (t[2] + (i64) (1<<25)) >> 26; t[3] += c2; t[2] -= c2 << 26; + c4 = (t[4] + (i64) (1<<25)) >> 26; t[5] += c4; t[4] -= c4 << 26; + c6 = (t[6] + (i64) (1<<25)) >> 26; t[7] += c6; t[6] -= c6 << 26; + c8 = (t[8] + (i64) (1<<25)) >> 26; t[9] += c8; t[8] -= c8 << 26; + FOR (i, 0, 10) { h[i] = t[i]; } +} + +sv fe_frombytes(fe h, const u8 s[32]) +{ + i64 t[10]; // intermediate result (may overflow 32 bits) + t[0] = load32_le(s); + t[1] = load24_le(s + 4) << 6; + t[2] = load24_le(s + 7) << 5; + t[3] = load24_le(s + 10) << 3; + t[4] = load24_le(s + 13) << 2; + t[5] = load32_le(s + 16); + t[6] = load24_le(s + 20) << 7; + t[7] = load24_le(s + 23) << 5; + t[8] = load24_le(s + 26) << 4; + t[9] = (load24_le(s + 29) & 8388607) << 2; + fe_carry(h, t); +} + +sv fe_mul121666(fe h, const fe f) +{ + i64 t[10]; + FOR(i, 0, 10) { t[i] = f[i] * (i64) 121666; } + fe_carry(h, t); +} + +sv fe_mul(fe h, const fe f, const fe g) +{ + // Everything is unrolled and put in temporary variables. + // We could roll the loop, but that would make it twice as slow. + i32 f0 = f[0]; i32 f1 = f[1]; i32 f2 = f[2]; i32 f3 = f[3]; i32 f4 = f[4]; + i32 f5 = f[5]; i32 f6 = f[6]; i32 f7 = f[7]; i32 f8 = f[8]; i32 f9 = f[9]; + i32 g0 = g[0]; i32 g1 = g[1]; i32 g2 = g[2]; i32 g3 = g[3]; i32 g4 = g[4]; + i32 g5 = g[5]; i32 g6 = g[6]; i32 g7 = g[7]; i32 g8 = g[8]; i32 g9 = g[9]; + i32 F1 = f1*2; i32 F3 = f3*2; i32 F5 = f5*2; i32 F7 = f7*2; i32 F9 = f9*2; + i32 G1 = g1*19; i32 G2 = g2*19; i32 G3 = g3*19; + i32 G4 = g4*19; i32 G5 = g5*19; i32 G6 = g6*19; + i32 G7 = g7*19; i32 G8 = g8*19; i32 G9 = g9*19; + + i64 h0 = f0*(i64)g0 + F1*(i64)G9 + f2*(i64)G8 + F3*(i64)G7 + f4*(i64)G6 + + F5*(i64)G5 + f6*(i64)G4 + F7*(i64)G3 + f8*(i64)G2 + F9*(i64)G1; + i64 h1 = f0*(i64)g1 + f1*(i64)g0 + f2*(i64)G9 + f3*(i64)G8 + f4*(i64)G7 + + f5*(i64)G6 + f6*(i64)G5 + f7*(i64)G4 + f8*(i64)G3 + f9*(i64)G2; + i64 h2 = f0*(i64)g2 + F1*(i64)g1 + f2*(i64)g0 + F3*(i64)G9 + f4*(i64)G8 + + F5*(i64)G7 + f6*(i64)G6 + F7*(i64)G5 + f8*(i64)G4 + F9*(i64)G3; + i64 h3 = f0*(i64)g3 + f1*(i64)g2 + f2*(i64)g1 + f3*(i64)g0 + f4*(i64)G9 + + f5*(i64)G8 + f6*(i64)G7 + f7*(i64)G6 + f8*(i64)G5 + f9*(i64)G4; + i64 h4 = f0*(i64)g4 + F1*(i64)g3 + f2*(i64)g2 + F3*(i64)g1 + f4*(i64)g0 + + F5*(i64)G9 + f6*(i64)G8 + F7*(i64)G7 + f8*(i64)G6 + F9*(i64)G5; + i64 h5 = f0*(i64)g5 + f1*(i64)g4 + f2*(i64)g3 + f3*(i64)g2 + f4*(i64)g1 + + f5*(i64)g0 + f6*(i64)G9 + f7*(i64)G8 + f8*(i64)G7 + f9*(i64)G6; + i64 h6 = f0*(i64)g6 + F1*(i64)g5 + f2*(i64)g4 + F3*(i64)g3 + f4*(i64)g2 + + F5*(i64)g1 + f6*(i64)g0 + F7*(i64)G9 + f8*(i64)G8 + F9*(i64)G7; + i64 h7 = f0*(i64)g7 + f1*(i64)g6 + f2*(i64)g5 + f3*(i64)g4 + f4*(i64)g3 + + f5*(i64)g2 + f6*(i64)g1 + f7*(i64)g0 + f8*(i64)G9 + f9*(i64)G8; + i64 h8 = f0*(i64)g8 + F1*(i64)g7 + f2*(i64)g6 + F3*(i64)g5 + f4*(i64)g4 + + F5*(i64)g3 + f6*(i64)g2 + F7*(i64)g1 + f8*(i64)g0 + F9*(i64)G9; + i64 h9 = f0*(i64)g9 + f1*(i64)g8 + f2*(i64)g7 + f3*(i64)g6 + f4*(i64)g5 + + f5*(i64)g4 + f6*(i64)g3 + f7*(i64)g2 + f8*(i64)g1 + f9*(i64)g0; + +#define CARRY_MULT \ + i64 c0, c1, c2, c3, c4, c5, c6, c7, c8, c9; \ + c0 = (h0 + (i64) (1<<25)) >> 26; h1 += c0; h0 -= c0 << 26; \ + c4 = (h4 + (i64) (1<<25)) >> 26; h5 += c4; h4 -= c4 << 26; \ + c1 = (h1 + (i64) (1<<24)) >> 25; h2 += c1; h1 -= c1 << 25; \ + c5 = (h5 + (i64) (1<<24)) >> 25; h6 += c5; h5 -= c5 << 25; \ + c2 = (h2 + (i64) (1<<25)) >> 26; h3 += c2; h2 -= c2 << 26; \ + c6 = (h6 + (i64) (1<<25)) >> 26; h7 += c6; h6 -= c6 << 26; \ + c3 = (h3 + (i64) (1<<24)) >> 25; h4 += c3; h3 -= c3 << 25; \ + c7 = (h7 + (i64) (1<<24)) >> 25; h8 += c7; h7 -= c7 << 25; \ + c4 = (h4 + (i64) (1<<25)) >> 26; h5 += c4; h4 -= c4 << 26; \ + c8 = (h8 + (i64) (1<<25)) >> 26; h9 += c8; h8 -= c8 << 26; \ + c9 = (h9 + (i64) (1<<24)) >> 25; h0 += c9 * 19; h9 -= c9 << 25; \ + c0 = (h0 + (i64) (1<<25)) >> 26; h1 += c0; h0 -= c0 << 26; \ + \ + h[0] = h0; h[1] = h1; h[2] = h2; h[3] = h3; h[4] = h4; \ + h[5] = h5; h[6] = h6; h[7] = h7; h[8] = h8; h[9] = h9 + CARRY_MULT; +} + +sv fe_sq(fe h, const fe f) +{ + i32 f0 = f[0]; i32 f1 = f[1]; i32 f2 = f[2]; i32 f3 = f[3]; i32 f4 = f[4]; + i32 f5 = f[5]; i32 f6 = f[6]; i32 f7 = f[7]; i32 f8 = f[8]; i32 f9 = f[9]; + i32 f0_2 = f0*2; i32 f1_2 = f1*2; i32 f2_2 = f2*2; i32 f3_2 = f3*2; + i32 f4_2 = f4*2; i32 f5_2 = f5*2; i32 f6_2 = f6*2; i32 f7_2 = f7*2; + i32 f5_38 = f5*38; i32 f6_19 = f6*19; i32 f7_38 = f7*38; + i32 f8_19 = f8*19; i32 f9_38 = f9*38; + + i64 h0 = f0 *(i64)f0 + f1_2*(i64)f9_38 + f2_2*(i64)f8_19 + + f3_2*(i64)f7_38 + f4_2*(i64)f6_19 + f5 *(i64)f5_38; + i64 h1 = f0_2*(i64)f1 + f2 *(i64)f9_38 + f3_2*(i64)f8_19 + + f4 *(i64)f7_38 + f5_2*(i64)f6_19; + i64 h2 = f0_2*(i64)f2 + f1_2*(i64)f1 + f3_2*(i64)f9_38 + + f4_2*(i64)f8_19 + f5_2*(i64)f7_38 + f6 *(i64)f6_19; + i64 h3 = f0_2*(i64)f3 + f1_2*(i64)f2 + f4 *(i64)f9_38 + + f5_2*(i64)f8_19 + f6 *(i64)f7_38; + i64 h4 = f0_2*(i64)f4 + f1_2*(i64)f3_2 + f2 *(i64)f2 + + f5_2*(i64)f9_38 + f6_2*(i64)f8_19 + f7 *(i64)f7_38; + i64 h5 = f0_2*(i64)f5 + f1_2*(i64)f4 + f2_2*(i64)f3 + + f6 *(i64)f9_38 + f7_2*(i64)f8_19; + i64 h6 = f0_2*(i64)f6 + f1_2*(i64)f5_2 + f2_2*(i64)f4 + + f3_2*(i64)f3 + f7_2*(i64)f9_38 + f8 *(i64)f8_19; + i64 h7 = f0_2*(i64)f7 + f1_2*(i64)f6 + f2_2*(i64)f5 + + f3_2*(i64)f4 + f8 *(i64)f9_38; + i64 h8 = f0_2*(i64)f8 + f1_2*(i64)f7_2 + f2_2*(i64)f6 + + f3_2*(i64)f5_2 + f4 *(i64)f4 + f9 *(i64)f9_38; + i64 h9 = f0_2*(i64)f9 + f1_2*(i64)f8 + f2_2*(i64)f7 + + f3_2*(i64)f6 + f4 *(i64)f5_2; + CARRY_MULT; +} + +sv fe_invert(fe out, const fe z) +{ + /* + fe c; fe_copy(c, z); + FOR (i, 0, 254) { + fe_sq(c, c); + if(i !=251 && i!= 249) fe_mul(c, c, z); + } + fe_copy(out, c); + */ + fe t0, t1, t2, t3; + fe_sq(t0, z ); + fe_sq(t1, t0); + fe_sq(t1, t1); + fe_mul(t1, z, t1); + fe_mul(t0, t0, t1); + fe_sq(t2, t0); fe_mul(t1 , t1, t2); + fe_sq(t2, t1); FOR (i, 1, 5) fe_sq(t2, t2); fe_mul(t1 , t2, t1); + fe_sq(t2, t1); FOR (i, 1, 10) fe_sq(t2, t2); fe_mul(t2 , t2, t1); + fe_sq(t3, t2); FOR (i, 1, 20) fe_sq(t3, t3); fe_mul(t2 , t3, t2); + fe_sq(t2, t2); FOR (i, 1, 10) fe_sq(t2, t2); fe_mul(t1 , t2, t1); + fe_sq(t2, t1); FOR (i, 1, 50) fe_sq(t2, t2); fe_mul(t2 , t2, t1); + fe_sq(t3, t2); FOR (i, 1, 100) fe_sq(t3, t3); fe_mul(t2 , t3, t2); + fe_sq(t2, t2); FOR (i, 1, 50) fe_sq(t2, t2); fe_mul(t1 , t2, t1); + fe_sq(t1, t1); FOR (i, 1, 5) fe_sq(t1, t1); fe_mul(out, t1, t0); + +} + +void fe_pow22523(fe out, const fe z) +{ + /* + fe c; fe_copy(c, z); + FOR(i, 0, 251) { + fe_sq (c, c); + if (i != 249) fe_mul(c, c, z); + } + fe_copy(out, c); + */ + fe t0, t1, t2; + fe_sq(t0, z); + fe_sq(t1,t0); fe_sq(t1, t1); fe_mul(t1, z, t1); + fe_mul(t0, t0, t1); + fe_sq(t0, t0); fe_mul(t0, t1, t0); + fe_sq(t1, t0); FOR (i, 1, 5) fe_sq(t1, t1); fe_mul(t0, t1, t0); + fe_sq(t1, t0); FOR (i, 1, 10) fe_sq(t1, t1); fe_mul(t1, t1, t0); + fe_sq(t2, t1); FOR (i, 1, 20) fe_sq(t2, t2); fe_mul(t1, t2, t1); + fe_sq(t1, t1); FOR (i, 1, 10) fe_sq(t1, t1); fe_mul(t0, t1, t0); + fe_sq(t1, t0); FOR (i, 1, 50) fe_sq(t1, t1); fe_mul(t1, t1, t0); + fe_sq(t2, t1); FOR (i, 1, 100) fe_sq(t2, t2); fe_mul(t1, t2, t1); + fe_sq(t1, t1); FOR (i, 1, 50) fe_sq(t1, t1); fe_mul(t0, t1, t0); + fe_sq(t0, t0); FOR (i, 1, 2) fe_sq(t0, t0); fe_mul(out, t0, z); + +} + +sv fe_tobytes(u8 s[32], const fe h) +{ + i32 t[11]; + FOR (i, 0, 10) { t[i] = h[i]; } + + i32 q = (19 * t[9] + (((i32) 1) << 24)) >> 25; + FOR (i, 0, 5) { + q += t[2*i ]; q >>= 26; + q += t[2*i+1]; q >>= 25; } - FOR(i, 0, 16) { - o[2*i ] = t[i] & 0xff; - o[2*i + 1] = t[i] >> 8; + t[0] += 19 * q; + FOR (i, 0, 5) { + i32 carry; + carry = t[2*i ] >> 26; t[2*i+1] += carry; t[2*i ] -= carry << 26; + carry = t[2*i+1] >> 25; t[2*i+2] += carry; t[2*i+1] -= carry << 25; } + store32_le(s + 0, ((u32)t[0] >> 0) | ((u32)t[1] << 26)); + store32_le(s + 4, ((u32)t[1] >> 6) | ((u32)t[2] << 19)); + store32_le(s + 8, ((u32)t[2] >> 13) | ((u32)t[3] << 13)); + store32_le(s + 12, ((u32)t[3] >> 19) | ((u32)t[4] << 6)); + store32_le(s + 16, ((u32)t[5] << 0) | ((u32)t[6] << 25)); + store32_le(s + 20, ((u32)t[6] >> 7) | ((u32)t[7] << 19)); + store32_le(s + 24, ((u32)t[7] >> 13) | ((u32)t[8] << 12)); + store32_le(s + 28, ((u32)t[8] >> 20) | ((u32)t[9] << 6)); } -sv unpack_25519(gf o, const u8 *n) +// Parity check. Returns 0 if even, 1 if odd +static int fe_isnegative(const fe f) { - FOR(i, 0, 16) o[i] = n[2*i] + ((i64)n[2*i + 1] << 8); - o[15] &= 0x7fff; + u8 s[32]; + fe_tobytes(s, f); + return s[0] & 1; } -sv A(gf o, const gf a, const gf b) { FOR(i, 0, 16) o[i] = a[i] + b[i]; } -sv Z(gf o, const gf a, const gf b) { FOR(i, 0, 16) o[i] = a[i] - b[i]; } -sv M(gf o, const gf a, const gf b) +static int fe_isnonzero(const fe f) { - i64 t[31]; - FOR(i, 0, 31) t[i] = 0; - FOR(i, 0, 16) FOR(j, 0, 16) t[i+j] += a[i] * b[j]; - FOR(i, 0, 15) t[i] += 38 * t[i+16]; - FOR(i, 0, 16) o[i] = t[i]; - car_25519(o); - car_25519(o); + static const u8 zero[32]; + u8 s[32]; + fe_tobytes(s, f); + return crypto_memcmp(s, zero, 32); } -sv S(gf o,const gf a) { M(o, a, a); } - -sv inv_25519(gf o,const gf i) -{ - gf c; - FOR(a, 0, 16) c[a] = i[a]; - for(int a = 253; a >= 0; a--) { - S(c, c); - if(a != 2 && a != 4) - M(c, c, i); - } - FOR(a, 0, 16) o[a] = c[a]; -} - -void crypto_x25519(u8 q[32], const u8 n[32], const u8 p[32]) -{ - u8 z[32]; - i64 x[80]; - i64 r; - gf a, b, c, d, e, f; - FOR(i, 0, 31) z[i] = n[i]; - z[31] = (n[31] & 127) | 64; - z[0 ] &= 248; - unpack_25519(x, p); - FOR(i, 0, 16) { - b[i] = x[i]; - d[i] = a[i] = c[i] = 0; - } - a[0] = d[0] = 1; - for(int i = 254; i>=0; i--) { - r = (z[i>>3] >> (i & 7)) & 1; - sel_25519(a, b, r); - sel_25519(c, d, r); - A(e, a, c); - Z(a, a, c); - A(c, b, d); - Z(b, b, d); - S(d, e); - S(f, a); - M(a, c, a); - M(c, b, e); - A(e, a, c); - Z(a, a, c); - S(b, a); - Z(c, d, f); - M(a, c, _121665); - A(a, a, d); - M(c, c, a); - M(a, d, f); - M(d, b, x); - S(b, e); - sel_25519(a, b, r); - sel_25519(c, d, r); - } - FOR(i, 0, 16) { - x[i+16] = a[i]; - x[i+32] = c[i]; - x[i+48] = b[i]; - x[i+64] = d[i]; +/////////////// +/// X-25519 /// Taken from Supercop's ref10 implementation. +/////////////// Bigger than TweetNaCl, but over 8 times faster +sv trim_scalar(u8 s[32]) +{ + s[ 0] &= 248; + s[31] &= 127; + s[31] |= 64; +} + +void crypto_x25519(u8 shared_secret [32], + const u8 your_secret_key [32], + const u8 their_public_key[32]) +{ + // computes the scalar product + fe x1, x2, z2, x3, z3; + fe_frombytes(x1, their_public_key); + + // restrict the possible scalar values + u8 e[32]; FOR (i, 0, 32) { e[i] = your_secret_key[i]; } + trim_scalar(e); + + // Montgomery ladder + // We work in projective coordinates to avoid divisons: x = X / Z + // We don't care about the y coordinate. + fe_1(x2); fe_0(z2); // "zero" point + fe_copy(x3, x1); fe_1(z3); // "one" point + u32 swap = 0; + for (int pos = 254; pos >= 0; --pos) { + // constant time conditional swap before ladder step + u32 b = (e[pos / 8] >> (pos & 7)) & 1; + swap ^= b; // xor trick avoids swapping at the end of the loop + fe_cswap(x2, x3, swap); + fe_cswap(z2, z3, swap); + swap = b; // anticipates one last swap after the loop + + // Montgomery ladder step: replaces (P2, P3) by (P2*2, P2+P3) + // with differential addition + fe t0, t1; + fe_sub(t0, x3, z3); fe_sub(t1, x2, z2); fe_add(x2, x2, z2); + fe_add(z2, x3, z3); fe_mul(z3, t0, x2); fe_mul(z2, z2, t1); + fe_sq (t0, t1 ); fe_sq (t1, x2 ); fe_add(x3, z3, z2); + fe_sub(z2, z3, z2); fe_mul(x2, t1, t0); fe_sub(t1, t1, t0); + fe_sq (z2, z2 ); fe_mul121666(z3, t1); fe_sq (x3, x3 ); + fe_add(t0, t0, z3); fe_mul(z3, x1, z2); fe_mul(z2, t1, t0); } - inv_25519(x+32, x+32); - M(x+16, x+16, x+32); - pack_25519(q, x+16); + // last swap is necessary to compensate for the xor trick + fe_cswap(x2, x3, swap); + fe_cswap(z2, z3, swap); + + // normalises the coordinates: x == X / Z + fe_invert(z2, z2); + fe_mul(x2, x2, z2); + fe_tobytes(shared_secret, x2); } -void crypto_x25519_public_key(u8 q[32], const u8 n[32]) +void crypto_x25519_public_key(u8 public_key[32], + const u8 secret_key[32]) { - crypto_x25519(q, n, _9); + static const u8 base_point[32] = {9}; + crypto_x25519(public_key, secret_key, base_point); } /////////////// -/// Ed25519 /// (Taken from TweetNaCl) +/// Ed25519 /// /////////////// -static const gf gf0; -static const gf gf1 = { 1 }; -static const gf D = { 0x78a3, 0x1359, 0x4dca, 0x75eb, - 0xd8ab, 0x4141, 0x0a4d, 0x0070, - 0xe898, 0x7779, 0x4079, 0x8cc7, - 0xfe73, 0x2b6f, 0x6cee, 0x5203 }; -static const gf D2 = { 0xf159, 0x26b2, 0x9b94, 0xebd6, - 0xb156, 0x8283, 0x149a, 0x00e0, - 0xd130, 0xeef3, 0x80f2, 0x198e, - 0xfce7, 0x56df, 0xd9dc, 0x2406 }; -static const gf X = { 0xd51a, 0x8f25, 0x2d60, 0xc956, - 0xa7b2, 0x9525, 0xc760, 0x692c, - 0xdc5c, 0xfdd6, 0xe231, 0xc0a4, - 0x53fe, 0xcd6e, 0x36d3, 0x2169 }; -static const gf Y = { 0x6658, 0x6666, 0x6666, 0x6666, - 0x6666, 0x6666, 0x6666, 0x6666, - 0x6666, 0x6666, 0x6666, 0x6666, - 0x6666, 0x6666, 0x6666, 0x6666 }; -static const gf I = { 0xa0b0, 0x4a0e, 0x1b27, 0xc4ee, - 0xe478, 0xad2f, 0x1806, 0x2f43, - 0xd7a7, 0x3dfb, 0x0099, 0x2b4d, - 0xdf0b, 0x4fc1, 0x2480, 0x2b83 }; -static const u64 L[32] = { 0xed, 0xd3, 0xf5, 0x5c, 0x1a, 0x63, 0x12, 0x58, - 0xd6, 0x9c, 0xf7, 0xa2, 0xde, 0xf9, 0xde, 0x14, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10 }; - -sv set_25519(gf r, const gf a) { FOR(i, 0, 16) r[i] = a[i]; } - -static u8 par_25519(const gf a) -{ - u8 d[32]; - pack_25519(d, a); - return d[0] & 1; -} - -sv pow2523(gf o,const gf i) -{ - gf c; - FOR(a, 0, 16) c[a] = i[a]; - for(int a = 250; a >= 0; a--) { - S(c, c); - if(a != 1) M(c, c, i); - } - FOR(a, 0, 16) o[a] = c[a]; -} -static int neq_25519(const gf a, const gf b) -{ - u8 c[32],d[32]; - pack_25519(c, a); - pack_25519(d, b); - return crypto_memcmp(c, d, 32); -} +// Point in a twisted Edwards curve, +// in extended projective coordinates +// x = X/Z, y = Y/Z, T = XY/Z +typedef struct { fe X; fe Y; fe Z; fe T; } ge; -sv add(gf p[4], gf q[4]) +sv ge_from_xy(ge *p, const fe x, const fe y) { - gf a, b, c, d, t, e, f, g, h; - Z(a, p[1], p[0]); - Z(t, q[1], q[0]); - M(a, a, t); - A(b, p[0], p[1]); - A(t, q[0], q[1]); - M(b, b, t); - M(c, p[3], q[3]); - M(c, c, D2); - M(d, p[2], q[2]); - A(d, d, d); - Z(e, b, a); - Z(f, d, c); - A(g, d, c); - A(h, b, a); + FOR (i, 0, 10) { + p->X[i] = x[i]; + p->Y[i] = y[i]; + } + fe_1 (p->Z); + fe_mul(p->T, x, y); +} + +sv ge_cswap(ge *p, ge *q, u32 b) +{ + fe_cswap(p->X, q->X, b); + fe_cswap(p->Y, q->Y, b); + fe_cswap(p->Z, q->Z, b); + fe_cswap(p->T, q->T, b); +} + +sv ge_tobytes(u8 s[32], const ge *h) +{ + fe recip, x, y; + fe_invert(recip, h->Z); + fe_mul(x, h->X, recip); + fe_mul(y, h->Y, recip); + fe_tobytes(s, y); + s[31] ^= fe_isnegative(x) << 7; +} + +// Variable time! s must not be secret! +static int ge_frombytes_neg(ge *h, const u8 s[32]) +{ + static const fe d = { + -10913610,13857413,-15372611,6949391,114729, + -8787816,-6275908,-3247719,-18696448,-12055116 + } ; + static const fe sqrtm1 = { + -32595792,-7943725,9377950,3500415,12389472, + -272473,-25146209,-2005654,326686,11406482 + } ; + fe u, v, v3, vxx, check; + fe_frombytes(h->Y, s); + fe_1(h->Z); + fe_sq(u, h->Y); // y^2 + fe_mul(v, u, d); + fe_sub(u, u, h->Z); // u = y^2-1 + fe_add(v, v, h->Z); // v = dy^2+1 + + fe_sq(v3, v); + fe_mul(v3, v3, v); // v3 = v^3 + fe_sq(h->X, v3); + fe_mul(h->X, h->X, v); + fe_mul(h->X, h->X, u); // x = uv^7 + + fe_pow22523(h->X, h->X); // x = (uv^7)^((q-5)/8) + fe_mul(h->X, h->X, v3); + fe_mul(h->X, h->X, u); // x = uv^3(uv^7)^((q-5)/8) + + fe_sq(vxx, h->X); + fe_mul(vxx, vxx, v); + fe_sub(check, vxx, u); // vx^2-u + if (fe_isnonzero(check)) { + fe_add(check, vxx, u); // vx^2+u + if (fe_isnonzero(check)) return -1; + fe_mul(h->X, h->X, sqrtm1); + } - M(p[0], e, f); - M(p[1], h, g); - M(p[2], g, f); - M(p[3], e, h); -} + if (fe_isnegative(h->X) == (s[31] >> 7)) + fe_neg(h->X, h->X); -sv cswap(gf p[4], gf q[4], u8 b) -{ - FOR(i, 0, 4) - sel_25519(p[i],q[i],b); + fe_mul(h->T, h->X, h->Y); + return 0; } -sv pack(u8 *r, gf p[4]) +sv ge_add(ge *s, const ge *p, const ge *q) { - gf tx, ty, zi; - inv_25519(zi, p[2]); - M(tx, p[0], zi); - M(ty, p[1], zi); - pack_25519(r, ty); - r[31] ^= par_25519(tx) << 7; -} + static const fe D2 = { // - 2 * 121665 / 121666 + 0x2b2f159, 0x1a6e509, 0x22add7a, 0x0d4141d, 0x0038052, + 0x0f3d130, 0x3407977, 0x19ce331, 0x1c56dff, 0x0901b67 + }; + fe a, b, c, d, e, f, g, h; + // A = (Y1-X1) * (Y2-X2) + // B = (Y1+X1) * (Y2+X2) + fe_sub(a, p->Y, p->X); fe_sub(h, q->Y, q->X); fe_mul(a, a, h); + fe_add(b, p->X, p->Y); fe_add(h, q->X, q->Y); fe_mul(b, b, h); + fe_mul(c, p->T, q->T); fe_mul(c, c, D2 ); // C = T1 * k * T2 + fe_add(d, p->Z, p->Z); fe_mul(d, d, q->Z); // D = Z1 * 2 * Z2 + fe_sub(e, b, a); // E = B - A + fe_sub(f, d, c); // F = D - C + fe_add(g, d, c); // G = D + C + fe_add(h, b, a); // H = B + A + fe_mul(s->X, e, f); // X3 = E * F + fe_mul(s->Y, g, h); // Y3 = G * H + fe_mul(s->Z, f, g); // T3 = E * H !error in the explicit formula database! + fe_mul(s->T, e, h); // Z3 = F * G +} + +sv ge_scalarmult(ge *p, const ge *q, const u8 scalar[32]) +{ + ge t; + fe_0(p->X); fe_copy(t.X, q->X); + fe_1(p->Y); fe_copy(t.Y, q->Y); + fe_1(p->Z); fe_copy(t.Z, q->Z); + fe_0(p->T); fe_copy(t.T, q->T); -sv scalarmult(gf p[4], gf q[4], const u8 *s) -{ - set_25519(p[0], gf0); - set_25519(p[1], gf1); - set_25519(p[2], gf1); - set_25519(p[3], gf0); for (int i = 255; i >= 0; i--) { - u8 b = (s[i/8] >> (i & 7)) & 1; - cswap(p, q, b); - add(q, p); - add(p, p); - cswap(p, q, b); + u8 b = (scalar[i/8] >> (i & 7)) & 1; + ge_cswap(p, &t, b); + ge_add(&t, &t, p); + ge_add(p , p , p); + ge_cswap(p, &t, b); } } -sv scalarbase(gf p[4], const u8 *s) +sv ge_scalarmult_base(ge *p, const u8 scalar[32]) { - gf q[4]; - set_25519(q[0], X); - set_25519(q[1], Y); - set_25519(q[2], gf1); - M(q[3], X, Y); - scalarmult(p, q, s); + static const fe X = { + 0x325d51a, 0x18b5823, 0x0f6592a, 0x104a92d, 0x1a4b31d, + 0x1d6dc5c, 0x27118fe, 0x07fd814, 0x13cd6e5, 0x085a4db}; + static const fe Y = { + 0x2666658, 0x1999999, 0x0cccccc, 0x1333333, 0x1999999, + 0x0666666, 0x3333333, 0x0cccccc, 0x2666666, 0x1999999}; + ge base_point; + ge_from_xy(&base_point, X, Y); + ge_scalarmult(p, &base_point, scalar); } sv modL(u8 *r, i64 x[64]) { + static const u64 L[32] = { 0xed, 0xd3, 0xf5, 0x5c, 0x1a, 0x63, 0x12, 0x58, + 0xd6, 0x9c, 0xf7, 0xa2, 0xde, 0xf9, 0xde, 0x14, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10 }; i64 i, j; for (i = 63;i >= 32;--i) { i64 carry = 0; @@ -1059,43 +1248,8 @@ sv reduce(u8 r[64]) modL(r, x); } -static int unpackneg(gf r[4],const u8 p[32]) -{ - gf t, chk, num, den, den2, den4, den6; - set_25519(r[2], gf1); - unpack_25519(r[1], p); - S(num,r [1]); - M(den, num, D); - Z(num, num, r[2]); - A(den, r[2], den); - - S(den2, den); - S(den4, den2); - M(den6, den4, den2); - M(t, den6, num); - M(t, t, den); - - pow2523(t, t); - M(t, t, num); - M(t, t, den); - M(t, t, den); - M(r[0], t, den); - - S(chk, r[0]); - M(chk, chk, den); - if (neq_25519(chk, num)) M(r[0], r[0], I); - - S(chk, r[0]); - M(chk, chk, den); - if (neq_25519(chk, num)) return -1; - - if (par_25519(r[0]) == (p[31]>>7)) Z(r[0],gf0,r[0]); - - M(r[3], r[0], r[1]); - return 0; -} - -sv hash_k(u8 k[64], const u8 R[32], const u8 A[32], const u8 *M, size_t M_size) +// hashes R || A || M, reduces it modulo L +sv hash_ram(u8 k[64], const u8 R[32], const u8 A[32], const u8 *M, size_t M_size) { HASH_CTX ctx; HASH_INIT (&ctx); @@ -1106,39 +1260,32 @@ sv hash_k(u8 k[64], const u8 R[32], const u8 A[32], const u8 *M, size_t M_size) reduce(k); } -void crypto_ed25519_public_key(u8 public_key[32], const u8 secret_key[32]) +void crypto_ed25519_public_key(u8 public_key[32], + const u8 secret_key[32]) { - // hash the private key, turn the hash into a scalar u8 a[64]; HASH(a, secret_key, 32); - a[ 0] &= 248; - a[31] &= 127; - a[31] |= 64; - - // the public key is the packed form of the point aB (B == basepoint) - gf aB[4]; - scalarbase(aB, a); - pack(public_key, aB); + trim_scalar(a); + ge A; + ge_scalarmult_base(&A, a); + ge_tobytes(public_key, &A); } -void crypto_ed25519_sign(u8 signature[64], - const u8 secret_key[32], - const u8 *message, - size_t message_size) +void crypto_ed25519_sign(uint8_t signature[64], + const uint8_t secret_key[32], + const uint8_t *message, + size_t message_size) { u8 h[64]; u8 *a = h; // secret scalar u8 *prefix = h + 32; // prefix for nonce generation HASH(h, secret_key, 32); + trim_scalar(a); - // build public key from secret key - a[ 0] &= 248; - a[31] &= 127; - a[31] |= 64; - gf aB[4]; - scalarbase(aB, a); + ge A; u8 public_key[32]; - pack(public_key, aB); + ge_scalarmult_base(&A, a); + ge_tobytes(public_key, &A); // Constructs the "random" nonce from the secret key and message. // An actual random number would work just fine, and would save us @@ -1151,36 +1298,39 @@ void crypto_ed25519_sign(u8 signature[64], HASH_UPDATE(&ctx, message, message_size); HASH_FINAL (&ctx, r); - gf rB[4]; + ge R; reduce(r); - scalarbase(rB, r); - pack(signature, rB); // first half of the signature = "random" nonce + ge_scalarmult_base(&R, r); + ge_tobytes(signature, &R); // first half of the signature = "random" nonce - u8 k[64]; - hash_k(k, signature, public_key, message, message_size); + u8 h_ram[64]; + hash_ram(h_ram, signature, public_key, message, message_size); - i64 s[64]; // s = r + k a + i64 s[64]; // s = r + h_ram a FOR(i, 0, 32) s[i] = (u64) r[i]; FOR(i, 32, 64) s[i] = 0; FOR(i, 0, 32) { FOR(j, 0, 32) { - s[i+j] += k[i] * (u64) a[j]; + s[i+j] += h_ram[i] * (u64) a[j]; } } modL(signature + 32, s); // second half of the signature = s } -int crypto_ed25519_check(const u8 signature[64], - const u8 public_key[32], - const u8 *message, +int crypto_ed25519_check(const uint8_t signature[64], + const uint8_t public_key[32], + const uint8_t *message, size_t message_size) { - gf aB[4]; if (unpackneg(aB, public_key)) return -1; // -aB - u8 k[64]; hash_k(k, signature, public_key, message, message_size); - gf p[4]; scalarmult(p, aB, k); // p = -aB k - gf sB[4]; scalarbase(sB, signature + 32); add(p, sB); // p = s - aB k - u8 t[32]; pack(t, p); - return crypto_memcmp(signature, t, 32); // R == s - aB k ? OK : fail + ge A, p, sB, diff; + u8 h_ram[64], R_check[32]; + if (ge_frombytes_neg(&A, public_key)) return -1; // -A + hash_ram(h_ram, signature, public_key, message, message_size); + ge_scalarmult(&p, &A, h_ram); // p = -A*h_ram + ge_scalarmult_base(&sB, signature + 32); + ge_add(&diff, &p, &sB); // diff = s - A*h_ram + ge_tobytes(R_check, &diff); + return crypto_memcmp(signature, R_check, 32); // R == s - A*h_ram ? OK : fail } //////////////////////////////// @@ -1251,7 +1401,7 @@ void crypto_lock_key(u8 shared_key[32], const u8 your_secret_key [32], const u8 their_public_key[32]) { - static const u8 _0[16]; + static const u8 _0[16] = {0}; u8 shared_secret[32]; crypto_x25519(shared_secret, your_secret_key, their_public_key); crypto_chacha20_H(shared_key, shared_secret, _0); @@ -1311,7 +1461,7 @@ int crypto_unlock(u8 *plaintext, box, box + 16, text_size); } -static const u8 null_nonce[24]; +static const u8 null_nonce[24] = {0}; void crypto_anonymous_lock(u8 *box, const u8 random_secret_key[32],