From: Loup Vaillant Date: Thu, 6 Jul 2017 19:55:06 +0000 (+0200) Subject: Sacrificed consiseness for performance X-Git-Url: https://git.codecow.com/?a=commitdiff_plain;h=b7d250469026e357fa2d0c23f9e2982c70658334;p=Monocypher.git Sacrificed consiseness for performance --- diff --git a/src/monocypher.c b/src/monocypher.c index 670b9d4..a4703c1 100644 --- a/src/monocypher.c +++ b/src/monocypher.c @@ -97,17 +97,26 @@ int crypto_zerocmp(const u8 *p, size_t n) sv chacha20_rounds(u32 out[16], const u32 in[16]) { - FOR (i, 0, 16) { out[i] = in[i]; } + // The temporary variables make Chacha20 10% faster. + u32 t0 = in[ 0]; u32 t1 = in[ 1]; u32 t2 = in[ 2]; u32 t3 = in[ 3]; + u32 t4 = in[ 4]; u32 t5 = in[ 5]; u32 t6 = in[ 6]; u32 t7 = in[ 7]; + u32 t8 = in[ 8]; u32 t9 = in[ 9]; u32 t10 = in[10]; u32 t11 = in[11]; + u32 t12 = in[12]; u32 t13 = in[13]; u32 t14 = in[14]; u32 t15 = in[15]; + FOR (i, 0, 10) { // 20 rounds, 2 rounds per loop. - QUARTERROUND(out[0], out[4], out[ 8], out[12]); // column 0 - QUARTERROUND(out[1], out[5], out[ 9], out[13]); // column 1 - QUARTERROUND(out[2], out[6], out[10], out[14]); // column 2 - QUARTERROUND(out[3], out[7], out[11], out[15]); // column 3 - QUARTERROUND(out[0], out[5], out[10], out[15]); // diagonal 0 - QUARTERROUND(out[1], out[6], out[11], out[12]); // diagonal 1 - QUARTERROUND(out[2], out[7], out[ 8], out[13]); // diagonal 2 - QUARTERROUND(out[3], out[4], out[ 9], out[14]); // diagonal 3 + QUARTERROUND(t0, t4, t8 , t12); // column 0 + QUARTERROUND(t1, t5, t9 , t13); // column 1 + QUARTERROUND(t2, t6, t10, t14); // column 2 + QUARTERROUND(t3, t7, t11, t15); // column 3 + QUARTERROUND(t0, t5, t10, t15); // diagonal 0 + QUARTERROUND(t1, t6, t11, t12); // diagonal 1 + QUARTERROUND(t2, t7, t8 , t13); // diagonal 2 + QUARTERROUND(t3, t4, t9 , t14); // diagonal 3 } + out[ 0] = t0; out[ 1] = t1; out[ 2] = t2; out[ 3] = t3; + out[ 4] = t4; out[ 5] = t5; out[ 6] = t6; out[ 7] = t7; + out[ 8] = t8; out[ 9] = t9; out[10] = t10; out[11] = t11; + out[12] = t12; out[13] = t13; out[14] = t14; out[15] = t15; } sv chacha20_init_key(crypto_chacha_ctx *ctx, const u8 key[32]) @@ -764,7 +773,7 @@ void crypto_argon2i(u8 *tag, u32 tag_size, /// Arithmetic modulo 2^255 - 19 /// //////////////////////////////////// // Taken from Supercop's ref10 implementation. -// A bit bigger than TweetNaCl, over 6 times faster. +// A bit bigger than TweetNaCl, about 8 times faster. // field element typedef i32 fe[10]; @@ -883,24 +892,92 @@ sv fe_mul(fe h, const fe f, const fe g) h[5] = h5; h[6] = h6; h[7] = h7; h[8] = h8; h[9] = h9; } -sv fe_sq(fe h, const fe f) { fe_mul(h, f, f); } - -// power to a power of 2 minus a small number. -// Timing depends on pow_2 and minus, so they shall not be secret. -sv fe_power(fe out, const fe z, int pow_2, u8 minus) +// we could use fe_mul() for this, but this is significantly faster +sv fe_sq(fe h, const fe f) { - minus--; - fe c; fe_copy(c, z); - for (int i = pow_2 - 2; i >= 0; i--) { - fe_sq(c, c); - if (i >= 8 || !((minus >> i) & 1)) { - fe_mul(c, c, z); - } - } - fe_copy(out, c); + i32 f0 = f[0]; i32 f1 = f[1]; i32 f2 = f[2]; i32 f3 = f[3]; i32 f4 = f[4]; + i32 f5 = f[5]; i32 f6 = f[6]; i32 f7 = f[7]; i32 f8 = f[8]; i32 f9 = f[9]; + i32 f0_2 = f0*2; i32 f1_2 = f1*2; i32 f2_2 = f2*2; i32 f3_2 = f3*2; + i32 f4_2 = f4*2; i32 f5_2 = f5*2; i32 f6_2 = f6*2; i32 f7_2 = f7*2; + i32 f5_38 = f5*38; i32 f6_19 = f6*19; i32 f7_38 = f7*38; + i32 f8_19 = f8*19; i32 f9_38 = f9*38; + + i64 h0 = f0 *(i64)f0 + f1_2*(i64)f9_38 + f2_2*(i64)f8_19 + + f3_2*(i64)f7_38 + f4_2*(i64)f6_19 + f5 *(i64)f5_38; + i64 h1 = f0_2*(i64)f1 + f2 *(i64)f9_38 + f3_2*(i64)f8_19 + + f4 *(i64)f7_38 + f5_2*(i64)f6_19; + i64 h2 = f0_2*(i64)f2 + f1_2*(i64)f1 + f3_2*(i64)f9_38 + + f4_2*(i64)f8_19 + f5_2*(i64)f7_38 + f6 *(i64)f6_19; + i64 h3 = f0_2*(i64)f3 + f1_2*(i64)f2 + f4 *(i64)f9_38 + + f5_2*(i64)f8_19 + f6 *(i64)f7_38; + i64 h4 = f0_2*(i64)f4 + f1_2*(i64)f3_2 + f2 *(i64)f2 + + f5_2*(i64)f9_38 + f6_2*(i64)f8_19 + f7 *(i64)f7_38; + i64 h5 = f0_2*(i64)f5 + f1_2*(i64)f4 + f2_2*(i64)f3 + + f6 *(i64)f9_38 + f7_2*(i64)f8_19; + i64 h6 = f0_2*(i64)f6 + f1_2*(i64)f5_2 + f2_2*(i64)f4 + + f3_2*(i64)f3 + f7_2*(i64)f9_38 + f8 *(i64)f8_19; + i64 h7 = f0_2*(i64)f7 + f1_2*(i64)f6 + f2_2*(i64)f5 + + f3_2*(i64)f4 + f8 *(i64)f9_38; + i64 h8 = f0_2*(i64)f8 + f1_2*(i64)f7_2 + f2_2*(i64)f6 + + f3_2*(i64)f5_2 + f4 *(i64)f4 + f9 *(i64)f9_38; + i64 h9 = f0_2*(i64)f9 + f1_2*(i64)f8 + f2_2*(i64)f7 + + f3_2*(i64)f6 + f4 *(i64)f5_2; + + i64 c0, c1, c2, c3, c4, c5, c6, c7, c8, c9; + c0 = (h0 + (i64) (1<<25)) >> 26; h1 += c0; h0 -= c0 << 26; + c4 = (h4 + (i64) (1<<25)) >> 26; h5 += c4; h4 -= c4 << 26; + c1 = (h1 + (i64) (1<<24)) >> 25; h2 += c1; h1 -= c1 << 25; + c5 = (h5 + (i64) (1<<24)) >> 25; h6 += c5; h5 -= c5 << 25; + c2 = (h2 + (i64) (1<<25)) >> 26; h3 += c2; h2 -= c2 << 26; + c6 = (h6 + (i64) (1<<25)) >> 26; h7 += c6; h6 -= c6 << 26; + c3 = (h3 + (i64) (1<<24)) >> 25; h4 += c3; h3 -= c3 << 25; + c7 = (h7 + (i64) (1<<24)) >> 25; h8 += c7; h7 -= c7 << 25; + c4 = (h4 + (i64) (1<<25)) >> 26; h5 += c4; h4 -= c4 << 26; + c8 = (h8 + (i64) (1<<25)) >> 26; h9 += c8; h8 -= c8 << 26; + c9 = (h9 + (i64) (1<<24)) >> 25; h0 += c9 * 19; h9 -= c9 << 25; + c0 = (h0 + (i64) (1<<25)) >> 26; h1 += c0; h0 -= c0 << 26; + + h[0] = h0; h[1] = h1; h[2] = h2; h[3] = h3; h[4] = h4; + h[5] = h5; h[6] = h6; h[7] = h7; h[8] = h8; h[9] = h9; +} + +// This could be simplified, but it would be slower +sv fe_invert(fe out, const fe z) +{ + fe t0, t1, t2, t3; + fe_sq(t0, z ); + fe_sq(t1, t0); + fe_sq(t1, t1); + fe_mul(t1, z, t1); + fe_mul(t0, t0, t1); + fe_sq(t2, t0); fe_mul(t1 , t1, t2); + fe_sq(t2, t1); FOR (i, 1, 5) fe_sq(t2, t2); fe_mul(t1 , t2, t1); + fe_sq(t2, t1); FOR (i, 1, 10) fe_sq(t2, t2); fe_mul(t2 , t2, t1); + fe_sq(t3, t2); FOR (i, 1, 20) fe_sq(t3, t3); fe_mul(t2 , t3, t2); + fe_sq(t2, t2); FOR (i, 1, 10) fe_sq(t2, t2); fe_mul(t1 , t2, t1); + fe_sq(t2, t1); FOR (i, 1, 50) fe_sq(t2, t2); fe_mul(t2 , t2, t1); + fe_sq(t3, t2); FOR (i, 1, 100) fe_sq(t3, t3); fe_mul(t2 , t3, t2); + fe_sq(t2, t2); FOR (i, 1, 50) fe_sq(t2, t2); fe_mul(t1 , t2, t1); + fe_sq(t1, t1); FOR (i, 1, 5) fe_sq(t1, t1); fe_mul(out, t1, t0); +} + +// This could be simplified, but it would be slower +void fe_pow22523(fe out, const fe z) +{ + fe t0, t1, t2; + fe_sq(t0, z); + fe_sq(t1,t0); fe_sq(t1, t1); fe_mul(t1, z, t1); + fe_mul(t0, t0, t1); + fe_sq(t0, t0); fe_mul(t0, t1, t0); + fe_sq(t1, t0); FOR (i, 1, 5) fe_sq(t1, t1); fe_mul(t0, t1, t0); + fe_sq(t1, t0); FOR (i, 1, 10) fe_sq(t1, t1); fe_mul(t1, t1, t0); + fe_sq(t2, t1); FOR (i, 1, 20) fe_sq(t2, t2); fe_mul(t1, t2, t1); + fe_sq(t1, t1); FOR (i, 1, 10) fe_sq(t1, t1); fe_mul(t0, t1, t0); + fe_sq(t1, t0); FOR (i, 1, 50) fe_sq(t1, t1); fe_mul(t1, t1, t0); + fe_sq(t2, t1); FOR (i, 1, 100) fe_sq(t2, t2); fe_mul(t1, t2, t1); + fe_sq(t1, t1); FOR (i, 1, 50) fe_sq(t1, t1); fe_mul(t0, t1, t0); + fe_sq(t0, t0); FOR (i, 1, 2) fe_sq(t0, t0); fe_mul(out, t0, z); } -sv fe_invert (fe out, const fe z) { fe_power(out, z, 255, 21); } -sv fe_pow22523(fe out, const fe z) { fe_power(out, z, 252, 3); } sv fe_tobytes(u8 s[32], const fe h) { @@ -1125,7 +1202,23 @@ sv ge_add(ge *s, const ge *p, const ge *q) fe_mul(s->T, e, h); // T3 = E * H } -sv ge_double(ge *s, const ge *p) { ge_add(s, p, p); } +// could use ge_add() for this, but this is slightly faster +sv ge_double(ge *s, const ge *p) +{ + fe a, b, c, d, e, f, g, h; + fe_sub(a, p->Y, p->X); fe_sq(a, a); // A = (Y1-X1)^2 + fe_add(b, p->X, p->Y); fe_sq(b, b); // B = (Y1+X1)^2 + fe_sq (c, p->T); fe_mul(c, c, D2); // C = T1^2 * k + fe_sq (d, p->Z); fe_add(d, d, d); // D = Z1^2 * 2 + fe_sub(e, b, a); // E = B - A + fe_sub(f, d, c); // F = D - C + fe_add(g, d, c); // G = D + C + fe_add(h, b, a); // H = B + A + fe_mul(s->X, e, f); // X3 = E * F + fe_mul(s->Y, g, h); // Y3 = G * H + fe_mul(s->Z, f, g); // Z3 = F * G + fe_mul(s->T, e, h); // T3 = E * H +} sv ge_scalarmult(ge *p, const ge *q, const u8 scalar[32]) { diff --git a/src/more_speed.c b/src/more_speed.c deleted file mode 100644 index 9afd728..0000000 --- a/src/more_speed.c +++ /dev/null @@ -1,118 +0,0 @@ -// Some low hanging fruits to make Monocypher a bit faster. -// -// It's a nice to have, but it probably won't save you if Monocypher -// is really too slow. If that ever happens, consider switching to 64 -// bits implementations such as Donna, or even direct assembly. -// -// Using Donna's field arithmetic should yield a 5-fold speedup at -// least. Signing can be even faster, but it takes a lot of code. -// -// On my computer, the alternate routines below makes curve25519 about -// 20% faster. Noticable, but not worth the extra hundred lines. - - -// Specialised squaring function, faster than general multiplication. -sv fe_sq(fe h, const fe f) -{ - i32 f0 = f[0]; i32 f1 = f[1]; i32 f2 = f[2]; i32 f3 = f[3]; i32 f4 = f[4]; - i32 f5 = f[5]; i32 f6 = f[6]; i32 f7 = f[7]; i32 f8 = f[8]; i32 f9 = f[9]; - i32 f0_2 = f0*2; i32 f1_2 = f1*2; i32 f2_2 = f2*2; i32 f3_2 = f3*2; - i32 f4_2 = f4*2; i32 f5_2 = f5*2; i32 f6_2 = f6*2; i32 f7_2 = f7*2; - i32 f5_38 = f5*38; i32 f6_19 = f6*19; i32 f7_38 = f7*38; - i32 f8_19 = f8*19; i32 f9_38 = f9*38; - - i64 h0 = f0 *(i64)f0 + f1_2*(i64)f9_38 + f2_2*(i64)f8_19 - + f3_2*(i64)f7_38 + f4_2*(i64)f6_19 + f5 *(i64)f5_38; - i64 h1 = f0_2*(i64)f1 + f2 *(i64)f9_38 + f3_2*(i64)f8_19 - + f4 *(i64)f7_38 + f5_2*(i64)f6_19; - i64 h2 = f0_2*(i64)f2 + f1_2*(i64)f1 + f3_2*(i64)f9_38 - + f4_2*(i64)f8_19 + f5_2*(i64)f7_38 + f6 *(i64)f6_19; - i64 h3 = f0_2*(i64)f3 + f1_2*(i64)f2 + f4 *(i64)f9_38 - + f5_2*(i64)f8_19 + f6 *(i64)f7_38; - i64 h4 = f0_2*(i64)f4 + f1_2*(i64)f3_2 + f2 *(i64)f2 - + f5_2*(i64)f9_38 + f6_2*(i64)f8_19 + f7 *(i64)f7_38; - i64 h5 = f0_2*(i64)f5 + f1_2*(i64)f4 + f2_2*(i64)f3 - + f6 *(i64)f9_38 + f7_2*(i64)f8_19; - i64 h6 = f0_2*(i64)f6 + f1_2*(i64)f5_2 + f2_2*(i64)f4 - + f3_2*(i64)f3 + f7_2*(i64)f9_38 + f8 *(i64)f8_19; - i64 h7 = f0_2*(i64)f7 + f1_2*(i64)f6 + f2_2*(i64)f5 - + f3_2*(i64)f4 + f8 *(i64)f9_38; - i64 h8 = f0_2*(i64)f8 + f1_2*(i64)f7_2 + f2_2*(i64)f6 - + f3_2*(i64)f5_2 + f4 *(i64)f4 + f9 *(i64)f9_38; - i64 h9 = f0_2*(i64)f9 + f1_2*(i64)f8 + f2_2*(i64)f7 - + f3_2*(i64)f6 + f4 *(i64)f5_2; - - i64 c0, c1, c2, c3, c4, c5, c6, c7, c8, c9; - c0 = (h0 + (i64) (1<<25)) >> 26; h1 += c0; h0 -= c0 << 26; - c4 = (h4 + (i64) (1<<25)) >> 26; h5 += c4; h4 -= c4 << 26; - c1 = (h1 + (i64) (1<<24)) >> 25; h2 += c1; h1 -= c1 << 25; - c5 = (h5 + (i64) (1<<24)) >> 25; h6 += c5; h5 -= c5 << 25; - c2 = (h2 + (i64) (1<<25)) >> 26; h3 += c2; h2 -= c2 << 26; - c6 = (h6 + (i64) (1<<25)) >> 26; h7 += c6; h6 -= c6 << 26; - c3 = (h3 + (i64) (1<<24)) >> 25; h4 += c3; h3 -= c3 << 25; - c7 = (h7 + (i64) (1<<24)) >> 25; h8 += c7; h7 -= c7 << 25; - c4 = (h4 + (i64) (1<<25)) >> 26; h5 += c4; h4 -= c4 << 26; - c8 = (h8 + (i64) (1<<25)) >> 26; h9 += c8; h8 -= c8 << 26; - c9 = (h9 + (i64) (1<<24)) >> 25; h0 += c9 * 19; h9 -= c9 << 25; - c0 = (h0 + (i64) (1<<25)) >> 26; h1 += c0; h0 -= c0 << 26; - - h[0] = h0; h[1] = h1; h[2] = h2; h[3] = h3; h[4] = h4; - h[5] = h5; h[6] = h6; h[7] = h7; h[8] = h8; h[9] = h9; -} - -// slightly faster inversion modulo 2^255 - 19 -sv fe_invert(fe out, const fe z) -{ - fe t0, t1, t2, t3; - fe_sq(t0, z ); - fe_sq(t1, t0); - fe_sq(t1, t1); - fe_mul(t1, z, t1); - fe_mul(t0, t0, t1); - fe_sq(t2, t0); fe_mul(t1 , t1, t2); - fe_sq(t2, t1); FOR (i, 1, 5) fe_sq(t2, t2); fe_mul(t1 , t2, t1); - fe_sq(t2, t1); FOR (i, 1, 10) fe_sq(t2, t2); fe_mul(t2 , t2, t1); - fe_sq(t3, t2); FOR (i, 1, 20) fe_sq(t3, t3); fe_mul(t2 , t3, t2); - fe_sq(t2, t2); FOR (i, 1, 10) fe_sq(t2, t2); fe_mul(t1 , t2, t1); - fe_sq(t2, t1); FOR (i, 1, 50) fe_sq(t2, t2); fe_mul(t2 , t2, t1); - fe_sq(t3, t2); FOR (i, 1, 100) fe_sq(t3, t3); fe_mul(t2 , t3, t2); - fe_sq(t2, t2); FOR (i, 1, 50) fe_sq(t2, t2); fe_mul(t1 , t2, t1); - fe_sq(t1, t1); FOR (i, 1, 5) fe_sq(t1, t1); fe_mul(out, t1, t0); -} - -// slightly faster power to 2^252 - 3, for ed25519 decompression -void fe_pow22523(fe out, const fe z) -{ - fe t0, t1, t2; - fe_sq(t0, z); - fe_sq(t1,t0); fe_sq(t1, t1); fe_mul(t1, z, t1); - fe_mul(t0, t0, t1); - fe_sq(t0, t0); fe_mul(t0, t1, t0); - fe_sq(t1, t0); FOR (i, 1, 5) fe_sq(t1, t1); fe_mul(t0, t1, t0); - fe_sq(t1, t0); FOR (i, 1, 10) fe_sq(t1, t1); fe_mul(t1, t1, t0); - fe_sq(t2, t1); FOR (i, 1, 20) fe_sq(t2, t2); fe_mul(t1, t2, t1); - fe_sq(t1, t1); FOR (i, 1, 10) fe_sq(t1, t1); fe_mul(t0, t1, t0); - fe_sq(t1, t0); FOR (i, 1, 50) fe_sq(t1, t1); fe_mul(t1, t1, t0); - fe_sq(t2, t1); FOR (i, 1, 100) fe_sq(t2, t2); fe_mul(t1, t2, t1); - fe_sq(t1, t1); FOR (i, 1, 50) fe_sq(t1, t1); fe_mul(t0, t1, t0); - fe_sq(t0, t0); FOR (i, 1, 2) fe_sq(t0, t0); fe_mul(out, t0, z); -} - -// Slightly faster twisted Edwards point doubling, assuming we use a -// specialised squaring function such as the above. -sv ge_double(ge *s, const ge *p) -{ - fe a, b, c, d, e, f, g, h; - fe_sub(a, p->Y, p->X); fe_sq(a, a); // A = (Y1-X1)^2 - fe_add(b, p->X, p->Y); fe_sq(b, b); // B = (Y1+X1)^2 - fe_sq (c, p->T); fe_mul(c, c, D2); // C = T1^2 * k - fe_sq (d, p->Z); fe_add(d, d, d); // D = Z1^2 * 2 - fe_sub(e, b, a); // E = B - A - fe_sub(f, d, c); // F = D - C - fe_add(g, d, c); // G = D + C - fe_add(h, b, a); // H = B + A - fe_mul(s->X, e, f); // X3 = E * F - fe_mul(s->Y, g, h); // Y3 = G * H - fe_mul(s->Z, f, g); // Z3 = F * G - fe_mul(s->T, e, h); // T3 = E * H -}