From: Loup Vaillant <loup@loup-vaillant.fr>
Date: Thu, 6 Jul 2017 19:55:06 +0000 (+0200)
Subject: Sacrificed consiseness for performance
X-Git-Url: https://git.codecow.com/?a=commitdiff_plain;h=b7d250469026e357fa2d0c23f9e2982c70658334;p=Monocypher.git

Sacrificed consiseness for performance
---

diff --git a/src/monocypher.c b/src/monocypher.c
index 670b9d4..a4703c1 100644
--- a/src/monocypher.c
+++ b/src/monocypher.c
@@ -97,17 +97,26 @@ int crypto_zerocmp(const u8 *p, size_t n)
 
 sv chacha20_rounds(u32 out[16], const u32 in[16])
 {
-    FOR (i, 0, 16) { out[i] = in[i]; }
+    // The temporary variables make Chacha20 10% faster.
+    u32 t0  = in[ 0];  u32 t1  = in[ 1];  u32 t2  = in[ 2];  u32 t3  = in[ 3];
+    u32 t4  = in[ 4];  u32 t5  = in[ 5];  u32 t6  = in[ 6];  u32 t7  = in[ 7];
+    u32 t8  = in[ 8];  u32 t9  = in[ 9];  u32 t10 = in[10];  u32 t11 = in[11];
+    u32 t12 = in[12];  u32 t13 = in[13];  u32 t14 = in[14];  u32 t15 = in[15];
+
     FOR (i, 0, 10) { // 20 rounds, 2 rounds per loop.
-        QUARTERROUND(out[0], out[4], out[ 8], out[12]); // column 0
-        QUARTERROUND(out[1], out[5], out[ 9], out[13]); // column 1
-        QUARTERROUND(out[2], out[6], out[10], out[14]); // column 2
-        QUARTERROUND(out[3], out[7], out[11], out[15]); // column 3
-        QUARTERROUND(out[0], out[5], out[10], out[15]); // diagonal 0
-        QUARTERROUND(out[1], out[6], out[11], out[12]); // diagonal 1
-        QUARTERROUND(out[2], out[7], out[ 8], out[13]); // diagonal 2
-        QUARTERROUND(out[3], out[4], out[ 9], out[14]); // diagonal 3
+        QUARTERROUND(t0, t4, t8 , t12); // column 0
+        QUARTERROUND(t1, t5, t9 , t13); // column 1
+        QUARTERROUND(t2, t6, t10, t14); // column 2
+        QUARTERROUND(t3, t7, t11, t15); // column 3
+        QUARTERROUND(t0, t5, t10, t15); // diagonal 0
+        QUARTERROUND(t1, t6, t11, t12); // diagonal 1
+        QUARTERROUND(t2, t7, t8 , t13); // diagonal 2
+        QUARTERROUND(t3, t4, t9 , t14); // diagonal 3
     }
+    out[ 0] = t0;   out[ 1] = t1;   out[ 2] = t2;   out[ 3] = t3;
+    out[ 4] = t4;   out[ 5] = t5;   out[ 6] = t6;   out[ 7] = t7;
+    out[ 8] = t8;   out[ 9] = t9;   out[10] = t10;  out[11] = t11;
+    out[12] = t12;  out[13] = t13;  out[14] = t14;  out[15] = t15;
 }
 
 sv chacha20_init_key(crypto_chacha_ctx *ctx, const u8 key[32])
@@ -764,7 +773,7 @@ void crypto_argon2i(u8       *tag,       u32 tag_size,
 /// Arithmetic modulo 2^255 - 19 ///
 ////////////////////////////////////
 //  Taken from Supercop's ref10 implementation.
-//  A bit bigger than TweetNaCl, over 6 times faster.
+//  A bit bigger than TweetNaCl, about 8 times faster.
 
 // field element
 typedef i32 fe[10];
@@ -883,24 +892,92 @@ sv fe_mul(fe h, const fe f, const fe g)
     h[5] = h5;  h[6] = h6;  h[7] = h7;  h[8] = h8;  h[9] = h9;
 }
 
-sv fe_sq(fe h, const fe f) { fe_mul(h, f, f); }
-
-// power to a power of 2 minus a small number.
-// Timing depends on pow_2 and minus, so they shall not be secret.
-sv fe_power(fe out, const fe z, int pow_2, u8 minus)
+// we could use fe_mul() for this, but this is significantly faster
+sv fe_sq(fe h, const fe f)
 {
-    minus--;
-    fe c; fe_copy(c, z);
-    for (int i = pow_2 - 2; i >= 0; i--) {
-        fe_sq(c, c);
-        if (i >= 8 || !((minus >> i) & 1)) {
-            fe_mul(c, c, z);
-        }
-    }
-    fe_copy(out, c);
+    i32 f0 = f[0]; i32 f1 = f[1]; i32 f2 = f[2]; i32 f3 = f[3]; i32 f4 = f[4];
+    i32 f5 = f[5]; i32 f6 = f[6]; i32 f7 = f[7]; i32 f8 = f[8]; i32 f9 = f[9];
+    i32 f0_2  = f0*2;   i32 f1_2  = f1*2;   i32 f2_2  = f2*2;   i32 f3_2 = f3*2;
+    i32 f4_2  = f4*2;   i32 f5_2  = f5*2;   i32 f6_2  = f6*2;   i32 f7_2 = f7*2;
+    i32 f5_38 = f5*38;  i32 f6_19 = f6*19;  i32 f7_38 = f7*38;
+    i32 f8_19 = f8*19;  i32 f9_38 = f9*38;
+
+    i64 h0 = f0  *(i64)f0    + f1_2*(i64)f9_38 + f2_2*(i64)f8_19
+        +    f3_2*(i64)f7_38 + f4_2*(i64)f6_19 + f5  *(i64)f5_38;
+    i64 h1 = f0_2*(i64)f1    + f2  *(i64)f9_38 + f3_2*(i64)f8_19
+        +    f4  *(i64)f7_38 + f5_2*(i64)f6_19;
+    i64 h2 = f0_2*(i64)f2    + f1_2*(i64)f1    + f3_2*(i64)f9_38
+        +    f4_2*(i64)f8_19 + f5_2*(i64)f7_38 + f6  *(i64)f6_19;
+    i64 h3 = f0_2*(i64)f3    + f1_2*(i64)f2    + f4  *(i64)f9_38
+        +    f5_2*(i64)f8_19 + f6  *(i64)f7_38;
+    i64 h4 = f0_2*(i64)f4    + f1_2*(i64)f3_2  + f2  *(i64)f2
+        +    f5_2*(i64)f9_38 + f6_2*(i64)f8_19 + f7  *(i64)f7_38;
+    i64 h5 = f0_2*(i64)f5    + f1_2*(i64)f4    + f2_2*(i64)f3
+        +    f6  *(i64)f9_38 + f7_2*(i64)f8_19;
+    i64 h6 = f0_2*(i64)f6    + f1_2*(i64)f5_2  + f2_2*(i64)f4
+        +    f3_2*(i64)f3    + f7_2*(i64)f9_38 + f8  *(i64)f8_19;
+    i64 h7 = f0_2*(i64)f7    + f1_2*(i64)f6    + f2_2*(i64)f5
+        +    f3_2*(i64)f4    + f8  *(i64)f9_38;
+    i64 h8 = f0_2*(i64)f8    + f1_2*(i64)f7_2  + f2_2*(i64)f6
+        +    f3_2*(i64)f5_2  + f4  *(i64)f4    + f9  *(i64)f9_38;
+    i64 h9 = f0_2*(i64)f9    + f1_2*(i64)f8    + f2_2*(i64)f7
+        +    f3_2*(i64)f6    + f4  *(i64)f5_2;
+
+    i64 c0, c1, c2, c3, c4, c5, c6, c7, c8, c9;
+    c0 = (h0 + (i64) (1<<25)) >> 26; h1 += c0;      h0 -= c0 << 26;
+    c4 = (h4 + (i64) (1<<25)) >> 26; h5 += c4;      h4 -= c4 << 26;
+    c1 = (h1 + (i64) (1<<24)) >> 25; h2 += c1;      h1 -= c1 << 25;
+    c5 = (h5 + (i64) (1<<24)) >> 25; h6 += c5;      h5 -= c5 << 25;
+    c2 = (h2 + (i64) (1<<25)) >> 26; h3 += c2;      h2 -= c2 << 26;
+    c6 = (h6 + (i64) (1<<25)) >> 26; h7 += c6;      h6 -= c6 << 26;
+    c3 = (h3 + (i64) (1<<24)) >> 25; h4 += c3;      h3 -= c3 << 25;
+    c7 = (h7 + (i64) (1<<24)) >> 25; h8 += c7;      h7 -= c7 << 25;
+    c4 = (h4 + (i64) (1<<25)) >> 26; h5 += c4;      h4 -= c4 << 26;
+    c8 = (h8 + (i64) (1<<25)) >> 26; h9 += c8;      h8 -= c8 << 26;
+    c9 = (h9 + (i64) (1<<24)) >> 25; h0 += c9 * 19; h9 -= c9 << 25;
+    c0 = (h0 + (i64) (1<<25)) >> 26; h1 += c0;      h0 -= c0 << 26;
+
+    h[0] = h0;  h[1] = h1;  h[2] = h2;  h[3] = h3;  h[4] = h4;
+    h[5] = h5;  h[6] = h6;  h[7] = h7;  h[8] = h8;  h[9] = h9;
+}
+
+// This could be simplified, but it would be slower
+sv fe_invert(fe out, const fe z)
+{
+    fe t0, t1, t2, t3;
+    fe_sq(t0, z );
+    fe_sq(t1, t0);
+    fe_sq(t1, t1);
+    fe_mul(t1,  z, t1);
+    fe_mul(t0, t0, t1);
+    fe_sq(t2, t0);                                fe_mul(t1 , t1, t2);
+    fe_sq(t2, t1); FOR (i, 1,   5) fe_sq(t2, t2); fe_mul(t1 , t2, t1);
+    fe_sq(t2, t1); FOR (i, 1,  10) fe_sq(t2, t2); fe_mul(t2 , t2, t1);
+    fe_sq(t3, t2); FOR (i, 1,  20) fe_sq(t3, t3); fe_mul(t2 , t3, t2);
+    fe_sq(t2, t2); FOR (i, 1,  10) fe_sq(t2, t2); fe_mul(t1 , t2, t1);
+    fe_sq(t2, t1); FOR (i, 1,  50) fe_sq(t2, t2); fe_mul(t2 , t2, t1);
+    fe_sq(t3, t2); FOR (i, 1, 100) fe_sq(t3, t3); fe_mul(t2 , t3, t2);
+    fe_sq(t2, t2); FOR (i, 1,  50) fe_sq(t2, t2); fe_mul(t1 , t2, t1);
+    fe_sq(t1, t1); FOR (i, 1,   5) fe_sq(t1, t1); fe_mul(out, t1, t0);
+}
+
+// This could be simplified, but it would be slower
+void fe_pow22523(fe out, const fe z)
+{
+    fe t0, t1, t2;
+    fe_sq(t0, z);
+    fe_sq(t1,t0);                   fe_sq(t1, t1);  fe_mul(t1, z, t1);
+    fe_mul(t0, t0, t1);
+    fe_sq(t0, t0);                                  fe_mul(t0, t1, t0);
+    fe_sq(t1, t0);  FOR (i, 1,   5) fe_sq(t1, t1);  fe_mul(t0, t1, t0);
+    fe_sq(t1, t0);  FOR (i, 1,  10) fe_sq(t1, t1);  fe_mul(t1, t1, t0);
+    fe_sq(t2, t1);  FOR (i, 1,  20) fe_sq(t2, t2);  fe_mul(t1, t2, t1);
+    fe_sq(t1, t1);  FOR (i, 1,  10) fe_sq(t1, t1);  fe_mul(t0, t1, t0);
+    fe_sq(t1, t0);  FOR (i, 1,  50) fe_sq(t1, t1);  fe_mul(t1, t1, t0);
+    fe_sq(t2, t1);  FOR (i, 1, 100) fe_sq(t2, t2);  fe_mul(t1, t2, t1);
+    fe_sq(t1, t1);  FOR (i, 1,  50) fe_sq(t1, t1);  fe_mul(t0, t1, t0);
+    fe_sq(t0, t0);  FOR (i, 1,   2) fe_sq(t0, t0);  fe_mul(out, t0, z);
 }
-sv fe_invert  (fe out, const fe z) { fe_power(out, z, 255, 21); }
-sv fe_pow22523(fe out, const fe z) { fe_power(out, z, 252,  3); }
 
 sv fe_tobytes(u8 s[32], const fe h)
 {
@@ -1125,7 +1202,23 @@ sv ge_add(ge *s, const ge *p, const ge *q)
     fe_mul(s->T, e, h);  //  T3 = E * H
 }
 
-sv ge_double(ge *s, const ge *p) { ge_add(s, p, p); }
+// could use ge_add() for this, but this is slightly faster
+sv ge_double(ge *s, const ge *p)
+{
+    fe a, b, c, d, e, f, g, h;
+    fe_sub(a, p->Y, p->X);  fe_sq(a, a);      //  A = (Y1-X1)^2
+    fe_add(b, p->X, p->Y);  fe_sq(b, b);      //  B = (Y1+X1)^2
+    fe_sq (c, p->T);        fe_mul(c, c, D2); //  C = T1^2 * k
+    fe_sq (d, p->Z);        fe_add(d, d, d);  //  D = Z1^2 * 2
+    fe_sub(e, b, a);                          //  E  = B - A
+    fe_sub(f, d, c);                          //  F  = D - C
+    fe_add(g, d, c);                          //  G  = D + C
+    fe_add(h, b, a);                          //  H  = B + A
+    fe_mul(s->X, e, f);                       //  X3 = E * F
+    fe_mul(s->Y, g, h);                       //  Y3 = G * H
+    fe_mul(s->Z, f, g);                       //  Z3 = F * G
+    fe_mul(s->T, e, h);                       //  T3 = E * H
+}
 
 sv ge_scalarmult(ge *p, const ge *q, const u8 scalar[32])
 {
diff --git a/src/more_speed.c b/src/more_speed.c
deleted file mode 100644
index 9afd728..0000000
--- a/src/more_speed.c
+++ /dev/null
@@ -1,118 +0,0 @@
-// Some low hanging fruits to make Monocypher a bit faster.
-//
-// It's a nice to have, but it probably won't save you if Monocypher
-// is really too slow.  If that ever happens, consider switching to 64
-// bits implementations such as Donna, or even direct assembly.
-//
-// Using Donna's field arithmetic should yield a 5-fold speedup at
-// least.  Signing can be even faster, but it takes a lot of code.
-//
-// On my computer, the alternate routines below makes curve25519 about
-// 20% faster.  Noticable, but not worth the extra hundred lines.
-
-
-// Specialised squaring function, faster than general multiplication.
-sv fe_sq(fe h, const fe f)
-{
-    i32 f0 = f[0]; i32 f1 = f[1]; i32 f2 = f[2]; i32 f3 = f[3]; i32 f4 = f[4];
-    i32 f5 = f[5]; i32 f6 = f[6]; i32 f7 = f[7]; i32 f8 = f[8]; i32 f9 = f[9];
-    i32 f0_2  = f0*2;   i32 f1_2  = f1*2;   i32 f2_2  = f2*2;   i32 f3_2 = f3*2;
-    i32 f4_2  = f4*2;   i32 f5_2  = f5*2;   i32 f6_2  = f6*2;   i32 f7_2 = f7*2;
-    i32 f5_38 = f5*38;  i32 f6_19 = f6*19;  i32 f7_38 = f7*38;
-    i32 f8_19 = f8*19;  i32 f9_38 = f9*38;
-
-    i64 h0 = f0  *(i64)f0    + f1_2*(i64)f9_38 + f2_2*(i64)f8_19
-        +    f3_2*(i64)f7_38 + f4_2*(i64)f6_19 + f5  *(i64)f5_38;
-    i64 h1 = f0_2*(i64)f1    + f2  *(i64)f9_38 + f3_2*(i64)f8_19
-        +    f4  *(i64)f7_38 + f5_2*(i64)f6_19;
-    i64 h2 = f0_2*(i64)f2    + f1_2*(i64)f1    + f3_2*(i64)f9_38
-        +    f4_2*(i64)f8_19 + f5_2*(i64)f7_38 + f6  *(i64)f6_19;
-    i64 h3 = f0_2*(i64)f3    + f1_2*(i64)f2    + f4  *(i64)f9_38
-        +    f5_2*(i64)f8_19 + f6  *(i64)f7_38;
-    i64 h4 = f0_2*(i64)f4    + f1_2*(i64)f3_2  + f2  *(i64)f2
-        +    f5_2*(i64)f9_38 + f6_2*(i64)f8_19 + f7  *(i64)f7_38;
-    i64 h5 = f0_2*(i64)f5    + f1_2*(i64)f4    + f2_2*(i64)f3
-        +    f6  *(i64)f9_38 + f7_2*(i64)f8_19;
-    i64 h6 = f0_2*(i64)f6    + f1_2*(i64)f5_2  + f2_2*(i64)f4
-        +    f3_2*(i64)f3    + f7_2*(i64)f9_38 + f8  *(i64)f8_19;
-    i64 h7 = f0_2*(i64)f7    + f1_2*(i64)f6    + f2_2*(i64)f5
-        +    f3_2*(i64)f4    + f8  *(i64)f9_38;
-    i64 h8 = f0_2*(i64)f8    + f1_2*(i64)f7_2  + f2_2*(i64)f6
-        +    f3_2*(i64)f5_2  + f4  *(i64)f4    + f9  *(i64)f9_38;
-    i64 h9 = f0_2*(i64)f9    + f1_2*(i64)f8    + f2_2*(i64)f7
-        +    f3_2*(i64)f6    + f4  *(i64)f5_2;
-
-    i64 c0, c1, c2, c3, c4, c5, c6, c7, c8, c9;
-    c0 = (h0 + (i64) (1<<25)) >> 26; h1 += c0;      h0 -= c0 << 26;
-    c4 = (h4 + (i64) (1<<25)) >> 26; h5 += c4;      h4 -= c4 << 26;
-    c1 = (h1 + (i64) (1<<24)) >> 25; h2 += c1;      h1 -= c1 << 25;
-    c5 = (h5 + (i64) (1<<24)) >> 25; h6 += c5;      h5 -= c5 << 25;
-    c2 = (h2 + (i64) (1<<25)) >> 26; h3 += c2;      h2 -= c2 << 26;
-    c6 = (h6 + (i64) (1<<25)) >> 26; h7 += c6;      h6 -= c6 << 26;
-    c3 = (h3 + (i64) (1<<24)) >> 25; h4 += c3;      h3 -= c3 << 25;
-    c7 = (h7 + (i64) (1<<24)) >> 25; h8 += c7;      h7 -= c7 << 25;
-    c4 = (h4 + (i64) (1<<25)) >> 26; h5 += c4;      h4 -= c4 << 26;
-    c8 = (h8 + (i64) (1<<25)) >> 26; h9 += c8;      h8 -= c8 << 26;
-    c9 = (h9 + (i64) (1<<24)) >> 25; h0 += c9 * 19; h9 -= c9 << 25;
-    c0 = (h0 + (i64) (1<<25)) >> 26; h1 += c0;      h0 -= c0 << 26;
-
-    h[0] = h0;  h[1] = h1;  h[2] = h2;  h[3] = h3;  h[4] = h4;
-    h[5] = h5;  h[6] = h6;  h[7] = h7;  h[8] = h8;  h[9] = h9;
-}
-
-// slightly faster inversion modulo 2^255 - 19
-sv fe_invert(fe out, const fe z)
-{
-    fe t0, t1, t2, t3;
-    fe_sq(t0, z );
-    fe_sq(t1, t0);
-    fe_sq(t1, t1);
-    fe_mul(t1,  z, t1);
-    fe_mul(t0, t0, t1);
-    fe_sq(t2, t0);                                fe_mul(t1 , t1, t2);
-    fe_sq(t2, t1); FOR (i, 1,   5) fe_sq(t2, t2); fe_mul(t1 , t2, t1);
-    fe_sq(t2, t1); FOR (i, 1,  10) fe_sq(t2, t2); fe_mul(t2 , t2, t1);
-    fe_sq(t3, t2); FOR (i, 1,  20) fe_sq(t3, t3); fe_mul(t2 , t3, t2);
-    fe_sq(t2, t2); FOR (i, 1,  10) fe_sq(t2, t2); fe_mul(t1 , t2, t1);
-    fe_sq(t2, t1); FOR (i, 1,  50) fe_sq(t2, t2); fe_mul(t2 , t2, t1);
-    fe_sq(t3, t2); FOR (i, 1, 100) fe_sq(t3, t3); fe_mul(t2 , t3, t2);
-    fe_sq(t2, t2); FOR (i, 1,  50) fe_sq(t2, t2); fe_mul(t1 , t2, t1);
-    fe_sq(t1, t1); FOR (i, 1,   5) fe_sq(t1, t1); fe_mul(out, t1, t0);
-}
-
-// slightly faster power to 2^252 - 3, for ed25519 decompression
-void fe_pow22523(fe out, const fe z)
-{
-    fe t0, t1, t2;
-    fe_sq(t0, z);
-    fe_sq(t1,t0);                   fe_sq(t1, t1);  fe_mul(t1, z, t1);
-    fe_mul(t0, t0, t1);
-    fe_sq(t0, t0);                                  fe_mul(t0, t1, t0);
-    fe_sq(t1, t0);  FOR (i, 1,   5) fe_sq(t1, t1);  fe_mul(t0, t1, t0);
-    fe_sq(t1, t0);  FOR (i, 1,  10) fe_sq(t1, t1);  fe_mul(t1, t1, t0);
-    fe_sq(t2, t1);  FOR (i, 1,  20) fe_sq(t2, t2);  fe_mul(t1, t2, t1);
-    fe_sq(t1, t1);  FOR (i, 1,  10) fe_sq(t1, t1);  fe_mul(t0, t1, t0);
-    fe_sq(t1, t0);  FOR (i, 1,  50) fe_sq(t1, t1);  fe_mul(t1, t1, t0);
-    fe_sq(t2, t1);  FOR (i, 1, 100) fe_sq(t2, t2);  fe_mul(t1, t2, t1);
-    fe_sq(t1, t1);  FOR (i, 1,  50) fe_sq(t1, t1);  fe_mul(t0, t1, t0);
-    fe_sq(t0, t0);  FOR (i, 1,   2) fe_sq(t0, t0);  fe_mul(out, t0, z);
-}
-
-// Slightly faster twisted Edwards point doubling, assuming we use a
-// specialised squaring function such as the above.
-sv ge_double(ge *s, const ge *p)
-{
-    fe a, b, c, d, e, f, g, h;
-    fe_sub(a, p->Y, p->X);  fe_sq(a, a);      //  A = (Y1-X1)^2
-    fe_add(b, p->X, p->Y);  fe_sq(b, b);      //  B = (Y1+X1)^2
-    fe_sq (c, p->T);        fe_mul(c, c, D2); //  C = T1^2 * k
-    fe_sq (d, p->Z);        fe_add(d, d, d);  //  D = Z1^2 * 2
-    fe_sub(e, b, a);                          //  E  = B - A
-    fe_sub(f, d, c);                          //  F  = D - C
-    fe_add(g, d, c);                          //  G  = D + C
-    fe_add(h, b, a);                          //  H  = B + A
-    fe_mul(s->X, e, f);                       //  X3 = E * F
-    fe_mul(s->Y, g, h);                       //  Y3 = G * H
-    fe_mul(s->Z, f, g);                       //  Z3 = F * G
-    fe_mul(s->T, e, h);                       //  T3 = E * H
-}