From: Loup Vaillant <loup@loup-vaillant.fr>
Date: Thu, 8 Sep 2016 20:29:51 +0000 (+0200)
Subject: confirmed that local vars are faster
X-Git-Url: https://git.codecow.com/?a=commitdiff_plain;h=bee9383d82d22fad05862177630a5fb39edc09e8;p=Monocypher.git

confirmed that local vars are faster

It's a bit of a surprise, but on my machine, local variables turn
out to be almost 10% faster than a local array, possibly thanks
to compiler optimizations. (intel core i5 skylake, gcc 5.4, -O2)
---

diff --git a/chacha20.c b/chacha20.c
index cd488c4..f5cca75 100644
--- a/chacha20.c
+++ b/chacha20.c
@@ -51,10 +51,9 @@ increment_counter(crypto_chacha_ctx *ctx)
         ctx->input[13]++;
 }
 
-///////////////////////
-/// Round functions ///
-///////////////////////
-
+////////////
+/// Core ///
+////////////
 
 // Mangles the chacha context into a random-looking output.
 //
@@ -64,24 +63,17 @@ increment_counter(crypto_chacha_ctx *ctx)
 static void
 chacha20_rounds(uint32_t output[16], const crypto_chacha_ctx *ctx)
 {
-    // Local variables instead of indices, to facilitate optimisations
-    // TODO: test this shit.  The speed increase might be small.
-    uint32_t x0  = ctx->input[ 0];
-    uint32_t x1  = ctx->input[ 1];
-    uint32_t x2  = ctx->input[ 2];
-    uint32_t x3  = ctx->input[ 3];
-    uint32_t x4  = ctx->input[ 4];
-    uint32_t x5  = ctx->input[ 5];
-    uint32_t x6  = ctx->input[ 6];
-    uint32_t x7  = ctx->input[ 7];
-    uint32_t x8  = ctx->input[ 8];
-    uint32_t x9  = ctx->input[ 9];
-    uint32_t x10 = ctx->input[10];
-    uint32_t x11 = ctx->input[11];
-    uint32_t x12 = ctx->input[12];
-    uint32_t x13 = ctx->input[13];
-    uint32_t x14 = ctx->input[14];
-    uint32_t x15 = ctx->input[15];
+    // Local variables instead of indices, to facilitate optimisations.
+    // Compared to a local array, this is almost 10% faster on my machine
+    // (2016 laptop, intel i5 Skylake, 64 bits Linux, GCC version 5.4, -O2)
+    uint32_t x0  = ctx->input[ 0];    uint32_t x1  = ctx->input[ 1];
+    uint32_t x2  = ctx->input[ 2];    uint32_t x3  = ctx->input[ 3];
+    uint32_t x4  = ctx->input[ 4];    uint32_t x5  = ctx->input[ 5];
+    uint32_t x6  = ctx->input[ 6];    uint32_t x7  = ctx->input[ 7];
+    uint32_t x8  = ctx->input[ 8];    uint32_t x9  = ctx->input[ 9];
+    uint32_t x10 = ctx->input[10];    uint32_t x11 = ctx->input[11];
+    uint32_t x12 = ctx->input[12];    uint32_t x13 = ctx->input[13];
+    uint32_t x14 = ctx->input[14];    uint32_t x15 = ctx->input[15];
 
     for (int i = 20; i > 0; i -= 2) { // 20 rounds, 2 rounds per loop.
         // The ctx is viewed as a matrix, whose indices are like
@@ -119,22 +111,14 @@ chacha20_rounds(uint32_t output[16], const crypto_chacha_ctx *ctx)
         QUARTERROUND(x3, x4,  x9, x14); // diagonal 4
     }
 
-    output[ 0] = x0;
-    output[ 1] = x1;
-    output[ 2] = x2;
-    output[ 3] = x3;
-    output[ 4] = x4;
-    output[ 5] = x5;
-    output[ 6] = x6;
-    output[ 7] = x7;
-    output[ 8] = x8;
-    output[ 9] = x9;
-    output[10] = x10;
-    output[11] = x11;
-    output[12] = x12;
-    output[13] = x13;
-    output[14] = x14;
-    output[15] = x15;
+    output[ 0] = x0;    output[ 1] = x1;
+    output[ 2] = x2;    output[ 3] = x3;
+    output[ 4] = x4;    output[ 5] = x5;
+    output[ 6] = x6;    output[ 7] = x7;
+    output[ 8] = x8;    output[ 9] = x9;
+    output[10] = x10;   output[11] = x11;
+    output[12] = x12;   output[13] = x13;
+    output[14] = x14;   output[15] = x15;
 }
 
 void
@@ -145,7 +129,7 @@ crypto_block_chacha20(uint8_t output[64], crypto_chacha_ctx *ctx)
 
     // Now our buffer is seriously garbled.  However, it is still easy
     // to deduce the initial context from it: just invert the quarter
-    // rounds and apply that in reverse order.
+    // rounds (it's a school exercise, really).
     //
     // So we perform a final operation: we add the initial context to
     // the buffer.  Half of this initial context is made up of the key,
@@ -154,20 +138,19 @@ crypto_block_chacha20(uint8_t output[64], crypto_chacha_ctx *ctx)
     //
     // This effectively removes half the intel the attacker needs to
     // reverse the computation, forcing him to try all 2^256 possibilities.
-    // Well that's the idea anyway.  the security of  this trick is not
+    // Well that's the idea anyway.  The security of  this trick is not
     // proven, and with few enough rounds, there are clever schemes that
     // don't try the whole key space.
     //
-    // As of 2016, "low enough" means 6 or 7 rounds.  We use 20.
+    // As of 2016, "few enough" means 6 or 7 rounds.  We use 20.
     // This should be enough to prevent anyone from breaking them all
-    // in the forseeable future.
+    // in the forseeable future (say the experts).
     //
     // Note that in principle, we don't have to add the constant, nonce,
-    // and counter: that part could be reversed by the attacker anyway
-    // so we'd be just as secure if we didn't.  However that only improves
-    // the performance of naive implementations such as this one.  With
-    // SIMD, it's faster to just add the lot, so that's what the standard
-    // does.
+    // and counter: that part could be reversed by the attacker anyway.
+    // However, that would only improve the performance of naive
+    // implementations such as this one.  With SIMD, instructions, it's
+    // faster to just add the lot, so that's what the standard does.
     for (unsigned i = 0; i < 16; i++) {
         uint32_t sum = buffer[i] + ctx->input[i];
         store32_le(output + i*4, sum);