From: Loup Vaillant <loup@loup-vaillant.fr>
Date: Mon, 8 May 2017 15:32:39 +0000 (+0200)
Subject: reimplemented blake2b from spec
X-Git-Url: https://git.codecow.com/?a=commitdiff_plain;h=177cb183f210cc78c5865226814d57d97cf4b1b0;p=Monocypher.git

reimplemented blake2b from spec
---

diff --git a/AUTHORS b/AUTHORS
new file mode 100644
index 0000000..837e201
--- /dev/null
+++ b/AUTHORS
@@ -0,0 +1,37 @@
+Designers
+---------
+
+- **Chacha20:** Daniel J. Bernstein.
+- **Poly1305:** Daniel J. Bernstein.
+- **Blake2:**   Jean-Philippe Aumasson, Christian Winnerlein, Samuel Neves,
+                and Zooko Wilcox-O'Hearn
+- **Argon2:**   Alex Biryukov, Daniel Dinu, and Dmitry Khovratovich
+- **X25519:**   Daniel J. Bernstein
+- **edDSA:**    Daniel J. Bernstein, Bo-Yin Yang, Niels Duif, Peter
+                Schwabe, and Tanja Lange
+
+Implementors
+------------
+
+- **Chacha20:** Loup Vaillant, implemented from spec.
+- **Poly1305:** Loup Vaillant, implemented from spec.
+- **Blake2b:**  Loup Vaillant, implemented from spec.
+- **Argon2i:**  Loup Vaillant, implemented from spec.
+- **X25519:**   Daniel J. Bernstein, taken and packaged from SUPERCOP
+                ref10.
+- **edDSA:**    Daniel J. Bernstein, taken and adapted from SUPERCOP
+                ref10 and TweetNaCl.
+
+Test suite
+----------
+
+Designed and implemented by Loup Vaillant, using _libsodium_ (by many
+authors), and _ed25519-donna_ (by Andrew Moon âfloodyberry).
+
+Thanks
+------
+
+Mike Pechkin and AndrÃ© Maroneze for finding bugs in earlier versions,
+and Andrew Moon for clarifying carry propagation in modular
+arithmetic.
+
diff --git a/LICENCE b/LICENCE
new file mode 100644
index 0000000..cdc7e55
--- /dev/null
+++ b/LICENCE
@@ -0,0 +1,5 @@
+Copying and distribution of the code, with or without modification,
+are permitted in any medium without royalty.  This code is offered
+as-is, without any warranty.
+
+(In other words, do whatever you want with the code.)
diff --git a/README b/README
index 01a3e4f..8908dac 100644
--- a/README
+++ b/README
@@ -1,65 +1,3 @@
-Authors
--------
-
-Packaged by Loup Vaillant.
-
-- Chacha20: Loup Vaillant, implemented from spec.
-- Poly1305: Loup Vaillant, implemented from spec.
-- Blake2b:  derived from https://tools.ietf.org/html/rfc7693
-- Argon2i:  Loup Vaillant, implemented from spec.
-- X25519:   taken from SUPERCOP ref10.
-- ed25519:  adapted from SUPERCOP ref10 and http://tweetnacl.cr.yp.to
-- High-level constructions: Loup Vaillant, implemented from specs and
-  first principles
-
-Licence
--------
-
-For everything *but* Blake2b:
-
-  Copying and distribution of the code, with or without modification,
-  are permitted in any medium without royalty.  This code is offered
-  as-is, without any warranty.
-
----
-
-For the Blake2b code:
-
-  Copyright (c) 2015 IETF Trust and the persons identified as authors
-  of the code. All rights reserved.
-
-  Redistribution and use in source and binary forms, with or without
-  modification, are permitted provided that the following conditions
-  are met:
-
-  - Redistributions of source code must retain the above copyright
-    notice, this list of conditions and the following disclaimer.
-
-  - Redistributions in binary form must reproduce the above copyright
-    notice, this list of conditions and the following disclaimer in
-    the documentation and/or other materials provided with the
-    distribution.
-
-  - Neither the name of Internet Society, IETF or IETF Trust, nor the
-    names of specific contributors, may be used to endorse or promote
-    products derived from this software without specific prior written
-    permission.
-
-  - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
-    CONTRIBUTORS âAS ISâ AND ANY EXPRESS OR IMPLIED WARRANTIES,
-    INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
-    MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-    DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS
-    BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-    EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
-    TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
-    ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
-    TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
-    THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
-    SUCH DAMAGE.
-
-
 Current status
 --------------
 
@@ -87,7 +25,7 @@ least once.
 Integration to your project
 ---------------------------
 
-Just copy monocypher.c and monocypher.h.
+Just include src/monocypher.c and src/monocypher.h in your project.
 
 They compile as C99, C11, C++98, C++11, C++14, and C++17. (Tested with
 gcc 5.4.0 and clang 2.8.0 on GNU/Linux.)
@@ -105,7 +43,7 @@ the default Blake2b, do as the test suite does:
 - Link the final program with a suitable SHA-512 implementation.  You
   can use the sha512.c and sha512.h files provided here.
 
-Note that even though the default hash (Blake2b) is not widely used,
-it doesn't prevent you from upgrading to faster implementations if you
-need to.  The Donna implementations of ed25519 for instance can use a
-custom hash âone of the fuzz tests does just that.
+Note that even though the default hash (Blake2b) is not "standard",
+you can still upgrade to faster implementations if you really need to.
+The Donna implementations of ed25519 for instance can use a custom
+hash âone of the tests does just that.
diff --git a/src/monocypher.c b/src/monocypher.c
index 04f44df..f8b7511 100644
--- a/src/monocypher.c
+++ b/src/monocypher.c
@@ -324,93 +324,102 @@ void crypto_poly1305_auth(u8     mac[16],  const u8 *msg,
 }
 
 ////////////////
-/// Blake2 b /// (taken from the reference
-////////////////  implentation in RFC 7693)
+/// Blake2 b ///
+////////////////
 
-// Initialization Vector.
-static const u64 blake2b_iv[8] = {
+static const u64 iv[8] = {
     0x6a09e667f3bcc908, 0xbb67ae8584caa73b,
     0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1,
     0x510e527fade682d1, 0x9b05688c2b3e6c1f,
-    0x1f83d9abfb41bd6b, 0x5be0cd19137e2179
+    0x1f83d9abfb41bd6b, 0x5be0cd19137e2179,
 };
 
-// increment a 128-bit "word".
-sv incr(u64 x[2], u64 y)
+// increment the input offset
+sv incr(crypto_blake2b_ctx *ctx)
+{
+    u64 *x = ctx->input_offset;
+    u8   y = ctx->buffer_idx;
+    x[0] += y;                 // increment low word
+    if (x[0] < y) { x[1]++; }  // carry overflow to high word
+}
+
+// pad the buffer with zeroes
+sv pad(crypto_blake2b_ctx *ctx)
 {
-    x[0] += y;                 // increment the low word
-    if (x[0] < y) { x[1]++; }  // handle overflow
+    FOR (i, ctx->buffer_idx, 128) { ctx->buffer[i] = 0; }
+    ctx->buffer_idx = 128; // mark the buffer as filled
 }
 
-sv blake2b_compress(crypto_blake2b_ctx *ctx, int last_block)
+sv compress(crypto_blake2b_ctx *ctx, int is_last_block)
 {
     static const u8 sigma[12][16] = {
-        { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
-        { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
-        { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
-        { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
-        { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
-        { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
-        { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
-        { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
-        { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
-        { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
-        { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
-        { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }
+        {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 },
+        { 14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3 },
+        { 11,  8, 12,  0,  5,  2, 15, 13, 10, 14,  3,  6,  7,  1,  9,  4 },
+        {  7,  9,  3,  1, 13, 12, 11, 14,  2,  6,  5, 10,  4,  0, 15,  8 },
+        {  9,  0,  5,  7,  2,  4, 10, 15, 14,  1, 11, 12,  6,  8,  3, 13 },
+        {  2, 12,  6, 10,  0, 11,  8,  3,  4, 13,  7,  5, 15, 14,  1,  9 },
+        { 12,  5,  1, 15, 14, 13,  4, 10,  0,  7,  6,  3,  9,  2,  8, 11 },
+        { 13, 11,  7, 14, 12,  1,  3,  9,  5,  0, 15,  4,  8,  6,  2, 10 },
+        {  6, 15, 14,  9, 11,  3,  0,  8, 12,  2, 13,  7,  1,  4, 10,  5 },
+        { 10,  2,  8,  4,  7,  6,  1,  5, 15, 11,  9, 14,  3, 12, 13,  0 },
+        {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 },
+        { 14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3 },
     };
 
-    // init work variables (before shuffling them)
+    // load input buffer
+    u64 input[16];
+    FOR(i, 0, 16) { input[i] = load64_le(ctx->buffer + i*8); }
+
+    // init work vector
     u64 v[16];
     FOR (i, 0, 8) {
-        v[i    ] = ctx->hash[i];
-        v[i + 8] = blake2b_iv[i];
+        v[i  ] = ctx->hash[i];
+        v[i+8] = iv[i];
     }
-    v[12] ^= ctx->input_size[0]; // low 64 bits of offset
-    v[13] ^= ctx->input_size[1]; // high 64 bits
-    if (last_block) { v[14] = ~v[14]; }
-
-    // load the input buffer
-    u64 m[16];
-    FOR (i ,0, 16) { m[i] = load64_le(&ctx->buf[i * 8]); }
+    v[12] ^= ctx->input_offset[0];
+    v[13] ^= ctx->input_offset[1];
+    if (is_last_block) { v[14] = ~v[14]; }
 
-    // shuffle the work variables with the 12 rounds
+    // mangle work vector
     FOR (i, 0, 12) {
-#define B2B_G(a, b, c, d, x, y)                                    \
-        v[a] += v[b] + x;  v[d] ^= v[a];  v[d] = rotr64(v[d], 32); \
-        v[c] += v[d]    ;  v[b] ^= v[c];  v[b] = rotr64(v[b], 24); \
-        v[a] += v[b] + y;  v[d] ^= v[a];  v[d] = rotr64(v[d], 16); \
-        v[c] += v[d]    ;  v[b] ^= v[c];  v[b] = rotr64(v[b], 63)
-
-        B2B_G( 0, 4,  8, 12, m[sigma[i][ 0]], m[sigma[i][ 1]]);
-        B2B_G( 1, 5,  9, 13, m[sigma[i][ 2]], m[sigma[i][ 3]]);
-        B2B_G( 2, 6, 10, 14, m[sigma[i][ 4]], m[sigma[i][ 5]]);
-        B2B_G( 3, 7, 11, 15, m[sigma[i][ 6]], m[sigma[i][ 7]]);
-        B2B_G( 0, 5, 10, 15, m[sigma[i][ 8]], m[sigma[i][ 9]]);
-        B2B_G( 1, 6, 11, 12, m[sigma[i][10]], m[sigma[i][11]]);
-        B2B_G( 2, 7,  8, 13, m[sigma[i][12]], m[sigma[i][13]]);
-        B2B_G( 3, 4,  9, 14, m[sigma[i][14]], m[sigma[i][15]]);
+#define BLAKE2_G(v, a, b, c, d, x, y)                       \
+        v[a] += v[b] + x;  v[d] = rotr64(v[d] ^ v[a], 32);  \
+        v[c] += v[d];      v[b] = rotr64(v[b] ^ v[c], 24);  \
+        v[a] += v[b] + y;  v[d] = rotr64(v[d] ^ v[a], 16);  \
+        v[c] += v[d];      v[b] = rotr64(v[b] ^ v[c], 63);  \
+
+        BLAKE2_G(v, 0, 4,  8, 12, input[sigma[i][ 0]], input[sigma[i][ 1]]);
+        BLAKE2_G(v, 1, 5,  9, 13, input[sigma[i][ 2]], input[sigma[i][ 3]]);
+        BLAKE2_G(v, 2, 6, 10, 14, input[sigma[i][ 4]], input[sigma[i][ 5]]);
+        BLAKE2_G(v, 3, 7, 11, 15, input[sigma[i][ 6]], input[sigma[i][ 7]]);
+        BLAKE2_G(v, 0, 5, 10, 15, input[sigma[i][ 8]], input[sigma[i][ 9]]);
+        BLAKE2_G(v, 1, 6, 11, 12, input[sigma[i][10]], input[sigma[i][11]]);
+        BLAKE2_G(v, 2, 7,  8, 13, input[sigma[i][12]], input[sigma[i][13]]);
+        BLAKE2_G(v, 3, 4,  9, 14, input[sigma[i][14]], input[sigma[i][15]]);
     }
-    // accumulate the work variables into the hash
+    // update hash
     FOR (i, 0, 8) { ctx->hash[i] ^= v[i] ^ v[i+8]; }
+    // mark buffer as empty
+    ctx->buffer_idx = 0;
 }
 
 void crypto_blake2b_general_init(crypto_blake2b_ctx *ctx, size_t out_size,
                                  const u8           *key, size_t key_size)
 {
-    // Initial hash == initialization vector...
-    FOR (i, 0, 8) { ctx->hash[i] = blake2b_iv[i]; }
-    ctx->hash[0] ^= 0x01010000 ^ (key_size << 8) ^ out_size;  // ...mostly
+    // initial hash
+    FOR (i, 0, 8) { ctx->hash[i] = iv[i]; }
+    ctx->hash[0] ^= 0x01010000 ^ (key_size << 8) ^ out_size;
 
-    ctx->input_size[0] = 0;       // input count low word
-    ctx->input_size[1] = 0;       // input count high word
-    ctx->c             = 0;       // pointer within buffer
-    ctx->output_size   = out_size;  // size of the final hash
+    ctx->input_offset[0] = 0;         // begining of the input, no offset
+    ctx->input_offset[1] = 0;         // begining of the input, no offset
+    ctx->buffer_idx      = 0;         // buffer is empty
+    ctx->hash_size       = out_size;  // remember the hash size we want
 
-    // If there's a key, put it in the first block, then pad with zeroes
+    // if there is a key, the first block is that key
     if (key_size > 0) {
-        FOR (i, 0     , key_size) { ctx->buf[i] = key[i]; }
-        FOR (i, key_size, 128   ) { ctx->buf[i] = 0;      }
-        ctx->c = 128; // mark the block as used
+        crypto_blake2b_update(ctx, key, key_size);
+        pad(ctx);
     }
 }
 
@@ -422,28 +431,21 @@ void crypto_blake2b_init(crypto_blake2b_ctx *ctx)
 void crypto_blake2b_update(crypto_blake2b_ctx *ctx, const u8 *in, size_t in_size)
 {
     FOR (i, 0, in_size) {
-        // If the buffer is full, increment the counters and
-        // add (compress) the current buffer to the hash
-        if (ctx->c == 128) {
-            ctx->c = 0;
-            incr(ctx->input_size, 128);
-            blake2b_compress(ctx, 0); // not last time -> 0
+        if (ctx->buffer_idx == 128) { // If buffer is full,
+            incr(ctx);                // update the input offset
+            compress(ctx, 0);         // compress the (not last) block
         }
-        // By now the buffer is not full.  We add one input byte.
-        ctx->buf[ctx->c] = in[i];
-        ctx->c++;
+        ctx->buffer[ctx->buffer_idx] = in[i];
+        ctx->buffer_idx++;
     }
 }
 
 void crypto_blake2b_final(crypto_blake2b_ctx *ctx, u8 *out)
 {
-    // update input size, pad then compress the buffer
-    incr(ctx->input_size, ctx->c);
-    FOR (i, ctx->c, 128) { ctx->buf[i] = 0; }
-    blake2b_compress(ctx, 1); // last time -> 1
-
-    // copy the hash in the output (little endian of course)
-    FOR (i, 0, ctx->output_size) {
+    incr    (ctx);    // update the input offset (the last block may not be full)
+    pad     (ctx);    // pad the last block with zeroes
+    compress(ctx, 1); // compress the last block
+    FOR (i, 0, ctx->hash_size) {
         out[i] = (ctx->hash[i / 8] >> (8 * (i & 7))) & 0xff;
     }
 }
diff --git a/src/monocypher.h b/src/monocypher.h
index cd17d1b..3aeb40e 100644
--- a/src/monocypher.h
+++ b/src/monocypher.h
@@ -67,11 +67,11 @@ void crypto_poly1305_auth(uint8_t        mac[16],
 /// Blake2 b ///
 ////////////////
 typedef struct {
-    uint8_t  buf[128];      // input buffer
-    uint64_t hash[8];       // chained state
-    uint64_t input_size[2]; // total number of bytes
-    uint8_t  c;             // pointer for buf[]
-    uint8_t  output_size;   // digest size
+    uint64_t hash[8];
+    uint64_t input_offset[2];
+    uint8_t  buffer[128];
+    size_t   buffer_idx;
+    size_t   hash_size;
 } crypto_blake2b_ctx;
 
 void crypto_blake2b_general_init(crypto_blake2b_ctx *ctx, size_t out_size,