From cb5d6013310abf64bce908779ebd6d92fcb84136 Mon Sep 17 00:00:00 2001
From: Loup Vaillant <loup@loup-vaillant.fr>
Date: Sun, 4 Sep 2016 01:48:25 +0200
Subject: [PATCH] Added chacha20 related algorithms

---
 .gitignore |   3 +
 chacha20.c | 465 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 chacha20.h |  92 +++++++++++
 3 files changed, 560 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 chacha20.c
 create mode 100644 chacha20.h

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..62fb2d7
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,3 @@
+# backup files
+*~
+*.o
\ No newline at end of file
diff --git a/chacha20.c b/chacha20.c
new file mode 100644
index 0000000..41b14f2
--- /dev/null
+++ b/chacha20.c
@@ -0,0 +1,465 @@
+#include "chacha20.h"
+#include <string.h>
+
+/////////////////
+/// Utilities ///
+/////////////////
+
+// undefined if n >= 32 or n == 0
+// Should compile into a single rot32 instruction
+// if available on the processsor.
+static uint32_t
+rotl32 (uint32_t x, uint32_t n)
+{
+    return (x << n) | (x >> (32 - n));
+}
+
+static uint32_t
+load32_le(const uint8_t s[4])
+{
+    // Portable, slow way.
+    // Only affects initialisation, though.
+    return s[0]
+        | (s[1] <<  8)
+        | (s[2] << 16)
+        | (s[3] << 24);
+}
+
+static void
+store32_le(uint8_t output[4], uint32_t input)
+{
+    // Portable, slow way.
+    output[0] =  input        & 0xff;
+    output[1] = (input >>  8) & 0xff;
+    output[2] = (input >> 16) & 0xff;
+    output[3] = (input >> 24) & 0xff;
+}
+
+// The counter is basically unlimited.  Still, if you ever allow
+// it to wrap around itself (by encrypting  more than 2^70 bytes
+// per nonce), you'll expose the XOR of the begining and the end
+// of your plaintext, and lose confidentiality in the process.
+// Oops.
+static void
+increment_counter(crypto_chacha_ctx *ctx)
+{
+    ctx->input[12]++;
+    // That conditional ultimately only depends on the length of the input,
+    // which is not secret.  Heck, the counter itself is not secret.
+    // No timing attack there, we're fine.
+    if (!ctx->input[12])
+        ctx->input[13]++;
+}
+
+///////////////////////
+/// Round functions ///
+///////////////////////
+
+// A quarter round is meant to mangle a fourth of a chacha context.
+// (a line, a column, or any subset you can think of).  Enough of
+// these rounds, carefully chosen, will garble the input beyond
+// recognition.
+//
+// WARNING: THIS OPERATION IS REVERSIBLE.
+//
+// If you build a crypto primitive on top of it without additional
+// precautions, any rookie cryptanalist can break it.
+#define QUARTERROUND(a, b, c, d)       \
+    a = a + b;  d = rotl32(d ^ a, 16); \
+    c = c + d;  b = rotl32(b ^ c, 12); \
+    a = a + b;  d = rotl32(d ^ a,  8); \
+    c = c + d;  b = rotl32(b ^ c,  7)
+
+// Mangles the chacha context into a pseudorandom output
+// That is, if you don't know the key stored in the context,
+// you can't deduce squat about the output.  This is true
+// even if you know the output of many other contexes, even
+// if they differ from this one by only one bit.
+static void
+chacha20_rounds(uint8_t output[64], const crypto_chacha_ctx *ctx)
+{
+    // Local variables instead of indices, to facilitate optimisations
+    // TODO: test this shit.  The speed increase might be small.
+    uint32_t x0  = ctx->input[ 0];
+    uint32_t x1  = ctx->input[ 1];
+    uint32_t x2  = ctx->input[ 2];
+    uint32_t x3  = ctx->input[ 3];
+    uint32_t x4  = ctx->input[ 4];
+    uint32_t x5  = ctx->input[ 5];
+    uint32_t x6  = ctx->input[ 6];
+    uint32_t x7  = ctx->input[ 7];
+    uint32_t x8  = ctx->input[ 8];
+    uint32_t x9  = ctx->input[ 9];
+    uint32_t x10 = ctx->input[10];
+    uint32_t x11 = ctx->input[11];
+    uint32_t x12 = ctx->input[12];
+    uint32_t x13 = ctx->input[13];
+    uint32_t x14 = ctx->input[14];
+    uint32_t x15 = ctx->input[15];
+
+    for (int i = 20; i > 0; i -= 2) { // 20 rounds, 2 rounds per loop.
+        // The ctx is viewed as a matrix, whose indices are like
+        //  0  1  2  3
+        //  4  5  6  7
+        //  8  9 10 11
+        // 12 13 14 15
+
+        // Column round.  Very SIMD friendly, if you want high performance.
+        QUARTERROUND(x0, x4,  x8, x12); // column 0
+        QUARTERROUND(x1, x5,  x9, x13); // column 1
+        QUARTERROUND(x2, x6, x10, x14); // column 2
+        QUARTERROUND(x3, x7, x11, x15); // column 3
+
+        // South East diagonal round.
+        // With a bit of permutation, this is also very SIMD friendly.
+        QUARTERROUND(x0, x5, x10, x15); // diagonal 1
+        QUARTERROUND(x1, x6, x11, x12); // diagonal 2
+        QUARTERROUND(x2, x7,  x8, x13); // diagonal 3
+        QUARTERROUND(x3, x4,  x9, x14); // diagonal 4
+    }
+
+    // Now our buffer is seriously garbled.  However, it is still easy
+    // to deduce the initial context from it: just invert the quarter
+    // rounds and apply that in reverse order.
+    //
+    // So we perform a final operation: we add the initial context to
+    // the buffer.  Half of this initial context is made up of the key,
+    // unknown to the attacker (the constant, nonce and counter are
+    // known however).
+    //
+    // This effectively removes half the intel the attacker needs to
+    // reverse the computation, forcing him to try all 2^256 possibilities.
+    // Well that's the idea anyway.  the security of  this trick is not
+    // proven, and with few enough rounds, there are clever schemes that
+    // don't try the whole key space.
+    //
+    // As of 2016, "low enough" means 6 or 7 rounds.  We use 20.
+    // This shoul be enough to prevent anyone from breaking them all
+    // in the forseeable future.
+    //
+    // Note that in principle, we don't have to add the constants nonce
+    // and counter: that part could be reversed by the attacker anyway
+    // so we'd be just as secure if we didn't.  However that only improves
+    // the performance of naive implementations such as this one.  With
+    // SIMD, it's faster to just add the lot, so that's what the standard
+    // does.
+    x0  += ctx->input[ 0];
+    x1  += ctx->input[ 1];
+    x2  += ctx->input[ 2];
+    x3  += ctx->input[ 3];
+    x4  += ctx->input[ 4];
+    x5  += ctx->input[ 5];
+    x6  += ctx->input[ 6];
+    x7  += ctx->input[ 7];
+    x8  += ctx->input[ 8];
+    x9  += ctx->input[ 9];
+    x10 += ctx->input[10];
+    x11 += ctx->input[11];
+    x12 += ctx->input[12];
+    x13 += ctx->input[13];
+    x14 += ctx->input[14];
+    x15 += ctx->input[15];
+
+    // finally, we can output our buffer
+    store32_le(output +  0, x0 );
+    store32_le(output +  4, x1 );
+    store32_le(output +  8, x2 );
+    store32_le(output + 12, x3 );
+    store32_le(output + 16, x4 );
+    store32_le(output + 20, x5 );
+    store32_le(output + 24, x6 );
+    store32_le(output + 28, x7 );
+    store32_le(output + 32, x8 );
+    store32_le(output + 36, x9 );
+    store32_le(output + 40, x10);
+    store32_le(output + 44, x11);
+    store32_le(output + 48, x12);
+    store32_le(output + 52, x13);
+    store32_le(output + 56, x14);
+    store32_le(output + 60, x15);
+}
+
+// This one is the same as chacha20_rounds, only it gives you only
+// half the output (256 bytes). It's basically the same as HSalsa20,
+// except build on ChaCha.  It is provably as secure as ChaCha20
+static void
+half_chacha20_rounds(uint32_t output[8], const crypto_chacha_ctx *ctx)
+{
+    // Copy pasta rom chacha20_rounds
+    uint32_t x0  = ctx->input[ 0];
+    uint32_t x1  = ctx->input[ 1];
+    uint32_t x2  = ctx->input[ 2];
+    uint32_t x3  = ctx->input[ 3];
+    uint32_t x4  = ctx->input[ 4];
+    uint32_t x5  = ctx->input[ 5];
+    uint32_t x6  = ctx->input[ 6];
+    uint32_t x7  = ctx->input[ 7];
+    uint32_t x8  = ctx->input[ 8];
+    uint32_t x9  = ctx->input[ 9];
+    uint32_t x10 = ctx->input[10];
+    uint32_t x11 = ctx->input[11];
+    uint32_t x12 = ctx->input[12];
+    uint32_t x13 = ctx->input[13];
+    uint32_t x14 = ctx->input[14];
+    uint32_t x15 = ctx->input[15];
+
+    // Copy pasta rom chacha20_rounds
+    for (int i = 20; i > 0; i -= 2) {
+        QUARTERROUND(x0, x4,  x8, x12); // column 0
+        QUARTERROUND(x1, x5,  x9, x13); // column 1
+        QUARTERROUND(x2, x6, x10, x14); // column 2
+        QUARTERROUND(x3, x7, x11, x15); // column 3
+        QUARTERROUND(x0, x5, x10, x15); // diagonal 1
+        QUARTERROUND(x1, x6, x11, x12); // diagonal 2
+        QUARTERROUND(x2, x7,  x8, x13); // diagonal 3
+        QUARTERROUND(x3, x4,  x9, x14); // diagonal 4
+    }
+
+    // Okay, remember about needing that addition?  Well, we only
+    // Disclose half of the output, and that ensures the attacker
+    // is missing 256 bits of information just like in regular chacha20.
+    //
+    // However, to be able to *prove* this is just as secure as chacha20,
+    // we will copy the numbers we know the attacker could have deduced
+    // anyway.  That is:
+    // - words 0, 1, 2, 3 (chacha constant),
+    // - words 12 and 13  (counter),
+    // - words 14 and 15  (nonce)
+    //
+    // This lets us avoid a couple additional loads and additions,
+    // for even moar speed.
+    output[0] = ctx->input[ 0]; // don't add the constant
+    output[1] = ctx->input[ 1]; // don't add the constant
+    output[2] = ctx->input[ 2]; // don't add the constant
+    output[3] = ctx->input[ 3]; // don't add the constant
+    output[4] = ctx->input[12]; // don't add the counter
+    output[5] = ctx->input[13]; // don't add the counter
+    output[6] = ctx->input[14]; // don't add the nonce
+    output[7] = ctx->input[15]; // don't add the nonce
+}
+
+//////////////////////////////
+/// Context initialization ///
+//////////////////////////////
+static void
+init_constant(crypto_chacha_ctx *ctx)
+{
+    // This constant looks like wasted space, that could be used
+    // for a larger key, nounce, or counter.  But while its exact
+    // value hardly matters, some properties it has do.
+    //
+    // Among other things, this constant prevents the existence of
+    // the all zero context, which would map to an all zero output;
+    // and is "assymetric" enough to guarantee good mangling.
+    //
+    // Also, the very exstence of a constant reduces reduces the amount
+    // of the context that's under the control of the attacker (a fourth
+    // instead of a whole half).
+    //
+    // Simply put: keep the constant, it's safer that way.
+    ctx->input[0]  = load32_le((uint8_t*)"expa");
+    ctx->input[1]  = load32_le((uint8_t*)"nd 3");
+    ctx->input[2]  = load32_le((uint8_t*)"2-by");
+    ctx->input[3]  = load32_le((uint8_t*)"te k");
+}
+
+static void
+init_key(crypto_chacha_ctx *ctx, const uint8_t key[32])
+{
+    ctx->input[4]  = load32_le(key +  0);
+    ctx->input[5]  = load32_le(key +  4);
+    ctx->input[6]  = load32_le(key +  8);
+    ctx->input[7]  = load32_le(key + 12);
+    ctx->input[8]  = load32_le(key + 16);
+    ctx->input[9]  = load32_le(key + 20);
+    ctx->input[10] = load32_le(key + 24);
+    ctx->input[11] = load32_le(key + 28);
+}
+
+static void
+init_ctr(crypto_chacha_ctx *ctx, uint64_t ctr)
+{
+    ctx->input[12] = (uint32_t) ctr;                   // LSB
+    ctx->input[13] = (uint32_t) (ctr >> (uint64_t)32); // MSB
+}
+
+static void
+init_nonce(crypto_chacha_ctx *ctx, const uint8_t nonce[8])
+{
+    ctx->input[14] = load32_le(nonce + 0);
+    ctx->input[15] = load32_le(nonce + 4);
+}
+
+static void
+init_big_nonce(crypto_chacha_ctx *ctx, const uint8_t nonce[16])
+{
+    ctx->input[12] = load32_le(nonce +  0);
+    ctx->input[13] = load32_le(nonce +  4);
+    ctx->input[14] = load32_le(nonce +  8);
+    ctx->input[15] = load32_le(nonce + 12);
+}
+
+static void
+init_chacha20(crypto_chacha_ctx *ctx,
+              const uint8_t      key[32],
+              const uint8_t      nonce[8],
+              uint64_t           ctr)
+{
+    init_constant(ctx       );
+    init_key     (ctx, key  );
+    init_ctr     (ctx, ctr  );
+    init_nonce   (ctx, nonce);
+}
+
+// Initializes a chacha context, with a bigger nonce.
+//
+// It uses a cascade scheme where a first block is initialised with
+// the first 128 bits of the nounce (and no counter), and a second block
+// is initialised with a derived key from the first block, and the
+// last 64 bits of the nonce.
+//
+// It's slower than regular initialization, but that big nonce can now
+// be selected at random without fear of collision.
+static void
+init_Xchacha20(crypto_chacha_ctx *ctx,
+               const uint8_t      key[32],
+               const uint8_t      nonce[24],
+               uint64_t           ctr)
+{
+    // initialise a first block
+    crypto_chacha_ctx init_ctx;
+    init_constant (&init_ctx       );
+    init_key      (&init_ctx, key  );
+    init_big_nonce(&init_ctx, nonce);
+
+    // set up the cascade
+    init_constant(ctx            );
+    init_ctr     (ctx, ctr       );
+    init_nonce   (ctx, nonce + 16);
+    half_chacha20_rounds(ctx->input + 5, &init_ctx); // init derived key
+}
+
+static void
+encrypt_chacha20(crypto_chacha_ctx *ctx,
+                 const uint8_t     *plain_text,
+                 uint8_t           *cipher_text,
+                 size_t             msg_length)
+{
+    size_t remaining_bytes = msg_length;
+    for (;;) {
+        uint8_t random_block[64];
+        chacha20_rounds(random_block, ctx);
+        increment_counter(ctx); // the only modification of the context
+
+        // XOR the last pseudo-random block with the input,
+        // then end the loop.
+        if (remaining_bytes <= 64) {
+            for (unsigned i = 0; i < remaining_bytes; i++)
+                cipher_text[i] = plain_text[i] ^ random_block[i];
+            return;
+        }
+
+        // XOR the current pseudo-random block with the input.
+        for (int i = 0; i < 64; i++)
+            cipher_text[i] = plain_text[i] ^ random_block[i];
+        remaining_bytes -= 64;
+        cipher_text     += 64;
+        plain_text      += 64;
+    }
+}
+
+//////////////////
+/// Encryption ///
+//////////////////
+void
+crypto_encrypt_chacha20(const uint8_t  key[32],
+                        const uint8_t  nonce[8],
+                        uint64_t       ctr,
+                        const uint8_t *plain_text,
+                        uint8_t       *cipher_text,
+                        size_t         msg_length)
+{
+    crypto_chacha_ctx ctx;
+    init_chacha20(&ctx, key, nonce, ctr);
+    encrypt_chacha20(&ctx, plain_text, cipher_text, msg_length);
+}
+
+
+void
+crypto_encrypt_Xchacha20(const uint8_t  key[32],
+                         const uint8_t  nonce[24],
+                         uint64_t       ctr,
+                         const uint8_t *plain_text,
+                         uint8_t       *cipher_text,
+                         size_t         msg_length)
+{
+    crypto_chacha_ctx ctx;
+    init_Xchacha20(&ctx, key, nonce, ctr);
+    encrypt_chacha20(&ctx, plain_text, cipher_text, msg_length);
+}
+
+void
+crypto_block_chacha20(const uint8_t key[32],
+                      const uint8_t nonce[8],
+                      uint64_t      ctr,
+                      uint8_t       output[64])
+{
+    crypto_chacha_ctx ctx;
+    init_chacha20(&ctx, key, nonce, ctr);
+    chacha20_rounds(output, &ctx);
+}
+
+void
+crypto_block_Xchacha20(const uint8_t key[32],
+                       const uint8_t nonce[24],
+                       uint64_t      ctr,
+                       uint8_t       output[64])
+{
+    crypto_chacha_ctx ctx;
+    init_Xchacha20(&ctx, key, nonce, ctr);
+    chacha20_rounds(output, &ctx);
+}
+
+///////////////////////////////
+/// Random number generator ///
+///////////////////////////////
+void
+crypto_init_rng(crypto_rng_context *ctx,
+                const uint8_t       key[32],
+                const uint8_t       nonce[8])
+{
+    init_chacha20(&ctx->chacha_ctx, key, nonce, 0);
+    ctx->remaining_bytes = 0;
+}
+
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+
+void
+crypto_random_bytes(crypto_rng_context *ctx,
+                    uint8_t            *out,
+                    size_t              nb_bytes)
+{
+    // Consume any remaining byte from a previous
+    // call to random_bytes
+    size_t penultimate_size = MIN(nb_bytes, ctx->remaining_bytes);
+    memcpy(out, ctx->reminder + 64 - ctx->remaining_bytes, penultimate_size);
+    out                  += penultimate_size;
+    nb_bytes             -= penultimate_size;
+    ctx->remaining_bytes -= penultimate_size;
+
+    if (nb_bytes == 0) { return ;} // past that point, ctx->remaining_bytes is 0
+
+    // fill the output stream block by block
+    while (nb_bytes >= 64) {
+        chacha20_rounds(out, &ctx->chacha_ctx);
+        increment_counter(&ctx->chacha_ctx);
+        out      += 64;
+        nb_bytes -= 64;
+    }
+
+    // Generate one last block and finish this
+    chacha20_rounds(ctx->reminder, &ctx->chacha_ctx); // there was no reminder
+    increment_counter(&ctx->chacha_ctx);
+    memcpy(out, ctx->reminder, nb_bytes); // those two lines work even
+    ctx->remaining_bytes = 64 - nb_bytes; // when nb_bytes is already 0
+}
diff --git a/chacha20.h b/chacha20.h
new file mode 100644
index 0000000..4bd598f
--- /dev/null
+++ b/chacha20.h
@@ -0,0 +1,92 @@
+#ifndef __CHACHA20__
+#define __CHACHA20__
+
+#include <inttypes.h>
+#include <stddef.h>
+
+typedef struct crypto_chacha_ctx {
+    uint32_t input[16];
+} crypto_chacha_ctx;
+
+typedef struct crypto_rng_context {
+    crypto_chacha_ctx chacha_ctx;
+    uint8_t           reminder[64];
+    size_t            remaining_bytes;
+} crypto_rng_context;
+
+// Encrypts the plain_text by XORing it with a pseudo-random
+// stream of numbers, seeded by the key and the nonce.
+// NEVER ENCRYPT 2 DISTINCT MESSAGES WITH THE SAME KEY AND NONCE.
+// If you do, you will expose the XOR of the two messages, and your
+// confidentiality is toast.  One time pads are called one time pads
+// for a reason.
+//
+// To decrypt a message, encrypt it again with the same key and nonce.
+//
+// key        : Your secret key.  Chose a (pseudo-)random one!
+// nonce      : Only once per key. Seriously, don't mess this up.
+// ctr        : Initial counter value. Typically 0 or 1
+// plain_text : Your precious secret message
+// cipher_text: Output buffer
+// msg_length : Length of plain_text and cipher_text
+void
+crypto_encrypt_chacha20(const uint8_t  key[32],
+                        const uint8_t  nonce[8],
+                        uint64_t       ctr,
+                        const uint8_t *plain_text,
+                        uint8_t       *cipher_text,
+                        size_t         msg_length);
+
+// This one is very similar to encrypt_chacha20, except it provides
+// a nonce large enough to chose at random, withouth worrying about
+// collisions.  Handy for stateless protocols.
+//
+// Streaming performance is the same, initialization is a bit slower.
+void
+crypto_encrypt_Xchacha20(const uint8_t  key[32],
+                         const uint8_t  nonce[24],
+                         uint64_t       ctr,
+                         const uint8_t *plain_text,
+                         uint8_t       *cipher_text,
+                         size_t         msg_length);
+
+// Will look random as long as you never use the same input twice.
+// Can be used with encrypt_bytes as long as the counter here is smaller
+// than the encrypt_bytes counter, and the message is not long enough to
+// wrap around the counter.
+// Use 1 for the encrypt_bytes counter and 0 here, and you should be safe.
+void
+crypto_block_chacha20(const uint8_t key[32],
+                      const uint8_t nonce[8],
+                      uint64_t      ctr,
+                      uint8_t       output[64]);
+
+// Similar to block_chacha20, except for the big nonce and worse performance.
+void
+crypto_block_Xchacha20(const uint8_t key[32],
+                       const uint8_t nonce[24],
+                       uint64_t      ctr,
+                       uint8_t       output[64]);
+
+
+
+// Inits a cryptographically secure Random Number Generator, with the
+// given key and nonce.  The output of that RNG will depend entirely
+// on the key and nonce.
+// NEVER USE THE SAME KEY AND NONCE FOR THIS AND MESSAGE ENCRYPTION.
+// If you do, you could leak the very key stream used to encrypt
+// your messages.  They'd be instantly deciphered.
+void
+crypto_init_rng(crypto_rng_context *ctx,
+                const uint8_t       key[32],
+                const uint8_t       nonce[8]);
+
+// provides pseudo-random bytes, deterministically (the output and
+// the end state of ctx depends entirely on the initial state of ctx).
+// It's a chacha20 stream, really.
+void
+crypto_random_bytes(crypto_rng_context *ctx,
+                    uint8_t            *out,
+                    size_t              nb_bytes);
+
+#endif // __CHACHA20__
-- 
2.47.3