From cb5d6013310abf64bce908779ebd6d92fcb84136 Mon Sep 17 00:00:00 2001 From: Loup Vaillant Date: Sun, 4 Sep 2016 01:48:25 +0200 Subject: [PATCH] Added chacha20 related algorithms --- .gitignore | 3 + chacha20.c | 465 +++++++++++++++++++++++++++++++++++++++++++++++++++++ chacha20.h | 92 +++++++++++ 3 files changed, 560 insertions(+) create mode 100644 .gitignore create mode 100644 chacha20.c create mode 100644 chacha20.h diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..62fb2d7 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +# backup files +*~ +*.o \ No newline at end of file diff --git a/chacha20.c b/chacha20.c new file mode 100644 index 0000000..41b14f2 --- /dev/null +++ b/chacha20.c @@ -0,0 +1,465 @@ +#include "chacha20.h" +#include + +///////////////// +/// Utilities /// +///////////////// + +// undefined if n >= 32 or n == 0 +// Should compile into a single rot32 instruction +// if available on the processsor. +static uint32_t +rotl32 (uint32_t x, uint32_t n) +{ + return (x << n) | (x >> (32 - n)); +} + +static uint32_t +load32_le(const uint8_t s[4]) +{ + // Portable, slow way. + // Only affects initialisation, though. + return s[0] + | (s[1] << 8) + | (s[2] << 16) + | (s[3] << 24); +} + +static void +store32_le(uint8_t output[4], uint32_t input) +{ + // Portable, slow way. + output[0] = input & 0xff; + output[1] = (input >> 8) & 0xff; + output[2] = (input >> 16) & 0xff; + output[3] = (input >> 24) & 0xff; +} + +// The counter is basically unlimited. Still, if you ever allow +// it to wrap around itself (by encrypting more than 2^70 bytes +// per nonce), you'll expose the XOR of the begining and the end +// of your plaintext, and lose confidentiality in the process. +// Oops. +static void +increment_counter(crypto_chacha_ctx *ctx) +{ + ctx->input[12]++; + // That conditional ultimately only depends on the length of the input, + // which is not secret. Heck, the counter itself is not secret. + // No timing attack there, we're fine. + if (!ctx->input[12]) + ctx->input[13]++; +} + +/////////////////////// +/// Round functions /// +/////////////////////// + +// A quarter round is meant to mangle a fourth of a chacha context. +// (a line, a column, or any subset you can think of). Enough of +// these rounds, carefully chosen, will garble the input beyond +// recognition. +// +// WARNING: THIS OPERATION IS REVERSIBLE. +// +// If you build a crypto primitive on top of it without additional +// precautions, any rookie cryptanalist can break it. +#define QUARTERROUND(a, b, c, d) \ + a = a + b; d = rotl32(d ^ a, 16); \ + c = c + d; b = rotl32(b ^ c, 12); \ + a = a + b; d = rotl32(d ^ a, 8); \ + c = c + d; b = rotl32(b ^ c, 7) + +// Mangles the chacha context into a pseudorandom output +// That is, if you don't know the key stored in the context, +// you can't deduce squat about the output. This is true +// even if you know the output of many other contexes, even +// if they differ from this one by only one bit. +static void +chacha20_rounds(uint8_t output[64], const crypto_chacha_ctx *ctx) +{ + // Local variables instead of indices, to facilitate optimisations + // TODO: test this shit. The speed increase might be small. + uint32_t x0 = ctx->input[ 0]; + uint32_t x1 = ctx->input[ 1]; + uint32_t x2 = ctx->input[ 2]; + uint32_t x3 = ctx->input[ 3]; + uint32_t x4 = ctx->input[ 4]; + uint32_t x5 = ctx->input[ 5]; + uint32_t x6 = ctx->input[ 6]; + uint32_t x7 = ctx->input[ 7]; + uint32_t x8 = ctx->input[ 8]; + uint32_t x9 = ctx->input[ 9]; + uint32_t x10 = ctx->input[10]; + uint32_t x11 = ctx->input[11]; + uint32_t x12 = ctx->input[12]; + uint32_t x13 = ctx->input[13]; + uint32_t x14 = ctx->input[14]; + uint32_t x15 = ctx->input[15]; + + for (int i = 20; i > 0; i -= 2) { // 20 rounds, 2 rounds per loop. + // The ctx is viewed as a matrix, whose indices are like + // 0 1 2 3 + // 4 5 6 7 + // 8 9 10 11 + // 12 13 14 15 + + // Column round. Very SIMD friendly, if you want high performance. + QUARTERROUND(x0, x4, x8, x12); // column 0 + QUARTERROUND(x1, x5, x9, x13); // column 1 + QUARTERROUND(x2, x6, x10, x14); // column 2 + QUARTERROUND(x3, x7, x11, x15); // column 3 + + // South East diagonal round. + // With a bit of permutation, this is also very SIMD friendly. + QUARTERROUND(x0, x5, x10, x15); // diagonal 1 + QUARTERROUND(x1, x6, x11, x12); // diagonal 2 + QUARTERROUND(x2, x7, x8, x13); // diagonal 3 + QUARTERROUND(x3, x4, x9, x14); // diagonal 4 + } + + // Now our buffer is seriously garbled. However, it is still easy + // to deduce the initial context from it: just invert the quarter + // rounds and apply that in reverse order. + // + // So we perform a final operation: we add the initial context to + // the buffer. Half of this initial context is made up of the key, + // unknown to the attacker (the constant, nonce and counter are + // known however). + // + // This effectively removes half the intel the attacker needs to + // reverse the computation, forcing him to try all 2^256 possibilities. + // Well that's the idea anyway. the security of this trick is not + // proven, and with few enough rounds, there are clever schemes that + // don't try the whole key space. + // + // As of 2016, "low enough" means 6 or 7 rounds. We use 20. + // This shoul be enough to prevent anyone from breaking them all + // in the forseeable future. + // + // Note that in principle, we don't have to add the constants nonce + // and counter: that part could be reversed by the attacker anyway + // so we'd be just as secure if we didn't. However that only improves + // the performance of naive implementations such as this one. With + // SIMD, it's faster to just add the lot, so that's what the standard + // does. + x0 += ctx->input[ 0]; + x1 += ctx->input[ 1]; + x2 += ctx->input[ 2]; + x3 += ctx->input[ 3]; + x4 += ctx->input[ 4]; + x5 += ctx->input[ 5]; + x6 += ctx->input[ 6]; + x7 += ctx->input[ 7]; + x8 += ctx->input[ 8]; + x9 += ctx->input[ 9]; + x10 += ctx->input[10]; + x11 += ctx->input[11]; + x12 += ctx->input[12]; + x13 += ctx->input[13]; + x14 += ctx->input[14]; + x15 += ctx->input[15]; + + // finally, we can output our buffer + store32_le(output + 0, x0 ); + store32_le(output + 4, x1 ); + store32_le(output + 8, x2 ); + store32_le(output + 12, x3 ); + store32_le(output + 16, x4 ); + store32_le(output + 20, x5 ); + store32_le(output + 24, x6 ); + store32_le(output + 28, x7 ); + store32_le(output + 32, x8 ); + store32_le(output + 36, x9 ); + store32_le(output + 40, x10); + store32_le(output + 44, x11); + store32_le(output + 48, x12); + store32_le(output + 52, x13); + store32_le(output + 56, x14); + store32_le(output + 60, x15); +} + +// This one is the same as chacha20_rounds, only it gives you only +// half the output (256 bytes). It's basically the same as HSalsa20, +// except build on ChaCha. It is provably as secure as ChaCha20 +static void +half_chacha20_rounds(uint32_t output[8], const crypto_chacha_ctx *ctx) +{ + // Copy pasta rom chacha20_rounds + uint32_t x0 = ctx->input[ 0]; + uint32_t x1 = ctx->input[ 1]; + uint32_t x2 = ctx->input[ 2]; + uint32_t x3 = ctx->input[ 3]; + uint32_t x4 = ctx->input[ 4]; + uint32_t x5 = ctx->input[ 5]; + uint32_t x6 = ctx->input[ 6]; + uint32_t x7 = ctx->input[ 7]; + uint32_t x8 = ctx->input[ 8]; + uint32_t x9 = ctx->input[ 9]; + uint32_t x10 = ctx->input[10]; + uint32_t x11 = ctx->input[11]; + uint32_t x12 = ctx->input[12]; + uint32_t x13 = ctx->input[13]; + uint32_t x14 = ctx->input[14]; + uint32_t x15 = ctx->input[15]; + + // Copy pasta rom chacha20_rounds + for (int i = 20; i > 0; i -= 2) { + QUARTERROUND(x0, x4, x8, x12); // column 0 + QUARTERROUND(x1, x5, x9, x13); // column 1 + QUARTERROUND(x2, x6, x10, x14); // column 2 + QUARTERROUND(x3, x7, x11, x15); // column 3 + QUARTERROUND(x0, x5, x10, x15); // diagonal 1 + QUARTERROUND(x1, x6, x11, x12); // diagonal 2 + QUARTERROUND(x2, x7, x8, x13); // diagonal 3 + QUARTERROUND(x3, x4, x9, x14); // diagonal 4 + } + + // Okay, remember about needing that addition? Well, we only + // Disclose half of the output, and that ensures the attacker + // is missing 256 bits of information just like in regular chacha20. + // + // However, to be able to *prove* this is just as secure as chacha20, + // we will copy the numbers we know the attacker could have deduced + // anyway. That is: + // - words 0, 1, 2, 3 (chacha constant), + // - words 12 and 13 (counter), + // - words 14 and 15 (nonce) + // + // This lets us avoid a couple additional loads and additions, + // for even moar speed. + output[0] = ctx->input[ 0]; // don't add the constant + output[1] = ctx->input[ 1]; // don't add the constant + output[2] = ctx->input[ 2]; // don't add the constant + output[3] = ctx->input[ 3]; // don't add the constant + output[4] = ctx->input[12]; // don't add the counter + output[5] = ctx->input[13]; // don't add the counter + output[6] = ctx->input[14]; // don't add the nonce + output[7] = ctx->input[15]; // don't add the nonce +} + +////////////////////////////// +/// Context initialization /// +////////////////////////////// +static void +init_constant(crypto_chacha_ctx *ctx) +{ + // This constant looks like wasted space, that could be used + // for a larger key, nounce, or counter. But while its exact + // value hardly matters, some properties it has do. + // + // Among other things, this constant prevents the existence of + // the all zero context, which would map to an all zero output; + // and is "assymetric" enough to guarantee good mangling. + // + // Also, the very exstence of a constant reduces reduces the amount + // of the context that's under the control of the attacker (a fourth + // instead of a whole half). + // + // Simply put: keep the constant, it's safer that way. + ctx->input[0] = load32_le((uint8_t*)"expa"); + ctx->input[1] = load32_le((uint8_t*)"nd 3"); + ctx->input[2] = load32_le((uint8_t*)"2-by"); + ctx->input[3] = load32_le((uint8_t*)"te k"); +} + +static void +init_key(crypto_chacha_ctx *ctx, const uint8_t key[32]) +{ + ctx->input[4] = load32_le(key + 0); + ctx->input[5] = load32_le(key + 4); + ctx->input[6] = load32_le(key + 8); + ctx->input[7] = load32_le(key + 12); + ctx->input[8] = load32_le(key + 16); + ctx->input[9] = load32_le(key + 20); + ctx->input[10] = load32_le(key + 24); + ctx->input[11] = load32_le(key + 28); +} + +static void +init_ctr(crypto_chacha_ctx *ctx, uint64_t ctr) +{ + ctx->input[12] = (uint32_t) ctr; // LSB + ctx->input[13] = (uint32_t) (ctr >> (uint64_t)32); // MSB +} + +static void +init_nonce(crypto_chacha_ctx *ctx, const uint8_t nonce[8]) +{ + ctx->input[14] = load32_le(nonce + 0); + ctx->input[15] = load32_le(nonce + 4); +} + +static void +init_big_nonce(crypto_chacha_ctx *ctx, const uint8_t nonce[16]) +{ + ctx->input[12] = load32_le(nonce + 0); + ctx->input[13] = load32_le(nonce + 4); + ctx->input[14] = load32_le(nonce + 8); + ctx->input[15] = load32_le(nonce + 12); +} + +static void +init_chacha20(crypto_chacha_ctx *ctx, + const uint8_t key[32], + const uint8_t nonce[8], + uint64_t ctr) +{ + init_constant(ctx ); + init_key (ctx, key ); + init_ctr (ctx, ctr ); + init_nonce (ctx, nonce); +} + +// Initializes a chacha context, with a bigger nonce. +// +// It uses a cascade scheme where a first block is initialised with +// the first 128 bits of the nounce (and no counter), and a second block +// is initialised with a derived key from the first block, and the +// last 64 bits of the nonce. +// +// It's slower than regular initialization, but that big nonce can now +// be selected at random without fear of collision. +static void +init_Xchacha20(crypto_chacha_ctx *ctx, + const uint8_t key[32], + const uint8_t nonce[24], + uint64_t ctr) +{ + // initialise a first block + crypto_chacha_ctx init_ctx; + init_constant (&init_ctx ); + init_key (&init_ctx, key ); + init_big_nonce(&init_ctx, nonce); + + // set up the cascade + init_constant(ctx ); + init_ctr (ctx, ctr ); + init_nonce (ctx, nonce + 16); + half_chacha20_rounds(ctx->input + 5, &init_ctx); // init derived key +} + +static void +encrypt_chacha20(crypto_chacha_ctx *ctx, + const uint8_t *plain_text, + uint8_t *cipher_text, + size_t msg_length) +{ + size_t remaining_bytes = msg_length; + for (;;) { + uint8_t random_block[64]; + chacha20_rounds(random_block, ctx); + increment_counter(ctx); // the only modification of the context + + // XOR the last pseudo-random block with the input, + // then end the loop. + if (remaining_bytes <= 64) { + for (unsigned i = 0; i < remaining_bytes; i++) + cipher_text[i] = plain_text[i] ^ random_block[i]; + return; + } + + // XOR the current pseudo-random block with the input. + for (int i = 0; i < 64; i++) + cipher_text[i] = plain_text[i] ^ random_block[i]; + remaining_bytes -= 64; + cipher_text += 64; + plain_text += 64; + } +} + +////////////////// +/// Encryption /// +////////////////// +void +crypto_encrypt_chacha20(const uint8_t key[32], + const uint8_t nonce[8], + uint64_t ctr, + const uint8_t *plain_text, + uint8_t *cipher_text, + size_t msg_length) +{ + crypto_chacha_ctx ctx; + init_chacha20(&ctx, key, nonce, ctr); + encrypt_chacha20(&ctx, plain_text, cipher_text, msg_length); +} + + +void +crypto_encrypt_Xchacha20(const uint8_t key[32], + const uint8_t nonce[24], + uint64_t ctr, + const uint8_t *plain_text, + uint8_t *cipher_text, + size_t msg_length) +{ + crypto_chacha_ctx ctx; + init_Xchacha20(&ctx, key, nonce, ctr); + encrypt_chacha20(&ctx, plain_text, cipher_text, msg_length); +} + +void +crypto_block_chacha20(const uint8_t key[32], + const uint8_t nonce[8], + uint64_t ctr, + uint8_t output[64]) +{ + crypto_chacha_ctx ctx; + init_chacha20(&ctx, key, nonce, ctr); + chacha20_rounds(output, &ctx); +} + +void +crypto_block_Xchacha20(const uint8_t key[32], + const uint8_t nonce[24], + uint64_t ctr, + uint8_t output[64]) +{ + crypto_chacha_ctx ctx; + init_Xchacha20(&ctx, key, nonce, ctr); + chacha20_rounds(output, &ctx); +} + +/////////////////////////////// +/// Random number generator /// +/////////////////////////////// +void +crypto_init_rng(crypto_rng_context *ctx, + const uint8_t key[32], + const uint8_t nonce[8]) +{ + init_chacha20(&ctx->chacha_ctx, key, nonce, 0); + ctx->remaining_bytes = 0; +} + +#define MIN(a, b) ((a) < (b) ? (a) : (b)) + +void +crypto_random_bytes(crypto_rng_context *ctx, + uint8_t *out, + size_t nb_bytes) +{ + // Consume any remaining byte from a previous + // call to random_bytes + size_t penultimate_size = MIN(nb_bytes, ctx->remaining_bytes); + memcpy(out, ctx->reminder + 64 - ctx->remaining_bytes, penultimate_size); + out += penultimate_size; + nb_bytes -= penultimate_size; + ctx->remaining_bytes -= penultimate_size; + + if (nb_bytes == 0) { return ;} // past that point, ctx->remaining_bytes is 0 + + // fill the output stream block by block + while (nb_bytes >= 64) { + chacha20_rounds(out, &ctx->chacha_ctx); + increment_counter(&ctx->chacha_ctx); + out += 64; + nb_bytes -= 64; + } + + // Generate one last block and finish this + chacha20_rounds(ctx->reminder, &ctx->chacha_ctx); // there was no reminder + increment_counter(&ctx->chacha_ctx); + memcpy(out, ctx->reminder, nb_bytes); // those two lines work even + ctx->remaining_bytes = 64 - nb_bytes; // when nb_bytes is already 0 +} diff --git a/chacha20.h b/chacha20.h new file mode 100644 index 0000000..4bd598f --- /dev/null +++ b/chacha20.h @@ -0,0 +1,92 @@ +#ifndef __CHACHA20__ +#define __CHACHA20__ + +#include +#include + +typedef struct crypto_chacha_ctx { + uint32_t input[16]; +} crypto_chacha_ctx; + +typedef struct crypto_rng_context { + crypto_chacha_ctx chacha_ctx; + uint8_t reminder[64]; + size_t remaining_bytes; +} crypto_rng_context; + +// Encrypts the plain_text by XORing it with a pseudo-random +// stream of numbers, seeded by the key and the nonce. +// NEVER ENCRYPT 2 DISTINCT MESSAGES WITH THE SAME KEY AND NONCE. +// If you do, you will expose the XOR of the two messages, and your +// confidentiality is toast. One time pads are called one time pads +// for a reason. +// +// To decrypt a message, encrypt it again with the same key and nonce. +// +// key : Your secret key. Chose a (pseudo-)random one! +// nonce : Only once per key. Seriously, don't mess this up. +// ctr : Initial counter value. Typically 0 or 1 +// plain_text : Your precious secret message +// cipher_text: Output buffer +// msg_length : Length of plain_text and cipher_text +void +crypto_encrypt_chacha20(const uint8_t key[32], + const uint8_t nonce[8], + uint64_t ctr, + const uint8_t *plain_text, + uint8_t *cipher_text, + size_t msg_length); + +// This one is very similar to encrypt_chacha20, except it provides +// a nonce large enough to chose at random, withouth worrying about +// collisions. Handy for stateless protocols. +// +// Streaming performance is the same, initialization is a bit slower. +void +crypto_encrypt_Xchacha20(const uint8_t key[32], + const uint8_t nonce[24], + uint64_t ctr, + const uint8_t *plain_text, + uint8_t *cipher_text, + size_t msg_length); + +// Will look random as long as you never use the same input twice. +// Can be used with encrypt_bytes as long as the counter here is smaller +// than the encrypt_bytes counter, and the message is not long enough to +// wrap around the counter. +// Use 1 for the encrypt_bytes counter and 0 here, and you should be safe. +void +crypto_block_chacha20(const uint8_t key[32], + const uint8_t nonce[8], + uint64_t ctr, + uint8_t output[64]); + +// Similar to block_chacha20, except for the big nonce and worse performance. +void +crypto_block_Xchacha20(const uint8_t key[32], + const uint8_t nonce[24], + uint64_t ctr, + uint8_t output[64]); + + + +// Inits a cryptographically secure Random Number Generator, with the +// given key and nonce. The output of that RNG will depend entirely +// on the key and nonce. +// NEVER USE THE SAME KEY AND NONCE FOR THIS AND MESSAGE ENCRYPTION. +// If you do, you could leak the very key stream used to encrypt +// your messages. They'd be instantly deciphered. +void +crypto_init_rng(crypto_rng_context *ctx, + const uint8_t key[32], + const uint8_t nonce[8]); + +// provides pseudo-random bytes, deterministically (the output and +// the end state of ctx depends entirely on the initial state of ctx). +// It's a chacha20 stream, really. +void +crypto_random_bytes(crypto_rng_context *ctx, + uint8_t *out, + size_t nb_bytes); + +#endif // __CHACHA20__ -- 2.47.3