/// Round functions ///
///////////////////////
-// A quarter round is meant to mangle a fourth of a chacha context.
-// (a line, a column, or any subset you can think of). Enough of
-// these rounds, carefully chosen, will garble the input beyond
-// recognition.
+
+// Mangles the chacha context into a random-looking output.
//
// WARNING: THIS OPERATION IS REVERSIBLE.
-//
-// If you build a crypto primitive on top of it without additional
-// precautions, any rookie cryptanalist can break it.
-#define QUARTERROUND(a, b, c, d) \
- a = a + b; d = rotl32(d ^ a, 16); \
- c = c + d; b = rotl32(b ^ c, 12); \
- a = a + b; d = rotl32(d ^ a, 8); \
- c = c + d; b = rotl32(b ^ c, 7)
-
-// Mangles the chacha context into a pseudorandom output
-// That is, if you don't know the key stored in the context,
-// you can't deduce squat about the output. This is true
-// even if you know the output of many other contexes, even
-// if they differ from this one by only one bit.
+// We still need to take steps to ensure the attacker can't
+// deduce the input (and with it, the key) from the output.
static void
-chacha20_rounds(uint8_t output[64], const crypto_chacha_ctx *ctx)
+chacha20_rounds(uint32_t output[16], const crypto_chacha_ctx *ctx)
{
// Local variables instead of indices, to facilitate optimisations
// TODO: test this shit. The speed increase might be small.
// 8 9 10 11
// 12 13 14 15
+ // A quarter round is meant to mangle a fourth of a chacha context.
+ // (a line, a column, or any subset you can think of). Enough of
+ // these rounds, carefully chosen, will garble the input beyond
+ // recognition.
+ //
+ // WARNING: THIS OPERATION IS REVERSIBLE.
+ //
+ // If you build a crypto primitive on top of it without additional
+ // precautions, any rookie cryptanalist can break it.
+#define QUARTERROUND(a, b, c, d) \
+ a = a + b; d = rotl32(d ^ a, 16); \
+ c = c + d; b = rotl32(b ^ c, 12); \
+ a = a + b; d = rotl32(d ^ a, 8); \
+ c = c + d; b = rotl32(b ^ c, 7)
+
// Column round. Very SIMD friendly, if you want high performance.
QUARTERROUND(x0, x4, x8, x12); // column 0
QUARTERROUND(x1, x5, x9, x13); // column 1
QUARTERROUND(x3, x4, x9, x14); // diagonal 4
}
+ output[ 0] = x0;
+ output[ 1] = x1;
+ output[ 2] = x2;
+ output[ 3] = x3;
+ output[ 4] = x4;
+ output[ 5] = x5;
+ output[ 6] = x6;
+ output[ 7] = x7;
+ output[ 8] = x8;
+ output[ 9] = x9;
+ output[10] = x10;
+ output[11] = x11;
+ output[12] = x12;
+ output[13] = x13;
+ output[14] = x14;
+ output[15] = x15;
+}
+
+static void
+chacha20_block(uint8_t output[64], const crypto_chacha_ctx *ctx)
+{
+ uint32_t buffer[16];
+ chacha20_rounds(buffer, ctx);
+
// Now our buffer is seriously garbled. However, it is still easy
// to deduce the initial context from it: just invert the quarter
// rounds and apply that in reverse order.
// the performance of naive implementations such as this one. With
// SIMD, it's faster to just add the lot, so that's what the standard
// does.
- x0 += ctx->input[ 0];
- x1 += ctx->input[ 1];
- x2 += ctx->input[ 2];
- x3 += ctx->input[ 3];
- x4 += ctx->input[ 4];
- x5 += ctx->input[ 5];
- x6 += ctx->input[ 6];
- x7 += ctx->input[ 7];
- x8 += ctx->input[ 8];
- x9 += ctx->input[ 9];
- x10 += ctx->input[10];
- x11 += ctx->input[11];
- x12 += ctx->input[12];
- x13 += ctx->input[13];
- x14 += ctx->input[14];
- x15 += ctx->input[15];
-
- // finally, we can output our buffer
- store32_le(output + 0, x0 );
- store32_le(output + 4, x1 );
- store32_le(output + 8, x2 );
- store32_le(output + 12, x3 );
- store32_le(output + 16, x4 );
- store32_le(output + 20, x5 );
- store32_le(output + 24, x6 );
- store32_le(output + 28, x7 );
- store32_le(output + 32, x8 );
- store32_le(output + 36, x9 );
- store32_le(output + 40, x10);
- store32_le(output + 44, x11);
- store32_le(output + 48, x12);
- store32_le(output + 52, x13);
- store32_le(output + 56, x14);
- store32_le(output + 60, x15);
+ for (unsigned i = 0; i < 16; i++) {
+ uint32_t sum = buffer[i] + ctx->input[i];
+ store32_le(output + i*4, sum);
+ }
}
-// This one is the same as chacha20_rounds, only it gives you only
+// This one is the same as chacha20_block, only it gives you only
// half the output (256 bytes). It's basically the same as HSalsa20,
// except build on ChaCha. It is provably as secure as ChaCha20
static void
-half_chacha20_rounds(uint32_t output[8], const crypto_chacha_ctx *ctx)
+half_chacha20_block(uint32_t output[8], const crypto_chacha_ctx *ctx)
{
- // Copy pasta rom chacha20_rounds
- uint32_t x0 = ctx->input[ 0];
- uint32_t x1 = ctx->input[ 1];
- uint32_t x2 = ctx->input[ 2];
- uint32_t x3 = ctx->input[ 3];
- uint32_t x4 = ctx->input[ 4];
- uint32_t x5 = ctx->input[ 5];
- uint32_t x6 = ctx->input[ 6];
- uint32_t x7 = ctx->input[ 7];
- uint32_t x8 = ctx->input[ 8];
- uint32_t x9 = ctx->input[ 9];
- uint32_t x10 = ctx->input[10];
- uint32_t x11 = ctx->input[11];
- uint32_t x12 = ctx->input[12];
- uint32_t x13 = ctx->input[13];
- uint32_t x14 = ctx->input[14];
- uint32_t x15 = ctx->input[15];
-
- // Copy pasta rom chacha20_rounds
- for (int i = 20; i > 0; i -= 2) {
- QUARTERROUND(x0, x4, x8, x12); // column 0
- QUARTERROUND(x1, x5, x9, x13); // column 1
- QUARTERROUND(x2, x6, x10, x14); // column 2
- QUARTERROUND(x3, x7, x11, x15); // column 3
- QUARTERROUND(x0, x5, x10, x15); // diagonal 1
- QUARTERROUND(x1, x6, x11, x12); // diagonal 2
- QUARTERROUND(x2, x7, x8, x13); // diagonal 3
- QUARTERROUND(x3, x4, x9, x14); // diagonal 4
- }
+ uint32_t buffer[16];
+ chacha20_rounds(buffer, ctx);
// Okay, remember about needing that addition? Well, we only
// Disclose half of the output, and that ensures the attacker
//
// This lets us avoid a couple additional loads and additions,
// for even moar speed.
- output[0] = ctx->input[ 0]; // don't add the constant
- output[1] = ctx->input[ 1]; // don't add the constant
- output[2] = ctx->input[ 2]; // don't add the constant
- output[3] = ctx->input[ 3]; // don't add the constant
- output[4] = ctx->input[12]; // don't add the counter
- output[5] = ctx->input[13]; // don't add the counter
- output[6] = ctx->input[14]; // don't add the nonce
- output[7] = ctx->input[15]; // don't add the nonce
+ memcpy(output, buffer , sizeof(uint32_t) * 4);
+ memcpy(output, buffer + 12, sizeof(uint32_t) * 4);
}
//////////////////////////////
init_constant(ctx );
init_ctr (ctx, ctr );
init_nonce (ctx, nonce + 16);
- half_chacha20_rounds(ctx->input + 5, &init_ctx); // init derived key
+ half_chacha20_block(ctx->input + 5, &init_ctx); // init derived key
}
static void
size_t remaining_bytes = msg_length;
for (;;) {
uint8_t random_block[64];
- chacha20_rounds(random_block, ctx);
+ chacha20_block(random_block, ctx);
increment_counter(ctx); // the only modification of the context
// XOR the last pseudo-random block with the input,
{
crypto_chacha_ctx ctx;
init_chacha20(&ctx, key, nonce, ctr);
- chacha20_rounds(output, &ctx);
+ chacha20_block(output, &ctx);
}
void
{
crypto_chacha_ctx ctx;
init_Xchacha20(&ctx, key, nonce, ctr);
- chacha20_rounds(output, &ctx);
+ chacha20_block(output, &ctx);
}
///////////////////////////////
// fill the output stream block by block
while (nb_bytes >= 64) {
- chacha20_rounds(out, &ctx->chacha_ctx);
+ chacha20_block(out, &ctx->chacha_ctx);
increment_counter(&ctx->chacha_ctx);
out += 64;
nb_bytes -= 64;
}
// Generate one last block and finish this
- chacha20_rounds(ctx->reminder, &ctx->chacha_ctx); // there was no reminder
+ chacha20_block(ctx->reminder, &ctx->chacha_ctx); // there was no reminder
increment_counter(&ctx->chacha_ctx);
memcpy(out, ctx->reminder, nb_bytes); // those two lines work even
ctx->remaining_bytes = 64 - nb_bytes; // when nb_bytes is already 0