From: Loup Vaillant Date: Wed, 22 Mar 2023 22:49:16 +0000 (+0100) Subject: Modify Blake2b context input to byte buffer X-Git-Url: https://git.codecow.com/?a=commitdiff_plain;h=310aab8ddf9b0a31ccc74c06ece8098769cf1231;p=Monocypher.git Modify Blake2b context input to byte buffer Though it requires a (safe because it's all aligned) cast at one point, it makes the code simpler and significantly speeds up non-aligned incremental hashes. Surprisingly, foregoing word-by-word loading at the begining of the update doesn't slow anything down, but forgoing it at the end *does*. So while we align with block boundaries directly, we end up copying the remaining words first, then the remaining bytes. --- diff --git a/src/monocypher.c b/src/monocypher.c index 17a7046..0257fb5 100644 --- a/src/monocypher.c +++ b/src/monocypher.c @@ -489,7 +489,7 @@ static void blake2b_compress(crypto_blake2b_ctx *ctx, int is_last_block) u64 v7 = ctx->hash[7]; u64 v15 = iv[7]; // mangle work vector - u64 *input = ctx->input; + u64 *input = (u64*)ctx->input; #define BLAKE2_G(a, b, c, d, x, y) \ a += b + x; d = rotr64(d ^ a, 32); \ c += d; b = rotr64(b ^ c, 24); \ @@ -533,14 +533,11 @@ void crypto_blake2b_keyed_init(crypto_blake2b_ctx *ctx, size_t hash_size, ctx->input_offset[1] = 0; // beginning of the input, no offset ctx->hash_size = hash_size; ctx->input_idx = 0; - ZERO(ctx->input, 16); + ZERO((u64*)ctx->input, 16); // if there is a key, the first block is that key (padded with zeroes) if (key_size > 0) { - u8 key_block[128] = {0}; - COPY(key_block, key, key_size); - // same as calling crypto_blake2b_update(ctx, key_block , 128) - load64_le_buf(ctx->input, key_block, 16); + COPY(ctx->input, key, key_size); ctx->input_idx = 128; } } @@ -558,35 +555,22 @@ void crypto_blake2b_update(crypto_blake2b_ctx *ctx, return; } - // Align with word boundaries - if ((ctx->input_idx & 7) != 0) { - size_t nb_bytes = MIN(gap(ctx->input_idx, 8), message_size); - size_t word = ctx->input_idx >> 3; - size_t byte = ctx->input_idx & 7; - FOR (i, 0, nb_bytes) { - ctx->input[word] |= (u64)message[i] << ((byte + i) << 3); - } + // Align with block boundaries (magic compiler makes it fast) + if ((ctx->input_idx & 127) != 0) { + size_t nb_bytes = MIN(gap(ctx->input_idx, 128), message_size); + COPY(ctx->input + ctx->input_idx, message, nb_bytes); ctx->input_idx += nb_bytes; message += nb_bytes; message_size -= nb_bytes; } - // Align with block boundaries (faster than byte by byte) - if ((ctx->input_idx & 127) != 0) { - size_t nb_words = MIN(gap(ctx->input_idx, 128), message_size) >> 3; - load64_le_buf(ctx->input + (ctx->input_idx >> 3), message, nb_words); - ctx->input_idx += nb_words << 3; - message += nb_words << 3; - message_size -= nb_words << 3; - } - // Process block by block size_t nb_blocks = message_size >> 7; FOR (i, 0, nb_blocks) { if (ctx->input_idx == 128) { blake2b_compress(ctx, 0); } - load64_le_buf(ctx->input, message, 16); + COPY(ctx->input, message, 128); message += 128; ctx->input_idx = 128; } @@ -599,22 +583,18 @@ void crypto_blake2b_update(crypto_blake2b_ctx *ctx, ctx->input_idx = 0; } if (ctx->input_idx == 0) { - ZERO(ctx->input, 16); + ZERO(ctx->input, 128); } // Fill remaining words (faster than byte by byte) size_t nb_words = message_size >> 3; - load64_le_buf(ctx->input, message, nb_words); + COPY(ctx->input, message, nb_words << 3); ctx->input_idx += nb_words << 3; message += nb_words << 3; message_size -= nb_words << 3; // Fill remaining bytes - FOR (i, 0, message_size) { - size_t word = ctx->input_idx >> 3; - size_t byte = ctx->input_idx & 7; - ctx->input[word] |= (u64)message[i] << (byte << 3); - ctx->input_idx++; - } + COPY(ctx->input + ctx->input_idx, message, message_size); + ctx->input_idx += message_size; } } diff --git a/src/monocypher.h b/src/monocypher.h index cf635e8..c73b4d4 100644 --- a/src/monocypher.h +++ b/src/monocypher.h @@ -136,7 +136,7 @@ typedef struct { // for they may change without notice. uint64_t hash[8]; uint64_t input_offset[2]; - uint64_t input[16]; + uint8_t input[128]; size_t input_idx; size_t hash_size; } crypto_blake2b_ctx; diff --git a/tests/speed/speed.c b/tests/speed/speed.c index 1a7e9dd..1a44c95 100644 --- a/tests/speed/speed.c +++ b/tests/speed/speed.c @@ -122,6 +122,22 @@ static u64 blake2b_small(void) TIMING_END; } +static u64 blake2b_blocks(void) +{ + u8 hash[64]; + RANDOM_INPUT(input, 32); + + TIMING_START { + crypto_blake2b_ctx ctx; + crypto_blake2b_init(&ctx, 64); + FOR (i, 0, 1000) { + crypto_blake2b_update(&ctx, input, 32); + } + crypto_blake2b_final(&ctx, hash); + } + TIMING_END; +} + static u64 sha512(void) { u8 hash[64]; @@ -256,6 +272,7 @@ int main() print("Auth'd encryption ",authenticated()*MUL ,"megabytes per second"); print("BLAKE2b ",blake2b() *MUL ,"megabytes per second"); print("BLAKE2b (small) ",blake2b_small() ,"cycles per second"); + print("BLAKE2b (32B blocks)",blake2b_blocks() ,"cycles per second"); print("SHA-512 ",sha512() *MUL ,"megabytes per second"); print("SHA-512 (small) ",sha512_small() ,"cycles per second"); print("Argon2i, 3 passes ",argon2i() *MUL ,"megabytes per second");