From 876420e769ffe30f37c2159b77f37504d294ac61 Mon Sep 17 00:00:00 2001 From: Loup Vaillant Date: Fri, 30 Dec 2016 11:50:26 +0100 Subject: [PATCH] added argon2i --- argon2i.c | 339 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ argon2i.h | 21 ++++ build.sh | 1 + 3 files changed, 361 insertions(+) create mode 100644 argon2i.c create mode 100644 argon2i.h diff --git a/argon2i.c b/argon2i.c new file mode 100644 index 0000000..05be2d9 --- /dev/null +++ b/argon2i.c @@ -0,0 +1,339 @@ +#include "argon2i.h" +#include "blake2b.h" + +static uint64_t +load64_le(const uint8_t s[8]) +{ + // Portable, slow way + return (uint64_t)s[0] + | ((uint64_t)s[1] << 8) + | ((uint64_t)s[2] << 16) + | ((uint64_t)s[3] << 24) + | ((uint64_t)s[4] << 32) + | ((uint64_t)s[5] << 40) + | ((uint64_t)s[6] << 48) + | ((uint64_t)s[7] << 56); +} + +static void +store32_le(uint8_t output[4], uint32_t input) +{ + // Portable, slow way. + output[0] = input & 0xff; + output[1] = (input >> 8) & 0xff; + output[2] = (input >> 16) & 0xff; + output[3] = (input >> 24) & 0xff; +} + +static void +store64_le(uint8_t output[8], uint64_t input) +{ + // Portable, slow way. + output[0] = input & 0xff; + output[1] = (input >> 8) & 0xff; + output[2] = (input >> 16) & 0xff; + output[3] = (input >> 24) & 0xff; + output[4] = (input >> 32) & 0xff; + output[5] = (input >> 40) & 0xff; + output[6] = (input >> 48) & 0xff; + output[7] = (input >> 56) & 0xff; +} + +static uint64_t +rotr64(uint64_t x, uint64_t y) +{ + return (x >> y) ^ (x << (64 - y)); +} + +static size_t +min(size_t a, size_t b) +{ + return a <= b ? a : b; +} + +typedef struct block { + uint64_t a[128]; // 1024 octets in 128 64-bit words +} block; + +static void +zero_block(block *b) +{ + for (int i = 0; i < 128; i++) { + b->a[i] = 0; + } +} + +static void +load_block(block *b, const uint8_t bytes[1024]) +{ + for (int i = 0; i < 128; i++) { + b->a[i] = load64_le(bytes + i * 8); + } +} + +static void +store_block(uint8_t bytes[1024], const block *b) +{ + for (int i = 0; i < 128; i++) { + store64_le(bytes + i * 8, b->a[i]); + } +} + +static void +copy_block(block *out, const block *in) +{ + for (int i = 0; i < 128; i++) { + out->a[i] = in->a[i]; + } +} + +static void +xor_block(block *out, const block *in) +{ + for (int i = 0; i < 128; i++) { + out->a[i] ^= in->a[i]; + } +} + +static void +extended_hash(uint8_t *digest, uint32_t digest_size, + const uint8_t *input , uint32_t input_size) +{ + crypto_blake2b_ctx ctx; + uint8_t buf[4]; + store32_le(buf, digest_size); + + crypto_general_blake2b_init(&ctx, min(digest_size, 64), 0, 0); + crypto_blake2b_update(&ctx, buf, 4); + crypto_blake2b_update(&ctx, input, input_size); + crypto_blake2b_final(&ctx, digest); + + if (digest_size > 64) { + // the conversion to u64 avoids integer overflow on + // ludicrously big hash sizes. + uint32_t r = (((uint64_t)digest_size + 31) / 32) - 2; + uint32_t i = 1; + uint32_t in = 0; + uint32_t out = 32; + while (i <= r) { + // Input and output overlap. + // This shouldn't be a problem. + crypto_blake2b(digest + out, input + in, 64); + i += 1; + in += 32; + out += 32; + } + crypto_general_blake2b(digest + out, digest_size - (32 * r), 0, 0, + digest + in , 64); + } +} + +// Computes Z from R in place +static void +g_rounds(block *work_block) +{ +#define LSB(x) ((x) & 0xffffffff) +#define G(a, b, c, d) \ + a += b + 2 * LSB(a) * LSB(b); d = rotr64(d ^ a, 32); \ + c += d + 2 * LSB(c) * LSB(d); b = rotr64(b ^ c, 24); \ + a += b + 2 * LSB(a) * LSB(b); d = rotr64(d ^ a, 16); \ + c += d + 2 * LSB(c) * LSB(d); b = rotr64(b ^ c, 63) +#define ROUND(v0, v1, v2, v3, v4, v5, v6, v7, \ + v8, v9, v10, v11, v12, v13, v14, v15) \ + G(v0, v4, v8, v12); G(v1, v5, v9, v13); \ + G(v2, v6, v10, v14); G(v3, v7, v11, v15); \ + G(v0, v5, v10, v15); G(v1, v6, v11, v12); \ + G(v2, v7, v8, v13); G(v3, v4, v9, v14) + + // column rounds (work_block = Q) + for (int i = 0; i < 128; i += 16) { + ROUND(work_block->a[i ], work_block->a[i + 1], + work_block->a[i + 2], work_block->a[i + 3], + work_block->a[i + 4], work_block->a[i + 5], + work_block->a[i + 6], work_block->a[i + 7], + work_block->a[i + 8], work_block->a[i + 9], + work_block->a[i + 10], work_block->a[i + 11], + work_block->a[i + 12], work_block->a[i + 13], + work_block->a[i + 14], work_block->a[i + 15]); + } + // row rounds (work_block = Z) + for (int i = 0; i < 16; i += 2) { + ROUND(work_block->a[i ], work_block->a[i + 1], + work_block->a[i + 16], work_block->a[i + 17], + work_block->a[i + 32], work_block->a[i + 33], + work_block->a[i + 48], work_block->a[i + 49], + work_block->a[i + 64], work_block->a[i + 65], + work_block->a[i + 80], work_block->a[i + 81], + work_block->a[i + 96], work_block->a[i + 97], + work_block->a[i + 112], work_block->a[i + 113]); + } +} + +static void +binary_g(block *result, block *x, block *y, void (*xcopy) (block*, const block*)) +{ + // puts R = X ^ Y into tmp + block tmp; + copy_block(&tmp , x); + xor_block (&tmp , y); + + xcopy(result, &tmp); // save R (erase or xor the old block) + g_rounds(&tmp); // tmp = Z + xor_block(result, &tmp); // result = R ^ Z (or R ^ Z ^ old) +} + +typedef struct gidx_ctx { + block b; + uint32_t pass_number; + uint32_t slice_number; + uint32_t nb_blocks; + uint32_t nb_iterations; + uint32_t ctr; + uint32_t next_index; +} gidx_ctx; + +static void +gidx_init(gidx_ctx *ctx, + uint32_t pass_number, + uint32_t slice_number, + uint32_t nb_blocks, + uint32_t nb_iterations) +{ + ctx->pass_number = pass_number; + ctx->slice_number = slice_number; + ctx->nb_blocks = nb_blocks; + ctx->nb_iterations = nb_iterations; + ctx->ctr = 0; // first block starts by 1. + ctx->next_index = 128; // will force increment of ctr upon gidx_next(). +} + +static uint64_t +gidx_next(gidx_ctx *ctx, uint32_t current_block) +{ + // lazily create the index block we need + if (ctx->next_index == 128) { + ctx->next_index = 0; + ctx->ctr++; + // refreshe the underlying block + zero_block(&(ctx->b)); + ctx->b.a[0] = ctx->pass_number; + ctx->b.a[1] = 0; // lane number (we have only one) + ctx->b.a[2] = ctx->slice_number; + ctx->b.a[3] = ctx->nb_blocks; + ctx->b.a[4] = ctx->nb_iterations; + ctx->b.a[5] = 1; // type: Argon2i + ctx->b.a[6] = ctx->ctr; + g_rounds(&(ctx->b)); + } + // we don't need J2, because there's only one lane. + uint64_t j1 = ctx->b.a[ctx->next_index]; // 32 least significant bits + ctx->next_index++; + + _Bool first_pass = ctx->pass_number == 1; // first pass == 1, not zero + uint32_t lane_size = ctx->nb_blocks; + uint32_t area_size = first_pass ? current_block - 1 : lane_size - 2; + uint64_t x = (j1 * j1) >> 32; + uint64_t y = (area_size * x) >> 32; + uint64_t z = area_size - 1 - y; + uint32_t start_pos = first_pass ? 0 : current_block + 1; + return (start_pos + z) % lane_size; +} + +void +crypto_Argon2i_hash(uint8_t *tag, uint32_t tag_size, + const uint8_t *password, uint32_t password_size, + const uint8_t *salt, uint32_t salt_size, + const uint8_t *key, uint32_t key_size, + const uint8_t *ad, uint32_t ad_size, + void *work_area, + uint32_t nb_blocks, + uint32_t nb_iterations) +{ + // work area seen as blocks (must be suitably aligned) + block *blocks = work_area; + + { + uint8_t buf[4]; + crypto_blake2b_ctx ctx; + crypto_blake2b_init(&ctx); + store32_le(buf, 1 /* p */ ); crypto_blake2b_update(&ctx, buf, 4); + store32_le(buf, tag_size ); crypto_blake2b_update(&ctx, buf, 4); + store32_le(buf, nb_blocks ); crypto_blake2b_update(&ctx, buf, 4); + store32_le(buf, nb_iterations); crypto_blake2b_update(&ctx, buf, 4); + store32_le(buf, 0x13 /* v */ ); crypto_blake2b_update(&ctx, buf, 4); + store32_le(buf, 1 /* y */ ); crypto_blake2b_update(&ctx, buf, 4); + store32_le(buf, password_size); crypto_blake2b_update(&ctx, buf, 4); + crypto_blake2b_update(&ctx, password, password_size); + store32_le(buf, salt_size ); crypto_blake2b_update(&ctx, buf, 4); + crypto_blake2b_update(&ctx, salt, salt_size); + store32_le(buf, key_size ); crypto_blake2b_update(&ctx, buf, 4); + crypto_blake2b_update(&ctx, key, key_size); + store32_le(buf, ad_size ); crypto_blake2b_update(&ctx, buf, 4); + crypto_blake2b_update(&ctx, ad, ad_size); + + uint8_t initial_hash[72]; // 64 bytes plus additional words for future hashes + crypto_blake2b_final(&ctx, initial_hash); + + // fill first 2 blocks + block tmp_block; + uint8_t hash_area[1024]; + store32_le(initial_hash + 64, 0); // first additional word + store32_le(initial_hash + 68, 0); // second additional word + extended_hash(hash_area, 1024, initial_hash, 72); + load_block(&tmp_block, hash_area); + copy_block(blocks, &tmp_block); + + store32_le(initial_hash + 68, 1); // slight modification + extended_hash(hash_area, 1024, initial_hash, 72); + load_block(&tmp_block, hash_area); + xor_block(blocks + 1, &tmp_block); + } + + // Actual number of blocks + nb_blocks -= nb_blocks % 4; // round down to 4 p (p == 1 thread) + const uint32_t segment_size = nb_blocks / 4; + + // fill the rest of the first segment + { + gidx_ctx ctx; + gidx_init(&ctx, 1, 1, nb_blocks, nb_iterations); + for (uint32_t i = 2; i < segment_size; i++) { + binary_g(blocks + i, // current block + blocks + i - 1, // previous block + blocks + gidx_next(&ctx, i), // reference block + copy_block); // first pass is a raw copy + } + } + // fill the other 3 segments + for (int segment = 1; segment < 4; segment++ ) { + gidx_ctx ctx; + gidx_init(&ctx, 1, segment + 1, nb_blocks, nb_iterations); + for (uint32_t i = segment * segment_size; + i < (segment + 1) * segment_size; + i++) { + binary_g(blocks + i, // current block + blocks + i - 1, // previous block + blocks + gidx_next(&ctx, i), // reference block + copy_block); // first pass is a raw copy + } + } + + // subsequent iterations (xor computations with previous results) + for (int segment = 0; segment < 4; segment++ ) { + gidx_ctx ctx; + gidx_init(&ctx, 1, segment + 1, nb_blocks, nb_iterations); + for (uint32_t i = segment * segment_size; + i < (segment + 1) * segment_size; + i++) { + binary_g(blocks + i, // current block + blocks + (i - 1) % nb_blocks, // previous block (modulo!) + blocks + gidx_next(&ctx, i), // reference block + xor_block); // subsequent passes are XOR + } + } + + // hash the very last block with H' into the output tag + uint8_t final_block[1024]; + store_block(final_block, blocks + (nb_blocks - 1)); + extended_hash(tag, tag_size, final_block, 1024); +} diff --git a/argon2i.h b/argon2i.h new file mode 100644 index 0000000..fd46b20 --- /dev/null +++ b/argon2i.h @@ -0,0 +1,21 @@ +#ifndef ARGON2I_H +#define ARGON2I_H + +#include +#include + +// Implements argon2i, with degree of paralelism 1, +// because it's good enough, and threads are scary. +void +crypto_Argon2i_hash(uint8_t *tag, uint32_t tag_size, // >= 4 + const uint8_t *password, uint32_t password_size, + const uint8_t *salt, uint32_t salt_size, // >= 8 + const uint8_t *key, uint32_t key_size, + const uint8_t *ad, uint32_t ad_size, + void *work_area, + uint32_t nb_blocks, // >= 8 + uint32_t nb_iterations); + + + +#endif // ARGON2I_H diff --git a/build.sh b/build.sh index 59cc66d..410c104 100755 --- a/build.sh +++ b/build.sh @@ -6,6 +6,7 @@ CFLAGS="-O2 -Wall -Wextra -std=c11" $CC $CFLAGS -c chacha20.c $CC $CFLAGS -c blake2b.c $CC $CFLAGS -c poly1305.c +$CC $CFLAGS -c argon2i.c $CC $CFLAGS -c test.c $CC $CFLAGS -o test test.o chacha20.o blake2b.o poly1305.o -- 2.47.3