From: Loup Vaillant Date: Wed, 28 Dec 2022 08:08:37 +0000 (+0100) Subject: Argon2i: support multiple lanes X-Git-Url: https://git.codecow.com/?a=commitdiff_plain;h=4ee41699638459e00fd02215a01768edecdee5e7;p=Monocypher.git Argon2i: support multiple lanes --- diff --git a/src/monocypher.c b/src/monocypher.c index 7714709..42b47df 100644 --- a/src/monocypher.c +++ b/src/monocypher.c @@ -660,16 +660,6 @@ static void blake_update_32(crypto_blake2b_ctx *ctx, u32 input) WIPE_BUFFER(buf); } -static void load_block(block *b, const u8 bytes[1024]) -{ - load64_le_buf(b->a, bytes, 128); -} - -static void store_block(u8 bytes[1024], const block *b) -{ - store64_le_buf(bytes, b->a, 128); -} - static void copy_block(block *o,const block*in){FOR(i,0,128)o->a[i] = in->a[i];} static void xor_block(block *o,const block*in){FOR(i,0,128)o->a[i]^= in->a[i];} @@ -747,18 +737,59 @@ static void g_rounds(block *work_block) } } +typedef struct { + u32 nb_blocks; + u32 nb_lanes; + u32 nb_passes; +} argon_hardness; + +typedef struct { + u32 pass; + u32 slice; + u32 lane; + u32 block; +} argon_index; + +static u32 ref_index(u64 seed, argon_hardness h, argon_index idx) +{ + u64 j1 = seed & 0xffffffff; // block selector (inside a lane) + u64 j2 = seed >> 32; // lane selector + + // Blocks may be picked from any of: + // - The last 3 slices (if they exist yet) + // - The already constructed blocks in this segment (except the last) + int first_pass = idx.pass == 0; + u32 lane_size = h.nb_blocks / h.nb_lanes; + u32 segment_size = lane_size / 4; + u32 lane = j2 % h.nb_lanes; + + // Start of the reference set + u32 next_slice = ((idx.slice + 1) % 4) * segment_size; + u32 start = first_pass ? 0 : next_slice; + + // Size of the reference set + u32 nb_segments = first_pass ? idx.slice : 3; + u32 nb_blocks = lane == idx.lane ? idx.block - 1 : 0; + nb_blocks -= lane != idx.lane && idx.block == 0; // why the fuck? + u32 w_size = nb_segments * segment_size + nb_blocks; + + // Generate offset from J1 and J2 + u64 x = (j1 * j1) >> 32; + u64 y = (w_size * x) >> 32; + u64 z = (w_size - 1) - y; + u64 ref = (start + z) % lane_size; + return lane * lane_size + (u32)ref; +} + // Argon2i uses a kind of stream cipher to determine which reference // block it will take to synthesise the next block. This context hold // that stream's state. (It's very similar to Chacha20. The block b // is analogous to Chacha's own pool) typedef struct { block b; - u32 pass_number; - u32 slice_number; - u32 nb_blocks; - u32 nb_iterations; + argon_hardness h; + argon_index idx; u32 ctr; - u32 offset; } gidx_ctx; // The block in the context will determine array indices. To avoid @@ -767,12 +798,14 @@ typedef struct { // easier, but timing attacks are the bigger threat in many settings. static void gidx_refresh(gidx_ctx *ctx) { + ctx->ctr++; + // seed the beginning of the block... - ctx->b.a[0] = ctx->pass_number; - ctx->b.a[1] = 0; // lane number (we have only one) - ctx->b.a[2] = ctx->slice_number; - ctx->b.a[3] = ctx->nb_blocks; - ctx->b.a[4] = ctx->nb_iterations; + ctx->b.a[0] = ctx->idx.pass; + ctx->b.a[1] = ctx->idx.lane; + ctx->b.a[2] = ctx->idx.slice; + ctx->b.a[3] = ctx->h.nb_blocks; + ctx->b.a[4] = ctx->h.nb_passes; ctx->b.a[5] = 1; // type: Argon2i ctx->b.a[6] = ctx->ctr; ZERO(ctx->b.a + 7, 121); // ...then zero the rest out @@ -789,87 +822,60 @@ static void gidx_refresh(gidx_ctx *ctx) wipe_block(&tmp); } -static void gidx_init(gidx_ctx *ctx, - u32 pass_number, u32 slice_number, - u32 nb_blocks, u32 nb_iterations) -{ - ctx->pass_number = pass_number; - ctx->slice_number = slice_number; - ctx->nb_blocks = nb_blocks; - ctx->nb_iterations = nb_iterations; - ctx->ctr = 0; - - // Offset from the beginning of the segment. For the first slice - // of the first pass, we start at the *third* block, so the offset - // starts at 2, not 0. - if (pass_number != 0 || slice_number != 0) { - ctx->offset = 0; - } else { - ctx->offset = 2; - ctx->ctr++; // Compensates for missed lazy creation - gidx_refresh(ctx); // at the start of gidx_next() +static void gidx_init(gidx_ctx *ctx, argon_hardness h, argon_index idx) +{ + ctx->h = h; + ctx->idx = idx; + ctx->ctr = 0; + + // On the first slice of the first pass, 2 blocks are already + // filled, and idx.block == 2 instead of zero. In this case the lazy + // refresh does not happen, so we need to refresh manually. + // + // We could instead unconditionally refresh, and use an eager + // refresh instead, but this wastes up to one refresh per segment. + if (idx.block != 0) { + gidx_refresh(ctx); } } static u32 gidx_next(gidx_ctx *ctx) { // lazily creates the offset block we need - if ((ctx->offset & 127) == 0) { - ctx->ctr++; + if ((ctx->idx.block & 127) == 0) { gidx_refresh(ctx); } - u32 index = ctx->offset & 127; // save index for current call - u32 offset = ctx->offset; // save offset for current call - ctx->offset++; // update offset for next call - - // Computes the area size. - // Pass 0 : all already finished segments plus already constructed - // blocks in this segment - // Pass 1+: 3 last segments plus already constructed - // blocks in this segment. THE SPEC SUGGESTS OTHERWISE. - // I CONFORM TO THE REFERENCE IMPLEMENTATION. - int first_pass = ctx->pass_number == 0; - u32 slice_size = ctx->nb_blocks >> 2; - u32 nb_segments = first_pass ? ctx->slice_number : 3; - u32 area_size = nb_segments * slice_size + offset - 1; - - // Computes the starting position of the reference area. - // CONTRARY TO WHAT THE SPEC SUGGESTS, IT STARTS AT THE - // NEXT SEGMENT, NOT THE NEXT BLOCK. - u32 next_slice = ((ctx->slice_number + 1) & 3) * slice_size; - u32 start_pos = first_pass ? 0 : next_slice; - - // Generate offset from J1 (no need for J2, there's only one lane) - u64 j1 = ctx->b.a[index] & 0xffffffff; // pseudo-random number - u64 x = (j1 * j1) >> 32; - u64 y = (area_size * x) >> 32; - u64 z = (area_size - 1) - y; - u64 ref = start_pos + z; // ref < 2 * nb_blocks - return (u32)(ref < ctx->nb_blocks ? ref : ref - ctx->nb_blocks); + u32 index = ref_index(ctx->b.a[ctx->idx.block], ctx->h, ctx->idx); + ctx->idx.block++; + return index; + } const crypto_argon2_settings crypto_argon2i_defaults = { CRYPTO_ARGON2_I, // algorithm - 100000, 3, 1, // nb_blocks, nb_iterations, nb_lanes + 100000, 3, 1, // nb_blocks, nb_passes, nb_lanes 16, 32, // salt_size, hash_size 0, 0, 0, 0, // no key, no ad }; -// Main algorithm void crypto_argon2(u8 *hash, void *work_area, const u8 *password, u32 password_size, const u8 *salt, crypto_argon2_settings s) { + const u32 segment_size = s.nb_blocks / s.nb_lanes / 4; + const u32 lane_size = segment_size * 4; + const u32 nb_blocks = lane_size * s.nb_lanes; // s.nb_blocks rounded down + // work area seen as blocks (must be suitably aligned) block *blocks = (block*)work_area; { crypto_blake2b_ctx ctx; crypto_blake2b_init(&ctx); - blake_update_32 (&ctx, s.nb_lanes ); // p: number of "threads" - blake_update_32 (&ctx, s.hash_size ); - blake_update_32 (&ctx, s.nb_blocks ); - blake_update_32 (&ctx, s.nb_iterations); - blake_update_32 (&ctx, 0x13 ); // v: version number - blake_update_32 (&ctx, s.algorithm ); // y: Argon2i, Argon2d... + blake_update_32 (&ctx, s.nb_lanes ); // p: number of "threads" + blake_update_32 (&ctx, s.hash_size); + blake_update_32 (&ctx, s.nb_blocks); + blake_update_32 (&ctx, s.nb_passes); + blake_update_32 (&ctx, 0x13 ); // v: version number + blake_update_32 (&ctx, s.algorithm); // y: Argon2i, Argon2d... blake_update_32 (&ctx, password_size); crypto_blake2b_update(&ctx, password, password_size); blake_update_32 (&ctx, s.salt_size); @@ -882,67 +888,87 @@ void crypto_argon2(u8 *hash, void *work_area, const u8 *password, u8 initial_hash[72]; // 64 bytes plus 2 words for future hashes crypto_blake2b_final(&ctx, initial_hash); - // fill first 2 blocks + // fill first 2 blocks of each lane u8 hash_area[1024]; - store32_le(initial_hash + 64, 0); // first additional word - store32_le(initial_hash + 68, 0); // second additional word - extended_hash(hash_area, 1024, initial_hash, 72); - load_block(blocks, hash_area); - - store32_le(initial_hash + 64, 1); // slight modification - extended_hash(hash_area, 1024, initial_hash, 72); - load_block(blocks + 1, hash_area); + FOR_T(u32, l, 0, s.nb_lanes) { + FOR_T(u32, i, 0, 2) { + store32_le(initial_hash + 64, i); // first additional word + store32_le(initial_hash + 68, l); // second additional word + extended_hash(hash_area, 1024, initial_hash, 72); + load64_le_buf(blocks[l * lane_size + i].a, hash_area, 128); + } + } WIPE_BUFFER(initial_hash); WIPE_BUFFER(hash_area); } - // Actual number of blocks (must be a multiple of 4 p) - u32 nb_blocks = s.nb_blocks - s.nb_blocks % (4 * s.nb_lanes); - const u32 segment_size = nb_blocks >> 2; - // fill (then re-fill) the rest of the blocks block tmp; - gidx_ctx ctx; // public information, no need to wipe - FOR_T (u32, pass_number, 0, s.nb_iterations) { - int first_pass = pass_number == 0; - - FOR_T (u32, segment, 0, 4) { - gidx_init(&ctx, pass_number, segment, nb_blocks, s.nb_iterations); - - // On the first segment of the first pass, - // blocks 0 and 1 are already filled. - // We use the offset to skip them. - u32 start_offset = first_pass && segment == 0 ? 2 : 0; - u32 segment_start = segment * segment_size + start_offset; - u32 segment_end = (segment + 1) * segment_size; - FOR_T (u32, current_block, segment_start, segment_end) { - block *reference = blocks + gidx_next(&ctx); - block *current = blocks + current_block; - block *previous = - current_block == 0 - ? blocks + nb_blocks - 1 - : blocks + current_block - 1; - // Apply compression function G, - // And copy it (or XOR it) to the current block. - copy_block(&tmp, previous); - xor_block (&tmp, reference); - if (first_pass) { copy_block(current, &tmp); } - else { xor_block (current, &tmp); } - g_rounds (&tmp); - xor_block (current, &tmp); + FOR_T(u32, pass, 0, s.nb_passes) { + FOR_T(u32, slice, 0, 4) { + // Each segment within the same slice are independent of + // each other, and can be computed in parallel (one thread + // per lane). We only need to wait for all segments to be + // finished before starting the next slice + // + // Monocpher has no support for threads, so segments are + // computed sequentially here. Note: optimal performance + // (and therefore security) requires one thread per lane. + // Without threads, multi-lane support is only there for + // compatibility, or as a reference. + FOR_T(u32, segment, 0, s.nb_lanes) { + // On the first slice of the first pass, + // blocks 0 and 1 are already filled. + // We use the offset to skip them. + u32 pass_offset = pass == 0 && slice == 0 ? 2 : 0; + u32 lane_offset = segment * lane_size; + u32 slice_offset = slice * segment_size; + block *segment_start = blocks + lane_offset + slice_offset; + + gidx_ctx ctx; // public information, not wiped + gidx_init(&ctx, + (argon_hardness){ nb_blocks, s.nb_lanes, s.nb_passes}, + (argon_index) { pass, slice, segment, pass_offset}); + FOR_T (u32, current_block, pass_offset, segment_size) { + block *reference = blocks + gidx_next(&ctx); + block *current = segment_start + current_block; + block *previous = + current_block == 0 && slice_offset == 0 + ? segment_start + lane_size - 1 + : segment_start + current_block - 1; + + // Apply compression function G, + // And copy it (or XOR it) to the current block. + copy_block(&tmp, previous); + xor_block (&tmp, reference); + if (pass == 0) { copy_block(current, &tmp); } + else { xor_block (current, &tmp); } + g_rounds (&tmp); + xor_block (current, &tmp); + } } } } wipe_block(&tmp); + + // XOR last blocks of each lane + block *last_block = blocks + lane_size - 1; + FOR_T (u32, lane, 1, s.nb_lanes) { + block *next_block = last_block + lane_size; + xor_block(next_block, last_block); + last_block = next_block; + } + + // Serialize last block u8 final_block[1024]; - store_block(final_block, blocks + (nb_blocks - 1)); + store64_le_buf(final_block, last_block->a, 128); - // wipe work area + // Wipe work area volatile u64 *p = (u64*)work_area; ZERO(p, 128 * nb_blocks); - // hash the very last block with H' into the output hash + // Hash the very last block with H' into the output hash extended_hash(hash, s.hash_size, final_block, 1024); WIPE_BUFFER(final_block); } diff --git a/src/monocypher.h b/src/monocypher.h index 6dacc87..08a3029 100644 --- a/src/monocypher.h +++ b/src/monocypher.h @@ -144,13 +144,13 @@ void crypto_blake2b_general_init(crypto_blake2b_ctx *ctx, size_t hash_size, // ---------------------------------- typedef struct { - uint32_t algorithm; // Argon2i, Argon2d, Argon2id - uint32_t nb_blocks; // memory hardness, >= 8 - uint32_t nb_iterations; // CPU hardness, >= 1 (>= 3 recommended for Argon2i) - uint32_t nb_lanes; // parallelism level (single threaded anyway) - uint32_t salt_size; // we recommend 16 bytes - uint32_t hash_size; // we recommend 32 bytes per key - const uint8_t *key; // pointers are aligned to 8 bytes + uint32_t algorithm; // Argon2i, Argon2d, Argon2id + uint32_t nb_blocks; // memory hardness, >= 8 + uint32_t nb_passes; // CPU hardness, >= 1 (>= 3 recommended for Argon2i) + uint32_t nb_lanes; // parallelism level (single threaded anyway) + uint32_t salt_size; // we recommend 16 bytes + uint32_t hash_size; // we recommend 32 bytes per key + const uint8_t *key; // pointers are aligned to 8 bytes const uint8_t *ad; uint32_t key_size; uint32_t ad_size; diff --git a/tests/gen/argon2i.c b/tests/gen/argon2i.c index d22e7a5..3bbee5f 100644 --- a/tests/gen/argon2i.c +++ b/tests/gen/argon2i.c @@ -73,7 +73,9 @@ static void test(size_t nb_blocks, size_t hash_size, size_t nb_iterations) print_number(nb_iterations ); print_vector(password, 16 ); print_vector(salt , crypto_pwhash_SALTBYTES); - printf(":\n:\n"); // no key, no additionnal data + print_number(1); // one lane (no parallelism) + printf(":\n"); // no key + printf(":\n"); // no additionnal data print_vector(hash , hash_size ); printf("\n"); } diff --git a/tests/gen/vectors/argon2i b/tests/gen/vectors/argon2i index 7b4c51e..c96ea2b 100644 --- a/tests/gen/vectors/argon2i +++ b/tests/gen/vectors/argon2i @@ -2,6 +2,16 @@ 0300000000000000: 0101010101010101010101010101010101010101010101010101010101010101: 02020202020202020202020202020202: +0100000000000000: 0303030303030303: 040404040404040404040404: afe519be3ab0e92375df221dfb17347080c7000b1be85f9ee39978bf11e7cc3a: + +2000000000000000: +0300000000000000: +0101010101010101010101010101010101010101010101010101010101010101: +02020202020202020202020202020202: +0400000000000000: +0303030303030303: +040404040404040404040404: +c814d9d1dc7f37aa13f0d77f2494bda1c8de6b016dd388d29952a4c4672b6ce8: diff --git a/tests/test.c b/tests/test.c index bce5d50..f544775 100644 --- a/tests/test.c +++ b/tests/test.c @@ -522,24 +522,24 @@ static void test_hmac_sha512() /////////////// static void argon2i(vector_reader *reader) { - u64 nb_blocks = load64_le(next_input(reader).buf); - u64 nb_iterations = load64_le(next_input(reader).buf); - vector password = next_input(reader); - vector salt = next_input(reader); - vector key = next_input(reader); - vector ad = next_input(reader); - vector out = next_output(reader); - void *work_area = alloc(nb_blocks * 1024); - crypto_argon2_settings s = crypto_argon2i_defaults; - s.nb_blocks = nb_blocks; - s.nb_iterations = nb_iterations; - s.hash_size = out.size; - s.salt_size = salt.size; - s.key = key.buf; - s.key_size = key.size; - s.ad = ad.buf; - s.ad_size = ad.size; + + s.nb_blocks = load32_le(next_input(reader).buf); + s.nb_passes = load32_le(next_input(reader).buf); + vector password = next_input(reader); + vector salt = next_input(reader); + s.nb_lanes = load32_le(next_input(reader).buf); + vector key = next_input(reader); + vector ad = next_input(reader); + vector out = next_output(reader); + void *work_area = alloc(s.nb_blocks * 1024); + + s.hash_size = out.size; + s.salt_size = salt.size; + s.key = key.buf; + s.key_size = key.size; + s.ad = ad.buf; + s.ad_size = ad.size; crypto_argon2(out.buf, work_area, password.buf, password.size, salt.buf, s); free(work_area); @@ -568,12 +568,12 @@ static void test_argon2i() // without overlap crypto_argon2_settings s = crypto_argon2i_defaults; - s.nb_blocks = 8; - s.nb_iterations = 1; - s.key = key; - s.ad = ad; - s.key_size = 32; - s.ad_size = 32; + s.nb_blocks = 8; + s.nb_passes = 1; + s.key = key; + s.ad = ad; + s.key_size = 32; + s.ad_size = 32; crypto_argon2(hash1, clean_work_area, pass, 16, salt, s); // with overlap diff --git a/tests/tis-ci-vectors.h b/tests/tis-ci-vectors.h index 3ba12b0..d5ef069 100644 --- a/tests/tis-ci-vectors.h +++ b/tests/tis-ci-vectors.h @@ -349,18 +349,20 @@ static const char *argon2i_vectors[]={ "0300000000000000", "e4e4c4054fe35a75d9c0f679ad8770d8", "227e68e4c1e68ce67ee88e6be251a207", + "0100000000000000", "", "", "2a2ec585be2ec27c215f677e947c212b1b85de797167d4950e29987977c941117c4c5f6f6f547e62d76b88fa121781986a37ea14dc394917af5396ea58915d", - "0800000000000000", + "2000000000000000", "0300000000000000", - "48b3753cff3a6d990163e6b60da1e4e5", - "d6a2df78c16c96a52d4fb01ea4ecf70e", - "", - "", - "ec60819d04c1d35416d20abc5908dd972acbfd8f6a282ca2b642064242526683c0f1b237f38bac8279571f049bfed4d8d177ea336f2ec96456eb6c584d3c9607", + "0101010101010101010101010101010101010101010101010101010101010101", + "02020202020202020202020202020202", + "0400000000000000", + "0303030303030303", + "040404040404040404040404", + "c814d9d1dc7f37aa13f0d77f2494bda1c8de6b016dd388d29952a4c4672b6ce8", }; -static size_t nb_argon2i_vectors=14; +static size_t nb_argon2i_vectors=16; static const char *edDSA_vectors[]={ "50831c8cb43cd6822bf3f6fae0801cb6c843d8066b07346635365fb7d6ee54e5", "b600ab324d70d2372f3ba5a0d8bdd8b8e797f780b642bd56e69a18db74c389bc", diff --git a/tests/tis-ci.c b/tests/tis-ci.c index 053d6c2..1c21747 100644 --- a/tests/tis-ci.c +++ b/tests/tis-ci.c @@ -160,24 +160,24 @@ static void hmac_sha512(vector_reader *reader) static void argon2i(vector_reader *reader) { - u64 nb_blocks = load64_le(next_input(reader).buf); - u64 nb_iterations = load64_le(next_input(reader).buf); - vector password = next_input(reader); - vector salt = next_input(reader); - vector key = next_input(reader); - vector ad = next_input(reader); - vector out = next_output(reader); - void *work_area = alloc(nb_blocks * 1024); - crypto_argon2_settings s = crypto_argon2i_defaults; - s.nb_blocks = nb_blocks; - s.nb_iterations = nb_iterations; - s.hash_size = out.size; - s.salt_size = salt.size; - s.key = key.buf; - s.key_size = key.size; - s.ad = ad.buf; - s.ad_size = ad.size; + + s.nb_blocks = load32_le(next_input(reader).buf); + s.nb_passes = load32_le(next_input(reader).buf); + vector password = next_input(reader); + vector salt = next_input(reader); + s.nb_lanes = load32_le(next_input(reader).buf); + vector key = next_input(reader); + vector ad = next_input(reader); + vector out = next_output(reader); + void *work_area = alloc(s.nb_blocks * 1024); + + s.hash_size = out.size; + s.salt_size = salt.size; + s.key = key.buf; + s.key_size = key.size; + s.ad = ad.buf; + s.ad_size = ad.size; crypto_argon2(out.buf, work_area, password.buf, password.size, salt.buf, s); free(work_area);