Simplify Argon2i

author Loup Vaillant <loup@loup-vaillant.fr>

Wed, 28 Dec 2022 21:11:31 +0000 (22:11 +0100)

committer Loup Vaillant <loup@loup-vaillant.fr>

Wed, 28 Dec 2022 22:48:24 +0000 (23:48 +0100)
author Loup Vaillant <loup@loup-vaillant.fr>
Wed, 28 Dec 2022 21:11:31 +0000 (22:11 +0100)
committer Loup Vaillant <loup@loup-vaillant.fr>
Wed, 28 Dec 2022 22:48:24 +0000 (23:48 +0100)
diff --git a/src/monocypher.c b/src/monocypher.c

index 42b47dfe80508bb44a3e261b80b3d587a801c888..cac46fbbe78b46b8bcb0dc47da53674788976e92 100644 (file)
--- a/src/monocypher.c
+++ b/src/monocypher.c
@@ -643,13 +643,7 @@ void crypto_blake2b(u8 hash[64], const u8 *message, size_t message_size)
  // references to R, Z, Q etc. come from the spec
  
  // Argon2 operates on 1024 byte blocks.
-typedef struct { u64 a[128]; } block;
-
-static void wipe_block(block *b)
-{
-       volatile u64* a = b->a;
-       ZERO(a, 128);
-}
+typedef struct { u64 a[128]; } blk;
  
  // updates a BLAKE2 hash with a 32 bit word, little endian.
  static void blake_update_32(crypto_blake2b_ctx *ctx, u32 input)
@@ -660,8 +654,8 @@ static void blake_update_32(crypto_blake2b_ctx *ctx, u32 input)
         WIPE_BUFFER(buf);
  }
  
-static void copy_block(block *o,const block*in){FOR(i,0,128)o->a[i] = in->a[i];}
-static void  xor_block(block *o,const block*in){FOR(i,0,128)o->a[i]^= in->a[i];}
+static void copy_block(blk *o,const blk*in){FOR(i, 0, 128) o->a[i]  = in->a[i];}
+static void  xor_block(blk *o,const blk*in){FOR(i, 0, 128) o->a[i] ^= in->a[i];}
  
  // Hash with a virtually unlimited digest size.
  // Doesn't extract more entropy than the base hash function.
@@ -711,146 +705,24 @@ static void extended_hash(u8       *digest, u32 digest_size,
         G(v2, v7,  v8, v13);  G(v3, v4,  v9, v14)
  
  // Core of the compression function G.  Computes Z from R in place.
-static void g_rounds(block *work_block)
+static void g_rounds(blk *b)
  {
         // column rounds (work_block = Q)
         for (int i = 0; i < 128; i += 16) {
-               ROUND(work_block->a[i     ], work_block->a[i +  1],
-                     work_block->a[i +  2], work_block->a[i +  3],
-                     work_block->a[i +  4], work_block->a[i +  5],
-                     work_block->a[i +  6], work_block->a[i +  7],
-                     work_block->a[i +  8], work_block->a[i +  9],
-                     work_block->a[i + 10], work_block->a[i + 11],
-                     work_block->a[i + 12], work_block->a[i + 13],
-                     work_block->a[i + 14], work_block->a[i + 15]);
+               ROUND(b->a[i   ], b->a[i+ 1], b->a[i+ 2], b->a[i+ 3],
+                     b->a[i+ 4], b->a[i+ 5], b->a[i+ 6], b->a[i+ 7],
+                     b->a[i+ 8], b->a[i+ 9], b->a[i+10], b->a[i+11],
+                     b->a[i+12], b->a[i+13], b->a[i+14], b->a[i+15]);
         }
-       // row rounds (work_block = Z)
+       // row rounds (b = Z)
         for (int i = 0; i < 16; i += 2) {
-               ROUND(work_block->a[i      ], work_block->a[i +   1],
-                     work_block->a[i +  16], work_block->a[i +  17],
-                     work_block->a[i +  32], work_block->a[i +  33],
-                     work_block->a[i +  48], work_block->a[i +  49],
-                     work_block->a[i +  64], work_block->a[i +  65],
-                     work_block->a[i +  80], work_block->a[i +  81],
-                     work_block->a[i +  96], work_block->a[i +  97],
-                     work_block->a[i + 112], work_block->a[i + 113]);
-       }
-}
-
-typedef struct {
-       u32 nb_blocks;
-       u32 nb_lanes;
-       u32 nb_passes;
-} argon_hardness;
-
-typedef struct {
-       u32 pass;
-       u32 slice;
-       u32 lane;
-       u32 block;
-} argon_index;
-
-static u32 ref_index(u64 seed, argon_hardness h, argon_index idx)
-{
-       u64 j1 = seed & 0xffffffff; // block selector (inside a lane)
-       u64 j2 = seed >> 32;        // lane selector
-
-       // Blocks may be picked from any of:
-       // - The last 3 slices (if they exist yet)
-       // - The already constructed blocks in this segment (except the last)
-       int first_pass   = idx.pass == 0;
-       u32 lane_size    = h.nb_blocks / h.nb_lanes;
-       u32 segment_size = lane_size / 4;
-       u32 lane         = j2 % h.nb_lanes;
-
-       // Start of the reference set
-       u32 next_slice   = ((idx.slice + 1) % 4) * segment_size;
-       u32 start        = first_pass ? 0 : next_slice;
-
-       // Size of the reference set
-       u32 nb_segments  = first_pass       ? idx.slice     : 3;
-       u32 nb_blocks    = lane == idx.lane ? idx.block - 1 : 0;
-       nb_blocks       -= lane != idx.lane && idx.block == 0; // why the fuck?
-       u32 w_size       = nb_segments * segment_size + nb_blocks;
-
-       // Generate offset from J1 and J2
-       u64 x   = (j1 * j1)    >> 32;
-       u64 y   = (w_size * x) >> 32;
-       u64 z   = (w_size - 1) - y;
-       u64 ref = (start + z) % lane_size;
-       return lane * lane_size + (u32)ref;
-}
-
-// Argon2i uses a kind of stream cipher to determine which reference
-// block it will take to synthesise the next block.  This context hold
-// that stream's state.  (It's very similar to Chacha20.  The block b
-// is analogous to Chacha's own pool)
-typedef struct {
-       block b;
-       argon_hardness h;
-       argon_index idx;
-       u32 ctr;
-} gidx_ctx;
-
-// The block in the context will determine array indices. To avoid
-// timing attacks, it only depends on public information.  No looking
-// at a previous block to seed the next.  This makes offline attacks
-// easier, but timing attacks are the bigger threat in many settings.
-static void gidx_refresh(gidx_ctx *ctx)
-{
-       ctx->ctr++;
-
-       // seed the beginning of the block...
-       ctx->b.a[0] = ctx->idx.pass;
-       ctx->b.a[1] = ctx->idx.lane;
-       ctx->b.a[2] = ctx->idx.slice;
-       ctx->b.a[3] = ctx->h.nb_blocks;
-       ctx->b.a[4] = ctx->h.nb_passes;
-       ctx->b.a[5] = 1;  // type: Argon2i
-       ctx->b.a[6] = ctx->ctr;
-       ZERO(ctx->b.a + 7, 121); // ...then zero the rest out
-
-       // Shuffle the block thus: ctx->b = G((G(ctx->b, zero)), zero)
-       // (G "square" function), to get cheap pseudo-random numbers.
-       block tmp;
-       copy_block(&tmp, &ctx->b);
-       g_rounds  (&ctx->b);
-       xor_block (&ctx->b, &tmp);
-       copy_block(&tmp, &ctx->b);
-       g_rounds  (&ctx->b);
-       xor_block (&ctx->b, &tmp);
-       wipe_block(&tmp);
-}
-
-static void gidx_init(gidx_ctx *ctx, argon_hardness h, argon_index idx)
-{
-       ctx->h   = h;
-       ctx->idx = idx;
-       ctx->ctr = 0;
-
-       // On the first slice of the first pass, 2 blocks are already
-       // filled, and idx.block == 2 instead of zero. In this case the lazy
-       // refresh does not happen, so we need to refresh manually.
-       //
-       // We could instead unconditionally refresh, and use an eager
-       // refresh instead, but this wastes up to one refresh per segment.
-       if (idx.block != 0) {
-               gidx_refresh(ctx);
+               ROUND(b->a[i   ], b->a[i+ 1], b->a[i+ 16], b->a[i+ 17],
+                     b->a[i+32], b->a[i+33], b->a[i+ 48], b->a[i+ 49],
+                     b->a[i+64], b->a[i+65], b->a[i+ 80], b->a[i+ 81],
+                     b->a[i+96], b->a[i+97], b->a[i+112], b->a[i+113]);
         }
  }
  
-static u32 gidx_next(gidx_ctx *ctx)
-{
-       // lazily creates the offset block we need
-       if ((ctx->idx.block & 127) == 0) {
-               gidx_refresh(ctx);
-       }
-       u32 index = ref_index(ctx->b.a[ctx->idx.block], ctx->h, ctx->idx);
-       ctx->idx.block++;
-       return index;
-
-}
-
  const crypto_argon2_settings crypto_argon2i_defaults = {
                 CRYPTO_ARGON2_I, // algorithm
                 100000, 3, 1,    // nb_blocks, nb_passes, nb_lanes
@@ -866,7 +738,7 @@ void crypto_argon2(u8 *hash, void *work_area, const u8 *password,
         const u32 nb_blocks    = lane_size * s.nb_lanes; // s.nb_blocks rounded down
  
         // work area seen as blocks (must be suitably aligned)
-       block *blocks = (block*)work_area;
+       blk *blocks = (blk*)work_area;
         {
                 crypto_blake2b_ctx ctx;
                 crypto_blake2b_init(&ctx);
@@ -903,40 +775,80 @@ void crypto_argon2(u8 *hash, void *work_area, const u8 *password,
                 WIPE_BUFFER(hash_area);
         }
  
-       // fill (then re-fill) the rest of the blocks
-       block tmp;
+       // Fill (and re-fill) the rest of the blocks
+       //
+       // Note: even though each segment within the same slice can be
+       // computed in parallel, (one thread per lane), we are computing
+       // them sequentially, because Monocypher doesn't support threads.
+       //
+       // Yet optimal performance (and therefore security) requires one
+       // thread per lane. The only reason Monocypher supports multiple
+       // lanes is compatibility.
+       blk tmp;
         FOR_T(u32, pass, 0, s.nb_passes) {
                 FOR_T(u32, slice, 0, 4) {
-                       // Each segment within the same slice are independent of
-                       // each other, and can be computed in parallel (one thread
-                       // per lane).  We only need to wait for all segments to be
-                       // finished before starting the next slice
-                       //
-                       // Monocpher has no support for threads, so segments are
-                       // computed sequentially here.  Note: optimal performance
-                       // (and therefore security) requires one thread per lane.
-                       // Without threads, multi-lane support is only there for
-                       // compatibility, or as a reference.
+                       // On the first slice of the first pass,
+                       // blocks 0 and 1 are already filled, hence pass_offset.
+                       u32 pass_offset  = pass == 0 && slice == 0 ? 2 : 0;
+                       u32 slice_offset = slice * segment_size;
+
+                       // Each iteration of the following loop may be performed in
+                       // a separate thread.  All iterations must be done before we
+                       // fill the next slice.
                         FOR_T(u32, segment, 0, s.nb_lanes) {
-                               // On the first slice of the first pass,
-                               // blocks 0 and 1 are already filled.
-                               // We use the offset to skip them.
-                               u32    pass_offset   = pass == 0 && slice == 0 ? 2 : 0;
-                               u32    lane_offset   = segment * lane_size;
-                               u32    slice_offset  = slice * segment_size;
-                               block *segment_start = blocks + lane_offset + slice_offset;
-
-                               gidx_ctx ctx; // public information, not wiped
-                               gidx_init(&ctx,
-                                         (argon_hardness){ nb_blocks, s.nb_lanes, s.nb_passes},
-                                         (argon_index)   { pass, slice, segment, pass_offset});
-                               FOR_T (u32, current_block, pass_offset, segment_size) {
-                                       block *reference = blocks + gidx_next(&ctx);
-                                       block *current   = segment_start + current_block;
-                                       block *previous  =
-                                               current_block == 0 && slice_offset == 0
+                               u32 index_ctr = 1;
+                               blk index_block;
+                               FOR_T (u32, block, pass_offset, segment_size) {
+                                       // Fill or refresh deterministic indices block
+                                       if (block == pass_offset || (block % 128) == 0) {
+                                               // seed the beginning of the block...
+                                               ZERO(index_block.a, 128);
+                                               index_block.a[0] = pass;
+                                               index_block.a[1] = segment;
+                                               index_block.a[2] = slice;
+                                               index_block.a[3] = nb_blocks;
+                                               index_block.a[4] = s.nb_passes;
+                                               index_block.a[5] = 1;  // type: Argon2i
+                                               index_block.a[6] = index_ctr;
+                                               index_ctr++;
+
+                                               // Shuffle the block: block = G((G(block, zero)), zero)
+                                               // (G "square" function), to get pseudo-random numbers.
+                                               copy_block(&tmp, &index_block);
+                                               g_rounds  (&index_block);
+                                               xor_block (&index_block, &tmp);
+                                               copy_block(&tmp, &index_block);
+                                               g_rounds  (&index_block);
+                                               xor_block (&index_block, &tmp);
+                                       }
+
+                                       // Establish the reference set.  *Approximately* comprises:
+                                       // - The last 3 slices (if they exist yet)
+                                       // - The already constructed blocks in the current segment
+                                       u32 next_slice   = ((slice + 1) % 4) * segment_size;
+                                       u32 window_start = pass == 0 ? 0     : next_slice;
+                                       u32 nb_segments  = pass == 0 ? slice : 3;
+                                       u32 window_size  = nb_segments * segment_size + block - 1;
+
+                                       // Generate offset from pseudo-random seed
+                                       u64 seed  = index_block.a[block];
+                                       u64 j1    = seed & 0xffffffff; // block selector
+                                       u64 j2    = seed >> 32;        // lane selector
+                                       u64 x     = (j1 * j1)         >> 32;
+                                       u64 y     = (window_size * x) >> 32;
+                                       u64 z     = (window_size - 1) - y;
+                                       u64 ref   = (window_start + z) % lane_size;
+                                       u32 index = (j2 % s.nb_lanes) * lane_size + (u32)ref;
+
+                                       // Find current, previous, and reference blocks
+                                       u32  lane_offset   = segment * lane_size;
+                                       blk *segment_start = blocks + lane_offset + slice_offset;
+                                       blk *reference     = blocks + index;
+                                       blk *current       = segment_start + block;
+                                       blk *previous      =
+                                               block == 0 && slice_offset == 0
                                                 ? segment_start + lane_size - 1
-                                               : segment_start + current_block - 1;
+                                               : segment_start + block - 1;
  
                                         // Apply compression function G,
                                         // And copy it (or XOR it) to the current block.
@@ -950,12 +862,14 @@ void crypto_argon2(u8 *hash, void *work_area, const u8 *password,
                         }
                 }
         }
-       wipe_block(&tmp);
+       // Wipe temporary block
+       volatile u64* p = tmp.a;
+       ZERO(p, 128);
  
         // XOR last blocks of each lane
-       block *last_block = blocks + lane_size - 1;
+       blk *last_block = blocks + lane_size - 1;
         FOR_T (u32, lane, 1, s.nb_lanes) {
-               block *next_block = last_block + lane_size;
+               blk *next_block = last_block + lane_size;
                 xor_block(next_block, last_block);
                 last_block = next_block;
         }
@@ -965,7 +879,7 @@ void crypto_argon2(u8 *hash, void *work_area, const u8 *password,
         store64_le_buf(final_block, last_block->a, 128);
  
         // Wipe work area
-       volatile u64 *p = (u64*)work_area;
+       p = (u64*)work_area;
         ZERO(p, 128 * nb_blocks);
  
         // Hash the very last block with H' into the output hash
author	Loup Vaillant <loup@loup-vaillant.fr>
	Wed, 28 Dec 2022 21:11:31 +0000 (22:11 +0100)
committer	Loup Vaillant <loup@loup-vaillant.fr>
	Wed, 28 Dec 2022 22:48:24 +0000 (23:48 +0100)