From: Loup Vaillant <loup@loup-vaillant.fr>
Date: Wed, 28 Dec 2022 08:08:37 +0000 (+0100)
Subject: Argon2i: support multiple lanes
X-Git-Url: https://git.codecow.com/?a=commitdiff_plain;h=4ee41699638459e00fd02215a01768edecdee5e7;p=Monocypher.git

Argon2i: support multiple lanes
---

diff --git a/src/monocypher.c b/src/monocypher.c
index 7714709..42b47df 100644
--- a/src/monocypher.c
+++ b/src/monocypher.c
@@ -660,16 +660,6 @@ static void blake_update_32(crypto_blake2b_ctx *ctx, u32 input)
 	WIPE_BUFFER(buf);
 }
 
-static void load_block(block *b, const u8 bytes[1024])
-{
-	load64_le_buf(b->a, bytes, 128);
-}
-
-static void store_block(u8 bytes[1024], const block *b)
-{
-	store64_le_buf(bytes, b->a, 128);
-}
-
 static void copy_block(block *o,const block*in){FOR(i,0,128)o->a[i] = in->a[i];}
 static void  xor_block(block *o,const block*in){FOR(i,0,128)o->a[i]^= in->a[i];}
 
@@ -747,18 +737,59 @@ static void g_rounds(block *work_block)
 	}
 }
 
+typedef struct {
+	u32 nb_blocks;
+	u32 nb_lanes;
+	u32 nb_passes;
+} argon_hardness;
+
+typedef struct {
+	u32 pass;
+	u32 slice;
+	u32 lane;
+	u32 block;
+} argon_index;
+
+static u32 ref_index(u64 seed, argon_hardness h, argon_index idx)
+{
+	u64 j1 = seed & 0xffffffff; // block selector (inside a lane)
+	u64 j2 = seed >> 32;        // lane selector
+
+	// Blocks may be picked from any of:
+	// - The last 3 slices (if they exist yet)
+	// - The already constructed blocks in this segment (except the last)
+	int first_pass   = idx.pass == 0;
+	u32 lane_size    = h.nb_blocks / h.nb_lanes;
+	u32 segment_size = lane_size / 4;
+	u32 lane         = j2 % h.nb_lanes;
+
+	// Start of the reference set
+	u32 next_slice   = ((idx.slice + 1) % 4) * segment_size;
+	u32 start        = first_pass ? 0 : next_slice;
+
+	// Size of the reference set
+	u32 nb_segments  = first_pass       ? idx.slice     : 3;
+	u32 nb_blocks    = lane == idx.lane ? idx.block - 1 : 0;
+	nb_blocks       -= lane != idx.lane && idx.block == 0; // why the fuck?
+	u32 w_size       = nb_segments * segment_size + nb_blocks;
+
+	// Generate offset from J1 and J2
+	u64 x   = (j1 * j1)    >> 32;
+	u64 y   = (w_size * x) >> 32;
+	u64 z   = (w_size - 1) - y;
+	u64 ref = (start + z) % lane_size;
+	return lane * lane_size + (u32)ref;
+}
+
 // Argon2i uses a kind of stream cipher to determine which reference
 // block it will take to synthesise the next block.  This context hold
 // that stream's state.  (It's very similar to Chacha20.  The block b
 // is analogous to Chacha's own pool)
 typedef struct {
 	block b;
-	u32 pass_number;
-	u32 slice_number;
-	u32 nb_blocks;
-	u32 nb_iterations;
+	argon_hardness h;
+	argon_index idx;
 	u32 ctr;
-	u32 offset;
 } gidx_ctx;
 
 // The block in the context will determine array indices. To avoid
@@ -767,12 +798,14 @@ typedef struct {
 // easier, but timing attacks are the bigger threat in many settings.
 static void gidx_refresh(gidx_ctx *ctx)
 {
+	ctx->ctr++;
+
 	// seed the beginning of the block...
-	ctx->b.a[0] = ctx->pass_number;
-	ctx->b.a[1] = 0;  // lane number (we have only one)
-	ctx->b.a[2] = ctx->slice_number;
-	ctx->b.a[3] = ctx->nb_blocks;
-	ctx->b.a[4] = ctx->nb_iterations;
+	ctx->b.a[0] = ctx->idx.pass;
+	ctx->b.a[1] = ctx->idx.lane;
+	ctx->b.a[2] = ctx->idx.slice;
+	ctx->b.a[3] = ctx->h.nb_blocks;
+	ctx->b.a[4] = ctx->h.nb_passes;
 	ctx->b.a[5] = 1;  // type: Argon2i
 	ctx->b.a[6] = ctx->ctr;
 	ZERO(ctx->b.a + 7, 121); // ...then zero the rest out
@@ -789,87 +822,60 @@ static void gidx_refresh(gidx_ctx *ctx)
 	wipe_block(&tmp);
 }
 
-static void gidx_init(gidx_ctx *ctx,
-                      u32 pass_number, u32 slice_number,
-                      u32 nb_blocks,   u32 nb_iterations)
-{
-	ctx->pass_number   = pass_number;
-	ctx->slice_number  = slice_number;
-	ctx->nb_blocks     = nb_blocks;
-	ctx->nb_iterations = nb_iterations;
-	ctx->ctr           = 0;
-
-	// Offset from the beginning of the segment.  For the first slice
-	// of the first pass, we start at the *third* block, so the offset
-	// starts at 2, not 0.
-	if (pass_number != 0 || slice_number != 0) {
-		ctx->offset = 0;
-	} else {
-		ctx->offset = 2;
-		ctx->ctr++;         // Compensates for missed lazy creation
-		gidx_refresh(ctx);  // at the start of gidx_next()
+static void gidx_init(gidx_ctx *ctx, argon_hardness h, argon_index idx)
+{
+	ctx->h   = h;
+	ctx->idx = idx;
+	ctx->ctr = 0;
+
+	// On the first slice of the first pass, 2 blocks are already
+	// filled, and idx.block == 2 instead of zero. In this case the lazy
+	// refresh does not happen, so we need to refresh manually.
+	//
+	// We could instead unconditionally refresh, and use an eager
+	// refresh instead, but this wastes up to one refresh per segment.
+	if (idx.block != 0) {
+		gidx_refresh(ctx);
 	}
 }
 
 static u32 gidx_next(gidx_ctx *ctx)
 {
 	// lazily creates the offset block we need
-	if ((ctx->offset & 127) == 0) {
-		ctx->ctr++;
+	if ((ctx->idx.block & 127) == 0) {
 		gidx_refresh(ctx);
 	}
-	u32 index  = ctx->offset & 127; // save index  for current call
-	u32 offset = ctx->offset;       // save offset for current call
-	ctx->offset++;                  // update offset for next call
-
-	// Computes the area size.
-	// Pass 0 : all already finished segments plus already constructed
-	//          blocks in this segment
-	// Pass 1+: 3 last segments plus already constructed
-	//          blocks in this segment.  THE SPEC SUGGESTS OTHERWISE.
-	//          I CONFORM TO THE REFERENCE IMPLEMENTATION.
-	int first_pass  = ctx->pass_number == 0;
-	u32 slice_size  = ctx->nb_blocks >> 2;
-	u32 nb_segments = first_pass ? ctx->slice_number : 3;
-	u32 area_size   = nb_segments * slice_size + offset - 1;
-
-	// Computes the starting position of the reference area.
-	// CONTRARY TO WHAT THE SPEC SUGGESTS, IT STARTS AT THE
-	// NEXT SEGMENT, NOT THE NEXT BLOCK.
-	u32 next_slice = ((ctx->slice_number + 1) & 3) * slice_size;
-	u32 start_pos  = first_pass ? 0 : next_slice;
-
-	// Generate offset from J1 (no need for J2, there's only one lane)
-	u64 j1  = ctx->b.a[index] & 0xffffffff; // pseudo-random number
-	u64 x   = (j1 * j1)       >> 32;
-	u64 y   = (area_size * x) >> 32;
-	u64 z   = (area_size - 1) - y;
-	u64 ref = start_pos + z;                // ref < 2 * nb_blocks
-	return (u32)(ref < ctx->nb_blocks ? ref : ref - ctx->nb_blocks);
+	u32 index = ref_index(ctx->b.a[ctx->idx.block], ctx->h, ctx->idx);
+	ctx->idx.block++;
+	return index;
+
 }
 
 const crypto_argon2_settings crypto_argon2i_defaults = {
 		CRYPTO_ARGON2_I, // algorithm
-		100000, 3, 1,    // nb_blocks, nb_iterations, nb_lanes
+		100000, 3, 1,    // nb_blocks, nb_passes, nb_lanes
 		16, 32,          // salt_size, hash_size
 		0, 0, 0, 0,      // no key, no ad
 };
 
-// Main algorithm
 void crypto_argon2(u8 *hash, void *work_area, const u8 *password,
                    u32 password_size, const u8 *salt, crypto_argon2_settings s)
 {
+	const u32 segment_size = s.nb_blocks / s.nb_lanes / 4;
+	const u32 lane_size    = segment_size * 4;
+	const u32 nb_blocks    = lane_size * s.nb_lanes; // s.nb_blocks rounded down
+
 	// work area seen as blocks (must be suitably aligned)
 	block *blocks = (block*)work_area;
 	{
 		crypto_blake2b_ctx ctx;
 		crypto_blake2b_init(&ctx);
-		blake_update_32      (&ctx, s.nb_lanes     ); // p: number of "threads"
-		blake_update_32      (&ctx, s.hash_size    );
-		blake_update_32      (&ctx, s.nb_blocks    );
-		blake_update_32      (&ctx, s.nb_iterations);
-		blake_update_32      (&ctx, 0x13           ); // v: version number
-		blake_update_32      (&ctx, s.algorithm    ); // y: Argon2i, Argon2d...
+		blake_update_32      (&ctx, s.nb_lanes ); // p: number of "threads"
+		blake_update_32      (&ctx, s.hash_size);
+		blake_update_32      (&ctx, s.nb_blocks);
+		blake_update_32      (&ctx, s.nb_passes);
+		blake_update_32      (&ctx, 0x13       ); // v: version number
+		blake_update_32      (&ctx, s.algorithm); // y: Argon2i, Argon2d...
 		blake_update_32      (&ctx,           password_size);
 		crypto_blake2b_update(&ctx, password, password_size);
 		blake_update_32      (&ctx,           s.salt_size);
@@ -882,67 +888,87 @@ void crypto_argon2(u8 *hash, void *work_area, const u8 *password,
 		u8 initial_hash[72]; // 64 bytes plus 2 words for future hashes
 		crypto_blake2b_final(&ctx, initial_hash);
 
-		// fill first 2 blocks
+		// fill first 2 blocks of each lane
 		u8 hash_area[1024];
-		store32_le(initial_hash + 64, 0); // first  additional word
-		store32_le(initial_hash + 68, 0); // second additional word
-		extended_hash(hash_area, 1024, initial_hash, 72);
-		load_block(blocks, hash_area);
-
-		store32_le(initial_hash + 64, 1); // slight modification
-		extended_hash(hash_area, 1024, initial_hash, 72);
-		load_block(blocks + 1, hash_area);
+		FOR_T(u32, l, 0, s.nb_lanes) {
+			FOR_T(u32, i, 0, 2) {
+				store32_le(initial_hash + 64, i); // first  additional word
+				store32_le(initial_hash + 68, l); // second additional word
+				extended_hash(hash_area, 1024, initial_hash, 72);
+				load64_le_buf(blocks[l * lane_size + i].a, hash_area, 128);
+			}
+		}
 
 		WIPE_BUFFER(initial_hash);
 		WIPE_BUFFER(hash_area);
 	}
 
-	// Actual number of blocks (must be a multiple of 4 p)
-	u32 nb_blocks = s.nb_blocks - s.nb_blocks % (4 * s.nb_lanes);
-	const u32 segment_size = nb_blocks >> 2;
-
 	// fill (then re-fill) the rest of the blocks
 	block tmp;
-	gidx_ctx ctx; // public information, no need to wipe
-	FOR_T (u32, pass_number, 0, s.nb_iterations) {
-		int first_pass = pass_number == 0;
-
-		FOR_T (u32, segment, 0, 4) {
-			gidx_init(&ctx, pass_number, segment, nb_blocks, s.nb_iterations);
-
-			// On the first segment of the first pass,
-			// blocks 0 and 1 are already filled.
-			// We use the offset to skip them.
-			u32 start_offset  = first_pass && segment == 0 ? 2 : 0;
-			u32 segment_start = segment * segment_size + start_offset;
-			u32 segment_end   = (segment + 1) * segment_size;
-			FOR_T (u32, current_block, segment_start, segment_end) {
-				block *reference = blocks + gidx_next(&ctx);
-				block *current   = blocks + current_block;
-				block *previous  =
-					current_block == 0
-					? blocks + nb_blocks - 1
-					: blocks + current_block - 1;
-				// Apply compression function G,
-				// And copy it (or XOR it) to the current block.
-				copy_block(&tmp, previous);
-				xor_block (&tmp, reference);
-				if (first_pass) { copy_block(current, &tmp); }
-				else            { xor_block (current, &tmp); }
-				g_rounds  (&tmp);
-				xor_block (current, &tmp);
+	FOR_T(u32, pass, 0, s.nb_passes) {
+		FOR_T(u32, slice, 0, 4) {
+			// Each segment within the same slice are independent of
+			// each other, and can be computed in parallel (one thread
+			// per lane).  We only need to wait for all segments to be
+			// finished before starting the next slice
+			//
+			// Monocpher has no support for threads, so segments are
+			// computed sequentially here.  Note: optimal performance
+			// (and therefore security) requires one thread per lane.
+			// Without threads, multi-lane support is only there for
+			// compatibility, or as a reference.
+			FOR_T(u32, segment, 0, s.nb_lanes) {
+				// On the first slice of the first pass,
+				// blocks 0 and 1 are already filled.
+				// We use the offset to skip them.
+				u32    pass_offset   = pass == 0 && slice == 0 ? 2 : 0;
+				u32    lane_offset   = segment * lane_size;
+				u32    slice_offset  = slice * segment_size;
+				block *segment_start = blocks + lane_offset + slice_offset;
+
+				gidx_ctx ctx; // public information, not wiped
+				gidx_init(&ctx,
+				          (argon_hardness){ nb_blocks, s.nb_lanes, s.nb_passes},
+				          (argon_index)   { pass, slice, segment, pass_offset});
+				FOR_T (u32, current_block, pass_offset, segment_size) {
+					block *reference = blocks + gidx_next(&ctx);
+					block *current   = segment_start + current_block;
+					block *previous  =
+						current_block == 0 && slice_offset == 0
+						? segment_start + lane_size - 1
+						: segment_start + current_block - 1;
+
+					// Apply compression function G,
+					// And copy it (or XOR it) to the current block.
+					copy_block(&tmp, previous);
+					xor_block (&tmp, reference);
+					if (pass == 0) { copy_block(current, &tmp); }
+					else           { xor_block (current, &tmp); }
+					g_rounds  (&tmp);
+					xor_block (current, &tmp);
+				}
 			}
 		}
 	}
 	wipe_block(&tmp);
+
+	// XOR last blocks of each lane
+	block *last_block = blocks + lane_size - 1;
+	FOR_T (u32, lane, 1, s.nb_lanes) {
+		block *next_block = last_block + lane_size;
+		xor_block(next_block, last_block);
+		last_block = next_block;
+	}
+
+	// Serialize last block
 	u8 final_block[1024];
-	store_block(final_block, blocks + (nb_blocks - 1));
+	store64_le_buf(final_block, last_block->a, 128);
 
-	// wipe work area
+	// Wipe work area
 	volatile u64 *p = (u64*)work_area;
 	ZERO(p, 128 * nb_blocks);
 
-	// hash the very last block with H' into the output hash
+	// Hash the very last block with H' into the output hash
 	extended_hash(hash, s.hash_size, final_block, 1024);
 	WIPE_BUFFER(final_block);
 }
diff --git a/src/monocypher.h b/src/monocypher.h
index 6dacc87..08a3029 100644
--- a/src/monocypher.h
+++ b/src/monocypher.h
@@ -144,13 +144,13 @@ void crypto_blake2b_general_init(crypto_blake2b_ctx *ctx, size_t hash_size,
 // ----------------------------------
 
 typedef struct {
-	uint32_t algorithm;     // Argon2i, Argon2d, Argon2id
-	uint32_t nb_blocks;     // memory hardness, >= 8
-	uint32_t nb_iterations; // CPU hardness, >= 1 (>= 3 recommended for Argon2i)
-	uint32_t nb_lanes;      // parallelism level (single threaded anyway)
-	uint32_t salt_size;     // we recommend 16 bytes
-	uint32_t hash_size;     // we recommend 32 bytes per key
-	const uint8_t *key;     // pointers are aligned to 8 bytes
+	uint32_t algorithm;  // Argon2i, Argon2d, Argon2id
+	uint32_t nb_blocks;  // memory hardness, >= 8
+	uint32_t nb_passes;  // CPU hardness, >= 1 (>= 3 recommended for Argon2i)
+	uint32_t nb_lanes;   // parallelism level (single threaded anyway)
+	uint32_t salt_size;  // we recommend 16 bytes
+	uint32_t hash_size;  // we recommend 32 bytes per key
+	const uint8_t *key;  // pointers are aligned to 8 bytes
 	const uint8_t *ad;
 	uint32_t key_size;
 	uint32_t ad_size;
diff --git a/tests/gen/argon2i.c b/tests/gen/argon2i.c
index d22e7a5..3bbee5f 100644
--- a/tests/gen/argon2i.c
+++ b/tests/gen/argon2i.c
@@ -73,7 +73,9 @@ static void test(size_t nb_blocks, size_t hash_size, size_t nb_iterations)
 	print_number(nb_iterations                    );
 	print_vector(password, 16                     );
 	print_vector(salt    , crypto_pwhash_SALTBYTES);
-	printf(":\n:\n"); // no key, no additionnal data
+	print_number(1);  // one lane (no parallelism)
+	printf(":\n");    // no key
+	printf(":\n");    // no additionnal data
 	print_vector(hash    , hash_size              );
 	printf("\n");
 }
diff --git a/tests/gen/vectors/argon2i b/tests/gen/vectors/argon2i
index 7b4c51e..c96ea2b 100644
--- a/tests/gen/vectors/argon2i
+++ b/tests/gen/vectors/argon2i
@@ -2,6 +2,16 @@
 0300000000000000:
 0101010101010101010101010101010101010101010101010101010101010101:
 02020202020202020202020202020202:
+0100000000000000:
 0303030303030303:
 040404040404040404040404:
 afe519be3ab0e92375df221dfb17347080c7000b1be85f9ee39978bf11e7cc3a:
+
+2000000000000000:
+0300000000000000:
+0101010101010101010101010101010101010101010101010101010101010101:
+02020202020202020202020202020202:
+0400000000000000:
+0303030303030303:
+040404040404040404040404:
+c814d9d1dc7f37aa13f0d77f2494bda1c8de6b016dd388d29952a4c4672b6ce8:
diff --git a/tests/test.c b/tests/test.c
index bce5d50..f544775 100644
--- a/tests/test.c
+++ b/tests/test.c
@@ -522,24 +522,24 @@ static void test_hmac_sha512()
 ///////////////
 static void argon2i(vector_reader *reader)
 {
-	u64    nb_blocks     = load64_le(next_input(reader).buf);
-	u64    nb_iterations = load64_le(next_input(reader).buf);
-	vector password      = next_input(reader);
-	vector salt          = next_input(reader);
-	vector key           = next_input(reader);
-	vector ad            = next_input(reader);
-	vector out           = next_output(reader);
-	void  *work_area     = alloc(nb_blocks * 1024);
-
 	crypto_argon2_settings s = crypto_argon2i_defaults;
-	s.nb_blocks     = nb_blocks;
-	s.nb_iterations = nb_iterations;
-	s.hash_size     = out.size;
-	s.salt_size     = salt.size;
-	s.key           = key.buf;
-	s.key_size      = key.size;
-	s.ad            = ad.buf;
-	s.ad_size       = ad.size;
+
+	s.nb_blocks      = load32_le(next_input(reader).buf);
+	s.nb_passes      = load32_le(next_input(reader).buf);
+	vector password  = next_input(reader);
+	vector salt      = next_input(reader);
+	s.nb_lanes       = load32_le(next_input(reader).buf);
+	vector key       = next_input(reader);
+	vector ad        = next_input(reader);
+	vector out       = next_output(reader);
+	void  *work_area = alloc(s.nb_blocks * 1024);
+
+	s.hash_size = out.size;
+	s.salt_size = salt.size;
+	s.key       = key.buf;
+	s.key_size  = key.size;
+	s.ad        = ad.buf;
+	s.ad_size   = ad.size;
 
 	crypto_argon2(out.buf, work_area, password.buf, password.size, salt.buf, s);
 	free(work_area);
@@ -568,12 +568,12 @@ static void test_argon2i()
 
 		// without overlap
 		crypto_argon2_settings s = crypto_argon2i_defaults;
-		s.nb_blocks     = 8;
-		s.nb_iterations = 1;
-		s.key           = key;
-		s.ad            = ad;
-		s.key_size      = 32;
-		s.ad_size       = 32;
+		s.nb_blocks = 8;
+		s.nb_passes = 1;
+		s.key       = key;
+		s.ad        = ad;
+		s.key_size  = 32;
+		s.ad_size   = 32;
 		crypto_argon2(hash1, clean_work_area, pass, 16, salt, s);
 
 		// with overlap
diff --git a/tests/tis-ci-vectors.h b/tests/tis-ci-vectors.h
index 3ba12b0..d5ef069 100644
--- a/tests/tis-ci-vectors.h
+++ b/tests/tis-ci-vectors.h
@@ -349,18 +349,20 @@ static const char *argon2i_vectors[]={
   "0300000000000000",
   "e4e4c4054fe35a75d9c0f679ad8770d8",
   "227e68e4c1e68ce67ee88e6be251a207",
+  "0100000000000000",
   "",
   "",
   "2a2ec585be2ec27c215f677e947c212b1b85de797167d4950e29987977c941117c4c5f6f6f547e62d76b88fa121781986a37ea14dc394917af5396ea58915d",
-  "0800000000000000",
+  "2000000000000000",
   "0300000000000000",
-  "48b3753cff3a6d990163e6b60da1e4e5",
-  "d6a2df78c16c96a52d4fb01ea4ecf70e",
-  "",
-  "",
-  "ec60819d04c1d35416d20abc5908dd972acbfd8f6a282ca2b642064242526683c0f1b237f38bac8279571f049bfed4d8d177ea336f2ec96456eb6c584d3c9607",
+  "0101010101010101010101010101010101010101010101010101010101010101",
+  "02020202020202020202020202020202",
+  "0400000000000000",
+  "0303030303030303",
+  "040404040404040404040404",
+  "c814d9d1dc7f37aa13f0d77f2494bda1c8de6b016dd388d29952a4c4672b6ce8",
 };
-static size_t nb_argon2i_vectors=14;
+static size_t nb_argon2i_vectors=16;
 static const char *edDSA_vectors[]={
   "50831c8cb43cd6822bf3f6fae0801cb6c843d8066b07346635365fb7d6ee54e5",
   "b600ab324d70d2372f3ba5a0d8bdd8b8e797f780b642bd56e69a18db74c389bc",
diff --git a/tests/tis-ci.c b/tests/tis-ci.c
index 053d6c2..1c21747 100644
--- a/tests/tis-ci.c
+++ b/tests/tis-ci.c
@@ -160,24 +160,24 @@ static void hmac_sha512(vector_reader *reader)
 
 static void argon2i(vector_reader *reader)
 {
-	u64    nb_blocks     = load64_le(next_input(reader).buf);
-	u64    nb_iterations = load64_le(next_input(reader).buf);
-	vector password      = next_input(reader);
-	vector salt          = next_input(reader);
-	vector key           = next_input(reader);
-	vector ad            = next_input(reader);
-	vector out           = next_output(reader);
-	void  *work_area     = alloc(nb_blocks * 1024);
-
 	crypto_argon2_settings s = crypto_argon2i_defaults;
-	s.nb_blocks     = nb_blocks;
-	s.nb_iterations = nb_iterations;
-	s.hash_size     = out.size;
-	s.salt_size     = salt.size;
-	s.key           = key.buf;
-	s.key_size      = key.size;
-	s.ad            = ad.buf;
-	s.ad_size       = ad.size;
+
+	s.nb_blocks      = load32_le(next_input(reader).buf);
+	s.nb_passes      = load32_le(next_input(reader).buf);
+	vector password  = next_input(reader);
+	vector salt      = next_input(reader);
+	s.nb_lanes       = load32_le(next_input(reader).buf);
+	vector key       = next_input(reader);
+	vector ad        = next_input(reader);
+	vector out       = next_output(reader);
+	void  *work_area = alloc(s.nb_blocks * 1024);
+
+	s.hash_size = out.size;
+	s.salt_size = salt.size;
+	s.key       = key.buf;
+	s.key_size  = key.size;
+	s.ad        = ad.buf;
+	s.ad_size   = ad.size;
 
 	crypto_argon2(out.buf, work_area, password.buf, password.size, salt.buf, s);
 	free(work_area);