#include "argon2i.h"
#include "blake2b.h"
+// tests
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
static uint64_t
load64_le(const uint8_t s[8])
{
uint64_t a[128]; // 1024 octets in 128 64-bit words
} block;
-static void
-zero_block(block *b)
-{
- for (int i = 0; i < 128; i++) {
- b->a[i] = 0;
- }
-}
-
static void
load_block(block *b, const uint8_t bytes[1024])
{
}
}
+static void
+blake_update_32(crypto_blake2b_ctx *ctx, uint32_t input)
+{
+ uint8_t buf[4];
+ store32_le(buf, input);
+ crypto_blake2b_update(ctx, buf, 4);
+}
+
static void
extended_hash(uint8_t *digest, uint32_t digest_size,
const uint8_t *input , uint32_t input_size)
{
crypto_blake2b_ctx ctx;
- uint8_t buf[4];
- store32_le(buf, digest_size);
-
- crypto_general_blake2b_init(&ctx, min(digest_size, 64), 0, 0);
- crypto_blake2b_update(&ctx, buf, 4);
- crypto_blake2b_update(&ctx, input, input_size);
- crypto_blake2b_final(&ctx, digest);
+ crypto_blake2b_general_init(&ctx, min(digest_size, 64), 0, 0);
+ blake_update_32 (&ctx, digest_size);
+ crypto_blake2b_update (&ctx, input, input_size);
+ crypto_blake2b_final (&ctx, digest);
if (digest_size > 64) {
// the conversion to u64 avoids integer overflow on
uint32_t i = 1;
uint32_t in = 0;
uint32_t out = 32;
- while (i <= r) {
+ while (i < r) {
// Input and output overlap.
// This shouldn't be a problem.
- crypto_blake2b(digest + out, input + in, 64);
+ crypto_blake2b(digest + out, digest + in, 64);
i += 1;
in += 32;
out += 32;
}
- crypto_general_blake2b(digest + out, digest_size - (32 * r), 0, 0,
+ crypto_general_blake2b(digest + out, digest_size - (32 * r),
+ 0, 0, // no key
digest + in , 64);
}
}
static void
binary_g(block *result, block *x, block *y, void (*xcopy) (block*, const block*))
{
- // puts R = X ^ Y into tmp
+ // put R = X ^ Y into tmp
block tmp;
- copy_block(&tmp , x);
- xor_block (&tmp , y);
+ copy_block(&tmp, x);
+ xor_block (&tmp, y);
xcopy(result, &tmp); // save R (erase or xor the old block)
g_rounds(&tmp); // tmp = Z
xor_block(result, &tmp); // result = R ^ Z (or R ^ Z ^ old)
}
+// applies the "two rounds compression function" on the input, in place
+static void
+g_square(block *work_block)
+{
+ // work_block == R
+ block tmp;
+ for (int i = 0; i < 2; i++) {
+ copy_block(&tmp, work_block); // tmp = R
+ g_rounds(work_block); // work_block = Z
+ xor_block(work_block, &tmp); // work_block = Z ^ R
+ }
+}
+
typedef struct gidx_ctx {
block b;
uint32_t pass_number;
uint32_t nb_blocks;
uint32_t nb_iterations;
uint32_t ctr;
- uint32_t next_index;
+ uint32_t index;
} gidx_ctx;
+static void
+gidx_refresh(gidx_ctx *ctx)
+{
+ ctx->b.a[0] = ctx->pass_number;
+ ctx->b.a[1] = 0; // lane number (we have only one)
+ ctx->b.a[2] = ctx->slice_number;
+ ctx->b.a[3] = ctx->nb_blocks;
+ ctx->b.a[4] = ctx->nb_iterations;
+ ctx->b.a[5] = 1; // type: Argon2i
+ ctx->b.a[6] = ctx->ctr;
+ // zero the rest of the block
+ for (int i = 7; i < 128; i++) {
+ ctx->b.a[i] = 0;
+ }
+ // shuffle the block into something weakly pseudorandom
+ g_square(&(ctx->b));
+}
+
static void
gidx_init(gidx_ctx *ctx,
uint32_t pass_number,
ctx->slice_number = slice_number;
ctx->nb_blocks = nb_blocks;
ctx->nb_iterations = nb_iterations;
- ctx->ctr = 0; // first block starts by 1.
- ctx->next_index = 128; // will force increment of ctr upon gidx_next().
+ ctx->ctr = 1; // not zero, surprisingly
+ ctx->index = pass_number == 0 && slice_number == 0 ? 2 : 0;
+ gidx_refresh(ctx);
}
-static uint64_t
-gidx_next(gidx_ctx *ctx, uint32_t current_block)
+static uint32_t
+gidx_next(gidx_ctx *ctx)
{
- // lazily create the index block we need
- if (ctx->next_index == 128) {
- ctx->next_index = 0;
+ // lazily creates the index block we need
+ if (ctx->index == 128) {
+ ctx->index = 0;
ctx->ctr++;
- // refreshe the underlying block
- zero_block(&(ctx->b));
- ctx->b.a[0] = ctx->pass_number;
- ctx->b.a[1] = 0; // lane number (we have only one)
- ctx->b.a[2] = ctx->slice_number;
- ctx->b.a[3] = ctx->nb_blocks;
- ctx->b.a[4] = ctx->nb_iterations;
- ctx->b.a[5] = 1; // type: Argon2i
- ctx->b.a[6] = ctx->ctr;
- g_rounds(&(ctx->b));
+ gidx_refresh(ctx);
}
// we don't need J2, because there's only one lane.
- uint64_t j1 = ctx->b.a[ctx->next_index]; // 32 least significant bits
- ctx->next_index++;
-
- _Bool first_pass = ctx->pass_number == 1; // first pass == 1, not zero
- uint32_t lane_size = ctx->nb_blocks;
- uint32_t area_size = first_pass ? current_block - 1 : lane_size - 2;
- uint64_t x = (j1 * j1) >> 32;
- uint64_t y = (area_size * x) >> 32;
- uint64_t z = area_size - 1 - y;
- uint32_t start_pos = first_pass ? 0 : current_block + 1;
- return (start_pos + z) % lane_size;
+ uint64_t j1 = ctx->b.a[ctx->index] & 0xffffffff;
+
+ // Computes the area size.
+ // Pass 0 : all already finished segments plus already constructed
+ // blocks in this segment
+ // Pass 1+: 3 last segments plus already constructed
+ // blocks in this segment
+ // THIS IS NOT WHAT THE SPEC SAYS. HERE I COPY THE REFERENCE IMPLEMENTATION
+ //uint32_t area_size = first_pass ? current_block - 1 : lane_size - 2;
+ _Bool first_pass = ctx->pass_number == 0;
+ uint32_t slice_size = ctx->nb_blocks / 4;
+ uint32_t area_size = ((first_pass ? ctx->slice_number : 3)
+ * slice_size + ctx->index - 1);
+
+ uint32_t next_slice = (ctx->slice_number == 3
+ ? 0
+ : (ctx->slice_number + 1) * slice_size);
+
+ // Generates the actual index from J1
+ uint64_t x = (j1 * j1) >> 32;
+ uint64_t y = (area_size * x) >> 32;
+ uint64_t z = area_size - 1 - y;
+ uint32_t start_pos = first_pass ? 0 : next_slice;
+ printf("s%d_%d", start_pos, area_size);
+ uint32_t actual_pos = (start_pos + z) % ctx->nb_blocks;
+
+ ctx->index++; // updates index for the next call
+
+ return actual_pos;
}
void
block *blocks = work_area;
{
- uint8_t buf[4];
crypto_blake2b_ctx ctx;
crypto_blake2b_init(&ctx);
- store32_le(buf, 1 /* p */ ); crypto_blake2b_update(&ctx, buf, 4);
- store32_le(buf, tag_size ); crypto_blake2b_update(&ctx, buf, 4);
- store32_le(buf, nb_blocks ); crypto_blake2b_update(&ctx, buf, 4);
- store32_le(buf, nb_iterations); crypto_blake2b_update(&ctx, buf, 4);
- store32_le(buf, 0x13 /* v */ ); crypto_blake2b_update(&ctx, buf, 4);
- store32_le(buf, 1 /* y */ ); crypto_blake2b_update(&ctx, buf, 4);
- store32_le(buf, password_size); crypto_blake2b_update(&ctx, buf, 4);
+
+ blake_update_32 (&ctx, 1 ); // p: number of threads
+ blake_update_32 (&ctx, tag_size );
+ blake_update_32 (&ctx, nb_blocks );
+ blake_update_32 (&ctx, nb_iterations);
+ blake_update_32 (&ctx, 0x13 ); // v: version number
+ blake_update_32 (&ctx, 1 ); // y: Argon2i
+ blake_update_32 (&ctx, password_size);
crypto_blake2b_update(&ctx, password, password_size);
- store32_le(buf, salt_size ); crypto_blake2b_update(&ctx, buf, 4);
- crypto_blake2b_update(&ctx, salt, salt_size);
- store32_le(buf, key_size ); crypto_blake2b_update(&ctx, buf, 4);
- crypto_blake2b_update(&ctx, key, key_size);
- store32_le(buf, ad_size ); crypto_blake2b_update(&ctx, buf, 4);
- crypto_blake2b_update(&ctx, ad, ad_size);
-
- uint8_t initial_hash[72]; // 64 bytes plus additional words for future hashes
+ blake_update_32 (&ctx, salt_size);
+ crypto_blake2b_update(&ctx, salt, salt_size);
+ blake_update_32 (&ctx, key_size);
+ crypto_blake2b_update(&ctx, key, key_size);
+ blake_update_32 (&ctx, ad_size);
+ crypto_blake2b_update(&ctx, ad, ad_size);
+
+ uint8_t initial_hash[72]; // 64 bytes plus 2 words for future hashes
crypto_blake2b_final(&ctx, initial_hash);
+ /* // debug stuff */
+ /* int input_size = 40 + password_size + salt_size + key_size + ad_size; */
+ /* int i = 0; */
+ /* uint8_t *input = malloc(input_size); */
+ /* store32_le(input + i, 4 ); i += 4; */
+ /* store32_le(input + i, tag_size ); i += 4; */
+ /* store32_le(input + i, nb_blocks ); i += 4; */
+ /* store32_le(input + i, nb_iterations); i += 4; */
+ /* store32_le(input + i, 0x13 ); i += 4; */
+ /* store32_le(input + i, 1 ); i += 4; */
+ /* store32_le(input + i, password_size); i += 4; */
+ /* memcpy (input + i, password, password_size); i += password_size; */
+ /* store32_le(input + i, salt_size ); i += 4; */
+ /* memcpy (input + i, salt, salt_size ); i += salt_size; */
+ /* store32_le(input + i, key_size ); i += 4; */
+ /* memcpy (input + i, key, key_size ); i += key_size; */
+ /* store32_le(input + i, ad_size ); i += 4; */
+ /* memcpy (input + i, ad, ad_size ); i += ad_size; */
+ /* printf("input_size, i: %d, %d", input_size, i); */
+ /* for (int i = 0; i < input_size; i++) { */
+ /* if (i % 4 == 0) { */
+ /* printf("\n"); */
+ /* } */
+ /* printf("%02x", input[i]); */
+ /* } */
+ /* printf("\n"); */
+ /* printf("Memory : %d\n", nb_blocks ); */
+ /* printf("Iterations : %d\n", nb_iterations); */
+ /* printf("Parallelism: %d\n", 4 ); */
+ /* printf("Tag length : %d\n", tag_size ); */
+ /* printf("Pwd length : %d\n", password_size); */
+ /* printf("Slt length : %d\n", salt_size ); */
+ /* printf("Key length : %d\n", key_size ); */
+ /* printf("AD length : %d\n", ad_size ); */
+
+ printf("Pre-hashing digest:\n");
+ for (int i = 0; i < 8; i++) {
+ for (int j = 0; j < 8; j++) {
+ printf("%02x ", initial_hash[8*i + j]);
+ }
+ printf("\n");
+ }
+ // end debug stuff
+
// fill first 2 blocks
block tmp_block;
uint8_t hash_area[1024];
load_block(&tmp_block, hash_area);
copy_block(blocks, &tmp_block);
- store32_le(initial_hash + 68, 1); // slight modification
+ store32_le(initial_hash + 64, 1); // slight modification
extended_hash(hash_area, 1024, initial_hash, 72);
load_block(&tmp_block, hash_area);
- xor_block(blocks + 1, &tmp_block);
+ copy_block(blocks + 1, &tmp_block);
}
// Actual number of blocks
nb_blocks -= nb_blocks % 4; // round down to 4 p (p == 1 thread)
const uint32_t segment_size = nb_blocks / 4;
- // fill the rest of the first segment
- {
- gidx_ctx ctx;
- gidx_init(&ctx, 1, 1, nb_blocks, nb_iterations);
- for (uint32_t i = 2; i < segment_size; i++) {
- binary_g(blocks + i, // current block
- blocks + i - 1, // previous block
- blocks + gidx_next(&ctx, i), // reference block
- copy_block); // first pass is a raw copy
- }
- }
- // fill the other 3 segments
- for (int segment = 1; segment < 4; segment++ ) {
- gidx_ctx ctx;
- gidx_init(&ctx, 1, segment + 1, nb_blocks, nb_iterations);
- for (uint32_t i = segment * segment_size;
- i < (segment + 1) * segment_size;
- i++) {
- binary_g(blocks + i, // current block
- blocks + i - 1, // previous block
- blocks + gidx_next(&ctx, i), // reference block
- copy_block); // first pass is a raw copy
- }
- }
+ // fill (then re-fill) the rest of the blocks
+ for (uint32_t pass_number = 0; pass_number < nb_iterations; pass_number++) {
+ _Bool first_pass = pass_number == 0;
+ // Simple copy on pass 0, XOR instead of overwrite on subsequent passes
+ void (*xcopy) (block*, const block*) = first_pass ?copy_block :xor_block;
+
+ for (int segment = 0; segment < 4; segment++ ) {
+
+ gidx_ctx ctx;
+ gidx_init(&ctx, pass_number, segment, nb_blocks, nb_iterations);
- // subsequent iterations (xor computations with previous results)
- for (int segment = 0; segment < 4; segment++ ) {
- gidx_ctx ctx;
- gidx_init(&ctx, 1, segment + 1, nb_blocks, nb_iterations);
- for (uint32_t i = segment * segment_size;
- i < (segment + 1) * segment_size;
- i++) {
- binary_g(blocks + i, // current block
- blocks + (i - 1) % nb_blocks, // previous block (modulo!)
- blocks + gidx_next(&ctx, i), // reference block
- xor_block); // subsequent passes are XOR
+ // On the first segment of the first pass,
+ // blocks 0 and 1 are already filled.
+ // We use the offset to skip them.
+ uint32_t offset = first_pass && segment == 0 ? 2 : 0;
+ // current, reference, and previous are block indices
+ for (uint32_t current = segment * segment_size + offset;
+ current < (segment + 1) * segment_size;
+ current++) {
+ uint32_t previous = current == 0 ? nb_blocks - 1 : current - 1;
+ uint32_t reference = gidx_next(&ctx);
+ // debug stuff
+ printf("(%2d,%2d,%2d) ", current, previous, reference);
+ // end debug stuff
+ binary_g(blocks + current,
+ blocks + previous,
+ blocks + reference,
+ xcopy);
+ }
+ printf("\n");
}
+ // debug stuff
+ for (uint32_t i = 0; i < nb_blocks; i++) {
+ printf("blocks[%2d]: %016lx\n", i, blocks[i].a[0]);
+ }
+ printf("\n");
+ // end debug stuf
}
-
// hash the very last block with H' into the output tag
uint8_t final_block[1024];
store_block(final_block, blocks + (nb_blocks - 1));