From 5d14fff16dce9636855a03585e990da6f577078e Mon Sep 17 00:00:00 2001
From: Loup Vaillant <loup@loup-vaillant.fr>
Date: Fri, 10 Feb 2017 15:49:02 +0100
Subject: [PATCH] all in one compilation unit

---
 ae.c         |   60 ---
 ae.h         |   44 --
 argon2i.c    |  403 --------------
 argon2i.h    |   42 --
 blake2b.c    |  175 -------
 blake2b.h    |   50 --
 build.sh     |   12 +-
 chacha20.c   |  139 -----
 chacha20.h   |   89 ----
 ed25519.c    |  391 --------------
 ed25519.h    |   20 -
 lock.c       |   92 ----
 lock.h       |   61 ---
 monocypher.c | 1427 ++++++++++++++++++++++++++++++++++++++++++++++++++
 monocypher.h |  335 ++++++++++++
 poly1305.c   |  154 ------
 poly1305.h   |   45 --
 test.c       |   11 +-
 x25519.c     |  156 ------
 x25519.h     |   34 --
 20 files changed, 1765 insertions(+), 1975 deletions(-)
 delete mode 100644 ae.c
 delete mode 100644 ae.h
 delete mode 100644 argon2i.c
 delete mode 100644 argon2i.h
 delete mode 100644 blake2b.c
 delete mode 100644 blake2b.h
 delete mode 100644 chacha20.c
 delete mode 100644 chacha20.h
 delete mode 100644 ed25519.c
 delete mode 100644 ed25519.h
 delete mode 100644 lock.c
 delete mode 100644 lock.h
 create mode 100644 monocypher.c
 create mode 100644 monocypher.h
 delete mode 100644 poly1305.c
 delete mode 100644 poly1305.h
 delete mode 100644 x25519.c
 delete mode 100644 x25519.h

diff --git a/ae.c b/ae.c
deleted file mode 100644
index 95624d2..0000000
--- a/ae.c
+++ /dev/null
@@ -1,60 +0,0 @@
-#include "ae.h"
-#include "chacha20.h"
-#include "poly1305.h"
-
-void crypto_ae_lock_detached(uint8_t        mac[16],
-                             uint8_t       *ciphertext,
-                             const uint8_t  key[32],
-                             const uint8_t  nonce[24],
-                             const uint8_t *plaintext,
-                             size_t         text_size)
-{
-    crypto_chacha_ctx e_ctx;
-    uint8_t           auth_key[32];
-    crypto_chacha20_Xinit (&e_ctx, key, nonce);
-    crypto_chacha20_random(&e_ctx, auth_key, 32);
-
-    crypto_chacha20_encrypt(&e_ctx, plaintext, ciphertext, text_size);
-    crypto_poly1305_auth(mac, ciphertext, text_size, auth_key);
-}
-
-int crypto_ae_unlock_detached(uint8_t       *plaintext,
-                              const uint8_t  key[32],
-                              const uint8_t  nonce[24],
-                              const uint8_t  mac[16],
-                              const uint8_t *ciphertext,
-                              size_t         text_size)
-{
-    crypto_chacha_ctx e_ctx;
-    uint8_t           auth_key[32];
-    crypto_chacha20_Xinit (&e_ctx, key, nonce);
-    crypto_chacha20_random(&e_ctx, auth_key, 32);
-
-    uint8_t real_mac[16];
-    crypto_poly1305_auth(real_mac, ciphertext, text_size, auth_key);
-
-    if (crypto_memcmp_16(real_mac, mac))
-        return -1;
-
-    crypto_chacha20_encrypt(&e_ctx, ciphertext, plaintext, text_size);
-    return 0;
-}
-
-void crypto_ae_lock(uint8_t       *box,
-                    const uint8_t  key[32],
-                    const uint8_t  nonce[24],
-                    const uint8_t *plaintext,
-                    size_t         text_size)
-{
-    crypto_ae_lock_detached(box, box + 16, key, nonce, plaintext, text_size);
-}
-
-int crypto_ae_unlock(uint8_t       *plaintext,
-                     const uint8_t  key[32],
-                     const uint8_t  nonce[24],
-                     const uint8_t *box,
-                     size_t         text_size)
-{
-    return crypto_ae_unlock_detached(plaintext, key, nonce,
-                                     box, box + 16, text_size);
-}
diff --git a/ae.h b/ae.h
deleted file mode 100644
index 9bd6fa2..0000000
--- a/ae.h
+++ /dev/null
@@ -1,44 +0,0 @@
-#ifndef AE_H
-#define AE_H
-
-#include <inttypes.h>
-#include <stddef.h>
-
-
-// Authenticated encryption with XChacha20 and Poly1305.
-void crypto_ae_lock_detached(uint8_t        mac[16],
-                             uint8_t       *ciphertext,
-                             const uint8_t  key[32],
-                             const uint8_t  nonce[24],
-                             const uint8_t *plaintext,
-                             size_t         text_size);
-
-// Authenticated encryption with XChacha20 and Poly1305.
-// Returns -1 and has no effect if the message is forged.
-int crypto_ae_unlock_detached(uint8_t       *plaintext,
-                              const uint8_t  key[32],
-                              const uint8_t  nonce[24],
-                              const uint8_t  mac[16],
-                              const uint8_t *ciphertext,
-                              size_t         text_size);
-
-// Like the above, only puts the mac and the ciphertext together
-// in a "box", mac first
-void crypto_ae_lock(uint8_t       *box,      // text_size + 16
-                    const uint8_t  key[32],
-                    const uint8_t  nonce[24],
-                    const uint8_t *plaintext,
-                    size_t         text_size);
-
-// Unlocks a box locked by aead_lock()
-int crypto_ae_unlock(uint8_t       *plaintext,
-                     const uint8_t  key[32],
-                     const uint8_t  nonce[24],
-                     const uint8_t *box,     // text_size + 16
-                     size_t         text_size);
-
-
-
-
-
-#endif // AE_H
diff --git a/argon2i.c b/argon2i.c
deleted file mode 100644
index 771956e..0000000
--- a/argon2i.c
+++ /dev/null
@@ -1,403 +0,0 @@
-#include "argon2i.h"
-#include "blake2b.h"
-
-/////////////////
-/// Utilities ///
-/////////////////
-
-static uint64_t
-load64_le(const uint8_t s[8])
-{
-    // Portable, slow way
-    return (uint64_t)s[0]
-        | ((uint64_t)s[1] <<  8)
-        | ((uint64_t)s[2] << 16)
-        | ((uint64_t)s[3] << 24)
-        | ((uint64_t)s[4] << 32)
-        | ((uint64_t)s[5] << 40)
-        | ((uint64_t)s[6] << 48)
-        | ((uint64_t)s[7] << 56);
-}
-
-static void
-store32_le(uint8_t output[4], uint32_t input)
-{
-    // Portable, slow way.
-    output[0] =  input        & 0xff;
-    output[1] = (input >>  8) & 0xff;
-    output[2] = (input >> 16) & 0xff;
-    output[3] = (input >> 24) & 0xff;
-}
-
-static void
-store64_le(uint8_t output[8], uint64_t input)
-{
-    // Portable, slow way.
-    output[0] =  input        & 0xff;
-    output[1] = (input >>  8) & 0xff;
-    output[2] = (input >> 16) & 0xff;
-    output[3] = (input >> 24) & 0xff;
-    output[4] = (input >> 32) & 0xff;
-    output[5] = (input >> 40) & 0xff;
-    output[6] = (input >> 48) & 0xff;
-    output[7] = (input >> 56) & 0xff;
-}
-
-static uint64_t
-rotr64(uint64_t x, uint64_t y)
-{
-    return (x >> y) ^ (x << (64 - y));
-}
-
-static uint32_t
-min(uint32_t a, uint32_t b)
-{
-    return a <= b ? a : b;
-}
-
-// updates a blake2 hash with a 32 bit word, little endian.
-static void
-blake_update_32(crypto_blake2b_ctx *ctx, uint32_t input)
-{
-    uint8_t buf[4];
-    store32_le(buf, input);
-    crypto_blake2b_update(ctx, buf, 4);
-}
-
-//////////////////
-// Argon2 block //
-//////////////////
-typedef struct block {
-    uint64_t a[128]; // 1024 octets in 128 64-bit words
-} block;
-
-static void
-load_block(block *b, const uint8_t bytes[1024])
-{
-    for (int i = 0; i < 128; i++) {
-        b->a[i] = load64_le(bytes + i * 8);
-    }
-}
-
-static void
-store_block(uint8_t bytes[1024], const block *b)
-{
-    for (int i = 0; i < 128; i++) {
-        store64_le(bytes + i * 8, b->a[i]);
-    }
-}
-
-static void
-copy_block(block *out, const block *in)
-{
-    for (int i = 0; i < 128; i++) {
-        out->a[i] = in->a[i];
-    }
-}
-
-static void
-xor_block(block *out, const block *in)
-{
-    for (int i = 0; i < 128; i++) {
-        out->a[i] ^= in->a[i];
-    }
-}
-
-////////////////////
-// Argon2i proper //
-////////////////////
-
-// Hash with a virtually unlimited digest size.
-// Doesn't extract more entropy than the base hash function.
-// Mainly used for filling a whole kilobyte block with pseudo-random bytes.
-static void
-extended_hash(uint8_t       *digest, uint32_t digest_size,
-              const uint8_t *input , uint32_t input_size)
-{
-    crypto_blake2b_ctx ctx;
-    crypto_blake2b_general_init(&ctx, min(digest_size, 64), 0, 0);
-    blake_update_32            (&ctx, digest_size);
-    crypto_blake2b_update      (&ctx, input, input_size);
-    crypto_blake2b_final       (&ctx, digest);
-
-    if (digest_size > 64) {
-        // the conversion to u64 avoids integer overflow on
-        // ludicrously big hash sizes.
-        uint32_t r   = (((uint64_t)digest_size + 31) / 32) - 2;
-        uint32_t i   =  1;
-        uint32_t in  =  0;
-        uint32_t out = 32;
-        while (i < r) {
-            // Input and output overlap.
-            // This shouldn't be a problem.
-            crypto_blake2b(digest + out, digest + in, 64);
-            i   +=  1;
-            in  += 32;
-            out += 32;
-        }
-        crypto_blake2b_general(digest + out, digest_size - (32 * r),
-                               0, 0, // no key
-                               digest + in , 64);
-    }
-}
-
-// Core of the compression function G.  Computes Z from R in place.
-static void
-g_rounds(block *work_block)
-{
-#define LSB(x) ((x) & 0xffffffff)
-#define G(a, b, c, d)                                            \
-    a += b + 2 * LSB(a) * LSB(b);  d ^= a;  d = rotr64(d, 32);   \
-    c += d + 2 * LSB(c) * LSB(d);  b ^= c;  b = rotr64(b, 24);   \
-    a += b + 2 * LSB(a) * LSB(b);  d ^= a;  d = rotr64(d, 16);   \
-    c += d + 2 * LSB(c) * LSB(d);  b ^= c;  b = rotr64(b, 63)
-#define ROUND(v0,  v1,  v2,  v3,  v4,  v5,  v6,  v7,    \
-              v8,  v9, v10, v11, v12, v13, v14, v15)    \
-    G(v0, v4,  v8, v12);  G(v1, v5,  v9, v13);          \
-    G(v2, v6, v10, v14);  G(v3, v7, v11, v15);          \
-    G(v0, v5, v10, v15);  G(v1, v6, v11, v12);          \
-    G(v2, v7,  v8, v13);  G(v3, v4,  v9, v14)
-
-    // column rounds (work_block = Q)
-    for (int i = 0; i < 128; i += 16) {
-        ROUND(work_block->a[i     ], work_block->a[i +  1],
-              work_block->a[i +  2], work_block->a[i +  3],
-              work_block->a[i +  4], work_block->a[i +  5],
-              work_block->a[i +  6], work_block->a[i +  7],
-              work_block->a[i +  8], work_block->a[i +  9],
-              work_block->a[i + 10], work_block->a[i + 11],
-              work_block->a[i + 12], work_block->a[i + 13],
-              work_block->a[i + 14], work_block->a[i + 15]);
-    }
-    // row rounds (work_block = Z)
-    for (int i = 0; i < 16; i += 2) {
-        ROUND(work_block->a[i      ], work_block->a[i +   1],
-              work_block->a[i +  16], work_block->a[i +  17],
-              work_block->a[i +  32], work_block->a[i +  33],
-              work_block->a[i +  48], work_block->a[i +  49],
-              work_block->a[i +  64], work_block->a[i +  65],
-              work_block->a[i +  80], work_block->a[i +  81],
-              work_block->a[i +  96], work_block->a[i +  97],
-              work_block->a[i + 112], work_block->a[i + 113]);
-    }
-}
-
-// The compression function G
-// may overwrite result completely  (xcopy == copy_block),
-// or XOR result with the old block (xcopy ==  xor_block)
-static void
-binary_g(block *result, const block *x, const block *y,
-         void (*xcopy) (block*, const block*))
-{
-    // put R = X ^ Y into tmp
-    block tmp;
-    copy_block(&tmp, x);
-    xor_block (&tmp, y);
-
-    xcopy(result, &tmp);     // save R (erase or xor the old block)
-    g_rounds(&tmp);          // tmp = Z
-    xor_block(result, &tmp); // result =  R ^ Z (or R ^ Z ^ old)
-}
-
-// unary version of the compression function.
-// The missing argument is implied zero.
-// Does the transformation in place.
-static void
-unary_g(block *work_block)
-{
-    // work_block == R
-    block tmp;
-    copy_block(&tmp, work_block); // tmp = R
-    g_rounds(work_block);         // work_block = Z
-    xor_block(work_block, &tmp);  // work_block = Z ^ R
-}
-
-typedef struct gidx_ctx {
-    block    b;
-    uint32_t pass_number;
-    uint32_t slice_number;
-    uint32_t nb_blocks;
-    uint32_t nb_iterations;
-    uint32_t ctr;
-    uint32_t index;
-} gidx_ctx;
-
-static void
-gidx_refresh(gidx_ctx *ctx)
-{
-    ctx->b.a[0] = ctx->pass_number;
-    ctx->b.a[1] = 0;  // lane number (we have only one)
-    ctx->b.a[2] = ctx->slice_number;
-    ctx->b.a[3] = ctx->nb_blocks;
-    ctx->b.a[4] = ctx->nb_iterations;
-    ctx->b.a[5] = 1;  // type: Argon2i
-    ctx->b.a[6] = ctx->ctr;
-    // zero the rest of the block
-    for (int i = 7; i < 128; i++) {
-        ctx->b.a[i] = 0;
-    }
-
-    // Shuffle the block thus: ctx->b = G((G(ctx->b, zero)), zero)
-    // Applies the G "square" function to get cheap pseudo-random numbers.
-    unary_g(&(ctx->b));
-    unary_g(&(ctx->b)); // square means apply it twice
-}
-
-static void
-gidx_init(gidx_ctx *ctx,
-          uint32_t pass_number,
-          uint32_t slice_number,
-          uint32_t nb_blocks,
-          uint32_t nb_iterations)
-{
-    ctx->pass_number   = pass_number;
-    ctx->slice_number  = slice_number;
-    ctx->nb_blocks     = nb_blocks;
-    ctx->nb_iterations = nb_iterations;
-    ctx->ctr           = 1;   // not zero, surprisingly
-    ctx->index         = pass_number == 0 && slice_number == 0 ? 2 : 0;
-    gidx_refresh(ctx);
-}
-
-static uint32_t
-gidx_next(gidx_ctx *ctx)
-{
-    // lazily creates the index block we need
-    if (ctx->index == 128) {
-        ctx->index = 0;
-        ctx->ctr++;
-        gidx_refresh(ctx);
-    }
-    // saves and increment the index
-    uint32_t index = ctx->index;
-    ctx->index++; // updates index for the next call
-
-    // Computes the area size.
-    // Pass 0 : all already finished segments plus already constructed
-    //          blocks in this segment
-    // Pass 1+: 3 last segments plus already constructed
-    //          blocks in this segment  THE SPEC SUGGESTS OTHERWISE.
-    //          I CONFORM TO THE REFERENCE IMPLEMENTATION.
-    _Bool    first_pass = ctx->pass_number == 0;
-    uint32_t slice_size = ctx->nb_blocks / 4;
-    uint32_t area_size  = ((first_pass ? ctx->slice_number : 3)
-                           * slice_size + index - 1);
-
-    // Computes the starting position of the reference area.
-    // CONTRARY TO WHAT THE SPEC SUGGESTS, IT STARTS AT THE
-    // NEXT SEGMENT, NOT THE NEXT BLOCK.
-    uint32_t next_slice = (ctx->slice_number == 3
-                           ? 0
-                           : (ctx->slice_number + 1) * slice_size);
-    uint32_t start_pos  = first_pass ? 0 : next_slice;
-
-    // Generates the actual index from J1 (no need for J2, there's only one lane)
-    uint64_t j1         = ctx->b.a[index] & 0xffffffff; // pseudo-random number
-    uint64_t x          = (j1 * j1)       >> 32;
-    uint64_t y          = (area_size * x) >> 32;
-    uint64_t z          = area_size - 1 - y;
-    return (start_pos + z) % ctx->nb_blocks;
-}
-
-// Main algorithm
-void
-crypto_argon2i_hash(uint8_t       *tag,       uint32_t tag_size,
-                    const uint8_t *password,  uint32_t password_size,
-                    const uint8_t *salt,      uint32_t salt_size,
-                    const uint8_t *key,       uint32_t key_size,
-                    const uint8_t *ad,        uint32_t ad_size,
-                    void *work_area,
-                    uint32_t nb_blocks,
-                    uint32_t nb_iterations)
-{
-    // work area seen as blocks (must be suitably aligned)
-    block *blocks = work_area;
-    {
-        crypto_blake2b_ctx ctx;
-        crypto_blake2b_init(&ctx);
-
-        blake_update_32      (&ctx, 1            ); // p: number of threads
-        blake_update_32      (&ctx, tag_size     );
-        blake_update_32      (&ctx, nb_blocks    );
-        blake_update_32      (&ctx, nb_iterations);
-        blake_update_32      (&ctx, 0x13         ); // v: version number
-        blake_update_32      (&ctx, 1            ); // y: Argon2i
-        blake_update_32      (&ctx,           password_size);
-        crypto_blake2b_update(&ctx, password, password_size);
-        blake_update_32      (&ctx,           salt_size);
-        crypto_blake2b_update(&ctx, salt,     salt_size);
-        blake_update_32      (&ctx,           key_size);
-        crypto_blake2b_update(&ctx, key,      key_size);
-        blake_update_32      (&ctx,           ad_size);
-        crypto_blake2b_update(&ctx, ad,       ad_size);
-
-        uint8_t initial_hash[72]; // 64 bytes plus 2 words for future hashes
-        crypto_blake2b_final(&ctx, initial_hash);
-
-        // fill first 2 blocks
-        block   tmp_block;
-        uint8_t hash_area[1024];
-        store32_le(initial_hash + 64, 0); // first  additional word
-        store32_le(initial_hash + 68, 0); // second additional word
-        extended_hash(hash_area, 1024, initial_hash, 72);
-        load_block(&tmp_block, hash_area);
-        copy_block(blocks, &tmp_block);
-
-        store32_le(initial_hash + 64, 1); // slight modification
-        extended_hash(hash_area, 1024, initial_hash, 72);
-        load_block(&tmp_block, hash_area);
-        copy_block(blocks + 1, &tmp_block);
-    }
-
-    // Actual number of blocks
-    nb_blocks -= nb_blocks % 4; // round down to 4 p (p == 1 thread)
-    const uint32_t segment_size = nb_blocks / 4;
-
-    // fill (then re-fill) the rest of the blocks
-    for (uint32_t pass_number = 0; pass_number < nb_iterations; pass_number++) {
-        _Bool     first_pass  = pass_number == 0;
-        // Simple copy on pass 0, XOR instead of overwrite on subsequent passes
-        void (*xcopy) (block*, const block*) = first_pass ?copy_block :xor_block;
-
-        for (int segment = 0; segment < 4; segment++ ) {
-
-            gidx_ctx ctx;
-            gidx_init(&ctx, pass_number, segment, nb_blocks, nb_iterations);
-
-            // On the first segment of the first pass,
-            // blocks 0 and 1 are already filled.
-            // We use the offset to skip them.
-            uint32_t offset = first_pass && segment == 0 ? 2 : 0;
-            // current, reference, and previous are block indices
-            for (uint32_t current =  segment      * segment_size + offset;
-                 current          < (segment + 1) * segment_size;
-                 current++) {
-                uint32_t previous  = current == 0 ? nb_blocks - 1 : current - 1;
-                uint32_t reference = gidx_next(&ctx);
-                binary_g(blocks + current,
-                         blocks + previous,
-                         blocks + reference,
-                         xcopy);
-            }
-        }
-    }
-    // hash the very last block with H' into the output tag
-    uint8_t final_block[1024];
-    store_block(final_block, blocks + (nb_blocks - 1));
-    extended_hash(tag, tag_size, final_block, 1024);
-}
-
-void
-crypto_argon2i(uint8_t        tag[32],
-               const uint8_t *password,  uint32_t password_size,
-               const uint8_t *salt,      uint32_t salt_size,
-               void    *work_area,
-               uint32_t nb_blocks,
-               uint32_t nb_iterations)
-{
-    crypto_argon2i_hash(tag     , 32,
-                        password, password_size,
-                        salt    , salt_size,
-                        0, 0, 0, 0,
-                        work_area, nb_blocks, nb_iterations);
-}
diff --git a/argon2i.h b/argon2i.h
deleted file mode 100644
index b7babf2..0000000
--- a/argon2i.h
+++ /dev/null
@@ -1,42 +0,0 @@
-#ifndef ARGON2I_H
-#define ARGON2I_H
-
-#include <inttypes.h>
-#include <stddef.h>
-
-// Implements argon2i, with degree of paralelism 1,
-// because it's good enough, and threads are scary.
-//
-// key and ad are optionnal.  They can be NULL if their respective size is 0.
-// work_area is a pointer to a contiguous chunk of memory of at least
-// nb_blocks * 1024 bytes.  It must be suitably aligned for 64-bit words.
-// Don't worry too much about alignment, malloc()'s results work.
-//
-// Choice of parameters for password hashing:
-// - If you need a key, use a 32 bytes one.
-// - Do what you will with the ad.
-// - Use a 32 bytes tag (to get a 256-bit key)
-// - Put 128 bits of entropy in the salt.  16 random bytes work well.
-// - Use all the memory you can get away with.
-// - Use as much iterations as reasonable.  No less than 10 passes if you can.
-void
-crypto_argon2i_hash(uint8_t       *tag,       uint32_t tag_size,      // >= 4
-                    const uint8_t *password,  uint32_t password_size,
-                    const uint8_t *salt,      uint32_t salt_size,     // >= 8
-                    const uint8_t *key,       uint32_t key_size,
-                    const uint8_t *ad,        uint32_t ad_size,
-                    void    *work_area,
-                    uint32_t nb_blocks,                               // >= 8
-                    uint32_t nb_iterations);
-
-// Convenience function. No key, no ad, 64 bytes tag
-void
-crypto_argon2i(uint8_t        tag[32],
-               const uint8_t *password,  uint32_t password_size,
-               const uint8_t *salt,      uint32_t salt_size,     // >= 8
-               void    *work_area,
-               uint32_t nb_blocks,                               // >= 8
-               uint32_t nb_iterations);
-
-
-#endif // ARGON2I_H
diff --git a/blake2b.c b/blake2b.c
deleted file mode 100644
index f9fb269..0000000
--- a/blake2b.c
+++ /dev/null
@@ -1,175 +0,0 @@
-// ripped off from the reference implentation in RFC 7693
-
-#include "blake2b.h"
-
-// Cyclic right rotation.
-static uint64_t
-rotr64(uint64_t x, uint64_t y)
-{
-    return (x >> y) ^ (x << (64 - y));
-}
-
-static uint64_t
-load64_le(uint8_t *s)
-{
-    // portable, slow way
-    return
-        ((uint64_t)s[0]      ) ^
-        ((uint64_t)s[1] <<  8) ^
-        ((uint64_t)s[2] << 16) ^
-        ((uint64_t)s[3] << 24) ^
-        ((uint64_t)s[4] << 32) ^
-        ((uint64_t)s[5] << 40) ^
-        ((uint64_t)s[6] << 48) ^
-        ((uint64_t)s[7] << 56);
-}
-
-// Initialization Vector.
-static const uint64_t blake2b_iv[8] = {
-    0x6a09e667f3bcc908, 0xbb67ae8584caa73b,
-    0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1,
-    0x510e527fade682d1, 0x9b05688c2b3e6c1f,
-    0x1f83d9abfb41bd6b, 0x5be0cd19137e2179
-};
-
-// increment a 128-bit "word".
-static void
-incr(uint64_t x[2], uint64_t y)
-{
-    x[0] += y;                 // increment the low word
-    if (x[0] < y) { x[1]++; }  // handle overflow
-}
-
-static void
-blake2b_compress(crypto_blake2b_ctx *ctx, _Bool last_block)
-{
-    static const uint8_t sigma[12][16] = {
-        { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
-        { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
-        { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
-        { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
-        { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
-        { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
-        { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
-        { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
-        { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
-        { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
-        { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
-        { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }
-    };
-
-    // init work variables (before shuffling them)
-    uint64_t v[16];
-    for (int i = 0; i < 8; i++) {
-        v[i    ] = ctx->hash[i];
-        v[i + 8] = blake2b_iv[i];
-    }
-    v[12] ^= ctx->input_size[0]; // low 64 bits of offset
-    v[13] ^= ctx->input_size[1]; // high 64 bits
-    if (last_block) { v[14] = ~v[14]; }
-
-    // load the input buffer
-    uint64_t m[16];
-    for (int i = 0; i < 16; i++) {
-        m[i] = load64_le(&ctx->buf[i * 8]);
-    }
-
-    // shuffle the work variables with the 12 rounds
-    for (int i = 0; i < 12; i++) {
-#define B2B_G(a, b, c, d, x, y)                                    \
-        v[a] += v[b] + x;  v[d] ^= v[a];  v[d] = rotr64(v[d], 32); \
-        v[c] += v[d]    ;  v[b] ^= v[c];  v[b] = rotr64(v[b], 24); \
-        v[a] += v[b] + y;  v[d] ^= v[a];  v[d] = rotr64(v[d], 16); \
-        v[c] += v[d]    ;  v[b] ^= v[c];  v[b] = rotr64(v[b], 63)
-
-        B2B_G( 0, 4,  8, 12, m[sigma[i][ 0]], m[sigma[i][ 1]]);
-        B2B_G( 1, 5,  9, 13, m[sigma[i][ 2]], m[sigma[i][ 3]]);
-        B2B_G( 2, 6, 10, 14, m[sigma[i][ 4]], m[sigma[i][ 5]]);
-        B2B_G( 3, 7, 11, 15, m[sigma[i][ 6]], m[sigma[i][ 7]]);
-        B2B_G( 0, 5, 10, 15, m[sigma[i][ 8]], m[sigma[i][ 9]]);
-        B2B_G( 1, 6, 11, 12, m[sigma[i][10]], m[sigma[i][11]]);
-        B2B_G( 2, 7,  8, 13, m[sigma[i][12]], m[sigma[i][13]]);
-        B2B_G( 3, 4,  9, 14, m[sigma[i][14]], m[sigma[i][15]]);
-    }
-
-    // accumulate the work variables into the hash
-    for(int i = 0; i < 8; i++) {
-        ctx->hash[i] ^= v[i] ^ v[i + 8];
-    }
-}
-
-void
-crypto_blake2b_general_init(crypto_blake2b_ctx *ctx, size_t outlen,
-                            const uint8_t      *key, size_t keylen)
-{
-    // Initial hash == initialization vector...
-    for (int i = 0; i < 8; i++) {
-        ctx->hash[i] = blake2b_iv[i];
-    }
-    ctx->hash[0]      ^= 0x01010000 ^ (keylen << 8) ^ outlen;  // ...mostly
-    ctx->input_size[0] = 0;       // input count low word
-    ctx->input_size[1] = 0;       // input count high word
-    ctx->c             = 0;       // pointer within buffer
-    ctx->output_size   = outlen;  // size of the final hash
-
-    // If there's a key, put it in the first block, then pad with zeroes
-    if (keylen > 0) {
-        for (size_t i = 0     ; i < keylen; i++) { ctx->buf[i] = key[i]; }
-        for (size_t i = keylen; i < 128   ; i++) { ctx->buf[i] = 0;      }
-        ctx->c = 128; // mark the block as used
-    }
-}
-
-void
-crypto_blake2b_init(crypto_blake2b_ctx *ctx)
-{
-    crypto_blake2b_general_init(ctx, 64, 0, 0);
-}
-
-void
-crypto_blake2b_update(crypto_blake2b_ctx *ctx, const uint8_t *in, size_t inlen)
-{
-    for (size_t i = 0; i < inlen; i++) {
-        // If the buffer is full, increment the counters and
-        // add (compress) the current buffer to the hash
-        if (ctx->c == 128) {
-            ctx->c = 0;
-            incr(ctx->input_size, 128);
-            blake2b_compress(ctx, 0); // not last time -> 0
-        }
-        // By now the buffer is not full.  We add one input byte.
-        ctx->buf[ctx->c] = in[i];
-        ctx->c++;
-    }
-}
-
-void
-crypto_blake2b_final(crypto_blake2b_ctx *ctx, uint8_t *out)
-{
-    // update input size, pad then compress the buffer
-    incr(ctx->input_size, ctx->c);
-    for (int i = ctx->c; i < 128; i++) { ctx->buf[i] = 0; }
-    blake2b_compress(ctx, 1); // last time -> 1
-
-    // copy the hash in the output (little endian of course)
-    for (int i = 0; i < ctx->output_size; i++) {
-        out[i] = (ctx->hash[i / 8] >> (8 * (i & 7))) & 0xFF;
-    }
-}
-
-void
-crypto_blake2b_general(      uint8_t*out, size_t outlen,
-                       const uint8_t*key, size_t keylen,
-                       const uint8_t*in,  size_t inlen)
-{
-    crypto_blake2b_ctx ctx;
-    crypto_blake2b_general_init(&ctx, outlen, key, keylen);
-    crypto_blake2b_update(&ctx, in, inlen);
-    crypto_blake2b_final(&ctx, out);
-}
-
-void
-crypto_blake2b(uint8_t out[64], const uint8_t *in, size_t inlen)
-{
-    crypto_blake2b_general(out, 64, 0, 0, in, inlen);
-}
diff --git a/blake2b.h b/blake2b.h
deleted file mode 100644
index dd3338b..0000000
--- a/blake2b.h
+++ /dev/null
@@ -1,50 +0,0 @@
-#ifndef BLAKE2B_H
-#define BLAKE2B_H
-
-#include <inttypes.h>
-#include <stddef.h>
-
-// blake2b context
-typedef struct {
-    uint8_t  buf[128];      // input buffer
-    uint64_t hash[8];       // chained state
-    uint64_t input_size[2]; // total number of bytes
-    uint8_t  c;             // pointer for buf[]
-    uint8_t  output_size;   // digest size
-} crypto_blake2b_ctx;
-
-// Initializes the context with user defined parameters:
-// outlen: the length of the hash.  Must be between 1 and 64.
-// keylen: length of the key.       Must be between 0 and 64.
-// key   : some secret key.         May be NULL if keylen is 0.
-// Any deviation from these invariants results in UNDEFINED BEHAVIOR
-void
-crypto_blake2b_general_init(crypto_blake2b_ctx *ctx, size_t outlen,
-                            const uint8_t      *key, size_t keylen);
-
-// Convenience function: 64 bytes hash, no secret key.
-void
-crypto_blake2b_init(crypto_blake2b_ctx *ctx);
-
-// Add "inlen" bytes from "in" into the hash.
-void
-crypto_blake2b_update(crypto_blake2b_ctx *ctx, const uint8_t *in, size_t inlen);
-
-// Generate the message digest (size given in init).
-void
-crypto_blake2b_final(crypto_blake2b_ctx *ctx, uint8_t *out);
-
-// All-in-one convenience function.
-// outlen, keylen, and key work the same as they do in the general_init function
-void
-crypto_blake2b_general(      uint8_t *out, size_t outlen, // digest
-                       const uint8_t *key, size_t keylen, // optional secret key
-                       const uint8_t *in , size_t inlen); // data to be hashed
-
-// All-in-one convenience function: 64 bytes hash, no secret key.
-void
-crypto_blake2b(uint8_t out[64], const uint8_t *in, size_t inlen);
-
-
-
-#endif // BLAKE2B_H
diff --git a/build.sh b/build.sh
index affbfd4..256c001 100755
--- a/build.sh
+++ b/build.sh
@@ -3,15 +3,7 @@
 CC="gcc"
 CFLAGS="-O2 -Wall -Wextra -std=c11"
 
-$CC $CFLAGS -c chacha20.c
-$CC $CFLAGS -c blake2b.c
-$CC $CFLAGS -c poly1305.c
-$CC $CFLAGS -c argon2i.c
-$CC $CFLAGS -c ae.c
-$CC $CFLAGS -c x25519.c
-$CC $CFLAGS -c ed25519.c -DED25519_SHA512
-$CC $CFLAGS -c lock.c
+$CC $CFLAGS -c monocypher.c -DED25519_SHA512
 $CC $CFLAGS -c sha512.c
 $CC $CFLAGS -c test.c
-
-$CC $CFLAGS -o test test.o chacha20.o argon2i.o blake2b.o poly1305.o x25519.o ae.o lock.o sha512.o ed25519.o
+$CC $CFLAGS -o test test.o monocypher.o sha512.o
diff --git a/chacha20.c b/chacha20.c
deleted file mode 100644
index 023f114..0000000
--- a/chacha20.c
+++ /dev/null
@@ -1,139 +0,0 @@
-#include "chacha20.h"
-
-static uint32_t
-load32_le(const uint8_t s[4])
-{
-    // Portable, slow way.
-    // Only affects initialisation, though.
-    return s[0]
-        | (s[1] <<  8)
-        | (s[2] << 16)
-        | (s[3] << 24);
-}
-
-static void
-store32_le(uint8_t output[4], uint32_t input)
-{
-    // Portable, slow way.
-    output[0] =  input        & 0xff;
-    output[1] = (input >>  8) & 0xff;
-    output[2] = (input >> 16) & 0xff;
-    output[3] = (input >> 24) & 0xff;
-}
-
-static void
-chacha20_rounds(uint32_t out[16], const uint32_t in[16])
-{
-    for (int i = 0; i < 16; i++)
-        out[i] = in[i];
-
-    for (int i = 0; i < 10; i++) { // 20 rounds, 2 rounds per loop.
-#define ROT_L32(x, n) x = (x << n) | (x >> (32 - n))
-#define QUARTERROUND(a, b, c, d)           \
-        a += b;  d ^= a;  ROT_L32(d, 16);  \
-        c += d;  b ^= c;  ROT_L32(b, 12);  \
-        a += b;  d ^= a;  ROT_L32(d,  8);  \
-        c += d;  b ^= c;  ROT_L32(b,  7)
-
-        QUARTERROUND(out[0], out[4], out[ 8], out[12]); // column 0
-        QUARTERROUND(out[1], out[5], out[ 9], out[13]); // column 1
-        QUARTERROUND(out[2], out[6], out[10], out[14]); // column 2
-        QUARTERROUND(out[3], out[7], out[11], out[15]); // column 3
-        QUARTERROUND(out[0], out[5], out[10], out[15]); // diagonal 1
-        QUARTERROUND(out[1], out[6], out[11], out[12]); // diagonal 2
-        QUARTERROUND(out[2], out[7], out[ 8], out[13]); // diagonal 3
-        QUARTERROUND(out[3], out[4], out[ 9], out[14]); // diagonal 4
-    }
-}
-
-static void
-chacha20_init_key(crypto_chacha_ctx *ctx, const uint8_t key[32])
-{
-    // constant
-    ctx->input[0]   = load32_le((uint8_t*)"expa");
-    ctx->input[1]   = load32_le((uint8_t*)"nd 3");
-    ctx->input[2]   = load32_le((uint8_t*)"2-by");
-    ctx->input[3]   = load32_le((uint8_t*)"te k");
-    // key
-    for (int i = 0; i < 8; i++)
-        ctx->input[i + 4] = load32_le(key + i*4);
-    // pool index (the random pool starts empty)
-    ctx->pool_index = 64;
-}
-
-void
-crypto_chacha20_H(uint8_t       out[32],
-                  const uint8_t key[32],
-                  const uint8_t in [16])
-{
-    crypto_chacha_ctx ctx;
-    chacha20_init_key(&ctx, key);
-    for (int i = 0; i < 4; i++)
-        ctx.input[i + 12] = load32_le(in + i*4);
-
-    uint32_t buffer[16];
-    chacha20_rounds(buffer, ctx.input);
-    // prevents reversal of the rounds by revealing only half of the buffer.
-    for (int i = 0; i < 4; i++) {
-        store32_le(out      + i*4, buffer[i     ]); // constant
-        store32_le(out + 16 + i*4, buffer[i + 12]); // counter and nonce
-    }
-}
-
-void
-crypto_chacha20_init(crypto_chacha_ctx *ctx,
-                     const uint8_t      key[32],
-                     const uint8_t      nonce[8])
-{
-    chacha20_init_key(ctx, key  );         // key
-    ctx->input[12] = 0;                    // counter
-    ctx->input[13] = 0;                    // counter
-    ctx->input[14] = load32_le(nonce + 0); // nonce
-    ctx->input[15] = load32_le(nonce + 4); // nonce
-}
-
-void
-crypto_chacha20_Xinit(crypto_chacha_ctx *ctx,
-                      const uint8_t      key[32],
-                      const uint8_t      nonce[24])
-{
-    uint8_t derived_key[32];
-    crypto_chacha20_H(derived_key, key, nonce);
-    crypto_chacha20_init(ctx, derived_key, nonce + 16);
-}
-
-void
-crypto_chacha20_encrypt(crypto_chacha_ctx *ctx,
-                        const uint8_t     *plain_text,
-                        uint8_t           *cipher_text,
-                        size_t             message_size)
-{
-    for (size_t i = 0; i < message_size; i++) {
-        // refill the pool if empty
-        if (ctx->pool_index == 64) {
-            // fill the pool
-            uint32_t buffer[16];
-            chacha20_rounds(buffer, ctx->input);
-            for (int i = 0; i < 16; i++)
-                store32_le(ctx->random_pool + i*4, buffer[i] + ctx->input[i]);
-            // update the counters
-            ctx->pool_index = 0;
-            ctx->input[12]++;
-            if (!ctx->input[12])
-                ctx->input[13]++;
-        }
-        // use the pool for encryption (or random stream)
-        cipher_text[i] =
-            (plain_text == 0 ? 0 : plain_text[i])
-            ^ ctx->random_pool[ctx->pool_index];
-        ctx->pool_index++;
-    }
-}
-
-void
-crypto_chacha20_random(crypto_chacha_ctx *ctx,
-                       uint8_t           *cipher_text,
-                       size_t             message_size)
-{
-    crypto_chacha20_encrypt(ctx, 0, cipher_text, message_size);
-}
diff --git a/chacha20.h b/chacha20.h
deleted file mode 100644
index 9f63614..0000000
--- a/chacha20.h
+++ /dev/null
@@ -1,89 +0,0 @@
-#ifndef CHACHA20_H
-#define CHACHA20_H
-
-#include <inttypes.h>
-#include <stddef.h>
-
-// This is a chacha20 context.
-// To use safely, just follow these guidelines:
-// - Always initialize your context with one of the crypto_init_* functions below
-// - Dont't modify it, except through the crypto_chacha20_* below.
-// - Never duplicate it.
-typedef struct crypto_chacha_ctx {
-    uint32_t input[16];       // current input, unencrypted
-    uint8_t  random_pool[64]; // last input, encrypted
-    uint8_t  pool_index;      // pointer to random_pool
-} crypto_chacha_ctx;
-
-// HChacha20.  *Kind* of a cryptographic hash, based on the chacha20 rounds.
-// Used for XChacha20, and the key derivation of the X25519 shared secret.
-// Don't use it unless you really know what you're doing.
-void
-crypto_chacha20_H(uint8_t       out[32],
-                  const uint8_t key[32],
-                  const uint8_t in [16]);
-
-// Initializes a chacha context.
-//
-// WARNING: DON'T USE THE SAME NONCE AND KEY TWICE
-//
-// You'd be exposing the XOR of subsequent encrypted
-// messages, thus destroying your confidentiality.
-//
-// WARNING: DON'T SELECT THE NONCE AT RANDOM
-//
-// If you encode enough messages with a random nonce, there's a good
-// chance some of them will use the same nonce by accident. 64 bits
-// just isn't enough for this.  Use a counter instead.
-//
-// If there are multiple parties sending out messages, you can give them
-// all an initial nonce of 0, 1 .. n-1 respectively, and have them increment
-// their nonce  by n.  (Also make sure the nonces never wrap around.).
-void
-crypto_chacha20_init(crypto_chacha_ctx *ctx,
-                     const uint8_t      key[32],
-                     const uint8_t      nonce[8]);
-
-// Initializes a chacha context, with a big nonce (192 bits),
-// more than enough to be selected at random.
-//
-// The price you pay for that is a slower initialization.  The security
-// guarantees are the same as regular initialization.
-void
-crypto_chacha20_Xinit(crypto_chacha_ctx *ctx,
-                      const uint8_t      key[32],
-                      const uint8_t      nonce[24]);
-
-// Encrypts the plain_text by XORing it with a pseudo-random
-// stream of numbers, seeded by the provided chacha20 context.
-// Decryption uses the exact same method.
-//
-// Once the context is initialized, encryptions can safely be chained thus:
-//
-//    crypto_encrypt_chacha20(ctx, plain_0, cipher_0, length_0);
-//    crypto_encrypt_chacha20(ctx, plain_1, cipher_1, length_1);
-//    crypto_encrypt_chacha20(ctx, plain_2, cipher_2, length_2);
-//
-// plain_text and cipher_text may point to the same location, for in-place
-// encryption.
-//
-// plain_text is allowed to be null (0), in which case it will be
-// interpreted as an all zero input.  The cipher_text will then
-// contain the raw chacha20 stream.  Useful as a random number
-// generator.
-//
-// WARNING: ENCRYPTION ALONE IS NOT SECURE.  YOU NEED AUTHENTICATION AS WELL.
-// Use the provided authenticated encryption constructions.
-void
-crypto_chacha20_encrypt(crypto_chacha_ctx *ctx,
-                        const uint8_t     *plain_text,
-                        uint8_t           *cipher_text,
-                        size_t             message_size);
-
-// convenience function.  Same as chacha20_encrypt() with a null plain_text.
-void
-crypto_chacha20_random(crypto_chacha_ctx *ctx,
-                       uint8_t           *cipher_text,
-                       size_t             message_size);
-
-#endif // CHACHA20_H
diff --git a/ed25519.c b/ed25519.c
deleted file mode 100644
index a1baa65..0000000
--- a/ed25519.c
+++ /dev/null
@@ -1,391 +0,0 @@
-// Taken from TweetNaCl.
-// I tried the ref10 implementation, but that was too damn big
-
-#include "ed25519.h"
-
-#define FOR(i, start, end) for (size_t i = start; i < end; i++)
-#define sv static void
-#define sc static const
-
-typedef uint8_t   u8;
-typedef int64_t  i64;
-typedef uint64_t u64;
-typedef i64 gf[16];
-
-sc gf gf0;
-sc gf gf1 = { 1 };
-sc gf  D  = { 0x78a3, 0x1359, 0x4dca, 0x75eb, 0xd8ab, 0x4141, 0x0a4d, 0x0070,
-              0xe898, 0x7779, 0x4079, 0x8cc7, 0xfe73, 0x2b6f, 0x6cee, 0x5203};
-sc gf  D2 = { 0xf159, 0x26b2, 0x9b94, 0xebd6, 0xb156, 0x8283, 0x149a, 0x00e0,
-              0xd130, 0xeef3, 0x80f2, 0x198e, 0xfce7, 0x56df, 0xd9dc, 0x2406};
-sc gf  X  = { 0xd51a, 0x8f25, 0x2d60, 0xc956, 0xa7b2, 0x9525, 0xc760, 0x692c,
-              0xdc5c, 0xfdd6, 0xe231, 0xc0a4, 0x53fe, 0xcd6e, 0x36d3, 0x2169};
-sc gf  Y  = { 0x6658, 0x6666, 0x6666, 0x6666, 0x6666, 0x6666, 0x6666, 0x6666,
-              0x6666, 0x6666, 0x6666, 0x6666, 0x6666, 0x6666, 0x6666, 0x6666};
-sc gf  I  = { 0xa0b0, 0x4a0e, 0x1b27, 0xc4ee, 0xe478, 0xad2f, 0x1806, 0x2f43,
-              0xd7a7, 0x3dfb, 0x0099, 0x2b4d, 0xdf0b, 0x4fc1, 0x2480, 0x2b83};
-
-sc u64 L[32] = { 0xed, 0xd3, 0xf5, 0x5c, 0x1a, 0x63, 0x12, 0x58,
-                 0xd6, 0x9c, 0xf7, 0xa2, 0xde, 0xf9, 0xde, 0x14,
-                 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10 };
-
-sv car_25519(gf o)
-{
-    FOR(i, 0, 16) {
-        o[i]              += 1LL  << 16;
-        i64 c          = o[i] >> 16;
-        o[(i+1) * (i<15)] += c - 1 + (37 * (c-1) * (i==15));
-        o[i]              -= c << 16;
-    }
-}
-
-sv sel_25519(gf p, gf q, int b)
-{
-    i64 c = ~(b-1);
-    FOR(i, 0, 16) {
-        i64 t = c & (p[i] ^ q[i]);
-        p[i]     ^= t;
-        q[i]     ^= t;
-    }
-}
-
-sv pack_25519(u8 *o, const gf n)
-{
-    gf t;
-    FOR(i, 0, 16) t[i] = n[i];
-    car_25519(t);
-    car_25519(t);
-    car_25519(t);
-    FOR(j, 0, 2) {
-        gf m;
-        m[0] = t[0] - 0xffed;
-        FOR(i, 1, 15) {
-            m[i  ]  = t[i] - 0xffff - ((m[i-1] >> 16) & 1);
-            m[i-1] &= 0xffff;
-        }
-        m[15]  = t[15] - 0x7fff - ((m[14] >> 16) & 1);
-        int b  = (m[15] >> 16) & 1;
-        m[14] &= 0xffff;
-        sel_25519(t, m, 1-b);
-    }
-    FOR(i, 0, 16) {
-        o[2*i    ] = t[i] & 0xff;
-        o[2*i + 1] = t[i] >> 8;
-    }
-}
-
-sv A(gf o, const gf a, const gf b) { FOR(i, 0, 16) o[i] = a[i] + b[i]; }
-sv Z(gf o, const gf a, const gf b) { FOR(i, 0, 16) o[i] = a[i] - b[i]; }
-sv M(gf o, const gf a, const gf b)
-{
-    i64 t[31];
-    FOR(i, 0, 31) t[i] = 0;
-    FOR(i, 0, 16) FOR(j, 0, 16) t[i+j] += a[i] * b[j];
-    FOR(i, 0, 15) t[i] += 38 * t[i+16];
-    FOR(i, 0, 16) o[i] = t[i];
-    car_25519(o);
-    car_25519(o);
-}
-sv S(gf o,const gf a){ M(o, a, a); }
-
-sv inv_25519(gf o,const gf i)
-{
-    gf c;
-    FOR(a, 0, 16) c[a] = i[a];
-    for(int a = 253; a >= 0; a--) {
-        S(c, c);
-        if(a != 2 && a != 4)
-            M(c, c, i);
-    }
-    FOR(a, 0, 16) o[a] = c[a];
-}
-
-sv unpack_25519(gf o, const u8 *n)
-{
-    FOR(i, 0, 16) o[i] = n[2*i] + ((i64)n[2*i + 1] << 8);
-    o[15] &= 0x7fff;
-}
-
-sv set_25519(gf r, const gf a) { FOR(i, 0, 16) r[i] = a[i]; }
-
-static u8 par_25519(const gf a)
-{
-    u8 d[32];
-    pack_25519(d, a);
-    return d[0] & 1;
-}
-
-sv pow2523(gf o,const gf i)
-{
-    gf c;
-    FOR(a, 0, 16) c[a] = i[a];
-    for(int a = 250; a >= 0; a--) {
-        S(c, c);
-        if(a != 1) M(c, c, i);
-    }
-    FOR(a, 0, 16) o[a] = c[a];
-}
-
-static int vn(const u8 *x, const u8 *y, size_t n)
-{
-  uint32_t d = 0;
-  FOR(i, 0, n) d |= x[i] ^ y[i];
-  return (1 & ((d - 1) >> 8)) - 1;
-}
-
-static int neq_25519(const gf a, const gf b)
-{
-    u8 c[32],d[32];
-    pack_25519(c, a);
-    pack_25519(d, b);
-    return vn(c, d, 32);
-}
-
-sv add(gf p[4], gf q[4])
-{
-    gf a, b, c, d, t, e, f, g, h;
-    Z(a, p[1], p[0]);
-    Z(t, q[1], q[0]);
-    M(a, a, t);
-    A(b, p[0], p[1]);
-    A(t, q[0], q[1]);
-    M(b, b, t);
-    M(c, p[3], q[3]);
-    M(c, c, D2);
-    M(d, p[2], q[2]);
-    A(d, d, d);
-    Z(e, b, a);
-    Z(f, d, c);
-    A(g, d, c);
-    A(h, b, a);
-
-    M(p[0], e, f);
-    M(p[1], h, g);
-    M(p[2], g, f);
-    M(p[3], e, h);
-}
-
-sv cswap(gf p[4], gf q[4], u8 b)
-{
-    FOR(i, 0, 4)
-        sel_25519(p[i],q[i],b);
-}
-
-sv pack(u8 *r, gf p[4])
-{
-    gf tx, ty, zi;
-    inv_25519(zi, p[2]);
-    M(tx, p[0], zi);
-    M(ty, p[1], zi);
-    pack_25519(r, ty);
-    r[31] ^= par_25519(tx) << 7;
-}
-
-sv scalarmult(gf p[4], gf q[4], const u8 *s)
-{
-    set_25519(p[0], gf0);
-    set_25519(p[1], gf1);
-    set_25519(p[2], gf1);
-    set_25519(p[3], gf0);
-    for (int i = 255; i >= 0; i--) {
-        u8 b = (s[i/8] >> (i & 7)) & 1;
-        cswap(p, q, b);
-        add(q, p);
-        add(p, p);
-        cswap(p, q, b);
-    }
-}
-
-sv scalarbase(gf p[4], const u8 *s)
-{
-    gf q[4];
-    set_25519(q[0], X);
-    set_25519(q[1], Y);
-    set_25519(q[2], gf1);
-    M(q[3], X, Y);
-    scalarmult(p, q, s);
-}
-
-sv modL(u8 *r, i64 x[64])
-{
-    i64 i, j;
-    for (i = 63;i >= 32;--i) {
-        i64 carry = 0;
-        for (j = i - 32;j < i - 12;++j) {
-            x[j] += carry - 16 * x[i] * L[j - (i - 32)];
-            carry = (x[j] + 128) >> 8;
-            x[j] -= carry << 8;
-        }
-        x[j] += carry;
-        x[i] = 0;
-    }
-    i64 carry = 0;
-    FOR(j, 0, 32) {
-        x[j] += carry - (x[31] >> 4) * L[j];
-        carry = x[j] >> 8;
-        x[j] &= 255;
-    }
-    FOR(j, 0, 32) x[j] -= carry * L[j];
-    FOR(i, 0, 32) {
-        x[i+1] += x[i] >> 8;
-        r[i  ]  = x[i] & 255;
-    }
-}
-
-sv reduce(u8 r[64])
-{
-    i64 x[64];
-    FOR(i, 0, 64) x[i] = (u64) r[i];
-    FOR(i, 0, 64) r[i] = 0;
-    modL(r, x);
-}
-
-static int unpackneg(gf r[4],const u8 p[32])
-{
-    gf t, chk, num, den, den2, den4, den6;
-    set_25519(r[2], gf1);
-    unpack_25519(r[1], p);
-    S(num,r [1]);
-    M(den, num, D);
-    Z(num, num, r[2]);
-    A(den, r[2], den);
-
-    S(den2, den);
-    S(den4, den2);
-    M(den6, den4, den2);
-    M(t, den6, num);
-    M(t, t, den);
-
-    pow2523(t, t);
-    M(t, t, num);
-    M(t, t, den);
-    M(t, t, den);
-    M(r[0], t, den);
-
-    S(chk, r[0]);
-    M(chk, chk, den);
-    if (neq_25519(chk, num)) M(r[0], r[0], I);
-
-    S(chk, r[0]);
-    M(chk, chk, den);
-    if (neq_25519(chk, num)) return -1;
-
-    if (par_25519(r[0]) == (p[31]>>7)) Z(r[0],gf0,r[0]);
-
-    M(r[3], r[0], r[1]);
-    return 0;
-}
-
-#ifdef ED25519_BLAKE2B
-    #include "blake2b.h"
-    #define HASH crypto_blake2b
-#else
-    #ifdef ED25519_SHA512
-        #include "sha512.h"
-        #define HASH crypto_sha512
-    #endif
-#endif
-
-#define COMBINE1(x, y) x ## y
-#define COMBINE2(x, y) COMBINE1(x, y)
-#define HASH_CTX    COMBINE2(HASH, _ctx)
-#define HASH_INIT   COMBINE2(HASH, _init)
-#define HASH_UPDATE COMBINE2(HASH, _update)
-#define HASH_FINAL  COMBINE2(HASH, _final)
-
-// hash function interface
-// Typical uses: sha512 for tests vectors, blake2b for production.
-void HASH_INIT  (HASH_CTX *ctx);
-void HASH_UPDATE(HASH_CTX *ctx, const u8 *in, size_t inlen);
-void HASH_FINAL (HASH_CTX *ctx, u8 hash[64]);
-void HASH(u8 hash[64], const u8 *in, size_t inlen);
-
-sv hash_k(u8 k[64], const u8 R[32], const u8 A[32], const u8 *M, size_t M_size)
-{
-    HASH_CTX ctx;
-    HASH_INIT  (&ctx);
-    HASH_UPDATE(&ctx, R , 32    );
-    HASH_UPDATE(&ctx, A , 32    );
-    HASH_UPDATE(&ctx, M , M_size);
-    HASH_FINAL (&ctx, k);
-    reduce(k);
-}
-
-void crypto_ed25519_public_key(uint8_t        public_key[32],
-                               const uint8_t  secret_key[32])
-{
-    // hash the private key, turn the hash into a scalar
-    u8 a[64];
-    HASH(a, secret_key, 32);
-    a[ 0] &= 248;
-    a[31] &= 127;
-    a[31] |= 64;
-
-    // the public key is the packed form of the point aB (B == basepoint)
-    gf aB[4];
-    scalarbase(aB, a);
-    pack(public_key, aB);
-}
-
-void crypto_ed25519_sign(uint8_t        signature[64],
-                         const uint8_t  secret_key[32],
-                         const uint8_t *message,
-                         size_t         message_size)
-{
-    u8 h[64];
-    u8 *a      = h;       // secret scalar
-    u8 *prefix = h + 32;  // prefix for nonce generation
-    HASH(h, secret_key, 32);
-
-    // build public key from secret key
-    a[ 0] &= 248;
-    a[31] &= 127;
-    a[31] |= 64;
-    gf aB[4];
-    scalarbase(aB, a);
-    u8 public_key[32];
-    pack(public_key, aB);
-
-    // Constructs the "random" nonce from the secret key and message.
-    // An actual random number would work just fine, and would save us
-    // the trouble of hashing the message twice.  If we did that
-    // however, the user could fuck it up and reuse the nonce.
-    u8 r[64];
-    HASH_CTX ctx;
-    HASH_INIT  (&ctx);
-    HASH_UPDATE(&ctx, prefix , 32          );
-    HASH_UPDATE(&ctx, message, message_size);
-    HASH_FINAL (&ctx, r);
-
-    gf rB[4];
-    reduce(r);
-    scalarbase(rB, r);
-    pack(signature, rB); // first half of the signature = "random" nonce
-
-    u8 k[64];
-    hash_k(k, signature, public_key, message, message_size);
-
-    i64 s[64]; // s = r + k a
-    FOR(i,  0, 32) s[i] = (u64) r[i];
-    FOR(i, 32, 64) s[i] = 0;
-    FOR(i, 0, 32) {
-        FOR(j, 0, 32) {
-            s[i+j] += k[i] * (u64) a[j];
-        }
-    }
-    modL(signature + 32, s);  // second half of the signature = s
-}
-
-int crypto_ed25519_check(const uint8_t  signature[64],
-                         const uint8_t  public_key[32],
-                         const uint8_t *message,
-                         size_t         message_size)
-{
-    gf aB[4];  if (unpackneg(aB, public_key)) return -1;   // -aB
-    u8 k[64];  hash_k(k, signature, public_key, message, message_size);
-    gf p[4];   scalarmult(p, aB, k);                       // p = -aB k
-    gf sB[4];  scalarbase(sB, signature + 32); add(p, sB); // p = s - aB k
-    u8 t[32];  pack(t, p);
-    return vn(signature, t, 32); // R == s - aB k ? OK : fail
-}
diff --git a/ed25519.h b/ed25519.h
deleted file mode 100644
index f03764b..0000000
--- a/ed25519.h
+++ /dev/null
@@ -1,20 +0,0 @@
-#ifndef ED25519_H
-#define ED25519_H
-
-#include <stddef.h>
-#include <inttypes.h>
-
-void crypto_ed25519_public_key(uint8_t        public_key[32],
-                               const uint8_t  secret_key[32]);
-
-void crypto_ed25519_sign(uint8_t        signature[64],
-                         const uint8_t  secret_key[32],
-                         const uint8_t *message,
-                         size_t         message_size);
-
-int crypto_ed25519_check(const uint8_t  signature[64],
-                         const uint8_t  public_key[32],
-                         const uint8_t *message,
-                         size_t         message_size);
-
-#endif // ED25519_H
diff --git a/lock.c b/lock.c
deleted file mode 100644
index bca01da..0000000
--- a/lock.c
+++ /dev/null
@@ -1,92 +0,0 @@
-#include "lock.h"
-#include "x25519.h"
-#include "chacha20.h"
-#include "ae.h"
-
-void crypto_lock_key(uint8_t       shared_key[32],
-                     const uint8_t your_secret_key [32],
-                     const uint8_t their_public_key[32])
-{
-    static const uint8_t _0[16];
-    uint8_t shared_secret[32];
-    crypto_x25519(shared_secret, your_secret_key, their_public_key);
-    crypto_chacha20_H(shared_key, shared_secret, _0);
-}
-
-void crypto_lock_detached(uint8_t        mac[16],
-                          uint8_t       *ciphertext,
-                          const uint8_t  your_secret_key [32],
-                          const uint8_t  their_public_key[32],
-                          const uint8_t  nonce[24],
-                          const uint8_t *plaintext,
-                          size_t         text_size)
-{
-    uint8_t shared_key[32];
-    crypto_lock_key(shared_key, your_secret_key, their_public_key);
-    crypto_ae_lock_detached(mac, ciphertext,
-                            shared_key, nonce,
-                            plaintext, text_size);
-}
-
-int crypto_unlock_detached(uint8_t       *plaintext,
-                           const uint8_t  your_secret_key [32],
-                           const uint8_t  their_public_key[32],
-                           const uint8_t  nonce[24],
-                           const uint8_t  mac[16],
-                           const uint8_t *ciphertext,
-                           size_t         text_size)
-{
-    uint8_t shared_key[32];
-    crypto_lock_key(shared_key, your_secret_key, their_public_key);
-    return crypto_ae_unlock_detached(plaintext,
-                                     shared_key, nonce,
-                                     mac, ciphertext, text_size);
-}
-
-void crypto_lock(uint8_t       *box,
-                 const uint8_t  your_secret_key [32],
-                 const uint8_t  their_public_key[32],
-                 const uint8_t  nonce[24],
-                 const uint8_t *plaintext,
-                 size_t         text_size)
-{
-    crypto_lock_detached(box, box + 16,
-                         your_secret_key, their_public_key, nonce,
-                         plaintext, text_size);
-}
-
-int crypto_unlock(uint8_t       *plaintext,
-                  const uint8_t  your_secret_key [32],
-                  const uint8_t  their_public_key[32],
-                  const uint8_t  nonce[24],
-                  const uint8_t *box,
-                  size_t         text_size)
-{
-    return crypto_unlock_detached(plaintext,
-                                  your_secret_key, their_public_key, nonce,
-                                  box, box + 16, text_size);
-}
-
-static const uint8_t null_nonce[24] = {};
-
-void crypto_anonymous_lock(uint8_t       *box,
-                           const uint8_t  random_secret_key[32],
-                           const uint8_t  their_public_key[32],
-                           const uint8_t *plaintext,
-                           size_t         text_size)
-{
-    crypto_x25519_base(box, random_secret_key); // put public key in box
-    crypto_lock(box + 32,
-                random_secret_key, their_public_key, null_nonce,
-                plaintext, text_size);
-}
-
-int crypto_anonymous_unlock(uint8_t       *plaintext,
-                            const uint8_t  your_secret_key[32],
-                            const uint8_t *box,
-                            size_t         text_size)
-{
-    return crypto_unlock(plaintext,
-                         your_secret_key, box, null_nonce,
-                         box + 32, text_size);
-}
diff --git a/lock.h b/lock.h
deleted file mode 100644
index c1b3e87..0000000
--- a/lock.h
+++ /dev/null
@@ -1,61 +0,0 @@
-#ifndef LOCK_H
-#define LOCK_H
-
-#include <inttypes.h>
-#include <stddef.h>
-
-// Computes a shared key with your secret key and their public key,
-// suitable for crypto_ae* functions.
-void crypto_lock_key(uint8_t       shared_key      [32],
-                     const uint8_t your_secret_key [32],
-                     const uint8_t their_public_key[32]);
-
-// Authenticated encryption with the sender's secret key and the recipient's
-// public key.  The message leaks if one of the secret key gets compromised.
-void crypto_lock_detached(uint8_t        mac[16],
-                          uint8_t       *ciphertext,
-                          const uint8_t  your_secret_key [32],
-                          const uint8_t  their_public_key[32],
-                          const uint8_t  nonce[24],
-                          const uint8_t *plaintext,
-                          size_t         text_size);
-
-// Authenticated decryption with the recipient's secret key, and the sender's
-// public key.  Has no effect if the message is forged.
-int crypto_unlock_detached(uint8_t       *plaintext,
-                           const uint8_t  your_secret_key [32],
-                           const uint8_t  their_public_key[32],
-                           const uint8_t  nonce[24],
-                           const uint8_t  mac[16],
-                           const uint8_t *ciphertext,
-                           size_t         text_size);
-
-// Like the above, only puts the mac and the ciphertext together
-// in a "box", mac first
-void crypto_lock(uint8_t       *box,
-                 const uint8_t  your_secret_key [32],
-                 const uint8_t  their_public_key[32],
-                 const uint8_t  nonce[24],
-                 const uint8_t *plaintext,
-                 size_t         text_size);
-
-// Unlocks a box locked by crypto_lock()
-int crypto_unlock(uint8_t       *plaintext,
-                  const uint8_t  your_secret_key [32],
-                  const uint8_t  their_public_key[32],
-                  const uint8_t  nonce[24],
-                  const uint8_t *box,
-                  size_t         text_size);
-
-void crypto_anonymous_lock(uint8_t       *box,
-                           const uint8_t  random_secret_key[32],
-                           const uint8_t  their_public_key[32],
-                           const uint8_t *plaintext,
-                           size_t         text_size);
-
-int crypto_anonymous_unlock(uint8_t       *plaintext,
-                            const uint8_t  your_secret_key[32],
-                            const uint8_t *box,
-                            size_t         text_size);
-
-#endif // LOCK_H
diff --git a/monocypher.c b/monocypher.c
new file mode 100644
index 0000000..db48387
--- /dev/null
+++ b/monocypher.c
@@ -0,0 +1,1427 @@
+#include "monocypher.h"
+
+static uint32_t load32_le(const uint8_t s[4])
+{
+    return s[0]
+        | (s[1] <<  8)
+        | (s[2] << 16)
+        | (s[3] << 24);
+}
+
+static void store32_le(uint8_t output[4], uint32_t input)
+{
+    output[0] =  input        & 0xff;
+    output[1] = (input >>  8) & 0xff;
+    output[2] = (input >> 16) & 0xff;
+    output[3] = (input >> 24) & 0xff;
+}
+
+static uint64_t rotr64(uint64_t x, uint64_t y)
+{
+    return (x >> y) ^ (x << (64 - y));
+}
+
+static uint64_t load64_le(const uint8_t s[8])
+{
+    return
+        ((uint64_t)s[0]      ) ^
+        ((uint64_t)s[1] <<  8) ^
+        ((uint64_t)s[2] << 16) ^
+        ((uint64_t)s[3] << 24) ^
+        ((uint64_t)s[4] << 32) ^
+        ((uint64_t)s[5] << 40) ^
+        ((uint64_t)s[6] << 48) ^
+        ((uint64_t)s[7] << 56);
+}
+
+static void store64_le(uint8_t output[8], uint64_t input)
+{
+    output[0] =  input        & 0xff;
+    output[1] = (input >>  8) & 0xff;
+    output[2] = (input >> 16) & 0xff;
+    output[3] = (input >> 24) & 0xff;
+    output[4] = (input >> 32) & 0xff;
+    output[5] = (input >> 40) & 0xff;
+    output[6] = (input >> 48) & 0xff;
+    output[7] = (input >> 56) & 0xff;
+}
+
+static void
+chacha20_rounds(uint32_t out[16], const uint32_t in[16])
+{
+    for (int i = 0; i < 16; i++)
+        out[i] = in[i];
+
+    for (int i = 0; i < 10; i++) { // 20 rounds, 2 rounds per loop.
+#define ROT_L32(x, n) x = (x << n) | (x >> (32 - n))
+#define QUARTERROUND(a, b, c, d)           \
+        a += b;  d ^= a;  ROT_L32(d, 16);  \
+        c += d;  b ^= c;  ROT_L32(b, 12);  \
+        a += b;  d ^= a;  ROT_L32(d,  8);  \
+        c += d;  b ^= c;  ROT_L32(b,  7)
+
+        QUARTERROUND(out[0], out[4], out[ 8], out[12]); // column 0
+        QUARTERROUND(out[1], out[5], out[ 9], out[13]); // column 1
+        QUARTERROUND(out[2], out[6], out[10], out[14]); // column 2
+        QUARTERROUND(out[3], out[7], out[11], out[15]); // column 3
+        QUARTERROUND(out[0], out[5], out[10], out[15]); // diagonal 1
+        QUARTERROUND(out[1], out[6], out[11], out[12]); // diagonal 2
+        QUARTERROUND(out[2], out[7], out[ 8], out[13]); // diagonal 3
+        QUARTERROUND(out[3], out[4], out[ 9], out[14]); // diagonal 4
+    }
+}
+
+static void
+chacha20_init_key(crypto_chacha_ctx *ctx, const uint8_t key[32])
+{
+    // constant
+    ctx->input[0]   = load32_le((uint8_t*)"expa");
+    ctx->input[1]   = load32_le((uint8_t*)"nd 3");
+    ctx->input[2]   = load32_le((uint8_t*)"2-by");
+    ctx->input[3]   = load32_le((uint8_t*)"te k");
+    // key
+    for (int i = 0; i < 8; i++)
+        ctx->input[i + 4] = load32_le(key + i*4);
+    // pool index (the random pool starts empty)
+    ctx->pool_index = 64;
+}
+
+void
+crypto_chacha20_H(uint8_t       out[32],
+                  const uint8_t key[32],
+                  const uint8_t in [16])
+{
+    crypto_chacha_ctx ctx;
+    chacha20_init_key(&ctx, key);
+    for (int i = 0; i < 4; i++)
+        ctx.input[i + 12] = load32_le(in + i*4);
+
+    uint32_t buffer[16];
+    chacha20_rounds(buffer, ctx.input);
+    // prevents reversal of the rounds by revealing only half of the buffer.
+    for (int i = 0; i < 4; i++) {
+        store32_le(out      + i*4, buffer[i     ]); // constant
+        store32_le(out + 16 + i*4, buffer[i + 12]); // counter and nonce
+    }
+}
+
+void
+crypto_chacha20_init(crypto_chacha_ctx *ctx,
+                     const uint8_t      key[32],
+                     const uint8_t      nonce[8])
+{
+    chacha20_init_key(ctx, key  );         // key
+    ctx->input[12] = 0;                    // counter
+    ctx->input[13] = 0;                    // counter
+    ctx->input[14] = load32_le(nonce + 0); // nonce
+    ctx->input[15] = load32_le(nonce + 4); // nonce
+}
+
+void
+crypto_chacha20_Xinit(crypto_chacha_ctx *ctx,
+                      const uint8_t      key[32],
+                      const uint8_t      nonce[24])
+{
+    uint8_t derived_key[32];
+    crypto_chacha20_H(derived_key, key, nonce);
+    crypto_chacha20_init(ctx, derived_key, nonce + 16);
+}
+
+void
+crypto_chacha20_encrypt(crypto_chacha_ctx *ctx,
+                        const uint8_t     *plain_text,
+                        uint8_t           *cipher_text,
+                        size_t             message_size)
+{
+    for (size_t i = 0; i < message_size; i++) {
+        // refill the pool if empty
+        if (ctx->pool_index == 64) {
+            // fill the pool
+            uint32_t buffer[16];
+            chacha20_rounds(buffer, ctx->input);
+            for (int i = 0; i < 16; i++)
+                store32_le(ctx->random_pool + i*4, buffer[i] + ctx->input[i]);
+            // update the counters
+            ctx->pool_index = 0;
+            ctx->input[12]++;
+            if (!ctx->input[12])
+                ctx->input[13]++;
+        }
+        // use the pool for encryption (or random stream)
+        cipher_text[i] =
+            (plain_text == 0 ? 0 : plain_text[i])
+            ^ ctx->random_pool[ctx->pool_index];
+        ctx->pool_index++;
+    }
+}
+
+void
+crypto_chacha20_random(crypto_chacha_ctx *ctx,
+                       uint8_t           *cipher_text,
+                       size_t             message_size)
+{
+    crypto_chacha20_encrypt(ctx, 0, cipher_text, message_size);
+}
+
+
+
+static void poly_load(uint32_t out[4], const uint8_t in[16])
+{
+    for (int i = 0; i < 4; i++)
+        out[i] = load32_le(in + i*4);
+}
+
+static void poly_add(uint32_t out[5], const uint32_t a[5], const uint32_t b[5])
+{
+    uint64_t carry = 0;
+    for (int i = 0; i < 5; i++) {
+        carry  += (int64_t)(a[i]) + b[i];
+        out[i]  = carry & 0xffffffff; // lower 32 bits right there.
+        carry >>= 32;                 // retain the carry
+    }
+}
+
+// h = (h + c) * r
+static void poly_block(crypto_poly1305_ctx *ctx)
+{
+    // h + c, without carry propagation
+    const uint64_t h0 = ctx->h[0] + (uint64_t)ctx->c[0];
+    const uint64_t h1 = ctx->h[1] + (uint64_t)ctx->c[1];
+    const uint64_t h2 = ctx->h[2] + (uint64_t)ctx->c[2];
+    const uint64_t h3 = ctx->h[3] + (uint64_t)ctx->c[3];
+    const uint64_t h4 = ctx->h[4] + (uint64_t)ctx->c[4];
+
+    // Local all the things!
+    const uint64_t r0 = ctx->r[0];
+    const uint64_t r1 = ctx->r[1];
+    const uint64_t r2 = ctx->r[2];
+    const uint64_t r3 = ctx->r[3];
+    const uint64_t rr0 = (ctx->r[0] >> 2) * 5; // lose 2 bottom bits...
+    const uint64_t rr1 = (ctx->r[1] >> 2) * 5; // 2 bottom bits already cleared
+    const uint64_t rr2 = (ctx->r[2] >> 2) * 5; // 2 bottom bits already cleared
+    const uint64_t rr3 = (ctx->r[3] >> 2) * 5; // 2 bottom bits already cleared
+
+    // (h + c) * r, without carry propagation
+    const uint64_t x0 = h0*r0 + h1*rr3 + h2*rr2 + h3*rr1 + h4*rr0;
+    const uint64_t x1 = h0*r1 + h1*r0  + h2*rr3 + h3*rr2 + h4*rr1;
+    const uint64_t x2 = h0*r2 + h1*r1  + h2*r0  + h3*rr3 + h4*rr2;
+    const uint64_t x3 = h0*r3 + h1*r2  + h2*r1  + h3*r0  + h4*rr3;
+    const uint64_t x4 = h4 * (r0 & 3); // ...recover those 2 bits
+
+    // carry propagation, put ctx->h under 2^130
+    const uint64_t msb = x4 + (x3 >> 32);
+    uint64_t       u   = (msb >> 2) * 5; // lose 2 bottom bits...
+    u += (x0 & 0xffffffff)             ;  ctx->h[0] = u & 0xffffffff;  u >>= 32;
+    u += (x1 & 0xffffffff) + (x0 >> 32);  ctx->h[1] = u & 0xffffffff;  u >>= 32;
+    u += (x2 & 0xffffffff) + (x1 >> 32);  ctx->h[2] = u & 0xffffffff;  u >>= 32;
+    u += (x3 & 0xffffffff) + (x2 >> 32);  ctx->h[3] = u & 0xffffffff;  u >>= 32;
+    u += msb & 3 /* ...recover them */ ;  ctx->h[4] = u;
+}
+
+// (re-)initializes the input counter and input buffer
+static void poly_clear_c(crypto_poly1305_ctx *ctx)
+{
+    for (int i = 0; i < 4; i++)
+        ctx->c[i] = 0;
+    ctx->c_index = 0;
+}
+
+void crypto_poly1305_init(crypto_poly1305_ctx *ctx, const uint8_t key[32])
+{
+    // initial h: zero
+    for (int i =  0; i < 5; i++)
+        ctx->h [i] = 0;
+    // initial r: first half of the key, minus a few bits
+    poly_load(ctx->r, key);
+    ctx->r[0] &= 0x0fffffff; // clear top 4 bits
+    ctx->r[1] &= 0x0ffffffc; // clear top 4 & bottom 2 bits
+    ctx->r[2] &= 0x0ffffffc; // clear top 4 & bottom 2 bits
+    ctx->r[3] &= 0x0ffffffc; // clear top 4 & bottom 2 bits
+    ctx->c[4]  = 1;
+    // second half of the key, saved for later
+    poly_load(ctx->pad, key + 16);
+    ctx->pad[4] = 0;
+    // buffer and counter
+    poly_clear_c(ctx);
+}
+
+void crypto_poly1305_update(crypto_poly1305_ctx *ctx,
+                            const uint8_t *m, size_t bytes)
+{
+    while (bytes > 0) {
+        if (ctx->c_index == 16) {
+            poly_block(ctx);
+            poly_clear_c(ctx);
+        }
+        // feed the input buffer
+        ctx->c[ctx->c_index / 4] |= *m << ((ctx->c_index % 4) * 8);
+        ctx->c_index++;
+        m++;
+        bytes--;
+    }
+}
+
+void crypto_poly1305_finish(crypto_poly1305_ctx *ctx, uint8_t mac[16])
+{
+    // move the final 1 according to remaining input length
+    ctx->c[4] = 0;
+    ctx->c[ctx->c_index / 4] |= 1 << ((ctx->c_index % 4) * 8);
+    // one last hash update...
+    poly_block(ctx);
+    // ... this time with full modular reduction
+    // We only need to conditionally subtract 2^130-5,
+    // using bit twidling to prevent timing attacks.
+    static const uint32_t minus_p[5] = { 5, 0, 0, 0, 0xfffffffc };
+    uint32_t h_minus_p[5];
+    poly_add(h_minus_p, ctx->h, minus_p);
+    uint32_t negative = ~(-(h_minus_p[4] >> 31)); // 0 or -1 (2's complement)
+    for (int i = 0; i < 5; i++) {
+        ctx->h[i] ^= negative & (ctx->h[i] ^ h_minus_p[i]);
+    }
+    // Add the secret pad to the final hash before output
+    poly_add(ctx->h, ctx->h, ctx->pad);
+    for (int i = 0; i < 4; i++)
+        store32_le(mac + i*4, ctx->h[i]);
+}
+
+void crypto_poly1305_auth(uint8_t mac[16], const uint8_t *m,
+                          size_t  m_size , const uint8_t  key[32])
+{
+    crypto_poly1305_ctx ctx;
+    crypto_poly1305_init  (&ctx, key);
+    crypto_poly1305_update(&ctx, m, m_size);
+    crypto_poly1305_finish(&ctx, mac);
+}
+
+int crypto_memcmp_16(const uint8_t mac1[16], const uint8_t mac2[16])
+{
+    unsigned diff = 0;
+    for (int i = 0; i < 16; i++) {
+        diff |= (mac1[i] ^ mac2[i]);
+    }
+    return diff;
+}
+// ripped off from the reference implentation in RFC 7693
+
+
+
+
+// Initialization Vector.
+static const uint64_t blake2b_iv[8] = {
+    0x6a09e667f3bcc908, 0xbb67ae8584caa73b,
+    0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1,
+    0x510e527fade682d1, 0x9b05688c2b3e6c1f,
+    0x1f83d9abfb41bd6b, 0x5be0cd19137e2179
+};
+
+// increment a 128-bit "word".
+static void
+incr(uint64_t x[2], uint64_t y)
+{
+    x[0] += y;                 // increment the low word
+    if (x[0] < y) { x[1]++; }  // handle overflow
+}
+
+static void
+blake2b_compress(crypto_blake2b_ctx *ctx, _Bool last_block)
+{
+    static const uint8_t sigma[12][16] = {
+        { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+        { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
+        { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
+        { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
+        { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
+        { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
+        { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
+        { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
+        { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
+        { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
+        { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+        { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }
+    };
+
+    // init work variables (before shuffling them)
+    uint64_t v[16];
+    for (int i = 0; i < 8; i++) {
+        v[i    ] = ctx->hash[i];
+        v[i + 8] = blake2b_iv[i];
+    }
+    v[12] ^= ctx->input_size[0]; // low 64 bits of offset
+    v[13] ^= ctx->input_size[1]; // high 64 bits
+    if (last_block) { v[14] = ~v[14]; }
+
+    // load the input buffer
+    uint64_t m[16];
+    for (int i = 0; i < 16; i++) {
+        m[i] = load64_le(&ctx->buf[i * 8]);
+    }
+
+    // shuffle the work variables with the 12 rounds
+    for (int i = 0; i < 12; i++) {
+#define B2B_G(a, b, c, d, x, y)                                    \
+        v[a] += v[b] + x;  v[d] ^= v[a];  v[d] = rotr64(v[d], 32); \
+        v[c] += v[d]    ;  v[b] ^= v[c];  v[b] = rotr64(v[b], 24); \
+        v[a] += v[b] + y;  v[d] ^= v[a];  v[d] = rotr64(v[d], 16); \
+        v[c] += v[d]    ;  v[b] ^= v[c];  v[b] = rotr64(v[b], 63)
+
+        B2B_G( 0, 4,  8, 12, m[sigma[i][ 0]], m[sigma[i][ 1]]);
+        B2B_G( 1, 5,  9, 13, m[sigma[i][ 2]], m[sigma[i][ 3]]);
+        B2B_G( 2, 6, 10, 14, m[sigma[i][ 4]], m[sigma[i][ 5]]);
+        B2B_G( 3, 7, 11, 15, m[sigma[i][ 6]], m[sigma[i][ 7]]);
+        B2B_G( 0, 5, 10, 15, m[sigma[i][ 8]], m[sigma[i][ 9]]);
+        B2B_G( 1, 6, 11, 12, m[sigma[i][10]], m[sigma[i][11]]);
+        B2B_G( 2, 7,  8, 13, m[sigma[i][12]], m[sigma[i][13]]);
+        B2B_G( 3, 4,  9, 14, m[sigma[i][14]], m[sigma[i][15]]);
+    }
+
+    // accumulate the work variables into the hash
+    for(int i = 0; i < 8; i++) {
+        ctx->hash[i] ^= v[i] ^ v[i + 8];
+    }
+}
+
+void
+crypto_blake2b_general_init(crypto_blake2b_ctx *ctx, size_t outlen,
+                            const uint8_t      *key, size_t keylen)
+{
+    // Initial hash == initialization vector...
+    for (int i = 0; i < 8; i++) {
+        ctx->hash[i] = blake2b_iv[i];
+    }
+    ctx->hash[0]      ^= 0x01010000 ^ (keylen << 8) ^ outlen;  // ...mostly
+    ctx->input_size[0] = 0;       // input count low word
+    ctx->input_size[1] = 0;       // input count high word
+    ctx->c             = 0;       // pointer within buffer
+    ctx->output_size   = outlen;  // size of the final hash
+
+    // If there's a key, put it in the first block, then pad with zeroes
+    if (keylen > 0) {
+        for (size_t i = 0     ; i < keylen; i++) { ctx->buf[i] = key[i]; }
+        for (size_t i = keylen; i < 128   ; i++) { ctx->buf[i] = 0;      }
+        ctx->c = 128; // mark the block as used
+    }
+}
+
+void
+crypto_blake2b_init(crypto_blake2b_ctx *ctx)
+{
+    crypto_blake2b_general_init(ctx, 64, 0, 0);
+}
+
+void
+crypto_blake2b_update(crypto_blake2b_ctx *ctx, const uint8_t *in, size_t inlen)
+{
+    for (size_t i = 0; i < inlen; i++) {
+        // If the buffer is full, increment the counters and
+        // add (compress) the current buffer to the hash
+        if (ctx->c == 128) {
+            ctx->c = 0;
+            incr(ctx->input_size, 128);
+            blake2b_compress(ctx, 0); // not last time -> 0
+        }
+        // By now the buffer is not full.  We add one input byte.
+        ctx->buf[ctx->c] = in[i];
+        ctx->c++;
+    }
+}
+
+void
+crypto_blake2b_final(crypto_blake2b_ctx *ctx, uint8_t *out)
+{
+    // update input size, pad then compress the buffer
+    incr(ctx->input_size, ctx->c);
+    for (int i = ctx->c; i < 128; i++) { ctx->buf[i] = 0; }
+    blake2b_compress(ctx, 1); // last time -> 1
+
+    // copy the hash in the output (little endian of course)
+    for (int i = 0; i < ctx->output_size; i++) {
+        out[i] = (ctx->hash[i / 8] >> (8 * (i & 7))) & 0xFF;
+    }
+}
+
+void
+crypto_blake2b_general(      uint8_t*out, size_t outlen,
+                       const uint8_t*key, size_t keylen,
+                       const uint8_t*in,  size_t inlen)
+{
+    crypto_blake2b_ctx ctx;
+    crypto_blake2b_general_init(&ctx, outlen, key, keylen);
+    crypto_blake2b_update(&ctx, in, inlen);
+    crypto_blake2b_final(&ctx, out);
+}
+
+void
+crypto_blake2b(uint8_t out[64], const uint8_t *in, size_t inlen)
+{
+    crypto_blake2b_general(out, 64, 0, 0, in, inlen);
+}
+
+
+/////////////////
+/// Utilities ///
+/////////////////
+
+
+static uint32_t
+min(uint32_t a, uint32_t b)
+{
+    return a <= b ? a : b;
+}
+
+// updates a blake2 hash with a 32 bit word, little endian.
+static void
+blake_update_32(crypto_blake2b_ctx *ctx, uint32_t input)
+{
+    uint8_t buf[4];
+    store32_le(buf, input);
+    crypto_blake2b_update(ctx, buf, 4);
+}
+
+//////////////////
+// Argon2 block //
+//////////////////
+typedef struct block {
+    uint64_t a[128]; // 1024 octets in 128 64-bit words
+} block;
+
+static void
+load_block(block *b, const uint8_t bytes[1024])
+{
+    for (int i = 0; i < 128; i++) {
+        b->a[i] = load64_le(bytes + i * 8);
+    }
+}
+
+static void
+store_block(uint8_t bytes[1024], const block *b)
+{
+    for (int i = 0; i < 128; i++) {
+        store64_le(bytes + i * 8, b->a[i]);
+    }
+}
+
+static void
+copy_block(block *out, const block *in)
+{
+    for (int i = 0; i < 128; i++) {
+        out->a[i] = in->a[i];
+    }
+}
+
+static void
+xor_block(block *out, const block *in)
+{
+    for (int i = 0; i < 128; i++) {
+        out->a[i] ^= in->a[i];
+    }
+}
+
+////////////////////
+// Argon2i proper //
+////////////////////
+
+// Hash with a virtually unlimited digest size.
+// Doesn't extract more entropy than the base hash function.
+// Mainly used for filling a whole kilobyte block with pseudo-random bytes.
+static void
+extended_hash(uint8_t       *digest, uint32_t digest_size,
+              const uint8_t *input , uint32_t input_size)
+{
+    crypto_blake2b_ctx ctx;
+    crypto_blake2b_general_init(&ctx, min(digest_size, 64), 0, 0);
+    blake_update_32            (&ctx, digest_size);
+    crypto_blake2b_update      (&ctx, input, input_size);
+    crypto_blake2b_final       (&ctx, digest);
+
+    if (digest_size > 64) {
+        // the conversion to u64 avoids integer overflow on
+        // ludicrously big hash sizes.
+        uint32_t r   = (((uint64_t)digest_size + 31) / 32) - 2;
+        uint32_t i   =  1;
+        uint32_t in  =  0;
+        uint32_t out = 32;
+        while (i < r) {
+            // Input and output overlap.
+            // This shouldn't be a problem.
+            crypto_blake2b(digest + out, digest + in, 64);
+            i   +=  1;
+            in  += 32;
+            out += 32;
+        }
+        crypto_blake2b_general(digest + out, digest_size - (32 * r),
+                               0, 0, // no key
+                               digest + in , 64);
+    }
+}
+
+// Core of the compression function G.  Computes Z from R in place.
+static void
+g_rounds(block *work_block)
+{
+#define LSB(x) ((x) & 0xffffffff)
+#define G(a, b, c, d)                                            \
+    a += b + 2 * LSB(a) * LSB(b);  d ^= a;  d = rotr64(d, 32);   \
+    c += d + 2 * LSB(c) * LSB(d);  b ^= c;  b = rotr64(b, 24);   \
+    a += b + 2 * LSB(a) * LSB(b);  d ^= a;  d = rotr64(d, 16);   \
+    c += d + 2 * LSB(c) * LSB(d);  b ^= c;  b = rotr64(b, 63)
+#define ROUND(v0,  v1,  v2,  v3,  v4,  v5,  v6,  v7,    \
+              v8,  v9, v10, v11, v12, v13, v14, v15)    \
+    G(v0, v4,  v8, v12);  G(v1, v5,  v9, v13);          \
+    G(v2, v6, v10, v14);  G(v3, v7, v11, v15);          \
+    G(v0, v5, v10, v15);  G(v1, v6, v11, v12);          \
+    G(v2, v7,  v8, v13);  G(v3, v4,  v9, v14)
+
+    // column rounds (work_block = Q)
+    for (int i = 0; i < 128; i += 16) {
+        ROUND(work_block->a[i     ], work_block->a[i +  1],
+              work_block->a[i +  2], work_block->a[i +  3],
+              work_block->a[i +  4], work_block->a[i +  5],
+              work_block->a[i +  6], work_block->a[i +  7],
+              work_block->a[i +  8], work_block->a[i +  9],
+              work_block->a[i + 10], work_block->a[i + 11],
+              work_block->a[i + 12], work_block->a[i + 13],
+              work_block->a[i + 14], work_block->a[i + 15]);
+    }
+    // row rounds (work_block = Z)
+    for (int i = 0; i < 16; i += 2) {
+        ROUND(work_block->a[i      ], work_block->a[i +   1],
+              work_block->a[i +  16], work_block->a[i +  17],
+              work_block->a[i +  32], work_block->a[i +  33],
+              work_block->a[i +  48], work_block->a[i +  49],
+              work_block->a[i +  64], work_block->a[i +  65],
+              work_block->a[i +  80], work_block->a[i +  81],
+              work_block->a[i +  96], work_block->a[i +  97],
+              work_block->a[i + 112], work_block->a[i + 113]);
+    }
+}
+
+// The compression function G
+// may overwrite result completely  (xcopy == copy_block),
+// or XOR result with the old block (xcopy ==  xor_block)
+static void
+binary_g(block *result, const block *x, const block *y,
+         void (*xcopy) (block*, const block*))
+{
+    // put R = X ^ Y into tmp
+    block tmp;
+    copy_block(&tmp, x);
+    xor_block (&tmp, y);
+
+    xcopy(result, &tmp);     // save R (erase or xor the old block)
+    g_rounds(&tmp);          // tmp = Z
+    xor_block(result, &tmp); // result =  R ^ Z (or R ^ Z ^ old)
+}
+
+// unary version of the compression function.
+// The missing argument is implied zero.
+// Does the transformation in place.
+static void
+unary_g(block *work_block)
+{
+    // work_block == R
+    block tmp;
+    copy_block(&tmp, work_block); // tmp = R
+    g_rounds(work_block);         // work_block = Z
+    xor_block(work_block, &tmp);  // work_block = Z ^ R
+}
+
+typedef struct gidx_ctx {
+    block    b;
+    uint32_t pass_number;
+    uint32_t slice_number;
+    uint32_t nb_blocks;
+    uint32_t nb_iterations;
+    uint32_t ctr;
+    uint32_t index;
+} gidx_ctx;
+
+static void
+gidx_refresh(gidx_ctx *ctx)
+{
+    ctx->b.a[0] = ctx->pass_number;
+    ctx->b.a[1] = 0;  // lane number (we have only one)
+    ctx->b.a[2] = ctx->slice_number;
+    ctx->b.a[3] = ctx->nb_blocks;
+    ctx->b.a[4] = ctx->nb_iterations;
+    ctx->b.a[5] = 1;  // type: Argon2i
+    ctx->b.a[6] = ctx->ctr;
+    // zero the rest of the block
+    for (int i = 7; i < 128; i++) {
+        ctx->b.a[i] = 0;
+    }
+
+    // Shuffle the block thus: ctx->b = G((G(ctx->b, zero)), zero)
+    // Applies the G "square" function to get cheap pseudo-random numbers.
+    unary_g(&(ctx->b));
+    unary_g(&(ctx->b)); // square means apply it twice
+}
+
+static void
+gidx_init(gidx_ctx *ctx,
+          uint32_t pass_number,
+          uint32_t slice_number,
+          uint32_t nb_blocks,
+          uint32_t nb_iterations)
+{
+    ctx->pass_number   = pass_number;
+    ctx->slice_number  = slice_number;
+    ctx->nb_blocks     = nb_blocks;
+    ctx->nb_iterations = nb_iterations;
+    ctx->ctr           = 1;   // not zero, surprisingly
+    ctx->index         = pass_number == 0 && slice_number == 0 ? 2 : 0;
+    gidx_refresh(ctx);
+}
+
+static uint32_t
+gidx_next(gidx_ctx *ctx)
+{
+    // lazily creates the index block we need
+    if (ctx->index == 128) {
+        ctx->index = 0;
+        ctx->ctr++;
+        gidx_refresh(ctx);
+    }
+    // saves and increment the index
+    uint32_t index = ctx->index;
+    ctx->index++; // updates index for the next call
+
+    // Computes the area size.
+    // Pass 0 : all already finished segments plus already constructed
+    //          blocks in this segment
+    // Pass 1+: 3 last segments plus already constructed
+    //          blocks in this segment.  THE SPEC SUGGESTS OTHERWISE.
+    //          I CONFORM TO THE REFERENCE IMPLEMENTATION.
+    _Bool    first_pass = ctx->pass_number == 0;
+    uint32_t slice_size = ctx->nb_blocks / 4;
+    uint32_t area_size  = ((first_pass ? ctx->slice_number : 3)
+                           * slice_size + index - 1);
+
+    // Computes the starting position of the reference area.
+    // CONTRARY TO WHAT THE SPEC SUGGESTS, IT STARTS AT THE
+    // NEXT SEGMENT, NOT THE NEXT BLOCK.
+    uint32_t next_slice = (ctx->slice_number == 3
+                           ? 0
+                           : (ctx->slice_number + 1) * slice_size);
+    uint32_t start_pos  = first_pass ? 0 : next_slice;
+
+    // Generates the actual index from J1 (no need for J2, there's only one lane)
+    uint64_t j1         = ctx->b.a[index] & 0xffffffff; // pseudo-random number
+    uint64_t x          = (j1 * j1)       >> 32;
+    uint64_t y          = (area_size * x) >> 32;
+    uint64_t z          = area_size - 1 - y;
+    return (start_pos + z) % ctx->nb_blocks;
+}
+
+// Main algorithm
+void
+crypto_argon2i_hash(uint8_t       *tag,       uint32_t tag_size,
+                    const uint8_t *password,  uint32_t password_size,
+                    const uint8_t *salt,      uint32_t salt_size,
+                    const uint8_t *key,       uint32_t key_size,
+                    const uint8_t *ad,        uint32_t ad_size,
+                    void *work_area,
+                    uint32_t nb_blocks,
+                    uint32_t nb_iterations)
+{
+    // work area seen as blocks (must be suitably aligned)
+    block *blocks = work_area;
+    {
+        crypto_blake2b_ctx ctx;
+        crypto_blake2b_init(&ctx);
+
+        blake_update_32      (&ctx, 1            ); // p: number of threads
+        blake_update_32      (&ctx, tag_size     );
+        blake_update_32      (&ctx, nb_blocks    );
+        blake_update_32      (&ctx, nb_iterations);
+        blake_update_32      (&ctx, 0x13         ); // v: version number
+        blake_update_32      (&ctx, 1            ); // y: Argon2i
+        blake_update_32      (&ctx,           password_size);
+        crypto_blake2b_update(&ctx, password, password_size);
+        blake_update_32      (&ctx,           salt_size);
+        crypto_blake2b_update(&ctx, salt,     salt_size);
+        blake_update_32      (&ctx,           key_size);
+        crypto_blake2b_update(&ctx, key,      key_size);
+        blake_update_32      (&ctx,           ad_size);
+        crypto_blake2b_update(&ctx, ad,       ad_size);
+
+        uint8_t initial_hash[72]; // 64 bytes plus 2 words for future hashes
+        crypto_blake2b_final(&ctx, initial_hash);
+
+        // fill first 2 blocks
+        block   tmp_block;
+        uint8_t hash_area[1024];
+        store32_le(initial_hash + 64, 0); // first  additional word
+        store32_le(initial_hash + 68, 0); // second additional word
+        extended_hash(hash_area, 1024, initial_hash, 72);
+        load_block(&tmp_block, hash_area);
+        copy_block(blocks, &tmp_block);
+
+        store32_le(initial_hash + 64, 1); // slight modification
+        extended_hash(hash_area, 1024, initial_hash, 72);
+        load_block(&tmp_block, hash_area);
+        copy_block(blocks + 1, &tmp_block);
+    }
+
+    // Actual number of blocks
+    nb_blocks -= nb_blocks % 4; // round down to 4 p (p == 1 thread)
+    const uint32_t segment_size = nb_blocks / 4;
+
+    // fill (then re-fill) the rest of the blocks
+    for (uint32_t pass_number = 0; pass_number < nb_iterations; pass_number++) {
+        _Bool     first_pass  = pass_number == 0;
+        // Simple copy on pass 0, XOR instead of overwrite on subsequent passes
+        void (*xcopy) (block*, const block*) = first_pass ?copy_block :xor_block;
+
+        for (int segment = 0; segment < 4; segment++ ) {
+
+            gidx_ctx ctx;
+            gidx_init(&ctx, pass_number, segment, nb_blocks, nb_iterations);
+
+            // On the first segment of the first pass,
+            // blocks 0 and 1 are already filled.
+            // We use the offset to skip them.
+            uint32_t offset = first_pass && segment == 0 ? 2 : 0;
+            // current, reference, and previous are block indices
+            for (uint32_t current =  segment      * segment_size + offset;
+                 current          < (segment + 1) * segment_size;
+                 current++) {
+                uint32_t previous  = current == 0 ? nb_blocks - 1 : current - 1;
+                uint32_t reference = gidx_next(&ctx);
+                binary_g(blocks + current,
+                         blocks + previous,
+                         blocks + reference,
+                         xcopy);
+            }
+        }
+    }
+    // hash the very last block with H' into the output tag
+    uint8_t final_block[1024];
+    store_block(final_block, blocks + (nb_blocks - 1));
+    extended_hash(tag, tag_size, final_block, 1024);
+}
+
+void
+crypto_argon2i(uint8_t        tag[32],
+               const uint8_t *password,  uint32_t password_size,
+               const uint8_t *salt,      uint32_t salt_size,
+               void    *work_area,
+               uint32_t nb_blocks,
+               uint32_t nb_iterations)
+{
+    crypto_argon2i_hash(tag     , 32,
+                        password, password_size,
+                        salt    , salt_size,
+                        0, 0, 0, 0,
+                        work_area, nb_blocks, nb_iterations);
+}
+
+// Taken from TweetNaCl
+
+
+#define FOR(i, start, end) for (size_t i = start; i < end; i++)
+#define sv static void
+typedef int64_t gf[16];
+
+static const uint8_t _0[16];
+static const uint8_t _9[32] = { 9 };
+static const gf _121665 = { 0xdb41, 1 };
+
+sv car_25519(gf o)
+{
+    FOR(i, 0, 16) {
+        o[i]              += 1LL  << 16;
+        int64_t c          = o[i] >> 16;
+        o[(i+1) * (i<15)] += c - 1 + (37 * (c-1) * (i==15));
+        o[i]              -= c << 16;
+    }
+}
+
+sv sel_25519(gf p, gf q, int b)
+{
+    int64_t c = ~(b-1);
+    FOR(i, 0, 16) {
+        int64_t t = c & (p[i] ^ q[i]);
+        p[i]     ^= t;
+        q[i]     ^= t;
+    }
+}
+
+sv pack_25519(uint8_t *o, const gf n)
+{
+    gf t;
+    FOR(i, 0, 16) t[i] = n[i];
+    car_25519(t);
+    car_25519(t);
+    car_25519(t);
+    FOR(j, 0, 2) {
+        gf m;
+        m[0] = t[0] - 0xffed;
+        FOR(i, 1, 15) {
+            m[i  ]  = t[i] - 0xffff - ((m[i-1] >> 16) & 1);
+            m[i-1] &= 0xffff;
+        }
+        m[15]  = t[15] - 0x7fff - ((m[14] >> 16) & 1);
+        int b  = (m[15] >> 16) & 1;
+        m[14] &= 0xffff;
+        sel_25519(t, m, 1-b);
+    }
+    FOR(i, 0, 16) {
+        o[2*i    ] = t[i] & 0xff;
+        o[2*i + 1] = t[i] >> 8;
+    }
+}
+
+sv unpack_25519(gf o, const uint8_t *n)
+{
+    FOR(i, 0, 16) o[i] = n[2*i] + ((int64_t)n[2*i + 1] << 8);
+    o[15] &= 0x7fff;
+}
+
+sv A(gf o, const gf a, const gf b)
+{
+    FOR(i, 0, 16) o[i] = a[i] + b[i];
+}
+
+sv Z(gf o, const gf a, const gf b)
+{
+    FOR(i, 0, 16) o[i] = a[i] - b[i];
+}
+
+sv M(gf o, const gf a, const gf b)
+{
+    int64_t t[31];
+    FOR(i, 0, 31) t[i] = 0;
+    FOR(i, 0, 16) FOR(j, 0, 16) t[i+j] += a[i] * b[j];
+    FOR(i, 0, 15) t[i] += 38 * t[i+16];
+    FOR(i, 0, 16) o[i] = t[i];
+    car_25519(o);
+    car_25519(o);
+}
+
+sv S(gf o,const gf a)
+{
+    M(o, a, a);
+}
+
+sv inv_25519(gf o,const gf i)
+{
+    gf c;
+    FOR(a, 0, 16) c[a] = i[a];
+    for(int a = 253; a >= 0; a--) {
+        S(c, c);
+        if(a != 2 && a != 4)
+            M(c, c, i);
+    }
+    FOR(a, 0, 16) o[a] = c[a];
+}
+
+void crypto_x25519(uint8_t q[32], const uint8_t n[32], const uint8_t p[32])
+{
+    uint8_t z[32];
+    int64_t x[80];
+    int64_t r;
+    gf a, b, c, d, e, f;
+    FOR(i, 0, 31) z[i] = n[i];
+    z[31]  = (n[31] & 127) | 64;
+    z[0 ] &= 248;
+    unpack_25519(x, p);
+    FOR(i, 0, 16) {
+        b[i] = x[i];
+        d[i] = a[i] = c[i] = 0;
+    }
+    a[0] = d[0] = 1;
+    for(int i = 254; i>=0; i--) {
+        r = (z[i>>3] >> (i & 7)) & 1;
+        sel_25519(a, b, r);
+        sel_25519(c, d, r);
+        A(e, a, c);
+        Z(a, a, c);
+        A(c, b, d);
+        Z(b, b, d);
+        S(d, e);
+        S(f, a);
+        M(a, c, a);
+        M(c, b, e);
+        A(e, a, c);
+        Z(a, a, c);
+        S(b, a);
+        Z(c, d, f);
+        M(a, c, _121665);
+        A(a, a, d);
+        M(c, c, a);
+        M(a, d, f);
+        M(d, b, x);
+        S(b, e);
+        sel_25519(a, b, r);
+        sel_25519(c, d, r);
+    }
+    FOR(i, 0, 16) {
+        x[i+16] = a[i];
+        x[i+32] = c[i];
+        x[i+48] = b[i];
+        x[i+64] = d[i];
+    }
+    inv_25519(x+32, x+32);
+    M(x+16, x+16, x+32);
+    pack_25519(q, x+16);
+}
+
+void crypto_x25519_base(uint8_t q[32], const uint8_t n[32])
+{
+    crypto_x25519(q, n, _9);
+}
+// Taken from TweetNaCl.
+// I tried the ref10 implementation, but that was too damn big
+
+
+#define FOR(i, start, end) for (size_t i = start; i < end; i++)
+#define sv static void
+#define sc static const
+
+typedef uint8_t   u8;
+typedef int64_t  i64;
+typedef uint64_t u64;
+typedef i64 gf[16];
+
+sc gf gf0;
+sc gf gf1 = { 1 };
+sc gf  D  = { 0x78a3, 0x1359, 0x4dca, 0x75eb, 0xd8ab, 0x4141, 0x0a4d, 0x0070,
+              0xe898, 0x7779, 0x4079, 0x8cc7, 0xfe73, 0x2b6f, 0x6cee, 0x5203};
+sc gf  D2 = { 0xf159, 0x26b2, 0x9b94, 0xebd6, 0xb156, 0x8283, 0x149a, 0x00e0,
+              0xd130, 0xeef3, 0x80f2, 0x198e, 0xfce7, 0x56df, 0xd9dc, 0x2406};
+sc gf  X  = { 0xd51a, 0x8f25, 0x2d60, 0xc956, 0xa7b2, 0x9525, 0xc760, 0x692c,
+              0xdc5c, 0xfdd6, 0xe231, 0xc0a4, 0x53fe, 0xcd6e, 0x36d3, 0x2169};
+sc gf  Y  = { 0x6658, 0x6666, 0x6666, 0x6666, 0x6666, 0x6666, 0x6666, 0x6666,
+              0x6666, 0x6666, 0x6666, 0x6666, 0x6666, 0x6666, 0x6666, 0x6666};
+sc gf  I  = { 0xa0b0, 0x4a0e, 0x1b27, 0xc4ee, 0xe478, 0xad2f, 0x1806, 0x2f43,
+              0xd7a7, 0x3dfb, 0x0099, 0x2b4d, 0xdf0b, 0x4fc1, 0x2480, 0x2b83};
+
+sc u64 L[32] = { 0xed, 0xd3, 0xf5, 0x5c, 0x1a, 0x63, 0x12, 0x58,
+                 0xd6, 0x9c, 0xf7, 0xa2, 0xde, 0xf9, 0xde, 0x14,
+                 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+                 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10 };
+
+sv set_25519(gf r, const gf a) { FOR(i, 0, 16) r[i] = a[i]; }
+
+static u8 par_25519(const gf a)
+{
+    u8 d[32];
+    pack_25519(d, a);
+    return d[0] & 1;
+}
+
+sv pow2523(gf o,const gf i)
+{
+    gf c;
+    FOR(a, 0, 16) c[a] = i[a];
+    for(int a = 250; a >= 0; a--) {
+        S(c, c);
+        if(a != 1) M(c, c, i);
+    }
+    FOR(a, 0, 16) o[a] = c[a];
+}
+
+static int vn(const u8 *x, const u8 *y, size_t n)
+{
+  uint32_t d = 0;
+  FOR(i, 0, n) d |= x[i] ^ y[i];
+  return (1 & ((d - 1) >> 8)) - 1;
+}
+
+static int neq_25519(const gf a, const gf b)
+{
+    u8 c[32],d[32];
+    pack_25519(c, a);
+    pack_25519(d, b);
+    return vn(c, d, 32);
+}
+
+sv add(gf p[4], gf q[4])
+{
+    gf a, b, c, d, t, e, f, g, h;
+    Z(a, p[1], p[0]);
+    Z(t, q[1], q[0]);
+    M(a, a, t);
+    A(b, p[0], p[1]);
+    A(t, q[0], q[1]);
+    M(b, b, t);
+    M(c, p[3], q[3]);
+    M(c, c, D2);
+    M(d, p[2], q[2]);
+    A(d, d, d);
+    Z(e, b, a);
+    Z(f, d, c);
+    A(g, d, c);
+    A(h, b, a);
+
+    M(p[0], e, f);
+    M(p[1], h, g);
+    M(p[2], g, f);
+    M(p[3], e, h);
+}
+
+sv cswap(gf p[4], gf q[4], u8 b)
+{
+    FOR(i, 0, 4)
+        sel_25519(p[i],q[i],b);
+}
+
+sv pack(u8 *r, gf p[4])
+{
+    gf tx, ty, zi;
+    inv_25519(zi, p[2]);
+    M(tx, p[0], zi);
+    M(ty, p[1], zi);
+    pack_25519(r, ty);
+    r[31] ^= par_25519(tx) << 7;
+}
+
+sv scalarmult(gf p[4], gf q[4], const u8 *s)
+{
+    set_25519(p[0], gf0);
+    set_25519(p[1], gf1);
+    set_25519(p[2], gf1);
+    set_25519(p[3], gf0);
+    for (int i = 255; i >= 0; i--) {
+        u8 b = (s[i/8] >> (i & 7)) & 1;
+        cswap(p, q, b);
+        add(q, p);
+        add(p, p);
+        cswap(p, q, b);
+    }
+}
+
+sv scalarbase(gf p[4], const u8 *s)
+{
+    gf q[4];
+    set_25519(q[0], X);
+    set_25519(q[1], Y);
+    set_25519(q[2], gf1);
+    M(q[3], X, Y);
+    scalarmult(p, q, s);
+}
+
+sv modL(u8 *r, i64 x[64])
+{
+    i64 i, j;
+    for (i = 63;i >= 32;--i) {
+        i64 carry = 0;
+        for (j = i - 32;j < i - 12;++j) {
+            x[j] += carry - 16 * x[i] * L[j - (i - 32)];
+            carry = (x[j] + 128) >> 8;
+            x[j] -= carry << 8;
+        }
+        x[j] += carry;
+        x[i] = 0;
+    }
+    i64 carry = 0;
+    FOR(j, 0, 32) {
+        x[j] += carry - (x[31] >> 4) * L[j];
+        carry = x[j] >> 8;
+        x[j] &= 255;
+    }
+    FOR(j, 0, 32) x[j] -= carry * L[j];
+    FOR(i, 0, 32) {
+        x[i+1] += x[i] >> 8;
+        r[i  ]  = x[i] & 255;
+    }
+}
+
+sv reduce(u8 r[64])
+{
+    i64 x[64];
+    FOR(i, 0, 64) x[i] = (u64) r[i];
+    FOR(i, 0, 64) r[i] = 0;
+    modL(r, x);
+}
+
+static int unpackneg(gf r[4],const u8 p[32])
+{
+    gf t, chk, num, den, den2, den4, den6;
+    set_25519(r[2], gf1);
+    unpack_25519(r[1], p);
+    S(num,r [1]);
+    M(den, num, D);
+    Z(num, num, r[2]);
+    A(den, r[2], den);
+
+    S(den2, den);
+    S(den4, den2);
+    M(den6, den4, den2);
+    M(t, den6, num);
+    M(t, t, den);
+
+    pow2523(t, t);
+    M(t, t, num);
+    M(t, t, den);
+    M(t, t, den);
+    M(r[0], t, den);
+
+    S(chk, r[0]);
+    M(chk, chk, den);
+    if (neq_25519(chk, num)) M(r[0], r[0], I);
+
+    S(chk, r[0]);
+    M(chk, chk, den);
+    if (neq_25519(chk, num)) return -1;
+
+    if (par_25519(r[0]) == (p[31]>>7)) Z(r[0],gf0,r[0]);
+
+    M(r[3], r[0], r[1]);
+    return 0;
+}
+
+#ifdef ED25519_SHA512
+    #include "sha512.h"
+    #define HASH crypto_sha512
+#else
+    #define HASH crypto_blake2b
+#endif
+
+#define COMBINE1(x, y) x ## y
+#define COMBINE2(x, y) COMBINE1(x, y)
+#define HASH_CTX    COMBINE2(HASH, _ctx)
+#define HASH_INIT   COMBINE2(HASH, _init)
+#define HASH_UPDATE COMBINE2(HASH, _update)
+#define HASH_FINAL  COMBINE2(HASH, _final)
+
+// hash function interface
+// Typical uses: sha512 for tests vectors, blake2b for production.
+void HASH_INIT  (HASH_CTX *ctx);
+void HASH_UPDATE(HASH_CTX *ctx, const u8 *in, size_t inlen);
+void HASH_FINAL (HASH_CTX *ctx, u8 hash[64]);
+void HASH(u8 hash[64], const u8 *in, size_t inlen);
+
+sv hash_k(u8 k[64], const u8 R[32], const u8 A[32], const u8 *M, size_t M_size)
+{
+    HASH_CTX ctx;
+    HASH_INIT  (&ctx);
+    HASH_UPDATE(&ctx, R , 32    );
+    HASH_UPDATE(&ctx, A , 32    );
+    HASH_UPDATE(&ctx, M , M_size);
+    HASH_FINAL (&ctx, k);
+    reduce(k);
+}
+
+void crypto_ed25519_public_key(uint8_t        public_key[32],
+                               const uint8_t  secret_key[32])
+{
+    // hash the private key, turn the hash into a scalar
+    u8 a[64];
+    HASH(a, secret_key, 32);
+    a[ 0] &= 248;
+    a[31] &= 127;
+    a[31] |= 64;
+
+    // the public key is the packed form of the point aB (B == basepoint)
+    gf aB[4];
+    scalarbase(aB, a);
+    pack(public_key, aB);
+}
+
+void crypto_ed25519_sign(uint8_t        signature[64],
+                         const uint8_t  secret_key[32],
+                         const uint8_t *message,
+                         size_t         message_size)
+{
+    u8 h[64];
+    u8 *a      = h;       // secret scalar
+    u8 *prefix = h + 32;  // prefix for nonce generation
+    HASH(h, secret_key, 32);
+
+    // build public key from secret key
+    a[ 0] &= 248;
+    a[31] &= 127;
+    a[31] |= 64;
+    gf aB[4];
+    scalarbase(aB, a);
+    u8 public_key[32];
+    pack(public_key, aB);
+
+    // Constructs the "random" nonce from the secret key and message.
+    // An actual random number would work just fine, and would save us
+    // the trouble of hashing the message twice.  If we did that
+    // however, the user could fuck it up and reuse the nonce.
+    u8 r[64];
+    HASH_CTX ctx;
+    HASH_INIT  (&ctx);
+    HASH_UPDATE(&ctx, prefix , 32          );
+    HASH_UPDATE(&ctx, message, message_size);
+    HASH_FINAL (&ctx, r);
+
+    gf rB[4];
+    reduce(r);
+    scalarbase(rB, r);
+    pack(signature, rB); // first half of the signature = "random" nonce
+
+    u8 k[64];
+    hash_k(k, signature, public_key, message, message_size);
+
+    i64 s[64]; // s = r + k a
+    FOR(i,  0, 32) s[i] = (u64) r[i];
+    FOR(i, 32, 64) s[i] = 0;
+    FOR(i, 0, 32) {
+        FOR(j, 0, 32) {
+            s[i+j] += k[i] * (u64) a[j];
+        }
+    }
+    modL(signature + 32, s);  // second half of the signature = s
+}
+
+int crypto_ed25519_check(const uint8_t  signature[64],
+                         const uint8_t  public_key[32],
+                         const uint8_t *message,
+                         size_t         message_size)
+{
+    gf aB[4];  if (unpackneg(aB, public_key)) return -1;   // -aB
+    u8 k[64];  hash_k(k, signature, public_key, message, message_size);
+    gf p[4];   scalarmult(p, aB, k);                       // p = -aB k
+    gf sB[4];  scalarbase(sB, signature + 32); add(p, sB); // p = s - aB k
+    u8 t[32];  pack(t, p);
+    return vn(signature, t, 32); // R == s - aB k ? OK : fail
+}
+
+void crypto_ae_lock_detached(uint8_t        mac[16],
+                             uint8_t       *ciphertext,
+                             const uint8_t  key[32],
+                             const uint8_t  nonce[24],
+                             const uint8_t *plaintext,
+                             size_t         text_size)
+{
+    crypto_chacha_ctx e_ctx;
+    uint8_t           auth_key[32];
+    crypto_chacha20_Xinit (&e_ctx, key, nonce);
+    crypto_chacha20_random(&e_ctx, auth_key, 32);
+
+    crypto_chacha20_encrypt(&e_ctx, plaintext, ciphertext, text_size);
+    crypto_poly1305_auth(mac, ciphertext, text_size, auth_key);
+}
+
+int crypto_ae_unlock_detached(uint8_t       *plaintext,
+                              const uint8_t  key[32],
+                              const uint8_t  nonce[24],
+                              const uint8_t  mac[16],
+                              const uint8_t *ciphertext,
+                              size_t         text_size)
+{
+    crypto_chacha_ctx e_ctx;
+    uint8_t           auth_key[32];
+    crypto_chacha20_Xinit (&e_ctx, key, nonce);
+    crypto_chacha20_random(&e_ctx, auth_key, 32);
+
+    uint8_t real_mac[16];
+    crypto_poly1305_auth(real_mac, ciphertext, text_size, auth_key);
+
+    if (crypto_memcmp_16(real_mac, mac))
+        return -1;
+
+    crypto_chacha20_encrypt(&e_ctx, ciphertext, plaintext, text_size);
+    return 0;
+}
+
+void crypto_ae_lock(uint8_t       *box,
+                    const uint8_t  key[32],
+                    const uint8_t  nonce[24],
+                    const uint8_t *plaintext,
+                    size_t         text_size)
+{
+    crypto_ae_lock_detached(box, box + 16, key, nonce, plaintext, text_size);
+}
+
+int crypto_ae_unlock(uint8_t       *plaintext,
+                     const uint8_t  key[32],
+                     const uint8_t  nonce[24],
+                     const uint8_t *box,
+                     size_t         text_size)
+{
+    return crypto_ae_unlock_detached(plaintext, key, nonce,
+                                     box, box + 16, text_size);
+}
+
+void crypto_lock_key(uint8_t       shared_key[32],
+                     const uint8_t your_secret_key [32],
+                     const uint8_t their_public_key[32])
+{
+    static const uint8_t _0[16];
+    uint8_t shared_secret[32];
+    crypto_x25519(shared_secret, your_secret_key, their_public_key);
+    crypto_chacha20_H(shared_key, shared_secret, _0);
+}
+
+void crypto_lock_detached(uint8_t        mac[16],
+                          uint8_t       *ciphertext,
+                          const uint8_t  your_secret_key [32],
+                          const uint8_t  their_public_key[32],
+                          const uint8_t  nonce[24],
+                          const uint8_t *plaintext,
+                          size_t         text_size)
+{
+    uint8_t shared_key[32];
+    crypto_lock_key(shared_key, your_secret_key, their_public_key);
+    crypto_ae_lock_detached(mac, ciphertext,
+                            shared_key, nonce,
+                            plaintext, text_size);
+}
+
+int crypto_unlock_detached(uint8_t       *plaintext,
+                           const uint8_t  your_secret_key [32],
+                           const uint8_t  their_public_key[32],
+                           const uint8_t  nonce[24],
+                           const uint8_t  mac[16],
+                           const uint8_t *ciphertext,
+                           size_t         text_size)
+{
+    uint8_t shared_key[32];
+    crypto_lock_key(shared_key, your_secret_key, their_public_key);
+    return crypto_ae_unlock_detached(plaintext,
+                                     shared_key, nonce,
+                                     mac, ciphertext, text_size);
+}
+
+void crypto_lock(uint8_t       *box,
+                 const uint8_t  your_secret_key [32],
+                 const uint8_t  their_public_key[32],
+                 const uint8_t  nonce[24],
+                 const uint8_t *plaintext,
+                 size_t         text_size)
+{
+    crypto_lock_detached(box, box + 16,
+                         your_secret_key, their_public_key, nonce,
+                         plaintext, text_size);
+}
+
+int crypto_unlock(uint8_t       *plaintext,
+                  const uint8_t  your_secret_key [32],
+                  const uint8_t  their_public_key[32],
+                  const uint8_t  nonce[24],
+                  const uint8_t *box,
+                  size_t         text_size)
+{
+    return crypto_unlock_detached(plaintext,
+                                  your_secret_key, their_public_key, nonce,
+                                  box, box + 16, text_size);
+}
+
+static const uint8_t null_nonce[24] = {};
+
+void crypto_anonymous_lock(uint8_t       *box,
+                           const uint8_t  random_secret_key[32],
+                           const uint8_t  their_public_key[32],
+                           const uint8_t *plaintext,
+                           size_t         text_size)
+{
+    crypto_x25519_base(box, random_secret_key); // put public key in box
+    crypto_lock(box + 32,
+                random_secret_key, their_public_key, null_nonce,
+                plaintext, text_size);
+}
+
+int crypto_anonymous_unlock(uint8_t       *plaintext,
+                            const uint8_t  your_secret_key[32],
+                            const uint8_t *box,
+                            size_t         text_size)
+{
+    return crypto_unlock(plaintext,
+                         your_secret_key, box, null_nonce,
+                         box + 32, text_size);
+}
diff --git a/monocypher.h b/monocypher.h
new file mode 100644
index 0000000..7829768
--- /dev/null
+++ b/monocypher.h
@@ -0,0 +1,335 @@
+#ifndef MONOCYPHER_H
+#define MONOCYPHER_H
+
+#include <inttypes.h>
+#include <stddef.h>
+
+// This is a chacha20 context.
+// To use safely, just follow these guidelines:
+// - Always initialize your context with one of the crypto_init_* functions below
+// - Dont't modify it, except through the crypto_chacha20_* below.
+// - Never duplicate it.
+typedef struct crypto_chacha_ctx {
+    uint32_t input[16];       // current input, unencrypted
+    uint8_t  random_pool[64]; // last input, encrypted
+    uint8_t  pool_index;      // pointer to random_pool
+} crypto_chacha_ctx;
+
+// HChacha20.  *Kind* of a cryptographic hash, based on the chacha20 rounds.
+// Used for XChacha20, and the key derivation of the X25519 shared secret.
+// Don't use it unless you really know what you're doing.
+void
+crypto_chacha20_H(uint8_t       out[32],
+                  const uint8_t key[32],
+                  const uint8_t in [16]);
+
+// Initializes a chacha context.
+//
+// WARNING: DON'T USE THE SAME NONCE AND KEY TWICE
+//
+// You'd be exposing the XOR of subsequent encrypted
+// messages, thus destroying your confidentiality.
+//
+// WARNING: DON'T SELECT THE NONCE AT RANDOM
+//
+// If you encode enough messages with a random nonce, there's a good
+// chance some of them will use the same nonce by accident. 64 bits
+// just isn't enough for this.  Use a counter instead.
+//
+// If there are multiple parties sending out messages, you can give them
+// all an initial nonce of 0, 1 .. n-1 respectively, and have them increment
+// their nonce  by n.  (Also make sure the nonces never wrap around.).
+void
+crypto_chacha20_init(crypto_chacha_ctx *ctx,
+                     const uint8_t      key[32],
+                     const uint8_t      nonce[8]);
+
+// Initializes a chacha context, with a big nonce (192 bits),
+// more than enough to be selected at random.
+//
+// The price you pay for that is a slower initialization.  The security
+// guarantees are the same as regular initialization.
+void
+crypto_chacha20_Xinit(crypto_chacha_ctx *ctx,
+                      const uint8_t      key[32],
+                      const uint8_t      nonce[24]);
+
+// Encrypts the plain_text by XORing it with a pseudo-random
+// stream of numbers, seeded by the provided chacha20 context.
+// Decryption uses the exact same method.
+//
+// Once the context is initialized, encryptions can safely be chained thus:
+//
+//    crypto_encrypt_chacha20(ctx, plain_0, cipher_0, length_0);
+//    crypto_encrypt_chacha20(ctx, plain_1, cipher_1, length_1);
+//    crypto_encrypt_chacha20(ctx, plain_2, cipher_2, length_2);
+//
+// plain_text and cipher_text may point to the same location, for in-place
+// encryption.
+//
+// plain_text is allowed to be null (0), in which case it will be
+// interpreted as an all zero input.  The cipher_text will then
+// contain the raw chacha20 stream.  Useful as a random number
+// generator.
+//
+// WARNING: ENCRYPTION ALONE IS NOT SECURE.  YOU NEED AUTHENTICATION AS WELL.
+// Use the provided authenticated encryption constructions.
+void
+crypto_chacha20_encrypt(crypto_chacha_ctx *ctx,
+                        const uint8_t     *plain_text,
+                        uint8_t           *cipher_text,
+                        size_t             message_size);
+
+// convenience function.  Same as chacha20_encrypt() with a null plain_text.
+void
+crypto_chacha20_random(crypto_chacha_ctx *ctx,
+                       uint8_t           *cipher_text,
+                       size_t             message_size);
+
+
+typedef struct {
+    uint32_t r[4];
+    uint32_t h[5];
+    uint32_t c[5];
+    uint32_t pad[5];
+    size_t   c_index;
+} crypto_poly1305_ctx;
+
+
+// Initializes the poly1305 context with the secret key.
+// Call first (obviously).
+// WARNING: NEVER AUTHENTICATE 2 MESSAGES WITH THE SAME KEY.
+// This is a ONE TIME authenticator.  If you authenticate 2 messages
+// with the same key, the attacker may deduce your secret key and
+// authenticate messages in your stead.
+void crypto_poly1305_init(crypto_poly1305_ctx *ctx, const uint8_t key[32]);
+
+// Updates the poly1305 context with a chunk of the message
+// Can be called multiple times, once for each chunk.
+// Make sure the chunks are processed in order, without overlap or hole...
+void crypto_poly1305_update(crypto_poly1305_ctx *ctx,
+                            const uint8_t *m, size_t bytes);
+
+// Authenticate the message munched through previous update() calls.
+// Call last (obviously).
+void crypto_poly1305_finish(crypto_poly1305_ctx *ctx, uint8_t mac[16]);
+
+
+// Convenience all in one function
+void crypto_poly1305_auth(uint8_t        mac[16],
+                          const uint8_t *m,
+                          size_t         msg_length,
+                          const uint8_t  key[32]);
+
+// Constant time equality verification
+// returns 0 if it matches, something else otherwise.
+int crypto_memcmp_16(const uint8_t mac1[16], const uint8_t mac2[16]);
+
+
+// blake2b context
+typedef struct {
+    uint8_t  buf[128];      // input buffer
+    uint64_t hash[8];       // chained state
+    uint64_t input_size[2]; // total number of bytes
+    uint8_t  c;             // pointer for buf[]
+    uint8_t  output_size;   // digest size
+} crypto_blake2b_ctx;
+
+// Initializes the context with user defined parameters:
+// outlen: the length of the hash.  Must be between 1 and 64.
+// keylen: length of the key.       Must be between 0 and 64.
+// key   : some secret key.         May be NULL if keylen is 0.
+// Any deviation from these invariants results in UNDEFINED BEHAVIOR
+void
+crypto_blake2b_general_init(crypto_blake2b_ctx *ctx, size_t outlen,
+                            const uint8_t      *key, size_t keylen);
+
+// Convenience function: 64 bytes hash, no secret key.
+void
+crypto_blake2b_init(crypto_blake2b_ctx *ctx);
+
+// Add "inlen" bytes from "in" into the hash.
+void
+crypto_blake2b_update(crypto_blake2b_ctx *ctx, const uint8_t *in, size_t inlen);
+
+// Generate the message digest (size given in init).
+void
+crypto_blake2b_final(crypto_blake2b_ctx *ctx, uint8_t *out);
+
+// All-in-one convenience function.
+// outlen, keylen, and key work the same as they do in the general_init function
+void
+crypto_blake2b_general(      uint8_t *out, size_t outlen, // digest
+                       const uint8_t *key, size_t keylen, // optional secret key
+                       const uint8_t *in , size_t inlen); // data to be hashed
+
+// All-in-one convenience function: 64 bytes hash, no secret key.
+void
+crypto_blake2b(uint8_t out[64], const uint8_t *in, size_t inlen);
+
+
+
+// Implements argon2i, with degree of paralelism 1,
+// because it's good enough, and threads are scary.
+//
+// key and ad are optionnal.  They can be NULL if their respective size is 0.
+// work_area is a pointer to a contiguous chunk of memory of at least
+// nb_blocks * 1024 bytes.  It must be suitably aligned for 64-bit words.
+// Don't worry too much about alignment, malloc()'s results work.
+//
+// Choice of parameters for password hashing:
+// - If you need a key, use a 32 bytes one.
+// - Do what you will with the ad.
+// - Use a 32 bytes tag (to get a 256-bit key)
+// - Put 128 bits of entropy in the salt.  16 random bytes work well.
+// - Use all the memory you can get away with.
+// - Use as much iterations as reasonable.  No less than 10 passes if you can.
+void
+crypto_argon2i_hash(uint8_t       *tag,       uint32_t tag_size,      // >= 4
+                    const uint8_t *password,  uint32_t password_size,
+                    const uint8_t *salt,      uint32_t salt_size,     // >= 8
+                    const uint8_t *key,       uint32_t key_size,
+                    const uint8_t *ad,        uint32_t ad_size,
+                    void    *work_area,
+                    uint32_t nb_blocks,                               // >= 8
+                    uint32_t nb_iterations);
+
+// Convenience function. No key, no ad, 64 bytes tag
+void
+crypto_argon2i(uint8_t        tag[32],
+               const uint8_t *password,  uint32_t password_size,
+               const uint8_t *salt,      uint32_t salt_size,     // >= 8
+               void    *work_area,
+               uint32_t nb_blocks,                               // >= 8
+               uint32_t nb_iterations);
+
+
+// Computes a shared secret from your private key and their public key.
+// WARNING: DO NOT USE THE SHARED SECRET DIRECTLY.
+// The shared secret is not pseudo-random.  You need to hash it to derive
+// an acceptable secret key.  Any cryptographic hash can work, as well as
+// HChacha20.
+//
+// Implementation details: this is an elliptic curve.  The public key is
+// a point on this curve, and your private key is a scalar.  The shared
+// secret is another point on this curve, obtained by scalar multiplication.
+// Basically:
+//     shared_secret == your_sk * their_pk == your_sk * (their_sk * base_point)
+//                   == their_sk * your_pk == their_sk * (your_sk * base_point)
+void crypto_x25519(uint8_t       shared_secret   [32],
+                   const uint8_t your_secret_key [32],
+                   const uint8_t their_public_key[32]);
+
+// Generates a public key from the specified secret key.
+// Make sure the secret key is randomly selected.
+//
+// Implementation detail: your secret key is a scalar, and we multiply
+// the base point (a constant) by it to obtain a public key.  That is:
+//     public_key == secret_key * base_point
+// Reversing the operation is conjectured to be infeasible
+// without quantum computers (128 bits of security).
+void crypto_x25519_base(uint8_t public_key[32], const uint8_t secret_key[32]);
+
+
+void crypto_ed25519_public_key(uint8_t        public_key[32],
+                               const uint8_t  secret_key[32]);
+
+void crypto_ed25519_sign(uint8_t        signature[64],
+                         const uint8_t  secret_key[32],
+                         const uint8_t *message,
+                         size_t         message_size);
+
+int crypto_ed25519_check(const uint8_t  signature[64],
+                         const uint8_t  public_key[32],
+                         const uint8_t *message,
+                         size_t         message_size);
+
+
+// Authenticated encryption with XChacha20 and Poly1305.
+void crypto_ae_lock_detached(uint8_t        mac[16],
+                             uint8_t       *ciphertext,
+                             const uint8_t  key[32],
+                             const uint8_t  nonce[24],
+                             const uint8_t *plaintext,
+                             size_t         text_size);
+
+// Authenticated encryption with XChacha20 and Poly1305.
+// Returns -1 and has no effect if the message is forged.
+int crypto_ae_unlock_detached(uint8_t       *plaintext,
+                              const uint8_t  key[32],
+                              const uint8_t  nonce[24],
+                              const uint8_t  mac[16],
+                              const uint8_t *ciphertext,
+                              size_t         text_size);
+
+// Like the above, only puts the mac and the ciphertext together
+// in a "box", mac first
+void crypto_ae_lock(uint8_t       *box,      // text_size + 16
+                    const uint8_t  key[32],
+                    const uint8_t  nonce[24],
+                    const uint8_t *plaintext,
+                    size_t         text_size);
+
+// Unlocks a box locked by aead_lock()
+int crypto_ae_unlock(uint8_t       *plaintext,
+                     const uint8_t  key[32],
+                     const uint8_t  nonce[24],
+                     const uint8_t *box,     // text_size + 16
+                     size_t         text_size);
+
+
+// Computes a shared key with your secret key and their public key,
+// suitable for crypto_ae* functions.
+void crypto_lock_key(uint8_t       shared_key      [32],
+                     const uint8_t your_secret_key [32],
+                     const uint8_t their_public_key[32]);
+
+// Authenticated encryption with the sender's secret key and the recipient's
+// public key.  The message leaks if one of the secret key gets compromised.
+void crypto_lock_detached(uint8_t        mac[16],
+                          uint8_t       *ciphertext,
+                          const uint8_t  your_secret_key [32],
+                          const uint8_t  their_public_key[32],
+                          const uint8_t  nonce[24],
+                          const uint8_t *plaintext,
+                          size_t         text_size);
+
+// Authenticated decryption with the recipient's secret key, and the sender's
+// public key.  Has no effect if the message is forged.
+int crypto_unlock_detached(uint8_t       *plaintext,
+                           const uint8_t  your_secret_key [32],
+                           const uint8_t  their_public_key[32],
+                           const uint8_t  nonce[24],
+                           const uint8_t  mac[16],
+                           const uint8_t *ciphertext,
+                           size_t         text_size);
+
+// Like the above, only puts the mac and the ciphertext together
+// in a "box", mac first
+void crypto_lock(uint8_t       *box,
+                 const uint8_t  your_secret_key [32],
+                 const uint8_t  their_public_key[32],
+                 const uint8_t  nonce[24],
+                 const uint8_t *plaintext,
+                 size_t         text_size);
+
+// Unlocks a box locked by crypto_lock()
+int crypto_unlock(uint8_t       *plaintext,
+                  const uint8_t  your_secret_key [32],
+                  const uint8_t  their_public_key[32],
+                  const uint8_t  nonce[24],
+                  const uint8_t *box,
+                  size_t         text_size);
+
+void crypto_anonymous_lock(uint8_t       *box,
+                           const uint8_t  random_secret_key[32],
+                           const uint8_t  their_public_key[32],
+                           const uint8_t *plaintext,
+                           size_t         text_size);
+
+int crypto_anonymous_unlock(uint8_t       *plaintext,
+                            const uint8_t  your_secret_key[32],
+                            const uint8_t *box,
+                            size_t         text_size);
+
+#endif // MONOCYPHER_H
diff --git a/poly1305.c b/poly1305.c
deleted file mode 100644
index c124977..0000000
--- a/poly1305.c
+++ /dev/null
@@ -1,154 +0,0 @@
-#include "poly1305.h"
-
-static uint32_t load32_le(const uint8_t s[4])
-{
-    return s[0]
-        | (s[1] <<  8)
-        | (s[2] << 16)
-        | (s[3] << 24);
-}
-
-static void store32_le(uint8_t output[4], uint32_t input)
-{
-    output[0] =  input        & 0xff;
-    output[1] = (input >>  8) & 0xff;
-    output[2] = (input >> 16) & 0xff;
-    output[3] = (input >> 24) & 0xff;
-}
-
-static void poly_load(uint32_t out[4], const uint8_t in[16])
-{
-    for (int i = 0; i < 4; i++)
-        out[i] = load32_le(in + i*4);
-}
-
-static void poly_add(uint32_t out[5], const uint32_t a[5], const uint32_t b[5])
-{
-    uint64_t carry = 0;
-    for (int i = 0; i < 5; i++) {
-        carry  += (int64_t)(a[i]) + b[i];
-        out[i]  = carry & 0xffffffff; // lower 32 bits right there.
-        carry >>= 32;                 // retain the carry
-    }
-}
-
-// h = (h + c) * r
-static void poly_block(crypto_poly1305_ctx *ctx)
-{
-    // h + c, without carry propagation
-    const uint64_t h0 = ctx->h[0] + (uint64_t)ctx->c[0];
-    const uint64_t h1 = ctx->h[1] + (uint64_t)ctx->c[1];
-    const uint64_t h2 = ctx->h[2] + (uint64_t)ctx->c[2];
-    const uint64_t h3 = ctx->h[3] + (uint64_t)ctx->c[3];
-    const uint64_t h4 = ctx->h[4] + (uint64_t)ctx->c[4];
-
-    // Local all the things!
-    const uint64_t r0 = ctx->r[0];
-    const uint64_t r1 = ctx->r[1];
-    const uint64_t r2 = ctx->r[2];
-    const uint64_t r3 = ctx->r[3];
-    const uint64_t rr0 = (ctx->r[0] >> 2) * 5; // lose 2 bottom bits...
-    const uint64_t rr1 = (ctx->r[1] >> 2) * 5; // 2 bottom bits already cleared
-    const uint64_t rr2 = (ctx->r[2] >> 2) * 5; // 2 bottom bits already cleared
-    const uint64_t rr3 = (ctx->r[3] >> 2) * 5; // 2 bottom bits already cleared
-
-    // (h + c) * r, without carry propagation
-    const uint64_t x0 = h0*r0 + h1*rr3 + h2*rr2 + h3*rr1 + h4*rr0;
-    const uint64_t x1 = h0*r1 + h1*r0  + h2*rr3 + h3*rr2 + h4*rr1;
-    const uint64_t x2 = h0*r2 + h1*r1  + h2*r0  + h3*rr3 + h4*rr2;
-    const uint64_t x3 = h0*r3 + h1*r2  + h2*r1  + h3*r0  + h4*rr3;
-    const uint64_t x4 = h4 * (r0 & 3); // ...recover those 2 bits
-
-    // carry propagation, put ctx->h under 2^130
-    const uint64_t msb = x4 + (x3 >> 32);
-    uint64_t       u   = (msb >> 2) * 5; // lose 2 bottom bits...
-    u += (x0 & 0xffffffff)             ;  ctx->h[0] = u & 0xffffffff;  u >>= 32;
-    u += (x1 & 0xffffffff) + (x0 >> 32);  ctx->h[1] = u & 0xffffffff;  u >>= 32;
-    u += (x2 & 0xffffffff) + (x1 >> 32);  ctx->h[2] = u & 0xffffffff;  u >>= 32;
-    u += (x3 & 0xffffffff) + (x2 >> 32);  ctx->h[3] = u & 0xffffffff;  u >>= 32;
-    u += msb & 3 /* ...recover them */ ;  ctx->h[4] = u;
-}
-
-// (re-)initializes the input counter and input buffer
-static void poly_clear_c(crypto_poly1305_ctx *ctx)
-{
-    for (int i = 0; i < 4; i++)
-        ctx->c[i] = 0;
-    ctx->c_index = 0;
-}
-
-void crypto_poly1305_init(crypto_poly1305_ctx *ctx, const uint8_t key[32])
-{
-    // initial h: zero
-    for (int i =  0; i < 5; i++)
-        ctx->h [i] = 0;
-    // initial r: first half of the key, minus a few bits
-    poly_load(ctx->r, key);
-    ctx->r[0] &= 0x0fffffff; // clear top 4 bits
-    ctx->r[1] &= 0x0ffffffc; // clear top 4 & bottom 2 bits
-    ctx->r[2] &= 0x0ffffffc; // clear top 4 & bottom 2 bits
-    ctx->r[3] &= 0x0ffffffc; // clear top 4 & bottom 2 bits
-    ctx->c[4]  = 1;
-    // second half of the key, saved for later
-    poly_load(ctx->pad, key + 16);
-    ctx->pad[4] = 0;
-    // buffer and counter
-    poly_clear_c(ctx);
-}
-
-void crypto_poly1305_update(crypto_poly1305_ctx *ctx,
-                            const uint8_t *m, size_t bytes)
-{
-    while (bytes > 0) {
-        if (ctx->c_index == 16) {
-            poly_block(ctx);
-            poly_clear_c(ctx);
-        }
-        // feed the input buffer
-        ctx->c[ctx->c_index / 4] |= *m << ((ctx->c_index % 4) * 8);
-        ctx->c_index++;
-        m++;
-        bytes--;
-    }
-}
-
-void crypto_poly1305_finish(crypto_poly1305_ctx *ctx, uint8_t mac[16])
-{
-    // move the final 1 according to remaining input length
-    ctx->c[4] = 0;
-    ctx->c[ctx->c_index / 4] |= 1 << ((ctx->c_index % 4) * 8);
-    // one last hash update...
-    poly_block(ctx);
-    // ... this time with full modular reduction
-    // We only need to conditionally subtract 2^130-5,
-    // using bit twidling to prevent timing attacks.
-    static const uint32_t minus_p[5] = { 5, 0, 0, 0, 0xfffffffc };
-    uint32_t h_minus_p[5];
-    poly_add(h_minus_p, ctx->h, minus_p);
-    uint32_t negative = ~(-(h_minus_p[4] >> 31)); // 0 or -1 (2's complement)
-    for (int i = 0; i < 5; i++) {
-        ctx->h[i] ^= negative & (ctx->h[i] ^ h_minus_p[i]);
-    }
-    // Add the secret pad to the final hash before output
-    poly_add(ctx->h, ctx->h, ctx->pad);
-    for (int i = 0; i < 4; i++)
-        store32_le(mac + i*4, ctx->h[i]);
-}
-
-void crypto_poly1305_auth(uint8_t mac[16], const uint8_t *m,
-                          size_t  m_size , const uint8_t  key[32])
-{
-    crypto_poly1305_ctx ctx;
-    crypto_poly1305_init  (&ctx, key);
-    crypto_poly1305_update(&ctx, m, m_size);
-    crypto_poly1305_finish(&ctx, mac);
-}
-
-int crypto_memcmp_16(const uint8_t mac1[16], const uint8_t mac2[16])
-{
-    unsigned diff = 0;
-    for (int i = 0; i < 16; i++) {
-        diff |= (mac1[i] ^ mac2[i]);
-    }
-    return diff;
-}
diff --git a/poly1305.h b/poly1305.h
deleted file mode 100644
index ce3571e..0000000
--- a/poly1305.h
+++ /dev/null
@@ -1,45 +0,0 @@
-#ifndef POLY1305_H
-#define POLY1305_H
-
-#include <inttypes.h>
-#include <stddef.h>
-
-typedef struct {
-    uint32_t r[4];
-    uint32_t h[5];
-    uint32_t c[5];
-    uint32_t pad[5];
-    size_t   c_index;
-} crypto_poly1305_ctx;
-
-
-// Initializes the poly1305 context with the secret key.
-// Call first (obviously).
-// WARNING: NEVER AUTHENTICATE 2 MESSAGES WITH THE SAME KEY.
-// This is a ONE TIME authenticator.  If you authenticate 2 messages
-// with the same key, the attacker may deduce your secret key and
-// authenticate messages in your stead.
-void crypto_poly1305_init(crypto_poly1305_ctx *ctx, const uint8_t key[32]);
-
-// Updates the poly1305 context with a chunk of the message
-// Can be called multiple times, once for each chunk.
-// Make sure the chunks are processed in order, without overlap or hole...
-void crypto_poly1305_update(crypto_poly1305_ctx *ctx,
-                            const uint8_t *m, size_t bytes);
-
-// Authenticate the message munched through previous update() calls.
-// Call last (obviously).
-void crypto_poly1305_finish(crypto_poly1305_ctx *ctx, uint8_t mac[16]);
-
-
-// Convenience all in one function
-void crypto_poly1305_auth(uint8_t        mac[16],
-                          const uint8_t *m,
-                          size_t         msg_length,
-                          const uint8_t  key[32]);
-
-// Constant time equality verification
-// returns 0 if it matches, something else otherwise.
-int crypto_memcmp_16(const uint8_t mac1[16], const uint8_t mac2[16]);
-
-#endif // POLY1305_H
diff --git a/test.c b/test.c
index 0dd7678..e7c6fa3 100644
--- a/test.c
+++ b/test.c
@@ -1,16 +1,7 @@
 #include <stdio.h>
 #include <stdlib.h>
-#include <inttypes.h>
-#include <stddef.h>
 #include <string.h>
-#include "chacha20.h"
-#include "blake2b.h"
-#include "poly1305.h"
-#include "argon2i.h"
-#include "ae.h"
-#include "lock.h"
-#include "x25519.h"
-#include "ed25519.h"
+#include "monocypher.h"
 #include "sha512.h"
 
 /////////////////////////
diff --git a/x25519.c b/x25519.c
deleted file mode 100644
index 25ac9dd..0000000
--- a/x25519.c
+++ /dev/null
@@ -1,156 +0,0 @@
-// Taken from TweetNaCl
-
-#include "x25519.h"
-
-#define FOR(i, start, end) for (size_t i = start; i < end; i++)
-#define sv static void
-typedef int64_t gf[16];
-
-static const uint8_t _0[16];
-static const uint8_t _9[32] = { 9 };
-static const gf _121665 = { 0xdb41, 1 };
-
-sv car_25519(gf o)
-{
-    FOR(i, 0, 16) {
-        o[i]              += 1LL  << 16;
-        int64_t c          = o[i] >> 16;
-        o[(i+1) * (i<15)] += c - 1 + (37 * (c-1) * (i==15));
-        o[i]              -= c << 16;
-    }
-}
-
-sv sel_25519(gf p, gf q, int b)
-{
-    int64_t c = ~(b-1);
-    FOR(i, 0, 16) {
-        int64_t t = c & (p[i] ^ q[i]);
-        p[i]     ^= t;
-        q[i]     ^= t;
-    }
-}
-
-sv pack_25519(uint8_t *o, const gf n)
-{
-    gf t;
-    FOR(i, 0, 16) t[i] = n[i];
-    car_25519(t);
-    car_25519(t);
-    car_25519(t);
-    FOR(j, 0, 2) {
-        gf m;
-        m[0] = t[0] - 0xffed;
-        FOR(i, 1, 15) {
-            m[i  ]  = t[i] - 0xffff - ((m[i-1] >> 16) & 1);
-            m[i-1] &= 0xffff;
-        }
-        m[15]  = t[15] - 0x7fff - ((m[14] >> 16) & 1);
-        int b  = (m[15] >> 16) & 1;
-        m[14] &= 0xffff;
-        sel_25519(t, m, 1-b);
-    }
-    FOR(i, 0, 16) {
-        o[2*i    ] = t[i] & 0xff;
-        o[2*i + 1] = t[i] >> 8;
-    }
-}
-
-sv unpack_25519(gf o, const uint8_t *n)
-{
-    FOR(i, 0, 16) o[i] = n[2*i] + ((int64_t)n[2*i + 1] << 8);
-    o[15] &= 0x7fff;
-}
-
-sv A(gf o, const gf a, const gf b)
-{
-    FOR(i, 0, 16) o[i] = a[i] + b[i];
-}
-
-sv Z(gf o, const gf a, const gf b)
-{
-    FOR(i, 0, 16) o[i] = a[i] - b[i];
-}
-
-sv M(gf o, const gf a, const gf b)
-{
-    int64_t t[31];
-    FOR(i, 0, 31) t[i] = 0;
-    FOR(i, 0, 16) FOR(j, 0, 16) t[i+j] += a[i] * b[j];
-    FOR(i, 0, 15) t[i] += 38 * t[i+16];
-    FOR(i, 0, 16) o[i] = t[i];
-    car_25519(o);
-    car_25519(o);
-}
-
-sv S(gf o,const gf a)
-{
-    M(o, a, a);
-}
-
-sv inv_25519(gf o,const gf i)
-{
-    gf c;
-    FOR(a, 0, 16) c[a] = i[a];
-    for(int a = 253; a >= 0; a--) {
-        S(c, c);
-        if(a != 2 && a != 4)
-            M(c, c, i);
-    }
-    FOR(a, 0, 16) o[a] = c[a];
-}
-
-void crypto_x25519(uint8_t q[32], const uint8_t n[32], const uint8_t p[32])
-{
-    uint8_t z[32];
-    int64_t x[80];
-    int64_t r;
-    gf a, b, c, d, e, f;
-    FOR(i, 0, 31) z[i] = n[i];
-    z[31]  = (n[31] & 127) | 64;
-    z[0 ] &= 248;
-    unpack_25519(x, p);
-    FOR(i, 0, 16) {
-        b[i] = x[i];
-        d[i] = a[i] = c[i] = 0;
-    }
-    a[0] = d[0] = 1;
-    for(int i = 254; i>=0; i--) {
-        r = (z[i>>3] >> (i & 7)) & 1;
-        sel_25519(a, b, r);
-        sel_25519(c, d, r);
-        A(e, a, c);
-        Z(a, a, c);
-        A(c, b, d);
-        Z(b, b, d);
-        S(d, e);
-        S(f, a);
-        M(a, c, a);
-        M(c, b, e);
-        A(e, a, c);
-        Z(a, a, c);
-        S(b, a);
-        Z(c, d, f);
-        M(a, c, _121665);
-        A(a, a, d);
-        M(c, c, a);
-        M(a, d, f);
-        M(d, b, x);
-        S(b, e);
-        sel_25519(a, b, r);
-        sel_25519(c, d, r);
-    }
-    FOR(i, 0, 16) {
-        x[i+16] = a[i];
-        x[i+32] = c[i];
-        x[i+48] = b[i];
-        x[i+64] = d[i];
-    }
-    inv_25519(x+32, x+32);
-    M(x+16, x+16, x+32);
-    pack_25519(q, x+16);
-}
-
-void crypto_x25519_base(uint8_t q[32], const uint8_t n[32])
-{
-    crypto_x25519(q, n, _9);
-}
diff --git a/x25519.h b/x25519.h
deleted file mode 100644
index 1c5d2ac..0000000
--- a/x25519.h
+++ /dev/null
@@ -1,34 +0,0 @@
-#ifndef X25519_H
-#define X25519_H
-
-#include <inttypes.h>
-#include <stddef.h>
-
-// Computes a shared secret from your private key and their public key.
-// WARNING: DO NOT USE THE SHARED SECRET DIRECTLY.
-// The shared secret is not pseudo-random.  You need to hash it to derive
-// an acceptable secret key.  Any cryptographic hash can work, as well as
-// HChacha20.
-//
-// Implementation details: this is an elliptic curve.  The public key is
-// a point on this curve, and your private key is a scalar.  The shared
-// secret is another point on this curve, obtained by scalar multiplication.
-// Basically:
-//     shared_secret == your_sk * their_pk == your_sk * (their_sk * base_point)
-//                   == their_sk * your_pk == their_sk * (your_sk * base_point)
-void crypto_x25519(uint8_t       shared_secret   [32],
-                   const uint8_t your_secret_key [32],
-                   const uint8_t their_public_key[32]);
-
-// Generates a public key from the specified secret key.
-// Make sure the secret key is randomly selected.
-//
-// Implementation detail: your secret key is a scalar, and we multiply
-// the base point (a constant) by it to obtain a public key.  That is:
-//     public_key == secret_key * base_point
-// Reversing the operation is conjectured to be infeasible
-// without quantum computers (128 bits of security).
-void crypto_x25519_base(uint8_t public_key[32], const uint8_t secret_key[32]);
-
-
-#endif // X25519_H
-- 
2.47.3