From: Loup Vaillant <loup@loup-vaillant.fr>
Date: Fri, 10 Feb 2017 14:49:02 +0000 (+0100)
Subject: all in one compilation unit
X-Git-Url: https://git.codecow.com/?a=commitdiff_plain;h=5d14fff16dce9636855a03585e990da6f577078e;p=Monocypher.git

all in one compilation unit
---

diff --git a/ae.c b/ae.c
deleted file mode 100644
index 95624d2..0000000
--- a/ae.c
+++ /dev/null
@@ -1,60 +0,0 @@
-#include "ae.h"
-#include "chacha20.h"
-#include "poly1305.h"
-
-void crypto_ae_lock_detached(uint8_t        mac[16],
-                             uint8_t       *ciphertext,
-                             const uint8_t  key[32],
-                             const uint8_t  nonce[24],
-                             const uint8_t *plaintext,
-                             size_t         text_size)
-{
-    crypto_chacha_ctx e_ctx;
-    uint8_t           auth_key[32];
-    crypto_chacha20_Xinit (&e_ctx, key, nonce);
-    crypto_chacha20_random(&e_ctx, auth_key, 32);
-
-    crypto_chacha20_encrypt(&e_ctx, plaintext, ciphertext, text_size);
-    crypto_poly1305_auth(mac, ciphertext, text_size, auth_key);
-}
-
-int crypto_ae_unlock_detached(uint8_t       *plaintext,
-                              const uint8_t  key[32],
-                              const uint8_t  nonce[24],
-                              const uint8_t  mac[16],
-                              const uint8_t *ciphertext,
-                              size_t         text_size)
-{
-    crypto_chacha_ctx e_ctx;
-    uint8_t           auth_key[32];
-    crypto_chacha20_Xinit (&e_ctx, key, nonce);
-    crypto_chacha20_random(&e_ctx, auth_key, 32);
-
-    uint8_t real_mac[16];
-    crypto_poly1305_auth(real_mac, ciphertext, text_size, auth_key);
-
-    if (crypto_memcmp_16(real_mac, mac))
-        return -1;
-
-    crypto_chacha20_encrypt(&e_ctx, ciphertext, plaintext, text_size);
-    return 0;
-}
-
-void crypto_ae_lock(uint8_t       *box,
-                    const uint8_t  key[32],
-                    const uint8_t  nonce[24],
-                    const uint8_t *plaintext,
-                    size_t         text_size)
-{
-    crypto_ae_lock_detached(box, box + 16, key, nonce, plaintext, text_size);
-}
-
-int crypto_ae_unlock(uint8_t       *plaintext,
-                     const uint8_t  key[32],
-                     const uint8_t  nonce[24],
-                     const uint8_t *box,
-                     size_t         text_size)
-{
-    return crypto_ae_unlock_detached(plaintext, key, nonce,
-                                     box, box + 16, text_size);
-}
diff --git a/ae.h b/ae.h
deleted file mode 100644
index 9bd6fa2..0000000
--- a/ae.h
+++ /dev/null
@@ -1,44 +0,0 @@
-#ifndef AE_H
-#define AE_H
-
-#include <inttypes.h>
-#include <stddef.h>
-
-
-// Authenticated encryption with XChacha20 and Poly1305.
-void crypto_ae_lock_detached(uint8_t        mac[16],
-                             uint8_t       *ciphertext,
-                             const uint8_t  key[32],
-                             const uint8_t  nonce[24],
-                             const uint8_t *plaintext,
-                             size_t         text_size);
-
-// Authenticated encryption with XChacha20 and Poly1305.
-// Returns -1 and has no effect if the message is forged.
-int crypto_ae_unlock_detached(uint8_t       *plaintext,
-                              const uint8_t  key[32],
-                              const uint8_t  nonce[24],
-                              const uint8_t  mac[16],
-                              const uint8_t *ciphertext,
-                              size_t         text_size);
-
-// Like the above, only puts the mac and the ciphertext together
-// in a "box", mac first
-void crypto_ae_lock(uint8_t       *box,      // text_size + 16
-                    const uint8_t  key[32],
-                    const uint8_t  nonce[24],
-                    const uint8_t *plaintext,
-                    size_t         text_size);
-
-// Unlocks a box locked by aead_lock()
-int crypto_ae_unlock(uint8_t       *plaintext,
-                     const uint8_t  key[32],
-                     const uint8_t  nonce[24],
-                     const uint8_t *box,     // text_size + 16
-                     size_t         text_size);
-
-
-
-
-
-#endif // AE_H
diff --git a/argon2i.c b/argon2i.c
deleted file mode 100644
index 771956e..0000000
--- a/argon2i.c
+++ /dev/null
@@ -1,403 +0,0 @@
-#include "argon2i.h"
-#include "blake2b.h"
-
-/////////////////
-/// Utilities ///
-/////////////////
-
-static uint64_t
-load64_le(const uint8_t s[8])
-{
-    // Portable, slow way
-    return (uint64_t)s[0]
-        | ((uint64_t)s[1] <<  8)
-        | ((uint64_t)s[2] << 16)
-        | ((uint64_t)s[3] << 24)
-        | ((uint64_t)s[4] << 32)
-        | ((uint64_t)s[5] << 40)
-        | ((uint64_t)s[6] << 48)
-        | ((uint64_t)s[7] << 56);
-}
-
-static void
-store32_le(uint8_t output[4], uint32_t input)
-{
-    // Portable, slow way.
-    output[0] =  input        & 0xff;
-    output[1] = (input >>  8) & 0xff;
-    output[2] = (input >> 16) & 0xff;
-    output[3] = (input >> 24) & 0xff;
-}
-
-static void
-store64_le(uint8_t output[8], uint64_t input)
-{
-    // Portable, slow way.
-    output[0] =  input        & 0xff;
-    output[1] = (input >>  8) & 0xff;
-    output[2] = (input >> 16) & 0xff;
-    output[3] = (input >> 24) & 0xff;
-    output[4] = (input >> 32) & 0xff;
-    output[5] = (input >> 40) & 0xff;
-    output[6] = (input >> 48) & 0xff;
-    output[7] = (input >> 56) & 0xff;
-}
-
-static uint64_t
-rotr64(uint64_t x, uint64_t y)
-{
-    return (x >> y) ^ (x << (64 - y));
-}
-
-static uint32_t
-min(uint32_t a, uint32_t b)
-{
-    return a <= b ? a : b;
-}
-
-// updates a blake2 hash with a 32 bit word, little endian.
-static void
-blake_update_32(crypto_blake2b_ctx *ctx, uint32_t input)
-{
-    uint8_t buf[4];
-    store32_le(buf, input);
-    crypto_blake2b_update(ctx, buf, 4);
-}
-
-//////////////////
-// Argon2 block //
-//////////////////
-typedef struct block {
-    uint64_t a[128]; // 1024 octets in 128 64-bit words
-} block;
-
-static void
-load_block(block *b, const uint8_t bytes[1024])
-{
-    for (int i = 0; i < 128; i++) {
-        b->a[i] = load64_le(bytes + i * 8);
-    }
-}
-
-static void
-store_block(uint8_t bytes[1024], const block *b)
-{
-    for (int i = 0; i < 128; i++) {
-        store64_le(bytes + i * 8, b->a[i]);
-    }
-}
-
-static void
-copy_block(block *out, const block *in)
-{
-    for (int i = 0; i < 128; i++) {
-        out->a[i] = in->a[i];
-    }
-}
-
-static void
-xor_block(block *out, const block *in)
-{
-    for (int i = 0; i < 128; i++) {
-        out->a[i] ^= in->a[i];
-    }
-}
-
-////////////////////
-// Argon2i proper //
-////////////////////
-
-// Hash with a virtually unlimited digest size.
-// Doesn't extract more entropy than the base hash function.
-// Mainly used for filling a whole kilobyte block with pseudo-random bytes.
-static void
-extended_hash(uint8_t       *digest, uint32_t digest_size,
-              const uint8_t *input , uint32_t input_size)
-{
-    crypto_blake2b_ctx ctx;
-    crypto_blake2b_general_init(&ctx, min(digest_size, 64), 0, 0);
-    blake_update_32            (&ctx, digest_size);
-    crypto_blake2b_update      (&ctx, input, input_size);
-    crypto_blake2b_final       (&ctx, digest);
-
-    if (digest_size > 64) {
-        // the conversion to u64 avoids integer overflow on
-        // ludicrously big hash sizes.
-        uint32_t r   = (((uint64_t)digest_size + 31) / 32) - 2;
-        uint32_t i   =  1;
-        uint32_t in  =  0;
-        uint32_t out = 32;
-        while (i < r) {
-            // Input and output overlap.
-            // This shouldn't be a problem.
-            crypto_blake2b(digest + out, digest + in, 64);
-            i   +=  1;
-            in  += 32;
-            out += 32;
-        }
-        crypto_blake2b_general(digest + out, digest_size - (32 * r),
-                               0, 0, // no key
-                               digest + in , 64);
-    }
-}
-
-// Core of the compression function G.  Computes Z from R in place.
-static void
-g_rounds(block *work_block)
-{
-#define LSB(x) ((x) & 0xffffffff)
-#define G(a, b, c, d)                                            \
-    a += b + 2 * LSB(a) * LSB(b);  d ^= a;  d = rotr64(d, 32);   \
-    c += d + 2 * LSB(c) * LSB(d);  b ^= c;  b = rotr64(b, 24);   \
-    a += b + 2 * LSB(a) * LSB(b);  d ^= a;  d = rotr64(d, 16);   \
-    c += d + 2 * LSB(c) * LSB(d);  b ^= c;  b = rotr64(b, 63)
-#define ROUND(v0,  v1,  v2,  v3,  v4,  v5,  v6,  v7,    \
-              v8,  v9, v10, v11, v12, v13, v14, v15)    \
-    G(v0, v4,  v8, v12);  G(v1, v5,  v9, v13);          \
-    G(v2, v6, v10, v14);  G(v3, v7, v11, v15);          \
-    G(v0, v5, v10, v15);  G(v1, v6, v11, v12);          \
-    G(v2, v7,  v8, v13);  G(v3, v4,  v9, v14)
-
-    // column rounds (work_block = Q)
-    for (int i = 0; i < 128; i += 16) {
-        ROUND(work_block->a[i     ], work_block->a[i +  1],
-              work_block->a[i +  2], work_block->a[i +  3],
-              work_block->a[i +  4], work_block->a[i +  5],
-              work_block->a[i +  6], work_block->a[i +  7],
-              work_block->a[i +  8], work_block->a[i +  9],
-              work_block->a[i + 10], work_block->a[i + 11],
-              work_block->a[i + 12], work_block->a[i + 13],
-              work_block->a[i + 14], work_block->a[i + 15]);
-    }
-    // row rounds (work_block = Z)
-    for (int i = 0; i < 16; i += 2) {
-        ROUND(work_block->a[i      ], work_block->a[i +   1],
-              work_block->a[i +  16], work_block->a[i +  17],
-              work_block->a[i +  32], work_block->a[i +  33],
-              work_block->a[i +  48], work_block->a[i +  49],
-              work_block->a[i +  64], work_block->a[i +  65],
-              work_block->a[i +  80], work_block->a[i +  81],
-              work_block->a[i +  96], work_block->a[i +  97],
-              work_block->a[i + 112], work_block->a[i + 113]);
-    }
-}
-
-// The compression function G
-// may overwrite result completely  (xcopy == copy_block),
-// or XOR result with the old block (xcopy ==  xor_block)
-static void
-binary_g(block *result, const block *x, const block *y,
-         void (*xcopy) (block*, const block*))
-{
-    // put R = X ^ Y into tmp
-    block tmp;
-    copy_block(&tmp, x);
-    xor_block (&tmp, y);
-
-    xcopy(result, &tmp);     // save R (erase or xor the old block)
-    g_rounds(&tmp);          // tmp = Z
-    xor_block(result, &tmp); // result =  R ^ Z (or R ^ Z ^ old)
-}
-
-// unary version of the compression function.
-// The missing argument is implied zero.
-// Does the transformation in place.
-static void
-unary_g(block *work_block)
-{
-    // work_block == R
-    block tmp;
-    copy_block(&tmp, work_block); // tmp = R
-    g_rounds(work_block);         // work_block = Z
-    xor_block(work_block, &tmp);  // work_block = Z ^ R
-}
-
-typedef struct gidx_ctx {
-    block    b;
-    uint32_t pass_number;
-    uint32_t slice_number;
-    uint32_t nb_blocks;
-    uint32_t nb_iterations;
-    uint32_t ctr;
-    uint32_t index;
-} gidx_ctx;
-
-static void
-gidx_refresh(gidx_ctx *ctx)
-{
-    ctx->b.a[0] = ctx->pass_number;
-    ctx->b.a[1] = 0;  // lane number (we have only one)
-    ctx->b.a[2] = ctx->slice_number;
-    ctx->b.a[3] = ctx->nb_blocks;
-    ctx->b.a[4] = ctx->nb_iterations;
-    ctx->b.a[5] = 1;  // type: Argon2i
-    ctx->b.a[6] = ctx->ctr;
-    // zero the rest of the block
-    for (int i = 7; i < 128; i++) {
-        ctx->b.a[i] = 0;
-    }
-
-    // Shuffle the block thus: ctx->b = G((G(ctx->b, zero)), zero)
-    // Applies the G "square" function to get cheap pseudo-random numbers.
-    unary_g(&(ctx->b));
-    unary_g(&(ctx->b)); // square means apply it twice
-}
-
-static void
-gidx_init(gidx_ctx *ctx,
-          uint32_t pass_number,
-          uint32_t slice_number,
-          uint32_t nb_blocks,
-          uint32_t nb_iterations)
-{
-    ctx->pass_number   = pass_number;
-    ctx->slice_number  = slice_number;
-    ctx->nb_blocks     = nb_blocks;
-    ctx->nb_iterations = nb_iterations;
-    ctx->ctr           = 1;   // not zero, surprisingly
-    ctx->index         = pass_number == 0 && slice_number == 0 ? 2 : 0;
-    gidx_refresh(ctx);
-}
-
-static uint32_t
-gidx_next(gidx_ctx *ctx)
-{
-    // lazily creates the index block we need
-    if (ctx->index == 128) {
-        ctx->index = 0;
-        ctx->ctr++;
-        gidx_refresh(ctx);
-    }
-    // saves and increment the index
-    uint32_t index = ctx->index;
-    ctx->index++; // updates index for the next call
-
-    // Computes the area size.
-    // Pass 0 : all already finished segments plus already constructed
-    //          blocks in this segment
-    // Pass 1+: 3 last segments plus already constructed
-    //          blocks in this segment  THE SPEC SUGGESTS OTHERWISE.
-    //          I CONFORM TO THE REFERENCE IMPLEMENTATION.
-    _Bool    first_pass = ctx->pass_number == 0;
-    uint32_t slice_size = ctx->nb_blocks / 4;
-    uint32_t area_size  = ((first_pass ? ctx->slice_number : 3)
-                           * slice_size + index - 1);
-
-    // Computes the starting position of the reference area.
-    // CONTRARY TO WHAT THE SPEC SUGGESTS, IT STARTS AT THE
-    // NEXT SEGMENT, NOT THE NEXT BLOCK.
-    uint32_t next_slice = (ctx->slice_number == 3
-                           ? 0
-                           : (ctx->slice_number + 1) * slice_size);
-    uint32_t start_pos  = first_pass ? 0 : next_slice;
-
-    // Generates the actual index from J1 (no need for J2, there's only one lane)
-    uint64_t j1         = ctx->b.a[index] & 0xffffffff; // pseudo-random number
-    uint64_t x          = (j1 * j1)       >> 32;
-    uint64_t y          = (area_size * x) >> 32;
-    uint64_t z          = area_size - 1 - y;
-    return (start_pos + z) % ctx->nb_blocks;
-}
-
-// Main algorithm
-void
-crypto_argon2i_hash(uint8_t       *tag,       uint32_t tag_size,
-                    const uint8_t *password,  uint32_t password_size,
-                    const uint8_t *salt,      uint32_t salt_size,
-                    const uint8_t *key,       uint32_t key_size,
-                    const uint8_t *ad,        uint32_t ad_size,
-                    void *work_area,
-                    uint32_t nb_blocks,
-                    uint32_t nb_iterations)
-{
-    // work area seen as blocks (must be suitably aligned)
-    block *blocks = work_area;
-    {
-        crypto_blake2b_ctx ctx;
-        crypto_blake2b_init(&ctx);
-
-        blake_update_32      (&ctx, 1            ); // p: number of threads
-        blake_update_32      (&ctx, tag_size     );
-        blake_update_32      (&ctx, nb_blocks    );
-        blake_update_32      (&ctx, nb_iterations);
-        blake_update_32      (&ctx, 0x13         ); // v: version number
-        blake_update_32      (&ctx, 1            ); // y: Argon2i
-        blake_update_32      (&ctx,           password_size);
-        crypto_blake2b_update(&ctx, password, password_size);
-        blake_update_32      (&ctx,           salt_size);
-        crypto_blake2b_update(&ctx, salt,     salt_size);
-        blake_update_32      (&ctx,           key_size);
-        crypto_blake2b_update(&ctx, key,      key_size);
-        blake_update_32      (&ctx,           ad_size);
-        crypto_blake2b_update(&ctx, ad,       ad_size);
-
-        uint8_t initial_hash[72]; // 64 bytes plus 2 words for future hashes
-        crypto_blake2b_final(&ctx, initial_hash);
-
-        // fill first 2 blocks
-        block   tmp_block;
-        uint8_t hash_area[1024];
-        store32_le(initial_hash + 64, 0); // first  additional word
-        store32_le(initial_hash + 68, 0); // second additional word
-        extended_hash(hash_area, 1024, initial_hash, 72);
-        load_block(&tmp_block, hash_area);
-        copy_block(blocks, &tmp_block);
-
-        store32_le(initial_hash + 64, 1); // slight modification
-        extended_hash(hash_area, 1024, initial_hash, 72);
-        load_block(&tmp_block, hash_area);
-        copy_block(blocks + 1, &tmp_block);
-    }
-
-    // Actual number of blocks
-    nb_blocks -= nb_blocks % 4; // round down to 4 p (p == 1 thread)
-    const uint32_t segment_size = nb_blocks / 4;
-
-    // fill (then re-fill) the rest of the blocks
-    for (uint32_t pass_number = 0; pass_number < nb_iterations; pass_number++) {
-        _Bool     first_pass  = pass_number == 0;
-        // Simple copy on pass 0, XOR instead of overwrite on subsequent passes
-        void (*xcopy) (block*, const block*) = first_pass ?copy_block :xor_block;
-
-        for (int segment = 0; segment < 4; segment++ ) {
-
-            gidx_ctx ctx;
-            gidx_init(&ctx, pass_number, segment, nb_blocks, nb_iterations);
-
-            // On the first segment of the first pass,
-            // blocks 0 and 1 are already filled.
-            // We use the offset to skip them.
-            uint32_t offset = first_pass && segment == 0 ? 2 : 0;
-            // current, reference, and previous are block indices
-            for (uint32_t current =  segment      * segment_size + offset;
-                 current          < (segment + 1) * segment_size;
-                 current++) {
-                uint32_t previous  = current == 0 ? nb_blocks - 1 : current - 1;
-                uint32_t reference = gidx_next(&ctx);
-                binary_g(blocks + current,
-                         blocks + previous,
-                         blocks + reference,
-                         xcopy);
-            }
-        }
-    }
-    // hash the very last block with H' into the output tag
-    uint8_t final_block[1024];
-    store_block(final_block, blocks + (nb_blocks - 1));
-    extended_hash(tag, tag_size, final_block, 1024);
-}
-
-void
-crypto_argon2i(uint8_t        tag[32],
-               const uint8_t *password,  uint32_t password_size,
-               const uint8_t *salt,      uint32_t salt_size,
-               void    *work_area,
-               uint32_t nb_blocks,
-               uint32_t nb_iterations)
-{
-    crypto_argon2i_hash(tag     , 32,
-                        password, password_size,
-                        salt    , salt_size,
-                        0, 0, 0, 0,
-                        work_area, nb_blocks, nb_iterations);
-}
diff --git a/argon2i.h b/argon2i.h
deleted file mode 100644
index b7babf2..0000000
--- a/argon2i.h
+++ /dev/null
@@ -1,42 +0,0 @@
-#ifndef ARGON2I_H
-#define ARGON2I_H
-
-#include <inttypes.h>
-#include <stddef.h>
-
-// Implements argon2i, with degree of paralelism 1,
-// because it's good enough, and threads are scary.
-//
-// key and ad are optionnal.  They can be NULL if their respective size is 0.
-// work_area is a pointer to a contiguous chunk of memory of at least
-// nb_blocks * 1024 bytes.  It must be suitably aligned for 64-bit words.
-// Don't worry too much about alignment, malloc()'s results work.
-//
-// Choice of parameters for password hashing:
-// - If you need a key, use a 32 bytes one.
-// - Do what you will with the ad.
-// - Use a 32 bytes tag (to get a 256-bit key)
-// - Put 128 bits of entropy in the salt.  16 random bytes work well.
-// - Use all the memory you can get away with.
-// - Use as much iterations as reasonable.  No less than 10 passes if you can.
-void
-crypto_argon2i_hash(uint8_t       *tag,       uint32_t tag_size,      // >= 4
-                    const uint8_t *password,  uint32_t password_size,
-                    const uint8_t *salt,      uint32_t salt_size,     // >= 8
-                    const uint8_t *key,       uint32_t key_size,
-                    const uint8_t *ad,        uint32_t ad_size,
-                    void    *work_area,
-                    uint32_t nb_blocks,                               // >= 8
-                    uint32_t nb_iterations);
-
-// Convenience function. No key, no ad, 64 bytes tag
-void
-crypto_argon2i(uint8_t        tag[32],
-               const uint8_t *password,  uint32_t password_size,
-               const uint8_t *salt,      uint32_t salt_size,     // >= 8
-               void    *work_area,
-               uint32_t nb_blocks,                               // >= 8
-               uint32_t nb_iterations);
-
-
-#endif // ARGON2I_H
diff --git a/blake2b.c b/blake2b.c
deleted file mode 100644
index f9fb269..0000000
--- a/blake2b.c
+++ /dev/null
@@ -1,175 +0,0 @@
-// ripped off from the reference implentation in RFC 7693
-
-#include "blake2b.h"
-
-// Cyclic right rotation.
-static uint64_t
-rotr64(uint64_t x, uint64_t y)
-{
-    return (x >> y) ^ (x << (64 - y));
-}
-
-static uint64_t
-load64_le(uint8_t *s)
-{
-    // portable, slow way
-    return
-        ((uint64_t)s[0]      ) ^
-        ((uint64_t)s[1] <<  8) ^
-        ((uint64_t)s[2] << 16) ^
-        ((uint64_t)s[3] << 24) ^
-        ((uint64_t)s[4] << 32) ^
-        ((uint64_t)s[5] << 40) ^
-        ((uint64_t)s[6] << 48) ^
-        ((uint64_t)s[7] << 56);
-}
-
-// Initialization Vector.
-static const uint64_t blake2b_iv[8] = {
-    0x6a09e667f3bcc908, 0xbb67ae8584caa73b,
-    0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1,
-    0x510e527fade682d1, 0x9b05688c2b3e6c1f,
-    0x1f83d9abfb41bd6b, 0x5be0cd19137e2179
-};
-
-// increment a 128-bit "word".
-static void
-incr(uint64_t x[2], uint64_t y)
-{
-    x[0] += y;                 // increment the low word
-    if (x[0] < y) { x[1]++; }  // handle overflow
-}
-
-static void
-blake2b_compress(crypto_blake2b_ctx *ctx, _Bool last_block)
-{
-    static const uint8_t sigma[12][16] = {
-        { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
-        { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
-        { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
-        { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
-        { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
-        { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
-        { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
-        { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
-        { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
-        { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
-        { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
-        { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }
-    };
-
-    // init work variables (before shuffling them)
-    uint64_t v[16];
-    for (int i = 0; i < 8; i++) {
-        v[i    ] = ctx->hash[i];
-        v[i + 8] = blake2b_iv[i];
-    }
-    v[12] ^= ctx->input_size[0]; // low 64 bits of offset
-    v[13] ^= ctx->input_size[1]; // high 64 bits
-    if (last_block) { v[14] = ~v[14]; }
-
-    // load the input buffer
-    uint64_t m[16];
-    for (int i = 0; i < 16; i++) {
-        m[i] = load64_le(&ctx->buf[i * 8]);
-    }
-
-    // shuffle the work variables with the 12 rounds
-    for (int i = 0; i < 12; i++) {
-#define B2B_G(a, b, c, d, x, y)                                    \
-        v[a] += v[b] + x;  v[d] ^= v[a];  v[d] = rotr64(v[d], 32); \
-        v[c] += v[d]    ;  v[b] ^= v[c];  v[b] = rotr64(v[b], 24); \
-        v[a] += v[b] + y;  v[d] ^= v[a];  v[d] = rotr64(v[d], 16); \
-        v[c] += v[d]    ;  v[b] ^= v[c];  v[b] = rotr64(v[b], 63)
-
-        B2B_G( 0, 4,  8, 12, m[sigma[i][ 0]], m[sigma[i][ 1]]);
-        B2B_G( 1, 5,  9, 13, m[sigma[i][ 2]], m[sigma[i][ 3]]);
-        B2B_G( 2, 6, 10, 14, m[sigma[i][ 4]], m[sigma[i][ 5]]);
-        B2B_G( 3, 7, 11, 15, m[sigma[i][ 6]], m[sigma[i][ 7]]);
-        B2B_G( 0, 5, 10, 15, m[sigma[i][ 8]], m[sigma[i][ 9]]);
-        B2B_G( 1, 6, 11, 12, m[sigma[i][10]], m[sigma[i][11]]);
-        B2B_G( 2, 7,  8, 13, m[sigma[i][12]], m[sigma[i][13]]);
-        B2B_G( 3, 4,  9, 14, m[sigma[i][14]], m[sigma[i][15]]);
-    }
-
-    // accumulate the work variables into the hash
-    for(int i = 0; i < 8; i++) {
-        ctx->hash[i] ^= v[i] ^ v[i + 8];
-    }
-}
-
-void
-crypto_blake2b_general_init(crypto_blake2b_ctx *ctx, size_t outlen,
-                            const uint8_t      *key, size_t keylen)
-{
-    // Initial hash == initialization vector...
-    for (int i = 0; i < 8; i++) {
-        ctx->hash[i] = blake2b_iv[i];
-    }
-    ctx->hash[0]      ^= 0x01010000 ^ (keylen << 8) ^ outlen;  // ...mostly
-    ctx->input_size[0] = 0;       // input count low word
-    ctx->input_size[1] = 0;       // input count high word
-    ctx->c             = 0;       // pointer within buffer
-    ctx->output_size   = outlen;  // size of the final hash
-
-    // If there's a key, put it in the first block, then pad with zeroes
-    if (keylen > 0) {
-        for (size_t i = 0     ; i < keylen; i++) { ctx->buf[i] = key[i]; }
-        for (size_t i = keylen; i < 128   ; i++) { ctx->buf[i] = 0;      }
-        ctx->c = 128; // mark the block as used
-    }
-}
-
-void
-crypto_blake2b_init(crypto_blake2b_ctx *ctx)
-{
-    crypto_blake2b_general_init(ctx, 64, 0, 0);
-}
-
-void
-crypto_blake2b_update(crypto_blake2b_ctx *ctx, const uint8_t *in, size_t inlen)
-{
-    for (size_t i = 0; i < inlen; i++) {
-        // If the buffer is full, increment the counters and
-        // add (compress) the current buffer to the hash
-        if (ctx->c == 128) {
-            ctx->c = 0;
-            incr(ctx->input_size, 128);
-            blake2b_compress(ctx, 0); // not last time -> 0
-        }
-        // By now the buffer is not full.  We add one input byte.
-        ctx->buf[ctx->c] = in[i];
-        ctx->c++;
-    }
-}
-
-void
-crypto_blake2b_final(crypto_blake2b_ctx *ctx, uint8_t *out)
-{
-    // update input size, pad then compress the buffer
-    incr(ctx->input_size, ctx->c);
-    for (int i = ctx->c; i < 128; i++) { ctx->buf[i] = 0; }
-    blake2b_compress(ctx, 1); // last time -> 1
-
-    // copy the hash in the output (little endian of course)
-    for (int i = 0; i < ctx->output_size; i++) {
-        out[i] = (ctx->hash[i / 8] >> (8 * (i & 7))) & 0xFF;
-    }
-}
-
-void
-crypto_blake2b_general(      uint8_t*out, size_t outlen,
-                       const uint8_t*key, size_t keylen,
-                       const uint8_t*in,  size_t inlen)
-{
-    crypto_blake2b_ctx ctx;
-    crypto_blake2b_general_init(&ctx, outlen, key, keylen);
-    crypto_blake2b_update(&ctx, in, inlen);
-    crypto_blake2b_final(&ctx, out);
-}
-
-void
-crypto_blake2b(uint8_t out[64], const uint8_t *in, size_t inlen)
-{
-    crypto_blake2b_general(out, 64, 0, 0, in, inlen);
-}
diff --git a/blake2b.h b/blake2b.h
deleted file mode 100644
index dd3338b..0000000
--- a/blake2b.h
+++ /dev/null
@@ -1,50 +0,0 @@
-#ifndef BLAKE2B_H
-#define BLAKE2B_H
-
-#include <inttypes.h>
-#include <stddef.h>
-
-// blake2b context
-typedef struct {
-    uint8_t  buf[128];      // input buffer
-    uint64_t hash[8];       // chained state
-    uint64_t input_size[2]; // total number of bytes
-    uint8_t  c;             // pointer for buf[]
-    uint8_t  output_size;   // digest size
-} crypto_blake2b_ctx;
-
-// Initializes the context with user defined parameters:
-// outlen: the length of the hash.  Must be between 1 and 64.
-// keylen: length of the key.       Must be between 0 and 64.
-// key   : some secret key.         May be NULL if keylen is 0.
-// Any deviation from these invariants results in UNDEFINED BEHAVIOR
-void
-crypto_blake2b_general_init(crypto_blake2b_ctx *ctx, size_t outlen,
-                            const uint8_t      *key, size_t keylen);
-
-// Convenience function: 64 bytes hash, no secret key.
-void
-crypto_blake2b_init(crypto_blake2b_ctx *ctx);
-
-// Add "inlen" bytes from "in" into the hash.
-void
-crypto_blake2b_update(crypto_blake2b_ctx *ctx, const uint8_t *in, size_t inlen);
-
-// Generate the message digest (size given in init).
-void
-crypto_blake2b_final(crypto_blake2b_ctx *ctx, uint8_t *out);
-
-// All-in-one convenience function.
-// outlen, keylen, and key work the same as they do in the general_init function
-void
-crypto_blake2b_general(      uint8_t *out, size_t outlen, // digest
-                       const uint8_t *key, size_t keylen, // optional secret key
-                       const uint8_t *in , size_t inlen); // data to be hashed
-
-// All-in-one convenience function: 64 bytes hash, no secret key.
-void
-crypto_blake2b(uint8_t out[64], const uint8_t *in, size_t inlen);
-
-
-
-#endif // BLAKE2B_H
diff --git a/build.sh b/build.sh
index affbfd4..256c001 100755
--- a/build.sh
+++ b/build.sh
@@ -3,15 +3,7 @@
 CC="gcc"
 CFLAGS="-O2 -Wall -Wextra -std=c11"
 
-$CC $CFLAGS -c chacha20.c
-$CC $CFLAGS -c blake2b.c
-$CC $CFLAGS -c poly1305.c
-$CC $CFLAGS -c argon2i.c
-$CC $CFLAGS -c ae.c
-$CC $CFLAGS -c x25519.c
-$CC $CFLAGS -c ed25519.c -DED25519_SHA512
-$CC $CFLAGS -c lock.c
+$CC $CFLAGS -c monocypher.c -DED25519_SHA512
 $CC $CFLAGS -c sha512.c
 $CC $CFLAGS -c test.c
-
-$CC $CFLAGS -o test test.o chacha20.o argon2i.o blake2b.o poly1305.o x25519.o ae.o lock.o sha512.o ed25519.o
+$CC $CFLAGS -o test test.o monocypher.o sha512.o
diff --git a/chacha20.c b/chacha20.c
deleted file mode 100644
index 023f114..0000000
--- a/chacha20.c
+++ /dev/null
@@ -1,139 +0,0 @@
-#include "chacha20.h"
-
-static uint32_t
-load32_le(const uint8_t s[4])
-{
-    // Portable, slow way.
-    // Only affects initialisation, though.
-    return s[0]
-        | (s[1] <<  8)
-        | (s[2] << 16)
-        | (s[3] << 24);
-}
-
-static void
-store32_le(uint8_t output[4], uint32_t input)
-{
-    // Portable, slow way.
-    output[0] =  input        & 0xff;
-    output[1] = (input >>  8) & 0xff;
-    output[2] = (input >> 16) & 0xff;
-    output[3] = (input >> 24) & 0xff;
-}
-
-static void
-chacha20_rounds(uint32_t out[16], const uint32_t in[16])
-{
-    for (int i = 0; i < 16; i++)
-        out[i] = in[i];
-
-    for (int i = 0; i < 10; i++) { // 20 rounds, 2 rounds per loop.
-#define ROT_L32(x, n) x = (x << n) | (x >> (32 - n))
-#define QUARTERROUND(a, b, c, d)           \
-        a += b;  d ^= a;  ROT_L32(d, 16);  \
-        c += d;  b ^= c;  ROT_L32(b, 12);  \
-        a += b;  d ^= a;  ROT_L32(d,  8);  \
-        c += d;  b ^= c;  ROT_L32(b,  7)
-
-        QUARTERROUND(out[0], out[4], out[ 8], out[12]); // column 0
-        QUARTERROUND(out[1], out[5], out[ 9], out[13]); // column 1
-        QUARTERROUND(out[2], out[6], out[10], out[14]); // column 2
-        QUARTERROUND(out[3], out[7], out[11], out[15]); // column 3
-        QUARTERROUND(out[0], out[5], out[10], out[15]); // diagonal 1
-        QUARTERROUND(out[1], out[6], out[11], out[12]); // diagonal 2
-        QUARTERROUND(out[2], out[7], out[ 8], out[13]); // diagonal 3
-        QUARTERROUND(out[3], out[4], out[ 9], out[14]); // diagonal 4
-    }
-}
-
-static void
-chacha20_init_key(crypto_chacha_ctx *ctx, const uint8_t key[32])
-{
-    // constant
-    ctx->input[0]   = load32_le((uint8_t*)"expa");
-    ctx->input[1]   = load32_le((uint8_t*)"nd 3");
-    ctx->input[2]   = load32_le((uint8_t*)"2-by");
-    ctx->input[3]   = load32_le((uint8_t*)"te k");
-    // key
-    for (int i = 0; i < 8; i++)
-        ctx->input[i + 4] = load32_le(key + i*4);
-    // pool index (the random pool starts empty)
-    ctx->pool_index = 64;
-}
-
-void
-crypto_chacha20_H(uint8_t       out[32],
-                  const uint8_t key[32],
-                  const uint8_t in [16])
-{
-    crypto_chacha_ctx ctx;
-    chacha20_init_key(&ctx, key);
-    for (int i = 0; i < 4; i++)
-        ctx.input[i + 12] = load32_le(in + i*4);
-
-    uint32_t buffer[16];
-    chacha20_rounds(buffer, ctx.input);
-    // prevents reversal of the rounds by revealing only half of the buffer.
-    for (int i = 0; i < 4; i++) {
-        store32_le(out      + i*4, buffer[i     ]); // constant
-        store32_le(out + 16 + i*4, buffer[i + 12]); // counter and nonce
-    }
-}
-
-void
-crypto_chacha20_init(crypto_chacha_ctx *ctx,
-                     const uint8_t      key[32],
-                     const uint8_t      nonce[8])
-{
-    chacha20_init_key(ctx, key  );         // key
-    ctx->input[12] = 0;                    // counter
-    ctx->input[13] = 0;                    // counter
-    ctx->input[14] = load32_le(nonce + 0); // nonce
-    ctx->input[15] = load32_le(nonce + 4); // nonce
-}
-
-void
-crypto_chacha20_Xinit(crypto_chacha_ctx *ctx,
-                      const uint8_t      key[32],
-                      const uint8_t      nonce[24])
-{
-    uint8_t derived_key[32];
-    crypto_chacha20_H(derived_key, key, nonce);
-    crypto_chacha20_init(ctx, derived_key, nonce + 16);
-}
-
-void
-crypto_chacha20_encrypt(crypto_chacha_ctx *ctx,
-                        const uint8_t     *plain_text,
-                        uint8_t           *cipher_text,
-                        size_t             message_size)
-{
-    for (size_t i = 0; i < message_size; i++) {
-        // refill the pool if empty
-        if (ctx->pool_index == 64) {
-            // fill the pool
-            uint32_t buffer[16];
-            chacha20_rounds(buffer, ctx->input);
-            for (int i = 0; i < 16; i++)
-                store32_le(ctx->random_pool + i*4, buffer[i] + ctx->input[i]);
-            // update the counters
-            ctx->pool_index = 0;
-            ctx->input[12]++;
-            if (!ctx->input[12])
-                ctx->input[13]++;
-        }
-        // use the pool for encryption (or random stream)
-        cipher_text[i] =
-            (plain_text == 0 ? 0 : plain_text[i])
-            ^ ctx->random_pool[ctx->pool_index];
-        ctx->pool_index++;
-    }
-}
-
-void
-crypto_chacha20_random(crypto_chacha_ctx *ctx,
-                       uint8_t           *cipher_text,
-                       size_t             message_size)
-{
-    crypto_chacha20_encrypt(ctx, 0, cipher_text, message_size);
-}
diff --git a/chacha20.h b/chacha20.h
deleted file mode 100644
index 9f63614..0000000
--- a/chacha20.h
+++ /dev/null
@@ -1,89 +0,0 @@
-#ifndef CHACHA20_H
-#define CHACHA20_H
-
-#include <inttypes.h>
-#include <stddef.h>
-
-// This is a chacha20 context.
-// To use safely, just follow these guidelines:
-// - Always initialize your context with one of the crypto_init_* functions below
-// - Dont't modify it, except through the crypto_chacha20_* below.
-// - Never duplicate it.
-typedef struct crypto_chacha_ctx {
-    uint32_t input[16];       // current input, unencrypted
-    uint8_t  random_pool[64]; // last input, encrypted
-    uint8_t  pool_index;      // pointer to random_pool
-} crypto_chacha_ctx;
-
-// HChacha20.  *Kind* of a cryptographic hash, based on the chacha20 rounds.
-// Used for XChacha20, and the key derivation of the X25519 shared secret.
-// Don't use it unless you really know what you're doing.
-void
-crypto_chacha20_H(uint8_t       out[32],
-                  const uint8_t key[32],
-                  const uint8_t in [16]);
-
-// Initializes a chacha context.
-//
-// WARNING: DON'T USE THE SAME NONCE AND KEY TWICE
-//
-// You'd be exposing the XOR of subsequent encrypted
-// messages, thus destroying your confidentiality.
-//
-// WARNING: DON'T SELECT THE NONCE AT RANDOM
-//
-// If you encode enough messages with a random nonce, there's a good
-// chance some of them will use the same nonce by accident. 64 bits
-// just isn't enough for this.  Use a counter instead.
-//
-// If there are multiple parties sending out messages, you can give them
-// all an initial nonce of 0, 1 .. n-1 respectively, and have them increment
-// their nonce  by n.  (Also make sure the nonces never wrap around.).
-void
-crypto_chacha20_init(crypto_chacha_ctx *ctx,
-                     const uint8_t      key[32],
-                     const uint8_t      nonce[8]);
-
-// Initializes a chacha context, with a big nonce (192 bits),
-// more than enough to be selected at random.
-//
-// The price you pay for that is a slower initialization.  The security
-// guarantees are the same as regular initialization.
-void
-crypto_chacha20_Xinit(crypto_chacha_ctx *ctx,
-                      const uint8_t      key[32],
-                      const uint8_t      nonce[24]);
-
-// Encrypts the plain_text by XORing it with a pseudo-random
-// stream of numbers, seeded by the provided chacha20 context.
-// Decryption uses the exact same method.
-//
-// Once the context is initialized, encryptions can safely be chained thus:
-//
-//    crypto_encrypt_chacha20(ctx, plain_0, cipher_0, length_0);
-//    crypto_encrypt_chacha20(ctx, plain_1, cipher_1, length_1);
-//    crypto_encrypt_chacha20(ctx, plain_2, cipher_2, length_2);
-//
-// plain_text and cipher_text may point to the same location, for in-place
-// encryption.
-//
-// plain_text is allowed to be null (0), in which case it will be
-// interpreted as an all zero input.  The cipher_text will then
-// contain the raw chacha20 stream.  Useful as a random number
-// generator.
-//
-// WARNING: ENCRYPTION ALONE IS NOT SECURE.  YOU NEED AUTHENTICATION AS WELL.
-// Use the provided authenticated encryption constructions.
-void
-crypto_chacha20_encrypt(crypto_chacha_ctx *ctx,
-                        const uint8_t     *plain_text,
-                        uint8_t           *cipher_text,
-                        size_t             message_size);
-
-// convenience function.  Same as chacha20_encrypt() with a null plain_text.
-void
-crypto_chacha20_random(crypto_chacha_ctx *ctx,
-                       uint8_t           *cipher_text,
-                       size_t             message_size);
-
-#endif // CHACHA20_H
diff --git a/ed25519.c b/ed25519.c
deleted file mode 100644
index a1baa65..0000000
--- a/ed25519.c
+++ /dev/null
@@ -1,391 +0,0 @@
-// Taken from TweetNaCl.
-// I tried the ref10 implementation, but that was too damn big
-
-#include "ed25519.h"
-
-#define FOR(i, start, end) for (size_t i = start; i < end; i++)
-#define sv static void
-#define sc static const
-
-typedef uint8_t   u8;
-typedef int64_t  i64;
-typedef uint64_t u64;
-typedef i64 gf[16];
-
-sc gf gf0;
-sc gf gf1 = { 1 };
-sc gf  D  = { 0x78a3, 0x1359, 0x4dca, 0x75eb, 0xd8ab, 0x4141, 0x0a4d, 0x0070,
-              0xe898, 0x7779, 0x4079, 0x8cc7, 0xfe73, 0x2b6f, 0x6cee, 0x5203};
-sc gf  D2 = { 0xf159, 0x26b2, 0x9b94, 0xebd6, 0xb156, 0x8283, 0x149a, 0x00e0,
-              0xd130, 0xeef3, 0x80f2, 0x198e, 0xfce7, 0x56df, 0xd9dc, 0x2406};
-sc gf  X  = { 0xd51a, 0x8f25, 0x2d60, 0xc956, 0xa7b2, 0x9525, 0xc760, 0x692c,
-              0xdc5c, 0xfdd6, 0xe231, 0xc0a4, 0x53fe, 0xcd6e, 0x36d3, 0x2169};
-sc gf  Y  = { 0x6658, 0x6666, 0x6666, 0x6666, 0x6666, 0x6666, 0x6666, 0x6666,
-              0x6666, 0x6666, 0x6666, 0x6666, 0x6666, 0x6666, 0x6666, 0x6666};
-sc gf  I  = { 0xa0b0, 0x4a0e, 0x1b27, 0xc4ee, 0xe478, 0xad2f, 0x1806, 0x2f43,
-              0xd7a7, 0x3dfb, 0x0099, 0x2b4d, 0xdf0b, 0x4fc1, 0x2480, 0x2b83};
-
-sc u64 L[32] = { 0xed, 0xd3, 0xf5, 0x5c, 0x1a, 0x63, 0x12, 0x58,
-                 0xd6, 0x9c, 0xf7, 0xa2, 0xde, 0xf9, 0xde, 0x14,
-                 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10 };
-
-sv car_25519(gf o)
-{
-    FOR(i, 0, 16) {
-        o[i]              += 1LL  << 16;
-        i64 c          = o[i] >> 16;
-        o[(i+1) * (i<15)] += c - 1 + (37 * (c-1) * (i==15));
-        o[i]              -= c << 16;
-    }
-}
-
-sv sel_25519(gf p, gf q, int b)
-{
-    i64 c = ~(b-1);
-    FOR(i, 0, 16) {
-        i64 t = c & (p[i] ^ q[i]);
-        p[i]     ^= t;
-        q[i]     ^= t;
-    }
-}
-
-sv pack_25519(u8 *o, const gf n)
-{
-    gf t;
-    FOR(i, 0, 16) t[i] = n[i];
-    car_25519(t);
-    car_25519(t);
-    car_25519(t);
-    FOR(j, 0, 2) {
-        gf m;
-        m[0] = t[0] - 0xffed;
-        FOR(i, 1, 15) {
-            m[i  ]  = t[i] - 0xffff - ((m[i-1] >> 16) & 1);
-            m[i-1] &= 0xffff;
-        }
-        m[15]  = t[15] - 0x7fff - ((m[14] >> 16) & 1);
-        int b  = (m[15] >> 16) & 1;
-        m[14] &= 0xffff;
-        sel_25519(t, m, 1-b);
-    }
-    FOR(i, 0, 16) {
-        o[2*i    ] = t[i] & 0xff;
-        o[2*i + 1] = t[i] >> 8;
-    }
-}
-
-sv A(gf o, const gf a, const gf b) { FOR(i, 0, 16) o[i] = a[i] + b[i]; }
-sv Z(gf o, const gf a, const gf b) { FOR(i, 0, 16) o[i] = a[i] - b[i]; }
-sv M(gf o, const gf a, const gf b)
-{
-    i64 t[31];
-    FOR(i, 0, 31) t[i] = 0;
-    FOR(i, 0, 16) FOR(j, 0, 16) t[i+j] += a[i] * b[j];
-    FOR(i, 0, 15) t[i] += 38 * t[i+16];
-    FOR(i, 0, 16) o[i] = t[i];
-    car_25519(o);
-    car_25519(o);
-}
-sv S(gf o,const gf a){ M(o, a, a); }
-
-sv inv_25519(gf o,const gf i)
-{
-    gf c;
-    FOR(a, 0, 16) c[a] = i[a];
-    for(int a = 253; a >= 0; a--) {
-        S(c, c);
-        if(a != 2 && a != 4)
-            M(c, c, i);
-    }
-    FOR(a, 0, 16) o[a] = c[a];
-}
-
-sv unpack_25519(gf o, const u8 *n)
-{
-    FOR(i, 0, 16) o[i] = n[2*i] + ((i64)n[2*i + 1] << 8);
-    o[15] &= 0x7fff;
-}
-
-sv set_25519(gf r, const gf a) { FOR(i, 0, 16) r[i] = a[i]; }
-
-static u8 par_25519(const gf a)
-{
-    u8 d[32];
-    pack_25519(d, a);
-    return d[0] & 1;
-}
-
-sv pow2523(gf o,const gf i)
-{
-    gf c;
-    FOR(a, 0, 16) c[a] = i[a];
-    for(int a = 250; a >= 0; a--) {
-        S(c, c);
-        if(a != 1) M(c, c, i);
-    }
-    FOR(a, 0, 16) o[a] = c[a];
-}
-
-static int vn(const u8 *x, const u8 *y, size_t n)
-{
-  uint32_t d = 0;
-  FOR(i, 0, n) d |= x[i] ^ y[i];
-  return (1 & ((d - 1) >> 8)) - 1;
-}
-
-static int neq_25519(const gf a, const gf b)
-{
-    u8 c[32],d[32];
-    pack_25519(c, a);
-    pack_25519(d, b);
-    return vn(c, d, 32);
-}
-
-sv add(gf p[4], gf q[4])
-{
-    gf a, b, c, d, t, e, f, g, h;
-    Z(a, p[1], p[0]);
-    Z(t, q[1], q[0]);
-    M(a, a, t);
-    A(b, p[0], p[1]);
-    A(t, q[0], q[1]);
-    M(b, b, t);
-    M(c, p[3], q[3]);
-    M(c, c, D2);
-    M(d, p[2], q[2]);
-    A(d, d, d);
-    Z(e, b, a);
-    Z(f, d, c);
-    A(g, d, c);
-    A(h, b, a);
-
-    M(p[0], e, f);
-    M(p[1], h, g);
-    M(p[2], g, f);
-    M(p[3], e, h);
-}
-
-sv cswap(gf p[4], gf q[4], u8 b)
-{
-    FOR(i, 0, 4)
-        sel_25519(p[i],q[i],b);
-}
-
-sv pack(u8 *r, gf p[4])
-{
-    gf tx, ty, zi;
-    inv_25519(zi, p[2]);
-    M(tx, p[0], zi);
-    M(ty, p[1], zi);
-    pack_25519(r, ty);
-    r[31] ^= par_25519(tx) << 7;
-}
-
-sv scalarmult(gf p[4], gf q[4], const u8 *s)
-{
-    set_25519(p[0], gf0);
-    set_25519(p[1], gf1);
-    set_25519(p[2], gf1);
-    set_25519(p[3], gf0);
-    for (int i = 255; i >= 0; i--) {
-        u8 b = (s[i/8] >> (i & 7)) & 1;
-        cswap(p, q, b);
-        add(q, p);
-        add(p, p);
-        cswap(p, q, b);
-    }
-}
-
-sv scalarbase(gf p[4], const u8 *s)
-{
-    gf q[4];
-    set_25519(q[0], X);
-    set_25519(q[1], Y);
-    set_25519(q[2], gf1);
-    M(q[3], X, Y);
-    scalarmult(p, q, s);
-}
-
-sv modL(u8 *r, i64 x[64])
-{
-    i64 i, j;
-    for (i = 63;i >= 32;--i) {
-        i64 carry = 0;
-        for (j = i - 32;j < i - 12;++j) {
-            x[j] += carry - 16 * x[i] * L[j - (i - 32)];
-            carry = (x[j] + 128) >> 8;
-            x[j] -= carry << 8;
-        }
-        x[j] += carry;
-        x[i] = 0;
-    }
-    i64 carry = 0;
-    FOR(j, 0, 32) {
-        x[j] += carry - (x[31] >> 4) * L[j];
-        carry = x[j] >> 8;
-        x[j] &= 255;
-    }
-    FOR(j, 0, 32) x[j] -= carry * L[j];
-    FOR(i, 0, 32) {
-        x[i+1] += x[i] >> 8;
-        r[i  ]  = x[i] & 255;
-    }
-}
-
-sv reduce(u8 r[64])
-{
-    i64 x[64];
-    FOR(i, 0, 64) x[i] = (u64) r[i];
-    FOR(i, 0, 64) r[i] = 0;
-    modL(r, x);
-}
-
-static int unpackneg(gf r[4],const u8 p[32])
-{
-    gf t, chk, num, den, den2, den4, den6;
-    set_25519(r[2], gf1);
-    unpack_25519(r[1], p);
-    S(num,r [1]);
-    M(den, num, D);
-    Z(num, num, r[2]);
-    A(den, r[2], den);
-
-    S(den2, den);
-    S(den4, den2);
-    M(den6, den4, den2);
-    M(t, den6, num);
-    M(t, t, den);
-
-    pow2523(t, t);
-    M(t, t, num);
-    M(t, t, den);
-    M(t, t, den);
-    M(r[0], t, den);
-
-    S(chk, r[0]);
-    M(chk, chk, den);
-    if (neq_25519(chk, num)) M(r[0], r[0], I);
-
-    S(chk, r[0]);
-    M(chk, chk, den);
-    if (neq_25519(chk, num)) return -1;
-
-    if (par_25519(r[0]) == (p[31]>>7)) Z(r[0],gf0,r[0]);
-
-    M(r[3], r[0], r[1]);
-    return 0;
-}
-
-#ifdef ED25519_BLAKE2B
-    #include "blake2b.h"
-    #define HASH crypto_blake2b
-#else
-    #ifdef ED25519_SHA512
-        #include "sha512.h"
-        #define HASH crypto_sha512
-    #endif
-#endif
-
-#define COMBINE1(x, y) x ## y
-#define COMBINE2(x, y) COMBINE1(x, y)
-#define HASH_CTX    COMBINE2(HASH, _ctx)
-#define HASH_INIT   COMBINE2(HASH, _init)
-#define HASH_UPDATE COMBINE2(HASH, _update)
-#define HASH_FINAL  COMBINE2(HASH, _final)
-
-// hash function interface
-// Typical uses: sha512 for tests vectors, blake2b for production.
-void HASH_INIT  (HASH_CTX *ctx);
-void HASH_UPDATE(HASH_CTX *ctx, const u8 *in, size_t inlen);
-void HASH_FINAL (HASH_CTX *ctx, u8 hash[64]);
-void HASH(u8 hash[64], const u8 *in, size_t inlen);
-
-sv hash_k(u8 k[64], const u8 R[32], const u8 A[32], const u8 *M, size_t M_size)
-{
-    HASH_CTX ctx;
-    HASH_INIT  (&ctx);
-    HASH_UPDATE(&ctx, R , 32    );
-    HASH_UPDATE(&ctx, A , 32    );
-    HASH_UPDATE(&ctx, M , M_size);
-    HASH_FINAL (&ctx, k);
-    reduce(k);
-}
-
-void crypto_ed25519_public_key(uint8_t        public_key[32],
-                               const uint8_t  secret_key[32])
-{
-    // hash the private key, turn the hash into a scalar
-    u8 a[64];
-    HASH(a, secret_key, 32);
-    a[ 0] &= 248;
-    a[31] &= 127;
-    a[31] |= 64;
-
-    // the public key is the packed form of the point aB (B == basepoint)
-    gf aB[4];
-    scalarbase(aB, a);
-    pack(public_key, aB);
-}
-
-void crypto_ed25519_sign(uint8_t        signature[64],
-                         const uint8_t  secret_key[32],
-                         const uint8_t *message,
-                         size_t         message_size)
-{
-    u8 h[64];
-    u8 *a      = h;       // secret scalar
-    u8 *prefix = h + 32;  // prefix for nonce generation
-    HASH(h, secret_key, 32);
-
-    // build public key from secret key
-    a[ 0] &= 248;
-    a[31] &= 127;
-    a[31] |= 64;
-    gf aB[4];
-    scalarbase(aB, a);
-    u8 public_key[32];
-    pack(public_key, aB);
-
-    // Constructs the "random" nonce from the secret key and message.
-    // An actual random number would work just fine, and would save us
-    // the trouble of hashing the message twice.  If we did that
-    // however, the user could fuck it up and reuse the nonce.
-    u8 r[64];
-    HASH_CTX ctx;
-    HASH_INIT  (&ctx);
-    HASH_UPDATE(&ctx, prefix , 32          );
-    HASH_UPDATE(&ctx, message, message_size);
-    HASH_FINAL (&ctx, r);
-
-    gf rB[4];
-    reduce(r);
-    scalarbase(rB, r);
-    pack(signature, rB); // first half of the signature = "random" nonce
-
-    u8 k[64];
-    hash_k(k, signature, public_key, message, message_size);
-
-    i64 s[64]; // s = r + k a
-    FOR(i,  0, 32) s[i] = (u64) r[i];
-    FOR(i, 32, 64) s[i] = 0;
-    FOR(i, 0, 32) {
-        FOR(j, 0, 32) {
-            s[i+j] += k[i] * (u64) a[j];
-        }
-    }
-    modL(signature + 32, s);  // second half of the signature = s
-}
-
-int crypto_ed25519_check(const uint8_t  signature[64],
-                         const uint8_t  public_key[32],
-                         const uint8_t *message,
-                         size_t         message_size)
-{
-    gf aB[4];  if (unpackneg(aB, public_key)) return -1;   // -aB
-    u8 k[64];  hash_k(k, signature, public_key, message, message_size);
-    gf p[4];   scalarmult(p, aB, k);                       // p = -aB k
-    gf sB[4];  scalarbase(sB, signature + 32); add(p, sB); // p = s - aB k
-    u8 t[32];  pack(t, p);
-    return vn(signature, t, 32); // R == s - aB k ? OK : fail
-}
diff --git a/ed25519.h b/ed25519.h
deleted file mode 100644
index f03764b..0000000
--- a/ed25519.h
+++ /dev/null
@@ -1,20 +0,0 @@
-#ifndef ED25519_H
-#define ED25519_H
-
-#include <stddef.h>
-#include <inttypes.h>
-
-void crypto_ed25519_public_key(uint8_t        public_key[32],
-                               const uint8_t  secret_key[32]);
-
-void crypto_ed25519_sign(uint8_t        signature[64],
-                         const uint8_t  secret_key[32],
-                         const uint8_t *message,
-                         size_t         message_size);
-
-int crypto_ed25519_check(const uint8_t  signature[64],
-                         const uint8_t  public_key[32],
-                         const uint8_t *message,
-                         size_t         message_size);
-
-#endif // ED25519_H
diff --git a/lock.c b/lock.c
deleted file mode 100644
index bca01da..0000000
--- a/lock.c
+++ /dev/null
@@ -1,92 +0,0 @@
-#include "lock.h"
-#include "x25519.h"
-#include "chacha20.h"
-#include "ae.h"
-
-void crypto_lock_key(uint8_t       shared_key[32],
-                     const uint8_t your_secret_key [32],
-                     const uint8_t their_public_key[32])
-{
-    static const uint8_t _0[16];
-    uint8_t shared_secret[32];
-    crypto_x25519(shared_secret, your_secret_key, their_public_key);
-    crypto_chacha20_H(shared_key, shared_secret, _0);
-}
-
-void crypto_lock_detached(uint8_t        mac[16],
-                          uint8_t       *ciphertext,
-                          const uint8_t  your_secret_key [32],
-                          const uint8_t  their_public_key[32],
-                          const uint8_t  nonce[24],
-                          const uint8_t *plaintext,
-                          size_t         text_size)
-{
-    uint8_t shared_key[32];
-    crypto_lock_key(shared_key, your_secret_key, their_public_key);
-    crypto_ae_lock_detached(mac, ciphertext,
-                            shared_key, nonce,
-                            plaintext, text_size);
-}
-
-int crypto_unlock_detached(uint8_t       *plaintext,
-                           const uint8_t  your_secret_key [32],
-                           const uint8_t  their_public_key[32],
-                           const uint8_t  nonce[24],
-                           const uint8_t  mac[16],
-                           const uint8_t *ciphertext,
-                           size_t         text_size)
-{
-    uint8_t shared_key[32];
-    crypto_lock_key(shared_key, your_secret_key, their_public_key);
-    return crypto_ae_unlock_detached(plaintext,
-                                     shared_key, nonce,
-                                     mac, ciphertext, text_size);
-}
-
-void crypto_lock(uint8_t       *box,
-                 const uint8_t  your_secret_key [32],
-                 const uint8_t  their_public_key[32],
-                 const uint8_t  nonce[24],
-                 const uint8_t *plaintext,
-                 size_t         text_size)
-{
-    crypto_lock_detached(box, box + 16,
-                         your_secret_key, their_public_key, nonce,
-                         plaintext, text_size);
-}
-
-int crypto_unlock(uint8_t       *plaintext,
-                  const uint8_t  your_secret_key [32],
-                  const uint8_t  their_public_key[32],
-                  const uint8_t  nonce[24],
-                  const uint8_t *box,
-                  size_t         text_size)
-{
-    return crypto_unlock_detached(plaintext,
-                                  your_secret_key, their_public_key, nonce,
-                                  box, box + 16, text_size);
-}
-
-static const uint8_t null_nonce[24] = {};
-
-void crypto_anonymous_lock(uint8_t       *box,
-                           const uint8_t  random_secret_key[32],
-                           const uint8_t  their_public_key[32],
-                           const uint8_t *plaintext,
-                           size_t         text_size)
-{
-    crypto_x25519_base(box, random_secret_key); // put public key in box
-    crypto_lock(box + 32,
-                random_secret_key, their_public_key, null_nonce,
-                plaintext, text_size);
-}
-
-int crypto_anonymous_unlock(uint8_t       *plaintext,
-                            const uint8_t  your_secret_key[32],
-                            const uint8_t *box,
-                            size_t         text_size)
-{
-    return crypto_unlock(plaintext,
-                         your_secret_key, box, null_nonce,
-                         box + 32, text_size);
-}
diff --git a/lock.h b/lock.h
deleted file mode 100644
index c1b3e87..0000000
--- a/lock.h
+++ /dev/null
@@ -1,61 +0,0 @@
-#ifndef LOCK_H
-#define LOCK_H
-
-#include <inttypes.h>
-#include <stddef.h>
-
-// Computes a shared key with your secret key and their public key,
-// suitable for crypto_ae* functions.
-void crypto_lock_key(uint8_t       shared_key      [32],
-                     const uint8_t your_secret_key [32],
-                     const uint8_t their_public_key[32]);
-
-// Authenticated encryption with the sender's secret key and the recipient's
-// public key.  The message leaks if one of the secret key gets compromised.
-void crypto_lock_detached(uint8_t        mac[16],
-                          uint8_t       *ciphertext,
-                          const uint8_t  your_secret_key [32],
-                          const uint8_t  their_public_key[32],
-                          const uint8_t  nonce[24],
-                          const uint8_t *plaintext,
-                          size_t         text_size);
-
-// Authenticated decryption with the recipient's secret key, and the sender's
-// public key.  Has no effect if the message is forged.
-int crypto_unlock_detached(uint8_t       *plaintext,
-                           const uint8_t  your_secret_key [32],
-                           const uint8_t  their_public_key[32],
-                           const uint8_t  nonce[24],
-                           const uint8_t  mac[16],
-                           const uint8_t *ciphertext,
-                           size_t         text_size);
-
-// Like the above, only puts the mac and the ciphertext together
-// in a "box", mac first
-void crypto_lock(uint8_t       *box,
-                 const uint8_t  your_secret_key [32],
-                 const uint8_t  their_public_key[32],
-                 const uint8_t  nonce[24],
-                 const uint8_t *plaintext,
-                 size_t         text_size);
-
-// Unlocks a box locked by crypto_lock()
-int crypto_unlock(uint8_t       *plaintext,
-                  const uint8_t  your_secret_key [32],
-                  const uint8_t  their_public_key[32],
-                  const uint8_t  nonce[24],
-                  const uint8_t *box,
-                  size_t         text_size);
-
-void crypto_anonymous_lock(uint8_t       *box,
-                           const uint8_t  random_secret_key[32],
-                           const uint8_t  their_public_key[32],
-                           const uint8_t *plaintext,
-                           size_t         text_size);
-
-int crypto_anonymous_unlock(uint8_t       *plaintext,
-                            const uint8_t  your_secret_key[32],
-                            const uint8_t *box,
-                            size_t         text_size);
-
-#endif // LOCK_H
diff --git a/monocypher.c b/monocypher.c
new file mode 100644
index 0000000..db48387
--- /dev/null
+++ b/monocypher.c
@@ -0,0 +1,1427 @@
+#include "monocypher.h"
+
+static uint32_t load32_le(const uint8_t s[4])
+{
+    return s[0]
+        | (s[1] <<  8)
+        | (s[2] << 16)
+        | (s[3] << 24);
+}
+
+static void store32_le(uint8_t output[4], uint32_t input)
+{
+    output[0] =  input        & 0xff;
+    output[1] = (input >>  8) & 0xff;
+    output[2] = (input >> 16) & 0xff;
+    output[3] = (input >> 24) & 0xff;
+}
+
+static uint64_t rotr64(uint64_t x, uint64_t y)
+{
+    return (x >> y) ^ (x << (64 - y));
+}
+
+static uint64_t load64_le(const uint8_t s[8])
+{
+    return
+        ((uint64_t)s[0]      ) ^
+        ((uint64_t)s[1] <<  8) ^
+        ((uint64_t)s[2] << 16) ^
+        ((uint64_t)s[3] << 24) ^
+        ((uint64_t)s[4] << 32) ^
+        ((uint64_t)s[5] << 40) ^
+        ((uint64_t)s[6] << 48) ^
+        ((uint64_t)s[7] << 56);
+}
+
+static void store64_le(uint8_t output[8], uint64_t input)
+{
+    output[0] =  input        & 0xff;
+    output[1] = (input >>  8) & 0xff;
+    output[2] = (input >> 16) & 0xff;
+    output[3] = (input >> 24) & 0xff;
+    output[4] = (input >> 32) & 0xff;
+    output[5] = (input >> 40) & 0xff;
+    output[6] = (input >> 48) & 0xff;
+    output[7] = (input >> 56) & 0xff;
+}
+
+static void
+chacha20_rounds(uint32_t out[16], const uint32_t in[16])
+{
+    for (int i = 0; i < 16; i++)
+        out[i] = in[i];
+
+    for (int i = 0; i < 10; i++) { // 20 rounds, 2 rounds per loop.
+#define ROT_L32(x, n) x = (x << n) | (x >> (32 - n))
+#define QUARTERROUND(a, b, c, d)           \
+        a += b;  d ^= a;  ROT_L32(d, 16);  \
+        c += d;  b ^= c;  ROT_L32(b, 12);  \
+        a += b;  d ^= a;  ROT_L32(d,  8);  \
+        c += d;  b ^= c;  ROT_L32(b,  7)
+
+        QUARTERROUND(out[0], out[4], out[ 8], out[12]); // column 0
+        QUARTERROUND(out[1], out[5], out[ 9], out[13]); // column 1
+        QUARTERROUND(out[2], out[6], out[10], out[14]); // column 2
+        QUARTERROUND(out[3], out[7], out[11], out[15]); // column 3
+        QUARTERROUND(out[0], out[5], out[10], out[15]); // diagonal 1
+        QUARTERROUND(out[1], out[6], out[11], out[12]); // diagonal 2
+        QUARTERROUND(out[2], out[7], out[ 8], out[13]); // diagonal 3
+        QUARTERROUND(out[3], out[4], out[ 9], out[14]); // diagonal 4
+    }
+}
+
+static void
+chacha20_init_key(crypto_chacha_ctx *ctx, const uint8_t key[32])
+{
+    // constant
+    ctx->input[0]   = load32_le((uint8_t*)"expa");
+    ctx->input[1]   = load32_le((uint8_t*)"nd 3");
+    ctx->input[2]   = load32_le((uint8_t*)"2-by");
+    ctx->input[3]   = load32_le((uint8_t*)"te k");
+    // key
+    for (int i = 0; i < 8; i++)
+        ctx->input[i + 4] = load32_le(key + i*4);
+    // pool index (the random pool starts empty)
+    ctx->pool_index = 64;
+}
+
+void
+crypto_chacha20_H(uint8_t       out[32],
+                  const uint8_t key[32],
+                  const uint8_t in [16])
+{
+    crypto_chacha_ctx ctx;
+    chacha20_init_key(&ctx, key);
+    for (int i = 0; i < 4; i++)
+        ctx.input[i + 12] = load32_le(in + i*4);
+
+    uint32_t buffer[16];
+    chacha20_rounds(buffer, ctx.input);
+    // prevents reversal of the rounds by revealing only half of the buffer.
+    for (int i = 0; i < 4; i++) {
+        store32_le(out      + i*4, buffer[i     ]); // constant
+        store32_le(out + 16 + i*4, buffer[i + 12]); // counter and nonce
+    }
+}
+
+void
+crypto_chacha20_init(crypto_chacha_ctx *ctx,
+                     const uint8_t      key[32],
+                     const uint8_t      nonce[8])
+{
+    chacha20_init_key(ctx, key  );         // key
+    ctx->input[12] = 0;                    // counter
+    ctx->input[13] = 0;                    // counter
+    ctx->input[14] = load32_le(nonce + 0); // nonce
+    ctx->input[15] = load32_le(nonce + 4); // nonce
+}
+
+void
+crypto_chacha20_Xinit(crypto_chacha_ctx *ctx,
+                      const uint8_t      key[32],
+                      const uint8_t      nonce[24])
+{
+    uint8_t derived_key[32];
+    crypto_chacha20_H(derived_key, key, nonce);
+    crypto_chacha20_init(ctx, derived_key, nonce + 16);
+}
+
+void
+crypto_chacha20_encrypt(crypto_chacha_ctx *ctx,
+                        const uint8_t     *plain_text,
+                        uint8_t           *cipher_text,
+                        size_t             message_size)
+{
+    for (size_t i = 0; i < message_size; i++) {
+        // refill the pool if empty
+        if (ctx->pool_index == 64) {
+            // fill the pool
+            uint32_t buffer[16];
+            chacha20_rounds(buffer, ctx->input);
+            for (int i = 0; i < 16; i++)
+                store32_le(ctx->random_pool + i*4, buffer[i] + ctx->input[i]);
+            // update the counters
+            ctx->pool_index = 0;
+            ctx->input[12]++;
+            if (!ctx->input[12])
+                ctx->input[13]++;
+        }
+        // use the pool for encryption (or random stream)
+        cipher_text[i] =
+            (plain_text == 0 ? 0 : plain_text[i])
+            ^ ctx->random_pool[ctx->pool_index];
+        ctx->pool_index++;
+    }
+}
+
+void
+crypto_chacha20_random(crypto_chacha_ctx *ctx,
+                       uint8_t           *cipher_text,
+                       size_t             message_size)
+{
+    crypto_chacha20_encrypt(ctx, 0, cipher_text, message_size);
+}
+
+
+
+static void poly_load(uint32_t out[4], const uint8_t in[16])
+{
+    for (int i = 0; i < 4; i++)
+        out[i] = load32_le(in + i*4);
+}
+
+static void poly_add(uint32_t out[5], const uint32_t a[5], const uint32_t b[5])
+{
+    uint64_t carry = 0;
+    for (int i = 0; i < 5; i++) {
+        carry  += (int64_t)(a[i]) + b[i];
+        out[i]  = carry & 0xffffffff; // lower 32 bits right there.
+        carry >>= 32;                 // retain the carry
+    }
+}
+
+// h = (h + c) * r
+static void poly_block(crypto_poly1305_ctx *ctx)
+{
+    // h + c, without carry propagation
+    const uint64_t h0 = ctx->h[0] + (uint64_t)ctx->c[0];
+    const uint64_t h1 = ctx->h[1] + (uint64_t)ctx->c[1];
+    const uint64_t h2 = ctx->h[2] + (uint64_t)ctx->c[2];
+    const uint64_t h3 = ctx->h[3] + (uint64_t)ctx->c[3];
+    const uint64_t h4 = ctx->h[4] + (uint64_t)ctx->c[4];
+
+    // Local all the things!
+    const uint64_t r0 = ctx->r[0];
+    const uint64_t r1 = ctx->r[1];
+    const uint64_t r2 = ctx->r[2];
+    const uint64_t r3 = ctx->r[3];
+    const uint64_t rr0 = (ctx->r[0] >> 2) * 5; // lose 2 bottom bits...
+    const uint64_t rr1 = (ctx->r[1] >> 2) * 5; // 2 bottom bits already cleared
+    const uint64_t rr2 = (ctx->r[2] >> 2) * 5; // 2 bottom bits already cleared
+    const uint64_t rr3 = (ctx->r[3] >> 2) * 5; // 2 bottom bits already cleared
+
+    // (h + c) * r, without carry propagation
+    const uint64_t x0 = h0*r0 + h1*rr3 + h2*rr2 + h3*rr1 + h4*rr0;
+    const uint64_t x1 = h0*r1 + h1*r0  + h2*rr3 + h3*rr2 + h4*rr1;
+    const uint64_t x2 = h0*r2 + h1*r1  + h2*r0  + h3*rr3 + h4*rr2;
+    const uint64_t x3 = h0*r3 + h1*r2  + h2*r1  + h3*r0  + h4*rr3;
+    const uint64_t x4 = h4 * (r0 & 3); // ...recover those 2 bits
+
+    // carry propagation, put ctx->h under 2^130
+    const uint64_t msb = x4 + (x3 >> 32);
+    uint64_t       u   = (msb >> 2) * 5; // lose 2 bottom bits...
+    u += (x0 & 0xffffffff)             ;  ctx->h[0] = u & 0xffffffff;  u >>= 32;
+    u += (x1 & 0xffffffff) + (x0 >> 32);  ctx->h[1] = u & 0xffffffff;  u >>= 32;
+    u += (x2 & 0xffffffff) + (x1 >> 32);  ctx->h[2] = u & 0xffffffff;  u >>= 32;
+    u += (x3 & 0xffffffff) + (x2 >> 32);  ctx->h[3] = u & 0xffffffff;  u >>= 32;
+    u += msb & 3 /* ...recover them */ ;  ctx->h[4] = u;
+}
+
+// (re-)initializes the input counter and input buffer
+static void poly_clear_c(crypto_poly1305_ctx *ctx)
+{
+    for (int i = 0; i < 4; i++)
+        ctx->c[i] = 0;
+    ctx->c_index = 0;
+}
+
+void crypto_poly1305_init(crypto_poly1305_ctx *ctx, const uint8_t key[32])
+{
+    // initial h: zero
+    for (int i =  0; i < 5; i++)
+        ctx->h [i] = 0;
+    // initial r: first half of the key, minus a few bits
+    poly_load(ctx->r, key);
+    ctx->r[0] &= 0x0fffffff; // clear top 4 bits
+    ctx->r[1] &= 0x0ffffffc; // clear top 4 & bottom 2 bits
+    ctx->r[2] &= 0x0ffffffc; // clear top 4 & bottom 2 bits
+    ctx->r[3] &= 0x0ffffffc; // clear top 4 & bottom 2 bits
+    ctx->c[4]  = 1;
+    // second half of the key, saved for later
+    poly_load(ctx->pad, key + 16);
+    ctx->pad[4] = 0;
+    // buffer and counter
+    poly_clear_c(ctx);
+}
+
+void crypto_poly1305_update(crypto_poly1305_ctx *ctx,
+                            const uint8_t *m, size_t bytes)
+{
+    while (bytes > 0) {
+        if (ctx->c_index == 16) {
+            poly_block(ctx);
+            poly_clear_c(ctx);
+        }
+        // feed the input buffer
+        ctx->c[ctx->c_index / 4] |= *m << ((ctx->c_index % 4) * 8);
+        ctx->c_index++;
+        m++;
+        bytes--;
+    }
+}
+
+void crypto_poly1305_finish(crypto_poly1305_ctx *ctx, uint8_t mac[16])
+{
+    // move the final 1 according to remaining input length
+    ctx->c[4] = 0;
+    ctx->c[ctx->c_index / 4] |= 1 << ((ctx->c_index % 4) * 8);
+    // one last hash update...
+    poly_block(ctx);
+    // ... this time with full modular reduction
+    // We only need to conditionally subtract 2^130-5,
+    // using bit twidling to prevent timing attacks.
+    static const uint32_t minus_p[5] = { 5, 0, 0, 0, 0xfffffffc };
+    uint32_t h_minus_p[5];
+    poly_add(h_minus_p, ctx->h, minus_p);
+    uint32_t negative = ~(-(h_minus_p[4] >> 31)); // 0 or -1 (2's complement)
+    for (int i = 0; i < 5; i++) {
+        ctx->h[i] ^= negative & (ctx->h[i] ^ h_minus_p[i]);
+    }
+    // Add the secret pad to the final hash before output
+    poly_add(ctx->h, ctx->h, ctx->pad);
+    for (int i = 0; i < 4; i++)
+        store32_le(mac + i*4, ctx->h[i]);
+}
+
+void crypto_poly1305_auth(uint8_t mac[16], const uint8_t *m,
+                          size_t  m_size , const uint8_t  key[32])
+{
+    crypto_poly1305_ctx ctx;
+    crypto_poly1305_init  (&ctx, key);
+    crypto_poly1305_update(&ctx, m, m_size);
+    crypto_poly1305_finish(&ctx, mac);
+}
+
+int crypto_memcmp_16(const uint8_t mac1[16], const uint8_t mac2[16])
+{
+    unsigned diff = 0;
+    for (int i = 0; i < 16; i++) {
+        diff |= (mac1[i] ^ mac2[i]);
+    }
+    return diff;
+}
+// ripped off from the reference implentation in RFC 7693
+
+
+
+
+// Initialization Vector.
+static const uint64_t blake2b_iv[8] = {
+    0x6a09e667f3bcc908, 0xbb67ae8584caa73b,
+    0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1,
+    0x510e527fade682d1, 0x9b05688c2b3e6c1f,
+    0x1f83d9abfb41bd6b, 0x5be0cd19137e2179
+};
+
+// increment a 128-bit "word".
+static void
+incr(uint64_t x[2], uint64_t y)
+{
+    x[0] += y;                 // increment the low word
+    if (x[0] < y) { x[1]++; }  // handle overflow
+}
+
+static void
+blake2b_compress(crypto_blake2b_ctx *ctx, _Bool last_block)
+{
+    static const uint8_t sigma[12][16] = {
+        { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+        { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
+        { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
+        { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
+        { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
+        { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
+        { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
+        { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
+        { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
+        { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
+        { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+        { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }
+    };
+
+    // init work variables (before shuffling them)
+    uint64_t v[16];
+    for (int i = 0; i < 8; i++) {
+        v[i    ] = ctx->hash[i];
+        v[i + 8] = blake2b_iv[i];
+    }
+    v[12] ^= ctx->input_size[0]; // low 64 bits of offset
+    v[13] ^= ctx->input_size[1]; // high 64 bits
+    if (last_block) { v[14] = ~v[14]; }
+
+    // load the input buffer
+    uint64_t m[16];
+    for (int i = 0; i < 16; i++) {
+        m[i] = load64_le(&ctx->buf[i * 8]);
+    }
+
+    // shuffle the work variables with the 12 rounds
+    for (int i = 0; i < 12; i++) {
+#define B2B_G(a, b, c, d, x, y)                                    \
+        v[a] += v[b] + x;  v[d] ^= v[a];  v[d] = rotr64(v[d], 32); \
+        v[c] += v[d]    ;  v[b] ^= v[c];  v[b] = rotr64(v[b], 24); \
+        v[a] += v[b] + y;  v[d] ^= v[a];  v[d] = rotr64(v[d], 16); \
+        v[c] += v[d]    ;  v[b] ^= v[c];  v[b] = rotr64(v[b], 63)
+
+        B2B_G( 0, 4,  8, 12, m[sigma[i][ 0]], m[sigma[i][ 1]]);
+        B2B_G( 1, 5,  9, 13, m[sigma[i][ 2]], m[sigma[i][ 3]]);
+        B2B_G( 2, 6, 10, 14, m[sigma[i][ 4]], m[sigma[i][ 5]]);
+        B2B_G( 3, 7, 11, 15, m[sigma[i][ 6]], m[sigma[i][ 7]]);
+        B2B_G( 0, 5, 10, 15, m[sigma[i][ 8]], m[sigma[i][ 9]]);
+        B2B_G( 1, 6, 11, 12, m[sigma[i][10]], m[sigma[i][11]]);
+        B2B_G( 2, 7,  8, 13, m[sigma[i][12]], m[sigma[i][13]]);
+        B2B_G( 3, 4,  9, 14, m[sigma[i][14]], m[sigma[i][15]]);
+    }
+
+    // accumulate the work variables into the hash
+    for(int i = 0; i < 8; i++) {
+        ctx->hash[i] ^= v[i] ^ v[i + 8];
+    }
+}
+
+void
+crypto_blake2b_general_init(crypto_blake2b_ctx *ctx, size_t outlen,
+                            const uint8_t      *key, size_t keylen)
+{
+    // Initial hash == initialization vector...
+    for (int i = 0; i < 8; i++) {
+        ctx->hash[i] = blake2b_iv[i];
+    }
+    ctx->hash[0]      ^= 0x01010000 ^ (keylen << 8) ^ outlen;  // ...mostly
+    ctx->input_size[0] = 0;       // input count low word
+    ctx->input_size[1] = 0;       // input count high word
+    ctx->c             = 0;       // pointer within buffer
+    ctx->output_size   = outlen;  // size of the final hash
+
+    // If there's a key, put it in the first block, then pad with zeroes
+    if (keylen > 0) {
+        for (size_t i = 0     ; i < keylen; i++) { ctx->buf[i] = key[i]; }
+        for (size_t i = keylen; i < 128   ; i++) { ctx->buf[i] = 0;      }
+        ctx->c = 128; // mark the block as used
+    }
+}
+
+void
+crypto_blake2b_init(crypto_blake2b_ctx *ctx)
+{
+    crypto_blake2b_general_init(ctx, 64, 0, 0);
+}
+
+void
+crypto_blake2b_update(crypto_blake2b_ctx *ctx, const uint8_t *in, size_t inlen)
+{
+    for (size_t i = 0; i < inlen; i++) {
+        // If the buffer is full, increment the counters and
+        // add (compress) the current buffer to the hash
+        if (ctx->c == 128) {
+            ctx->c = 0;
+            incr(ctx->input_size, 128);
+            blake2b_compress(ctx, 0); // not last time -> 0
+        }
+        // By now the buffer is not full.  We add one input byte.
+        ctx->buf[ctx->c] = in[i];
+        ctx->c++;
+    }
+}
+
+void
+crypto_blake2b_final(crypto_blake2b_ctx *ctx, uint8_t *out)
+{
+    // update input size, pad then compress the buffer
+    incr(ctx->input_size, ctx->c);
+    for (int i = ctx->c; i < 128; i++) { ctx->buf[i] = 0; }
+    blake2b_compress(ctx, 1); // last time -> 1
+
+    // copy the hash in the output (little endian of course)
+    for (int i = 0; i < ctx->output_size; i++) {
+        out[i] = (ctx->hash[i / 8] >> (8 * (i & 7))) & 0xFF;
+    }
+}
+
+void
+crypto_blake2b_general(      uint8_t*out, size_t outlen,
+                       const uint8_t*key, size_t keylen,
+                       const uint8_t*in,  size_t inlen)
+{
+    crypto_blake2b_ctx ctx;
+    crypto_blake2b_general_init(&ctx, outlen, key, keylen);
+    crypto_blake2b_update(&ctx, in, inlen);
+    crypto_blake2b_final(&ctx, out);
+}
+
+void
+crypto_blake2b(uint8_t out[64], const uint8_t *in, size_t inlen)
+{
+    crypto_blake2b_general(out, 64, 0, 0, in, inlen);
+}
+
+
+/////////////////
+/// Utilities ///
+/////////////////
+
+
+static uint32_t
+min(uint32_t a, uint32_t b)
+{
+    return a <= b ? a : b;
+}
+
+// updates a blake2 hash with a 32 bit word, little endian.
+static void
+blake_update_32(crypto_blake2b_ctx *ctx, uint32_t input)
+{
+    uint8_t buf[4];
+    store32_le(buf, input);
+    crypto_blake2b_update(ctx, buf, 4);
+}
+
+//////////////////
+// Argon2 block //
+//////////////////
+typedef struct block {
+    uint64_t a[128]; // 1024 octets in 128 64-bit words
+} block;
+
+static void
+load_block(block *b, const uint8_t bytes[1024])
+{
+    for (int i = 0; i < 128; i++) {
+        b->a[i] = load64_le(bytes + i * 8);
+    }
+}
+
+static void
+store_block(uint8_t bytes[1024], const block *b)
+{
+    for (int i = 0; i < 128; i++) {
+        store64_le(bytes + i * 8, b->a[i]);
+    }
+}
+
+static void
+copy_block(block *out, const block *in)
+{
+    for (int i = 0; i < 128; i++) {
+        out->a[i] = in->a[i];
+    }
+}
+
+static void
+xor_block(block *out, const block *in)
+{
+    for (int i = 0; i < 128; i++) {
+        out->a[i] ^= in->a[i];
+    }
+}
+
+////////////////////
+// Argon2i proper //
+////////////////////
+
+// Hash with a virtually unlimited digest size.
+// Doesn't extract more entropy than the base hash function.
+// Mainly used for filling a whole kilobyte block with pseudo-random bytes.
+static void
+extended_hash(uint8_t       *digest, uint32_t digest_size,
+              const uint8_t *input , uint32_t input_size)
+{
+    crypto_blake2b_ctx ctx;
+    crypto_blake2b_general_init(&ctx, min(digest_size, 64), 0, 0);
+    blake_update_32            (&ctx, digest_size);
+    crypto_blake2b_update      (&ctx, input, input_size);
+    crypto_blake2b_final       (&ctx, digest);
+
+    if (digest_size > 64) {
+        // the conversion to u64 avoids integer overflow on
+        // ludicrously big hash sizes.
+        uint32_t r   = (((uint64_t)digest_size + 31) / 32) - 2;
+        uint32_t i   =  1;
+        uint32_t in  =  0;
+        uint32_t out = 32;
+        while (i < r) {
+            // Input and output overlap.
+            // This shouldn't be a problem.
+            crypto_blake2b(digest + out, digest + in, 64);
+            i   +=  1;
+            in  += 32;
+            out += 32;
+        }
+        crypto_blake2b_general(digest + out, digest_size - (32 * r),
+                               0, 0, // no key
+                               digest + in , 64);
+    }
+}
+
+// Core of the compression function G.  Computes Z from R in place.
+static void
+g_rounds(block *work_block)
+{
+#define LSB(x) ((x) & 0xffffffff)
+#define G(a, b, c, d)                                            \
+    a += b + 2 * LSB(a) * LSB(b);  d ^= a;  d = rotr64(d, 32);   \
+    c += d + 2 * LSB(c) * LSB(d);  b ^= c;  b = rotr64(b, 24);   \
+    a += b + 2 * LSB(a) * LSB(b);  d ^= a;  d = rotr64(d, 16);   \
+    c += d + 2 * LSB(c) * LSB(d);  b ^= c;  b = rotr64(b, 63)
+#define ROUND(v0,  v1,  v2,  v3,  v4,  v5,  v6,  v7,    \
+              v8,  v9, v10, v11, v12, v13, v14, v15)    \
+    G(v0, v4,  v8, v12);  G(v1, v5,  v9, v13);          \
+    G(v2, v6, v10, v14);  G(v3, v7, v11, v15);          \
+    G(v0, v5, v10, v15);  G(v1, v6, v11, v12);          \
+    G(v2, v7,  v8, v13);  G(v3, v4,  v9, v14)
+
+    // column rounds (work_block = Q)
+    for (int i = 0; i < 128; i += 16) {
+        ROUND(work_block->a[i     ], work_block->a[i +  1],
+              work_block->a[i +  2], work_block->a[i +  3],
+              work_block->a[i +  4], work_block->a[i +  5],
+              work_block->a[i +  6], work_block->a[i +  7],
+              work_block->a[i +  8], work_block->a[i +  9],
+              work_block->a[i + 10], work_block->a[i + 11],
+              work_block->a[i + 12], work_block->a[i + 13],
+              work_block->a[i + 14], work_block->a[i + 15]);
+    }
+    // row rounds (work_block = Z)
+    for (int i = 0; i < 16; i += 2) {
+        ROUND(work_block->a[i      ], work_block->a[i +   1],
+              work_block->a[i +  16], work_block->a[i +  17],
+              work_block->a[i +  32], work_block->a[i +  33],
+              work_block->a[i +  48], work_block->a[i +  49],
+              work_block->a[i +  64], work_block->a[i +  65],
+              work_block->a[i +  80], work_block->a[i +  81],
+              work_block->a[i +  96], work_block->a[i +  97],
+              work_block->a[i + 112], work_block->a[i + 113]);
+    }
+}
+
+// The compression function G
+// may overwrite result completely  (xcopy == copy_block),
+// or XOR result with the old block (xcopy ==  xor_block)
+static void
+binary_g(block *result, const block *x, const block *y,
+         void (*xcopy) (block*, const block*))
+{
+    // put R = X ^ Y into tmp
+    block tmp;
+    copy_block(&tmp, x);
+    xor_block (&tmp, y);
+
+    xcopy(result, &tmp);     // save R (erase or xor the old block)
+    g_rounds(&tmp);          // tmp = Z
+    xor_block(result, &tmp); // result =  R ^ Z (or R ^ Z ^ old)
+}
+
+// unary version of the compression function.
+// The missing argument is implied zero.
+// Does the transformation in place.
+static void
+unary_g(block *work_block)
+{
+    // work_block == R
+    block tmp;
+    copy_block(&tmp, work_block); // tmp = R
+    g_rounds(work_block);         // work_block = Z
+    xor_block(work_block, &tmp);  // work_block = Z ^ R
+}
+
+typedef struct gidx_ctx {
+    block    b;
+    uint32_t pass_number;
+    uint32_t slice_number;
+    uint32_t nb_blocks;
+    uint32_t nb_iterations;
+    uint32_t ctr;
+    uint32_t index;
+} gidx_ctx;
+
+static void
+gidx_refresh(gidx_ctx *ctx)
+{
+    ctx->b.a[0] = ctx->pass_number;
+    ctx->b.a[1] = 0;  // lane number (we have only one)
+    ctx->b.a[2] = ctx->slice_number;
+    ctx->b.a[3] = ctx->nb_blocks;
+    ctx->b.a[4] = ctx->nb_iterations;
+    ctx->b.a[5] = 1;  // type: Argon2i
+    ctx->b.a[6] = ctx->ctr;
+    // zero the rest of the block
+    for (int i = 7; i < 128; i++) {
+        ctx->b.a[i] = 0;
+    }
+
+    // Shuffle the block thus: ctx->b = G((G(ctx->b, zero)), zero)
+    // Applies the G "square" function to get cheap pseudo-random numbers.
+    unary_g(&(ctx->b));
+    unary_g(&(ctx->b)); // square means apply it twice
+}
+
+static void
+gidx_init(gidx_ctx *ctx,
+          uint32_t pass_number,
+          uint32_t slice_number,
+          uint32_t nb_blocks,
+          uint32_t nb_iterations)
+{
+    ctx->pass_number   = pass_number;
+    ctx->slice_number  = slice_number;
+    ctx->nb_blocks     = nb_blocks;
+    ctx->nb_iterations = nb_iterations;
+    ctx->ctr           = 1;   // not zero, surprisingly
+    ctx->index         = pass_number == 0 && slice_number == 0 ? 2 : 0;
+    gidx_refresh(ctx);
+}
+
+static uint32_t
+gidx_next(gidx_ctx *ctx)
+{
+    // lazily creates the index block we need
+    if (ctx->index == 128) {
+        ctx->index = 0;
+        ctx->ctr++;
+        gidx_refresh(ctx);
+    }
+    // saves and increment the index
+    uint32_t index = ctx->index;
+    ctx->index++; // updates index for the next call
+
+    // Computes the area size.
+    // Pass 0 : all already finished segments plus already constructed
+    //          blocks in this segment
+    // Pass 1+: 3 last segments plus already constructed
+    //          blocks in this segment.  THE SPEC SUGGESTS OTHERWISE.
+    //          I CONFORM TO THE REFERENCE IMPLEMENTATION.
+    _Bool    first_pass = ctx->pass_number == 0;
+    uint32_t slice_size = ctx->nb_blocks / 4;
+    uint32_t area_size  = ((first_pass ? ctx->slice_number : 3)
+                           * slice_size + index - 1);
+
+    // Computes the starting position of the reference area.
+    // CONTRARY TO WHAT THE SPEC SUGGESTS, IT STARTS AT THE
+    // NEXT SEGMENT, NOT THE NEXT BLOCK.
+    uint32_t next_slice = (ctx->slice_number == 3
+                           ? 0
+                           : (ctx->slice_number + 1) * slice_size);
+    uint32_t start_pos  = first_pass ? 0 : next_slice;
+
+    // Generates the actual index from J1 (no need for J2, there's only one lane)
+    uint64_t j1         = ctx->b.a[index] & 0xffffffff; // pseudo-random number
+    uint64_t x          = (j1 * j1)       >> 32;
+    uint64_t y          = (area_size * x) >> 32;
+    uint64_t z          = area_size - 1 - y;
+    return (start_pos + z) % ctx->nb_blocks;
+}
+
+// Main algorithm
+void
+crypto_argon2i_hash(uint8_t       *tag,       uint32_t tag_size,
+                    const uint8_t *password,  uint32_t password_size,
+                    const uint8_t *salt,      uint32_t salt_size,
+                    const uint8_t *key,       uint32_t key_size,
+                    const uint8_t *ad,        uint32_t ad_size,
+                    void *work_area,
+                    uint32_t nb_blocks,
+                    uint32_t nb_iterations)
+{
+    // work area seen as blocks (must be suitably aligned)
+    block *blocks = work_area;
+    {
+        crypto_blake2b_ctx ctx;
+        crypto_blake2b_init(&ctx);
+
+        blake_update_32      (&ctx, 1            ); // p: number of threads
+        blake_update_32      (&ctx, tag_size     );
+        blake_update_32      (&ctx, nb_blocks    );
+        blake_update_32      (&ctx, nb_iterations);
+        blake_update_32      (&ctx, 0x13         ); // v: version number
+        blake_update_32      (&ctx, 1            ); // y: Argon2i
+        blake_update_32      (&ctx,           password_size);
+        crypto_blake2b_update(&ctx, password, password_size);
+        blake_update_32      (&ctx,           salt_size);
+        crypto_blake2b_update(&ctx, salt,     salt_size);
+        blake_update_32      (&ctx,           key_size);
+        crypto_blake2b_update(&ctx, key,      key_size);
+        blake_update_32      (&ctx,           ad_size);
+        crypto_blake2b_update(&ctx, ad,       ad_size);
+
+        uint8_t initial_hash[72]; // 64 bytes plus 2 words for future hashes
+        crypto_blake2b_final(&ctx, initial_hash);
+
+        // fill first 2 blocks
+        block   tmp_block;
+        uint8_t hash_area[1024];
+        store32_le(initial_hash + 64, 0); // first  additional word
+        store32_le(initial_hash + 68, 0); // second additional word
+        extended_hash(hash_area, 1024, initial_hash, 72);
+        load_block(&tmp_block, hash_area);
+        copy_block(blocks, &tmp_block);
+
+        store32_le(initial_hash + 64, 1); // slight modification
+        extended_hash(hash_area, 1024, initial_hash, 72);
+        load_block(&tmp_block, hash_area);
+        copy_block(blocks + 1, &tmp_block);
+    }
+
+    // Actual number of blocks
+    nb_blocks -= nb_blocks % 4; // round down to 4 p (p == 1 thread)
+    const uint32_t segment_size = nb_blocks / 4;
+
+    // fill (then re-fill) the rest of the blocks
+    for (uint32_t pass_number = 0; pass_number < nb_iterations; pass_number++) {
+        _Bool     first_pass  = pass_number == 0;
+        // Simple copy on pass 0, XOR instead of overwrite on subsequent passes
+        void (*xcopy) (block*, const block*) = first_pass ?copy_block :xor_block;
+
+        for (int segment = 0; segment < 4; segment++ ) {
+
+            gidx_ctx ctx;
+            gidx_init(&ctx, pass_number, segment, nb_blocks, nb_iterations);
+
+            // On the first segment of the first pass,
+            // blocks 0 and 1 are already filled.
+            // We use the offset to skip them.
+            uint32_t offset = first_pass && segment == 0 ? 2 : 0;
+            // current, reference, and previous are block indices
+            for (uint32_t current =  segment      * segment_size + offset;
+                 current          < (segment + 1) * segment_size;
+                 current++) {
+                uint32_t previous  = current == 0 ? nb_blocks - 1 : current - 1;
+                uint32_t reference = gidx_next(&ctx);
+                binary_g(blocks + current,
+                         blocks + previous,
+                         blocks + reference,
+                         xcopy);
+            }
+        }
+    }
+    // hash the very last block with H' into the output tag
+    uint8_t final_block[1024];
+    store_block(final_block, blocks + (nb_blocks - 1));
+    extended_hash(tag, tag_size, final_block, 1024);
+}
+
+void
+crypto_argon2i(uint8_t        tag[32],
+               const uint8_t *password,  uint32_t password_size,
+               const uint8_t *salt,      uint32_t salt_size,
+               void    *work_area,
+               uint32_t nb_blocks,
+               uint32_t nb_iterations)
+{
+    crypto_argon2i_hash(tag     , 32,
+                        password, password_size,
+                        salt    , salt_size,
+                        0, 0, 0, 0,
+                        work_area, nb_blocks, nb_iterations);
+}
+
+// Taken from TweetNaCl
+
+
+#define FOR(i, start, end) for (size_t i = start; i < end; i++)
+#define sv static void
+typedef int64_t gf[16];
+
+static const uint8_t _0[16];
+static const uint8_t _9[32] = { 9 };
+static const gf _121665 = { 0xdb41, 1 };
+
+sv car_25519(gf o)
+{
+    FOR(i, 0, 16) {
+        o[i]              += 1LL  << 16;
+        int64_t c          = o[i] >> 16;
+        o[(i+1) * (i<15)] += c - 1 + (37 * (c-1) * (i==15));
+        o[i]              -= c << 16;
+    }
+}
+
+sv sel_25519(gf p, gf q, int b)
+{
+    int64_t c = ~(b-1);
+    FOR(i, 0, 16) {
+        int64_t t = c & (p[i] ^ q[i]);
+        p[i]     ^= t;
+        q[i]     ^= t;
+    }
+}
+
+sv pack_25519(uint8_t *o, const gf n)
+{
+    gf t;
+    FOR(i, 0, 16) t[i] = n[i];
+    car_25519(t);
+    car_25519(t);
+    car_25519(t);
+    FOR(j, 0, 2) {
+        gf m;
+        m[0] = t[0] - 0xffed;
+        FOR(i, 1, 15) {
+            m[i  ]  = t[i] - 0xffff - ((m[i-1] >> 16) & 1);
+            m[i-1] &= 0xffff;
+        }
+        m[15]  = t[15] - 0x7fff - ((m[14] >> 16) & 1);
+        int b  = (m[15] >> 16) & 1;
+        m[14] &= 0xffff;
+        sel_25519(t, m, 1-b);
+    }
+    FOR(i, 0, 16) {
+        o[2*i    ] = t[i] & 0xff;
+        o[2*i + 1] = t[i] >> 8;
+    }
+}
+
+sv unpack_25519(gf o, const uint8_t *n)
+{
+    FOR(i, 0, 16) o[i] = n[2*i] + ((int64_t)n[2*i + 1] << 8);
+    o[15] &= 0x7fff;
+}
+
+sv A(gf o, const gf a, const gf b)
+{
+    FOR(i, 0, 16) o[i] = a[i] + b[i];
+}
+
+sv Z(gf o, const gf a, const gf b)
+{
+    FOR(i, 0, 16) o[i] = a[i] - b[i];
+}
+
+sv M(gf o, const gf a, const gf b)
+{
+    int64_t t[31];
+    FOR(i, 0, 31) t[i] = 0;
+    FOR(i, 0, 16) FOR(j, 0, 16) t[i+j] += a[i] * b[j];
+    FOR(i, 0, 15) t[i] += 38 * t[i+16];
+    FOR(i, 0, 16) o[i] = t[i];
+    car_25519(o);
+    car_25519(o);
+}
+
+sv S(gf o,const gf a)
+{
+    M(o, a, a);
+}
+
+sv inv_25519(gf o,const gf i)
+{
+    gf c;
+    FOR(a, 0, 16) c[a] = i[a];
+    for(int a = 253; a >= 0; a--) {
+        S(c, c);
+        if(a != 2 && a != 4)
+            M(c, c, i);
+    }
+    FOR(a, 0, 16) o[a] = c[a];
+}
+
+void crypto_x25519(uint8_t q[32], const uint8_t n[32], const uint8_t p[32])
+{
+    uint8_t z[32];
+    int64_t x[80];
+    int64_t r;
+    gf a, b, c, d, e, f;
+    FOR(i, 0, 31) z[i] = n[i];
+    z[31]  = (n[31] & 127) | 64;
+    z[0 ] &= 248;
+    unpack_25519(x, p);
+    FOR(i, 0, 16) {
+        b[i] = x[i];
+        d[i] = a[i] = c[i] = 0;
+    }
+    a[0] = d[0] = 1;
+    for(int i = 254; i>=0; i--) {
+        r = (z[i>>3] >> (i & 7)) & 1;
+        sel_25519(a, b, r);
+        sel_25519(c, d, r);
+        A(e, a, c);
+        Z(a, a, c);
+        A(c, b, d);
+        Z(b, b, d);
+        S(d, e);
+        S(f, a);
+        M(a, c, a);
+        M(c, b, e);
+        A(e, a, c);
+        Z(a, a, c);
+        S(b, a);
+        Z(c, d, f);
+        M(a, c, _121665);
+        A(a, a, d);
+        M(c, c, a);
+        M(a, d, f);
+        M(d, b, x);
+        S(b, e);
+        sel_25519(a, b, r);
+        sel_25519(c, d, r);
+    }
+    FOR(i, 0, 16) {
+        x[i+16] = a[i];
+        x[i+32] = c[i];
+        x[i+48] = b[i];
+        x[i+64] = d[i];
+    }
+    inv_25519(x+32, x+32);
+    M(x+16, x+16, x+32);
+    pack_25519(q, x+16);
+}
+
+void crypto_x25519_base(uint8_t q[32], const uint8_t n[32])
+{
+    crypto_x25519(q, n, _9);
+}
+// Taken from TweetNaCl.
+// I tried the ref10 implementation, but that was too damn big
+
+
+#define FOR(i, start, end) for (size_t i = start; i < end; i++)
+#define sv static void
+#define sc static const
+
+typedef uint8_t   u8;
+typedef int64_t  i64;
+typedef uint64_t u64;
+typedef i64 gf[16];
+
+sc gf gf0;
+sc gf gf1 = { 1 };
+sc gf  D  = { 0x78a3, 0x1359, 0x4dca, 0x75eb, 0xd8ab, 0x4141, 0x0a4d, 0x0070,
+              0xe898, 0x7779, 0x4079, 0x8cc7, 0xfe73, 0x2b6f, 0x6cee, 0x5203};
+sc gf  D2 = { 0xf159, 0x26b2, 0x9b94, 0xebd6, 0xb156, 0x8283, 0x149a, 0x00e0,
+              0xd130, 0xeef3, 0x80f2, 0x198e, 0xfce7, 0x56df, 0xd9dc, 0x2406};
+sc gf  X  = { 0xd51a, 0x8f25, 0x2d60, 0xc956, 0xa7b2, 0x9525, 0xc760, 0x692c,
+              0xdc5c, 0xfdd6, 0xe231, 0xc0a4, 0x53fe, 0xcd6e, 0x36d3, 0x2169};
+sc gf  Y  = { 0x6658, 0x6666, 0x6666, 0x6666, 0x6666, 0x6666, 0x6666, 0x6666,
+              0x6666, 0x6666, 0x6666, 0x6666, 0x6666, 0x6666, 0x6666, 0x6666};
+sc gf  I  = { 0xa0b0, 0x4a0e, 0x1b27, 0xc4ee, 0xe478, 0xad2f, 0x1806, 0x2f43,
+              0xd7a7, 0x3dfb, 0x0099, 0x2b4d, 0xdf0b, 0x4fc1, 0x2480, 0x2b83};
+
+sc u64 L[32] = { 0xed, 0xd3, 0xf5, 0x5c, 0x1a, 0x63, 0x12, 0x58,
+                 0xd6, 0x9c, 0xf7, 0xa2, 0xde, 0xf9, 0xde, 0x14,
+                 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+                 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10 };
+
+sv set_25519(gf r, const gf a) { FOR(i, 0, 16) r[i] = a[i]; }
+
+static u8 par_25519(const gf a)
+{
+    u8 d[32];
+    pack_25519(d, a);
+    return d[0] & 1;
+}
+
+sv pow2523(gf o,const gf i)
+{
+    gf c;
+    FOR(a, 0, 16) c[a] = i[a];
+    for(int a = 250; a >= 0; a--) {
+        S(c, c);
+        if(a != 1) M(c, c, i);
+    }
+    FOR(a, 0, 16) o[a] = c[a];
+}
+
+static int vn(const u8 *x, const u8 *y, size_t n)
+{
+  uint32_t d = 0;
+  FOR(i, 0, n) d |= x[i] ^ y[i];
+  return (1 & ((d - 1) >> 8)) - 1;
+}
+
+static int neq_25519(const gf a, const gf b)
+{
+    u8 c[32],d[32];
+    pack_25519(c, a);
+    pack_25519(d, b);
+    return vn(c, d, 32);
+}
+
+sv add(gf p[4], gf q[4])
+{
+    gf a, b, c, d, t, e, f, g, h;
+    Z(a, p[1], p[0]);
+    Z(t, q[1], q[0]);
+    M(a, a, t);
+    A(b, p[0], p[1]);
+    A(t, q[0], q[1]);
+    M(b, b, t);
+    M(c, p[3], q[3]);
+    M(c, c, D2);
+    M(d, p[2], q[2]);
+    A(d, d, d);
+    Z(e, b, a);
+    Z(f, d, c);
+    A(g, d, c);
+    A(h, b, a);
+
+    M(p[0], e, f);
+    M(p[1], h, g);
+    M(p[2], g, f);
+    M(p[3], e, h);
+}
+
+sv cswap(gf p[4], gf q[4], u8 b)
+{
+    FOR(i, 0, 4)
+        sel_25519(p[i],q[i],b);
+}
+
+sv pack(u8 *r, gf p[4])
+{
+    gf tx, ty, zi;
+    inv_25519(zi, p[2]);
+    M(tx, p[0], zi);
+    M(ty, p[1], zi);
+    pack_25519(r, ty);
+    r[31] ^= par_25519(tx) << 7;
+}
+
+sv scalarmult(gf p[4], gf q[4], const u8 *s)
+{
+    set_25519(p[0], gf0);
+    set_25519(p[1], gf1);
+    set_25519(p[2], gf1);
+    set_25519(p[3], gf0);
+    for (int i = 255; i >= 0; i--) {
+        u8 b = (s[i/8] >> (i & 7)) & 1;
+        cswap(p, q, b);
+        add(q, p);
+        add(p, p);
+        cswap(p, q, b);
+    }
+}
+
+sv scalarbase(gf p[4], const u8 *s)
+{
+    gf q[4];
+    set_25519(q[0], X);
+    set_25519(q[1], Y);
+    set_25519(q[2], gf1);
+    M(q[3], X, Y);
+    scalarmult(p, q, s);
+}
+
+sv modL(u8 *r, i64 x[64])
+{
+    i64 i, j;
+    for (i = 63;i >= 32;--i) {
+        i64 carry = 0;
+        for (j = i - 32;j < i - 12;++j) {
+            x[j] += carry - 16 * x[i] * L[j - (i - 32)];
+            carry = (x[j] + 128) >> 8;
+            x[j] -= carry << 8;
+        }
+        x[j] += carry;
+        x[i] = 0;
+    }
+    i64 carry = 0;
+    FOR(j, 0, 32) {
+        x[j] += carry - (x[31] >> 4) * L[j];
+        carry = x[j] >> 8;
+        x[j] &= 255;
+    }
+    FOR(j, 0, 32) x[j] -= carry * L[j];
+    FOR(i, 0, 32) {
+        x[i+1] += x[i] >> 8;
+        r[i  ]  = x[i] & 255;
+    }
+}
+
+sv reduce(u8 r[64])
+{
+    i64 x[64];
+    FOR(i, 0, 64) x[i] = (u64) r[i];
+    FOR(i, 0, 64) r[i] = 0;
+    modL(r, x);
+}
+
+static int unpackneg(gf r[4],const u8 p[32])
+{
+    gf t, chk, num, den, den2, den4, den6;
+    set_25519(r[2], gf1);
+    unpack_25519(r[1], p);
+    S(num,r [1]);
+    M(den, num, D);
+    Z(num, num, r[2]);
+    A(den, r[2], den);
+
+    S(den2, den);
+    S(den4, den2);
+    M(den6, den4, den2);
+    M(t, den6, num);
+    M(t, t, den);
+
+    pow2523(t, t);
+    M(t, t, num);
+    M(t, t, den);
+    M(t, t, den);
+    M(r[0], t, den);
+
+    S(chk, r[0]);
+    M(chk, chk, den);
+    if (neq_25519(chk, num)) M(r[0], r[0], I);
+
+    S(chk, r[0]);
+    M(chk, chk, den);
+    if (neq_25519(chk, num)) return -1;
+
+    if (par_25519(r[0]) == (p[31]>>7)) Z(r[0],gf0,r[0]);
+
+    M(r[3], r[0], r[1]);
+    return 0;
+}
+
+#ifdef ED25519_SHA512
+    #include "sha512.h"
+    #define HASH crypto_sha512
+#else
+    #define HASH crypto_blake2b
+#endif
+
+#define COMBINE1(x, y) x ## y
+#define COMBINE2(x, y) COMBINE1(x, y)
+#define HASH_CTX    COMBINE2(HASH, _ctx)
+#define HASH_INIT   COMBINE2(HASH, _init)
+#define HASH_UPDATE COMBINE2(HASH, _update)
+#define HASH_FINAL  COMBINE2(HASH, _final)
+
+// hash function interface
+// Typical uses: sha512 for tests vectors, blake2b for production.
+void HASH_INIT  (HASH_CTX *ctx);
+void HASH_UPDATE(HASH_CTX *ctx, const u8 *in, size_t inlen);
+void HASH_FINAL (HASH_CTX *ctx, u8 hash[64]);
+void HASH(u8 hash[64], const u8 *in, size_t inlen);
+
+sv hash_k(u8 k[64], const u8 R[32], const u8 A[32], const u8 *M, size_t M_size)
+{
+    HASH_CTX ctx;
+    HASH_INIT  (&ctx);
+    HASH_UPDATE(&ctx, R , 32    );
+    HASH_UPDATE(&ctx, A , 32    );
+    HASH_UPDATE(&ctx, M , M_size);
+    HASH_FINAL (&ctx, k);
+    reduce(k);
+}
+
+void crypto_ed25519_public_key(uint8_t        public_key[32],
+                               const uint8_t  secret_key[32])
+{
+    // hash the private key, turn the hash into a scalar
+    u8 a[64];
+    HASH(a, secret_key, 32);
+    a[ 0] &= 248;
+    a[31] &= 127;
+    a[31] |= 64;
+
+    // the public key is the packed form of the point aB (B == basepoint)
+    gf aB[4];
+    scalarbase(aB, a);
+    pack(public_key, aB);
+}
+
+void crypto_ed25519_sign(uint8_t        signature[64],
+                         const uint8_t  secret_key[32],
+                         const uint8_t *message,
+                         size_t         message_size)
+{
+    u8 h[64];
+    u8 *a      = h;       // secret scalar
+    u8 *prefix = h + 32;  // prefix for nonce generation
+    HASH(h, secret_key, 32);
+
+    // build public key from secret key
+    a[ 0] &= 248;
+    a[31] &= 127;
+    a[31] |= 64;
+    gf aB[4];
+    scalarbase(aB, a);
+    u8 public_key[32];
+    pack(public_key, aB);
+
+    // Constructs the "random" nonce from the secret key and message.
+    // An actual random number would work just fine, and would save us
+    // the trouble of hashing the message twice.  If we did that
+    // however, the user could fuck it up and reuse the nonce.
+    u8 r[64];
+    HASH_CTX ctx;
+    HASH_INIT  (&ctx);
+    HASH_UPDATE(&ctx, prefix , 32          );
+    HASH_UPDATE(&ctx, message, message_size);
+    HASH_FINAL (&ctx, r);
+
+    gf rB[4];
+    reduce(r);
+    scalarbase(rB, r);
+    pack(signature, rB); // first half of the signature = "random" nonce
+
+    u8 k[64];
+    hash_k(k, signature, public_key, message, message_size);
+
+    i64 s[64]; // s = r + k a
+    FOR(i,  0, 32) s[i] = (u64) r[i];
+    FOR(i, 32, 64) s[i] = 0;
+    FOR(i, 0, 32) {
+        FOR(j, 0, 32) {
+            s[i+j] += k[i] * (u64) a[j];
+        }
+    }
+    modL(signature + 32, s);  // second half of the signature = s
+}
+
+int crypto_ed25519_check(const uint8_t  signature[64],
+                         const uint8_t  public_key[32],
+                         const uint8_t *message,
+                         size_t         message_size)
+{
+    gf aB[4];  if (unpackneg(aB, public_key)) return -1;   // -aB
+    u8 k[64];  hash_k(k, signature, public_key, message, message_size);
+    gf p[4];   scalarmult(p, aB, k);                       // p = -aB k
+    gf sB[4];  scalarbase(sB, signature + 32); add(p, sB); // p = s - aB k
+    u8 t[32];  pack(t, p);
+    return vn(signature, t, 32); // R == s - aB k ? OK : fail
+}
+
+void crypto_ae_lock_detached(uint8_t        mac[16],
+                             uint8_t       *ciphertext,
+                             const uint8_t  key[32],
+                             const uint8_t  nonce[24],
+                             const uint8_t *plaintext,
+                             size_t         text_size)
+{
+    crypto_chacha_ctx e_ctx;
+    uint8_t           auth_key[32];
+    crypto_chacha20_Xinit (&e_ctx, key, nonce);
+    crypto_chacha20_random(&e_ctx, auth_key, 32);
+
+    crypto_chacha20_encrypt(&e_ctx, plaintext, ciphertext, text_size);
+    crypto_poly1305_auth(mac, ciphertext, text_size, auth_key);
+}
+
+int crypto_ae_unlock_detached(uint8_t       *plaintext,
+                              const uint8_t  key[32],
+                              const uint8_t  nonce[24],
+                              const uint8_t  mac[16],
+                              const uint8_t *ciphertext,
+                              size_t         text_size)
+{
+    crypto_chacha_ctx e_ctx;
+    uint8_t           auth_key[32];
+    crypto_chacha20_Xinit (&e_ctx, key, nonce);
+    crypto_chacha20_random(&e_ctx, auth_key, 32);
+
+    uint8_t real_mac[16];
+    crypto_poly1305_auth(real_mac, ciphertext, text_size, auth_key);
+
+    if (crypto_memcmp_16(real_mac, mac))
+        return -1;
+
+    crypto_chacha20_encrypt(&e_ctx, ciphertext, plaintext, text_size);
+    return 0;
+}
+
+void crypto_ae_lock(uint8_t       *box,
+                    const uint8_t  key[32],
+                    const uint8_t  nonce[24],
+                    const uint8_t *plaintext,
+                    size_t         text_size)
+{
+    crypto_ae_lock_detached(box, box + 16, key, nonce, plaintext, text_size);
+}
+
+int crypto_ae_unlock(uint8_t       *plaintext,
+                     const uint8_t  key[32],
+                     const uint8_t  nonce[24],
+                     const uint8_t *box,
+                     size_t         text_size)
+{
+    return crypto_ae_unlock_detached(plaintext, key, nonce,
+                                     box, box + 16, text_size);
+}
+
+void crypto_lock_key(uint8_t       shared_key[32],
+                     const uint8_t your_secret_key [32],
+                     const uint8_t their_public_key[32])
+{
+    static const uint8_t _0[16];
+    uint8_t shared_secret[32];
+    crypto_x25519(shared_secret, your_secret_key, their_public_key);
+    crypto_chacha20_H(shared_key, shared_secret, _0);
+}
+
+void crypto_lock_detached(uint8_t        mac[16],
+                          uint8_t       *ciphertext,
+                          const uint8_t  your_secret_key [32],
+                          const uint8_t  their_public_key[32],
+                          const uint8_t  nonce[24],
+                          const uint8_t *plaintext,
+                          size_t         text_size)
+{
+    uint8_t shared_key[32];
+    crypto_lock_key(shared_key, your_secret_key, their_public_key);
+    crypto_ae_lock_detached(mac, ciphertext,
+                            shared_key, nonce,
+                            plaintext, text_size);
+}
+
+int crypto_unlock_detached(uint8_t       *plaintext,
+                           const uint8_t  your_secret_key [32],
+                           const uint8_t  their_public_key[32],
+                           const uint8_t  nonce[24],
+                           const uint8_t  mac[16],
+                           const uint8_t *ciphertext,
+                           size_t         text_size)
+{
+    uint8_t shared_key[32];
+    crypto_lock_key(shared_key, your_secret_key, their_public_key);
+    return crypto_ae_unlock_detached(plaintext,
+                                     shared_key, nonce,
+                                     mac, ciphertext, text_size);
+}
+
+void crypto_lock(uint8_t       *box,
+                 const uint8_t  your_secret_key [32],
+                 const uint8_t  their_public_key[32],
+                 const uint8_t  nonce[24],
+                 const uint8_t *plaintext,
+                 size_t         text_size)
+{
+    crypto_lock_detached(box, box + 16,
+                         your_secret_key, their_public_key, nonce,
+                         plaintext, text_size);
+}
+
+int crypto_unlock(uint8_t       *plaintext,
+                  const uint8_t  your_secret_key [32],
+                  const uint8_t  their_public_key[32],
+                  const uint8_t  nonce[24],
+                  const uint8_t *box,
+                  size_t         text_size)
+{
+    return crypto_unlock_detached(plaintext,
+                                  your_secret_key, their_public_key, nonce,
+                                  box, box + 16, text_size);
+}
+
+static const uint8_t null_nonce[24] = {};
+
+void crypto_anonymous_lock(uint8_t       *box,
+                           const uint8_t  random_secret_key[32],
+                           const uint8_t  their_public_key[32],
+                           const uint8_t *plaintext,
+                           size_t         text_size)
+{
+    crypto_x25519_base(box, random_secret_key); // put public key in box
+    crypto_lock(box + 32,
+                random_secret_key, their_public_key, null_nonce,
+                plaintext, text_size);
+}
+
+int crypto_anonymous_unlock(uint8_t       *plaintext,
+                            const uint8_t  your_secret_key[32],
+                            const uint8_t *box,
+                            size_t         text_size)
+{
+    return crypto_unlock(plaintext,
+                         your_secret_key, box, null_nonce,
+                         box + 32, text_size);
+}
diff --git a/monocypher.h b/monocypher.h
new file mode 100644
index 0000000..7829768
--- /dev/null
+++ b/monocypher.h
@@ -0,0 +1,335 @@
+#ifndef MONOCYPHER_H
+#define MONOCYPHER_H
+
+#include <inttypes.h>
+#include <stddef.h>
+
+// This is a chacha20 context.
+// To use safely, just follow these guidelines:
+// - Always initialize your context with one of the crypto_init_* functions below
+// - Dont't modify it, except through the crypto_chacha20_* below.
+// - Never duplicate it.
+typedef struct crypto_chacha_ctx {
+    uint32_t input[16];       // current input, unencrypted
+    uint8_t  random_pool[64]; // last input, encrypted
+    uint8_t  pool_index;      // pointer to random_pool
+} crypto_chacha_ctx;
+
+// HChacha20.  *Kind* of a cryptographic hash, based on the chacha20 rounds.
+// Used for XChacha20, and the key derivation of the X25519 shared secret.
+// Don't use it unless you really know what you're doing.
+void
+crypto_chacha20_H(uint8_t       out[32],
+                  const uint8_t key[32],
+                  const uint8_t in [16]);
+
+// Initializes a chacha context.
+//
+// WARNING: DON'T USE THE SAME NONCE AND KEY TWICE
+//
+// You'd be exposing the XOR of subsequent encrypted
+// messages, thus destroying your confidentiality.
+//
+// WARNING: DON'T SELECT THE NONCE AT RANDOM
+//
+// If you encode enough messages with a random nonce, there's a good
+// chance some of them will use the same nonce by accident. 64 bits
+// just isn't enough for this.  Use a counter instead.
+//
+// If there are multiple parties sending out messages, you can give them
+// all an initial nonce of 0, 1 .. n-1 respectively, and have them increment
+// their nonce  by n.  (Also make sure the nonces never wrap around.).
+void
+crypto_chacha20_init(crypto_chacha_ctx *ctx,
+                     const uint8_t      key[32],
+                     const uint8_t      nonce[8]);
+
+// Initializes a chacha context, with a big nonce (192 bits),
+// more than enough to be selected at random.
+//
+// The price you pay for that is a slower initialization.  The security
+// guarantees are the same as regular initialization.
+void
+crypto_chacha20_Xinit(crypto_chacha_ctx *ctx,
+                      const uint8_t      key[32],
+                      const uint8_t      nonce[24]);
+
+// Encrypts the plain_text by XORing it with a pseudo-random
+// stream of numbers, seeded by the provided chacha20 context.
+// Decryption uses the exact same method.
+//
+// Once the context is initialized, encryptions can safely be chained thus:
+//
+//    crypto_encrypt_chacha20(ctx, plain_0, cipher_0, length_0);
+//    crypto_encrypt_chacha20(ctx, plain_1, cipher_1, length_1);
+//    crypto_encrypt_chacha20(ctx, plain_2, cipher_2, length_2);
+//
+// plain_text and cipher_text may point to the same location, for in-place
+// encryption.
+//
+// plain_text is allowed to be null (0), in which case it will be
+// interpreted as an all zero input.  The cipher_text will then
+// contain the raw chacha20 stream.  Useful as a random number
+// generator.
+//
+// WARNING: ENCRYPTION ALONE IS NOT SECURE.  YOU NEED AUTHENTICATION AS WELL.
+// Use the provided authenticated encryption constructions.
+void
+crypto_chacha20_encrypt(crypto_chacha_ctx *ctx,
+                        const uint8_t     *plain_text,
+                        uint8_t           *cipher_text,
+                        size_t             message_size);
+
+// convenience function.  Same as chacha20_encrypt() with a null plain_text.
+void
+crypto_chacha20_random(crypto_chacha_ctx *ctx,
+                       uint8_t           *cipher_text,
+                       size_t             message_size);
+
+
+typedef struct {
+    uint32_t r[4];
+    uint32_t h[5];
+    uint32_t c[5];
+    uint32_t pad[5];
+    size_t   c_index;
+} crypto_poly1305_ctx;
+
+
+// Initializes the poly1305 context with the secret key.
+// Call first (obviously).
+// WARNING: NEVER AUTHENTICATE 2 MESSAGES WITH THE SAME KEY.
+// This is a ONE TIME authenticator.  If you authenticate 2 messages
+// with the same key, the attacker may deduce your secret key and
+// authenticate messages in your stead.
+void crypto_poly1305_init(crypto_poly1305_ctx *ctx, const uint8_t key[32]);
+
+// Updates the poly1305 context with a chunk of the message
+// Can be called multiple times, once for each chunk.
+// Make sure the chunks are processed in order, without overlap or hole...
+void crypto_poly1305_update(crypto_poly1305_ctx *ctx,
+                            const uint8_t *m, size_t bytes);
+
+// Authenticate the message munched through previous update() calls.
+// Call last (obviously).
+void crypto_poly1305_finish(crypto_poly1305_ctx *ctx, uint8_t mac[16]);
+
+
+// Convenience all in one function
+void crypto_poly1305_auth(uint8_t        mac[16],
+                          const uint8_t *m,
+                          size_t         msg_length,
+                          const uint8_t  key[32]);
+
+// Constant time equality verification
+// returns 0 if it matches, something else otherwise.
+int crypto_memcmp_16(const uint8_t mac1[16], const uint8_t mac2[16]);
+
+
+// blake2b context
+typedef struct {
+    uint8_t  buf[128];      // input buffer
+    uint64_t hash[8];       // chained state
+    uint64_t input_size[2]; // total number of bytes
+    uint8_t  c;             // pointer for buf[]
+    uint8_t  output_size;   // digest size
+} crypto_blake2b_ctx;
+
+// Initializes the context with user defined parameters:
+// outlen: the length of the hash.  Must be between 1 and 64.
+// keylen: length of the key.       Must be between 0 and 64.
+// key   : some secret key.         May be NULL if keylen is 0.
+// Any deviation from these invariants results in UNDEFINED BEHAVIOR
+void
+crypto_blake2b_general_init(crypto_blake2b_ctx *ctx, size_t outlen,
+                            const uint8_t      *key, size_t keylen);
+
+// Convenience function: 64 bytes hash, no secret key.
+void
+crypto_blake2b_init(crypto_blake2b_ctx *ctx);
+
+// Add "inlen" bytes from "in" into the hash.
+void
+crypto_blake2b_update(crypto_blake2b_ctx *ctx, const uint8_t *in, size_t inlen);
+
+// Generate the message digest (size given in init).
+void
+crypto_blake2b_final(crypto_blake2b_ctx *ctx, uint8_t *out);
+
+// All-in-one convenience function.
+// outlen, keylen, and key work the same as they do in the general_init function
+void
+crypto_blake2b_general(      uint8_t *out, size_t outlen, // digest
+                       const uint8_t *key, size_t keylen, // optional secret key
+                       const uint8_t *in , size_t inlen); // data to be hashed
+
+// All-in-one convenience function: 64 bytes hash, no secret key.
+void
+crypto_blake2b(uint8_t out[64], const uint8_t *in, size_t inlen);
+
+
+
+// Implements argon2i, with degree of paralelism 1,
+// because it's good enough, and threads are scary.
+//
+// key and ad are optionnal.  They can be NULL if their respective size is 0.
+// work_area is a pointer to a contiguous chunk of memory of at least
+// nb_blocks * 1024 bytes.  It must be suitably aligned for 64-bit words.
+// Don't worry too much about alignment, malloc()'s results work.
+//
+// Choice of parameters for password hashing:
+// - If you need a key, use a 32 bytes one.
+// - Do what you will with the ad.
+// - Use a 32 bytes tag (to get a 256-bit key)
+// - Put 128 bits of entropy in the salt.  16 random bytes work well.
+// - Use all the memory you can get away with.
+// - Use as much iterations as reasonable.  No less than 10 passes if you can.
+void
+crypto_argon2i_hash(uint8_t       *tag,       uint32_t tag_size,      // >= 4
+                    const uint8_t *password,  uint32_t password_size,
+                    const uint8_t *salt,      uint32_t salt_size,     // >= 8
+                    const uint8_t *key,       uint32_t key_size,
+                    const uint8_t *ad,        uint32_t ad_size,
+                    void    *work_area,
+                    uint32_t nb_blocks,                               // >= 8
+                    uint32_t nb_iterations);
+
+// Convenience function. No key, no ad, 64 bytes tag
+void
+crypto_argon2i(uint8_t        tag[32],
+               const uint8_t *password,  uint32_t password_size,
+               const uint8_t *salt,      uint32_t salt_size,     // >= 8
+               void    *work_area,
+               uint32_t nb_blocks,                               // >= 8
+               uint32_t nb_iterations);
+
+
+// Computes a shared secret from your private key and their public key.
+// WARNING: DO NOT USE THE SHARED SECRET DIRECTLY.
+// The shared secret is not pseudo-random.  You need to hash it to derive
+// an acceptable secret key.  Any cryptographic hash can work, as well as
+// HChacha20.
+//
+// Implementation details: this is an elliptic curve.  The public key is
+// a point on this curve, and your private key is a scalar.  The shared
+// secret is another point on this curve, obtained by scalar multiplication.
+// Basically:
+//     shared_secret == your_sk * their_pk == your_sk * (their_sk * base_point)
+//                   == their_sk * your_pk == their_sk * (your_sk * base_point)
+void crypto_x25519(uint8_t       shared_secret   [32],
+                   const uint8_t your_secret_key [32],
+                   const uint8_t their_public_key[32]);
+
+// Generates a public key from the specified secret key.
+// Make sure the secret key is randomly selected.
+//
+// Implementation detail: your secret key is a scalar, and we multiply
+// the base point (a constant) by it to obtain a public key.  That is:
+//     public_key == secret_key * base_point
+// Reversing the operation is conjectured to be infeasible
+// without quantum computers (128 bits of security).
+void crypto_x25519_base(uint8_t public_key[32], const uint8_t secret_key[32]);
+
+
+void crypto_ed25519_public_key(uint8_t        public_key[32],
+                               const uint8_t  secret_key[32]);
+
+void crypto_ed25519_sign(uint8_t        signature[64],
+                         const uint8_t  secret_key[32],
+                         const uint8_t *message,
+                         size_t         message_size);
+
+int crypto_ed25519_check(const uint8_t  signature[64],
+                         const uint8_t  public_key[32],
+                         const uint8_t *message,
+                         size_t         message_size);
+
+
+// Authenticated encryption with XChacha20 and Poly1305.
+void crypto_ae_lock_detached(uint8_t        mac[16],
+                             uint8_t       *ciphertext,
+                             const uint8_t  key[32],
+                             const uint8_t  nonce[24],
+                             const uint8_t *plaintext,
+                             size_t         text_size);
+
+// Authenticated encryption with XChacha20 and Poly1305.
+// Returns -1 and has no effect if the message is forged.
+int crypto_ae_unlock_detached(uint8_t       *plaintext,
+                              const uint8_t  key[32],
+                              const uint8_t  nonce[24],
+                              const uint8_t  mac[16],
+                              const uint8_t *ciphertext,
+                              size_t         text_size);
+
+// Like the above, only puts the mac and the ciphertext together
+// in a "box", mac first
+void crypto_ae_lock(uint8_t       *box,      // text_size + 16
+                    const uint8_t  key[32],
+                    const uint8_t  nonce[24],
+                    const uint8_t *plaintext,
+                    size_t         text_size);
+
+// Unlocks a box locked by aead_lock()
+int crypto_ae_unlock(uint8_t       *plaintext,
+                     const uint8_t  key[32],
+                     const uint8_t  nonce[24],
+                     const uint8_t *box,     // text_size + 16
+                     size_t         text_size);
+
+
+// Computes a shared key with your secret key and their public key,
+// suitable for crypto_ae* functions.
+void crypto_lock_key(uint8_t       shared_key      [32],
+                     const uint8_t your_secret_key [32],
+                     const uint8_t their_public_key[32]);
+
+// Authenticated encryption with the sender's secret key and the recipient's
+// public key.  The message leaks if one of the secret key gets compromised.
+void crypto_lock_detached(uint8_t        mac[16],
+                          uint8_t       *ciphertext,
+                          const uint8_t  your_secret_key [32],
+                          const uint8_t  their_public_key[32],
+                          const uint8_t  nonce[24],
+                          const uint8_t *plaintext,
+                          size_t         text_size);
+
+// Authenticated decryption with the recipient's secret key, and the sender's
+// public key.  Has no effect if the message is forged.
+int crypto_unlock_detached(uint8_t       *plaintext,
+                           const uint8_t  your_secret_key [32],
+                           const uint8_t  their_public_key[32],
+                           const uint8_t  nonce[24],
+                           const uint8_t  mac[16],
+                           const uint8_t *ciphertext,
+                           size_t         text_size);
+
+// Like the above, only puts the mac and the ciphertext together
+// in a "box", mac first
+void crypto_lock(uint8_t       *box,
+                 const uint8_t  your_secret_key [32],
+                 const uint8_t  their_public_key[32],
+                 const uint8_t  nonce[24],
+                 const uint8_t *plaintext,
+                 size_t         text_size);
+
+// Unlocks a box locked by crypto_lock()
+int crypto_unlock(uint8_t       *plaintext,
+                  const uint8_t  your_secret_key [32],
+                  const uint8_t  their_public_key[32],
+                  const uint8_t  nonce[24],
+                  const uint8_t *box,
+                  size_t         text_size);
+
+void crypto_anonymous_lock(uint8_t       *box,
+                           const uint8_t  random_secret_key[32],
+                           const uint8_t  their_public_key[32],
+                           const uint8_t *plaintext,
+                           size_t         text_size);
+
+int crypto_anonymous_unlock(uint8_t       *plaintext,
+                            const uint8_t  your_secret_key[32],
+                            const uint8_t *box,
+                            size_t         text_size);
+
+#endif // MONOCYPHER_H
diff --git a/poly1305.c b/poly1305.c
deleted file mode 100644
index c124977..0000000
--- a/poly1305.c
+++ /dev/null
@@ -1,154 +0,0 @@
-#include "poly1305.h"
-
-static uint32_t load32_le(const uint8_t s[4])
-{
-    return s[0]
-        | (s[1] <<  8)
-        | (s[2] << 16)
-        | (s[3] << 24);
-}
-
-static void store32_le(uint8_t output[4], uint32_t input)
-{
-    output[0] =  input        & 0xff;
-    output[1] = (input >>  8) & 0xff;
-    output[2] = (input >> 16) & 0xff;
-    output[3] = (input >> 24) & 0xff;
-}
-
-static void poly_load(uint32_t out[4], const uint8_t in[16])
-{
-    for (int i = 0; i < 4; i++)
-        out[i] = load32_le(in + i*4);
-}
-
-static void poly_add(uint32_t out[5], const uint32_t a[5], const uint32_t b[5])
-{
-    uint64_t carry = 0;
-    for (int i = 0; i < 5; i++) {
-        carry  += (int64_t)(a[i]) + b[i];
-        out[i]  = carry & 0xffffffff; // lower 32 bits right there.
-        carry >>= 32;                 // retain the carry
-    }
-}
-
-// h = (h + c) * r
-static void poly_block(crypto_poly1305_ctx *ctx)
-{
-    // h + c, without carry propagation
-    const uint64_t h0 = ctx->h[0] + (uint64_t)ctx->c[0];
-    const uint64_t h1 = ctx->h[1] + (uint64_t)ctx->c[1];
-    const uint64_t h2 = ctx->h[2] + (uint64_t)ctx->c[2];
-    const uint64_t h3 = ctx->h[3] + (uint64_t)ctx->c[3];
-    const uint64_t h4 = ctx->h[4] + (uint64_t)ctx->c[4];
-
-    // Local all the things!
-    const uint64_t r0 = ctx->r[0];
-    const uint64_t r1 = ctx->r[1];
-    const uint64_t r2 = ctx->r[2];
-    const uint64_t r3 = ctx->r[3];
-    const uint64_t rr0 = (ctx->r[0] >> 2) * 5; // lose 2 bottom bits...
-    const uint64_t rr1 = (ctx->r[1] >> 2) * 5; // 2 bottom bits already cleared
-    const uint64_t rr2 = (ctx->r[2] >> 2) * 5; // 2 bottom bits already cleared
-    const uint64_t rr3 = (ctx->r[3] >> 2) * 5; // 2 bottom bits already cleared
-
-    // (h + c) * r, without carry propagation
-    const uint64_t x0 = h0*r0 + h1*rr3 + h2*rr2 + h3*rr1 + h4*rr0;
-    const uint64_t x1 = h0*r1 + h1*r0  + h2*rr3 + h3*rr2 + h4*rr1;
-    const uint64_t x2 = h0*r2 + h1*r1  + h2*r0  + h3*rr3 + h4*rr2;
-    const uint64_t x3 = h0*r3 + h1*r2  + h2*r1  + h3*r0  + h4*rr3;
-    const uint64_t x4 = h4 * (r0 & 3); // ...recover those 2 bits
-
-    // carry propagation, put ctx->h under 2^130
-    const uint64_t msb = x4 + (x3 >> 32);
-    uint64_t       u   = (msb >> 2) * 5; // lose 2 bottom bits...
-    u += (x0 & 0xffffffff)             ;  ctx->h[0] = u & 0xffffffff;  u >>= 32;
-    u += (x1 & 0xffffffff) + (x0 >> 32);  ctx->h[1] = u & 0xffffffff;  u >>= 32;
-    u += (x2 & 0xffffffff) + (x1 >> 32);  ctx->h[2] = u & 0xffffffff;  u >>= 32;
-    u += (x3 & 0xffffffff) + (x2 >> 32);  ctx->h[3] = u & 0xffffffff;  u >>= 32;
-    u += msb & 3 /* ...recover them */ ;  ctx->h[4] = u;
-}
-
-// (re-)initializes the input counter and input buffer
-static void poly_clear_c(crypto_poly1305_ctx *ctx)
-{
-    for (int i = 0; i < 4; i++)
-        ctx->c[i] = 0;
-    ctx->c_index = 0;
-}
-
-void crypto_poly1305_init(crypto_poly1305_ctx *ctx, const uint8_t key[32])
-{
-    // initial h: zero
-    for (int i =  0; i < 5; i++)
-        ctx->h [i] = 0;
-    // initial r: first half of the key, minus a few bits
-    poly_load(ctx->r, key);
-    ctx->r[0] &= 0x0fffffff; // clear top 4 bits
-    ctx->r[1] &= 0x0ffffffc; // clear top 4 & bottom 2 bits
-    ctx->r[2] &= 0x0ffffffc; // clear top 4 & bottom 2 bits
-    ctx->r[3] &= 0x0ffffffc; // clear top 4 & bottom 2 bits
-    ctx->c[4]  = 1;
-    // second half of the key, saved for later
-    poly_load(ctx->pad, key + 16);
-    ctx->pad[4] = 0;
-    // buffer and counter
-    poly_clear_c(ctx);
-}
-
-void crypto_poly1305_update(crypto_poly1305_ctx *ctx,
-                            const uint8_t *m, size_t bytes)
-{
-    while (bytes > 0) {
-        if (ctx->c_index == 16) {
-            poly_block(ctx);
-            poly_clear_c(ctx);
-        }
-        // feed the input buffer
-        ctx->c[ctx->c_index / 4] |= *m << ((ctx->c_index % 4) * 8);
-        ctx->c_index++;
-        m++;
-        bytes--;
-    }
-}
-
-void crypto_poly1305_finish(crypto_poly1305_ctx *ctx, uint8_t mac[16])
-{
-    // move the final 1 according to remaining input length
-    ctx->c[4] = 0;
-    ctx->c[ctx->c_index / 4] |= 1 << ((ctx->c_index % 4) * 8);
-    // one last hash update...
-    poly_block(ctx);
-    // ... this time with full modular reduction
-    // We only need to conditionally subtract 2^130-5,
-    // using bit twidling to prevent timing attacks.
-    static const uint32_t minus_p[5] = { 5, 0, 0, 0, 0xfffffffc };
-    uint32_t h_minus_p[5];
-    poly_add(h_minus_p, ctx->h, minus_p);
-    uint32_t negative = ~(-(h_minus_p[4] >> 31)); // 0 or -1 (2's complement)
-    for (int i = 0; i < 5; i++) {
-        ctx->h[i] ^= negative & (ctx->h[i] ^ h_minus_p[i]);
-    }
-    // Add the secret pad to the final hash before output
-    poly_add(ctx->h, ctx->h, ctx->pad);
-    for (int i = 0; i < 4; i++)
-        store32_le(mac + i*4, ctx->h[i]);
-}
-
-void crypto_poly1305_auth(uint8_t mac[16], const uint8_t *m,
-                          size_t  m_size , const uint8_t  key[32])
-{
-    crypto_poly1305_ctx ctx;
-    crypto_poly1305_init  (&ctx, key);
-    crypto_poly1305_update(&ctx, m, m_size);
-    crypto_poly1305_finish(&ctx, mac);
-}
-
-int crypto_memcmp_16(const uint8_t mac1[16], const uint8_t mac2[16])
-{
-    unsigned diff = 0;
-    for (int i = 0; i < 16; i++) {
-        diff |= (mac1[i] ^ mac2[i]);
-    }
-    return diff;
-}
diff --git a/poly1305.h b/poly1305.h
deleted file mode 100644
index ce3571e..0000000
--- a/poly1305.h
+++ /dev/null
@@ -1,45 +0,0 @@
-#ifndef POLY1305_H
-#define POLY1305_H
-
-#include <inttypes.h>
-#include <stddef.h>
-
-typedef struct {
-    uint32_t r[4];
-    uint32_t h[5];
-    uint32_t c[5];
-    uint32_t pad[5];
-    size_t   c_index;
-} crypto_poly1305_ctx;
-
-
-// Initializes the poly1305 context with the secret key.
-// Call first (obviously).
-// WARNING: NEVER AUTHENTICATE 2 MESSAGES WITH THE SAME KEY.
-// This is a ONE TIME authenticator.  If you authenticate 2 messages
-// with the same key, the attacker may deduce your secret key and
-// authenticate messages in your stead.
-void crypto_poly1305_init(crypto_poly1305_ctx *ctx, const uint8_t key[32]);
-
-// Updates the poly1305 context with a chunk of the message
-// Can be called multiple times, once for each chunk.
-// Make sure the chunks are processed in order, without overlap or hole...
-void crypto_poly1305_update(crypto_poly1305_ctx *ctx,
-                            const uint8_t *m, size_t bytes);
-
-// Authenticate the message munched through previous update() calls.
-// Call last (obviously).
-void crypto_poly1305_finish(crypto_poly1305_ctx *ctx, uint8_t mac[16]);
-
-
-// Convenience all in one function
-void crypto_poly1305_auth(uint8_t        mac[16],
-                          const uint8_t *m,
-                          size_t         msg_length,
-                          const uint8_t  key[32]);
-
-// Constant time equality verification
-// returns 0 if it matches, something else otherwise.
-int crypto_memcmp_16(const uint8_t mac1[16], const uint8_t mac2[16]);
-
-#endif // POLY1305_H
diff --git a/test.c b/test.c
index 0dd7678..e7c6fa3 100644
--- a/test.c
+++ b/test.c
@@ -1,16 +1,7 @@
 #include <stdio.h>
 #include <stdlib.h>
-#include <inttypes.h>
-#include <stddef.h>
 #include <string.h>
-#include "chacha20.h"
-#include "blake2b.h"
-#include "poly1305.h"
-#include "argon2i.h"
-#include "ae.h"
-#include "lock.h"
-#include "x25519.h"
-#include "ed25519.h"
+#include "monocypher.h"
 #include "sha512.h"
 
 /////////////////////////
diff --git a/x25519.c b/x25519.c
deleted file mode 100644
index 25ac9dd..0000000
--- a/x25519.c
+++ /dev/null
@@ -1,156 +0,0 @@
-// Taken from TweetNaCl
-
-#include "x25519.h"
-
-#define FOR(i, start, end) for (size_t i = start; i < end; i++)
-#define sv static void
-typedef int64_t gf[16];
-
-static const uint8_t _0[16];
-static const uint8_t _9[32] = { 9 };
-static const gf _121665 = { 0xdb41, 1 };
-
-sv car_25519(gf o)
-{
-    FOR(i, 0, 16) {
-        o[i]              += 1LL  << 16;
-        int64_t c          = o[i] >> 16;
-        o[(i+1) * (i<15)] += c - 1 + (37 * (c-1) * (i==15));
-        o[i]              -= c << 16;
-    }
-}
-
-sv sel_25519(gf p, gf q, int b)
-{
-    int64_t c = ~(b-1);
-    FOR(i, 0, 16) {
-        int64_t t = c & (p[i] ^ q[i]);
-        p[i]     ^= t;
-        q[i]     ^= t;
-    }
-}
-
-sv pack_25519(uint8_t *o, const gf n)
-{
-    gf t;
-    FOR(i, 0, 16) t[i] = n[i];
-    car_25519(t);
-    car_25519(t);
-    car_25519(t);
-    FOR(j, 0, 2) {
-        gf m;
-        m[0] = t[0] - 0xffed;
-        FOR(i, 1, 15) {
-            m[i  ]  = t[i] - 0xffff - ((m[i-1] >> 16) & 1);
-            m[i-1] &= 0xffff;
-        }
-        m[15]  = t[15] - 0x7fff - ((m[14] >> 16) & 1);
-        int b  = (m[15] >> 16) & 1;
-        m[14] &= 0xffff;
-        sel_25519(t, m, 1-b);
-    }
-    FOR(i, 0, 16) {
-        o[2*i    ] = t[i] & 0xff;
-        o[2*i + 1] = t[i] >> 8;
-    }
-}
-
-sv unpack_25519(gf o, const uint8_t *n)
-{
-    FOR(i, 0, 16) o[i] = n[2*i] + ((int64_t)n[2*i + 1] << 8);
-    o[15] &= 0x7fff;
-}
-
-sv A(gf o, const gf a, const gf b)
-{
-    FOR(i, 0, 16) o[i] = a[i] + b[i];
-}
-
-sv Z(gf o, const gf a, const gf b)
-{
-    FOR(i, 0, 16) o[i] = a[i] - b[i];
-}
-
-sv M(gf o, const gf a, const gf b)
-{
-    int64_t t[31];
-    FOR(i, 0, 31) t[i] = 0;
-    FOR(i, 0, 16) FOR(j, 0, 16) t[i+j] += a[i] * b[j];
-    FOR(i, 0, 15) t[i] += 38 * t[i+16];
-    FOR(i, 0, 16) o[i] = t[i];
-    car_25519(o);
-    car_25519(o);
-}
-
-sv S(gf o,const gf a)
-{
-    M(o, a, a);
-}
-
-sv inv_25519(gf o,const gf i)
-{
-    gf c;
-    FOR(a, 0, 16) c[a] = i[a];
-    for(int a = 253; a >= 0; a--) {
-        S(c, c);
-        if(a != 2 && a != 4)
-            M(c, c, i);
-    }
-    FOR(a, 0, 16) o[a] = c[a];
-}
-
-void crypto_x25519(uint8_t q[32], const uint8_t n[32], const uint8_t p[32])
-{
-    uint8_t z[32];
-    int64_t x[80];
-    int64_t r;
-    gf a, b, c, d, e, f;
-    FOR(i, 0, 31) z[i] = n[i];
-    z[31]  = (n[31] & 127) | 64;
-    z[0 ] &= 248;
-    unpack_25519(x, p);
-    FOR(i, 0, 16) {
-        b[i] = x[i];
-        d[i] = a[i] = c[i] = 0;
-    }
-    a[0] = d[0] = 1;
-    for(int i = 254; i>=0; i--) {
-        r = (z[i>>3] >> (i & 7)) & 1;
-        sel_25519(a, b, r);
-        sel_25519(c, d, r);
-        A(e, a, c);
-        Z(a, a, c);
-        A(c, b, d);
-        Z(b, b, d);
-        S(d, e);
-        S(f, a);
-        M(a, c, a);
-        M(c, b, e);
-        A(e, a, c);
-        Z(a, a, c);
-        S(b, a);
-        Z(c, d, f);
-        M(a, c, _121665);
-        A(a, a, d);
-        M(c, c, a);
-        M(a, d, f);
-        M(d, b, x);
-        S(b, e);
-        sel_25519(a, b, r);
-        sel_25519(c, d, r);
-    }
-    FOR(i, 0, 16) {
-        x[i+16] = a[i];
-        x[i+32] = c[i];
-        x[i+48] = b[i];
-        x[i+64] = d[i];
-    }
-    inv_25519(x+32, x+32);
-    M(x+16, x+16, x+32);
-    pack_25519(q, x+16);
-}
-
-void crypto_x25519_base(uint8_t q[32], const uint8_t n[32])
-{
-    crypto_x25519(q, n, _9);
-}
diff --git a/x25519.h b/x25519.h
deleted file mode 100644
index 1c5d2ac..0000000
--- a/x25519.h
+++ /dev/null
@@ -1,34 +0,0 @@
-#ifndef X25519_H
-#define X25519_H
-
-#include <inttypes.h>
-#include <stddef.h>
-
-// Computes a shared secret from your private key and their public key.
-// WARNING: DO NOT USE THE SHARED SECRET DIRECTLY.
-// The shared secret is not pseudo-random.  You need to hash it to derive
-// an acceptable secret key.  Any cryptographic hash can work, as well as
-// HChacha20.
-//
-// Implementation details: this is an elliptic curve.  The public key is
-// a point on this curve, and your private key is a scalar.  The shared
-// secret is another point on this curve, obtained by scalar multiplication.
-// Basically:
-//     shared_secret == your_sk * their_pk == your_sk * (their_sk * base_point)
-//                   == their_sk * your_pk == their_sk * (your_sk * base_point)
-void crypto_x25519(uint8_t       shared_secret   [32],
-                   const uint8_t your_secret_key [32],
-                   const uint8_t their_public_key[32]);
-
-// Generates a public key from the specified secret key.
-// Make sure the secret key is randomly selected.
-//
-// Implementation detail: your secret key is a scalar, and we multiply
-// the base point (a constant) by it to obtain a public key.  That is:
-//     public_key == secret_key * base_point
-// Reversing the operation is conjectured to be infeasible
-// without quantum computers (128 bits of security).
-void crypto_x25519_base(uint8_t public_key[32], const uint8_t secret_key[32]);
-
-
-#endif // X25519_H