From: Loup Vaillant Date: Mon, 20 Feb 2017 00:55:50 +0000 (+0100) Subject: 20% slower curve25519, gained a hundred lines X-Git-Url: https://git.codecow.com/?a=commitdiff_plain;h=5c33c3fc7b4cc1da79497357c48e2d6aab084a5a;p=Monocypher.git 20% slower curve25519, gained a hundred lines --- diff --git a/monocypher.c b/monocypher.c index a6f4bb1..763e5d4 100644 --- a/monocypher.c +++ b/monocypher.c @@ -762,8 +762,10 @@ void crypto_argon2i(u8 *tag, u32 tag_size, } //////////////////////////////////// -/// Arithmetic modulo 2^255 - 19 /// Taken from Supercop's ref10 implementation. -//////////////////////////////////// A bit bigger than TweetNaCl, much faster. +/// Arithmetic modulo 2^255 - 19 /// +//////////////////////////////////// +// Taken from Supercop's ref10 implementation. +// A bit bigger than TweetNaCl, over 6 times faster. // field element typedef i32 fe[10]; @@ -833,7 +835,7 @@ sv fe_mul121666(fe h, const fe f) sv fe_mul(fe h, const fe f, const fe g) { // Everything is unrolled and put in temporary variables. - // We could roll the loop, but that would make it twice as slow. + // We could roll the loop, but that would make curve25519 twice as slow. i32 f0 = f[0]; i32 f1 = f[1]; i32 f2 = f[2]; i32 f3 = f[3]; i32 f4 = f[4]; i32 f5 = f[5]; i32 f6 = f[6]; i32 f7 = f[7]; i32 f8 = f[8]; i32 f9 = f[9]; i32 g0 = g[0]; i32 g1 = g[1]; i32 g2 = g[2]; i32 g3 = g[3]; i32 g4 = g[4]; @@ -864,111 +866,41 @@ sv fe_mul(fe h, const fe f, const fe g) i64 h9 = f0*(i64)g9 + f1*(i64)g8 + f2*(i64)g7 + f3*(i64)g6 + f4*(i64)g5 + f5*(i64)g4 + f6*(i64)g3 + f7*(i64)g2 + f8*(i64)g1 + f9*(i64)g0; -#define CARRY_MULT \ - i64 c0, c1, c2, c3, c4, c5, c6, c7, c8, c9; \ - c0 = (h0 + (i64) (1<<25)) >> 26; h1 += c0; h0 -= c0 << 26; \ - c4 = (h4 + (i64) (1<<25)) >> 26; h5 += c4; h4 -= c4 << 26; \ - c1 = (h1 + (i64) (1<<24)) >> 25; h2 += c1; h1 -= c1 << 25; \ - c5 = (h5 + (i64) (1<<24)) >> 25; h6 += c5; h5 -= c5 << 25; \ - c2 = (h2 + (i64) (1<<25)) >> 26; h3 += c2; h2 -= c2 << 26; \ - c6 = (h6 + (i64) (1<<25)) >> 26; h7 += c6; h6 -= c6 << 26; \ - c3 = (h3 + (i64) (1<<24)) >> 25; h4 += c3; h3 -= c3 << 25; \ - c7 = (h7 + (i64) (1<<24)) >> 25; h8 += c7; h7 -= c7 << 25; \ - c4 = (h4 + (i64) (1<<25)) >> 26; h5 += c4; h4 -= c4 << 26; \ - c8 = (h8 + (i64) (1<<25)) >> 26; h9 += c8; h8 -= c8 << 26; \ - c9 = (h9 + (i64) (1<<24)) >> 25; h0 += c9 * 19; h9 -= c9 << 25; \ - c0 = (h0 + (i64) (1<<25)) >> 26; h1 += c0; h0 -= c0 << 26; \ - \ - h[0] = h0; h[1] = h1; h[2] = h2; h[3] = h3; h[4] = h4; \ - h[5] = h5; h[6] = h6; h[7] = h7; h[8] = h8; h[9] = h9 - CARRY_MULT; -} - -sv fe_sq(fe h, const fe f) -{ - i32 f0 = f[0]; i32 f1 = f[1]; i32 f2 = f[2]; i32 f3 = f[3]; i32 f4 = f[4]; - i32 f5 = f[5]; i32 f6 = f[6]; i32 f7 = f[7]; i32 f8 = f[8]; i32 f9 = f[9]; - i32 f0_2 = f0*2; i32 f1_2 = f1*2; i32 f2_2 = f2*2; i32 f3_2 = f3*2; - i32 f4_2 = f4*2; i32 f5_2 = f5*2; i32 f6_2 = f6*2; i32 f7_2 = f7*2; - i32 f5_38 = f5*38; i32 f6_19 = f6*19; i32 f7_38 = f7*38; - i32 f8_19 = f8*19; i32 f9_38 = f9*38; - - i64 h0 = f0 *(i64)f0 + f1_2*(i64)f9_38 + f2_2*(i64)f8_19 - + f3_2*(i64)f7_38 + f4_2*(i64)f6_19 + f5 *(i64)f5_38; - i64 h1 = f0_2*(i64)f1 + f2 *(i64)f9_38 + f3_2*(i64)f8_19 - + f4 *(i64)f7_38 + f5_2*(i64)f6_19; - i64 h2 = f0_2*(i64)f2 + f1_2*(i64)f1 + f3_2*(i64)f9_38 - + f4_2*(i64)f8_19 + f5_2*(i64)f7_38 + f6 *(i64)f6_19; - i64 h3 = f0_2*(i64)f3 + f1_2*(i64)f2 + f4 *(i64)f9_38 - + f5_2*(i64)f8_19 + f6 *(i64)f7_38; - i64 h4 = f0_2*(i64)f4 + f1_2*(i64)f3_2 + f2 *(i64)f2 - + f5_2*(i64)f9_38 + f6_2*(i64)f8_19 + f7 *(i64)f7_38; - i64 h5 = f0_2*(i64)f5 + f1_2*(i64)f4 + f2_2*(i64)f3 - + f6 *(i64)f9_38 + f7_2*(i64)f8_19; - i64 h6 = f0_2*(i64)f6 + f1_2*(i64)f5_2 + f2_2*(i64)f4 - + f3_2*(i64)f3 + f7_2*(i64)f9_38 + f8 *(i64)f8_19; - i64 h7 = f0_2*(i64)f7 + f1_2*(i64)f6 + f2_2*(i64)f5 - + f3_2*(i64)f4 + f8 *(i64)f9_38; - i64 h8 = f0_2*(i64)f8 + f1_2*(i64)f7_2 + f2_2*(i64)f6 - + f3_2*(i64)f5_2 + f4 *(i64)f4 + f9 *(i64)f9_38; - i64 h9 = f0_2*(i64)f9 + f1_2*(i64)f8 + f2_2*(i64)f7 - + f3_2*(i64)f6 + f4 *(i64)f5_2; - CARRY_MULT; -} - -sv fe_invert(fe out, const fe z) -{ - /* + i64 c0, c1, c2, c3, c4, c5, c6, c7, c8, c9; + c0 = (h0 + (i64) (1<<25)) >> 26; h1 += c0; h0 -= c0 << 26; + c4 = (h4 + (i64) (1<<25)) >> 26; h5 += c4; h4 -= c4 << 26; + c1 = (h1 + (i64) (1<<24)) >> 25; h2 += c1; h1 -= c1 << 25; + c5 = (h5 + (i64) (1<<24)) >> 25; h6 += c5; h5 -= c5 << 25; + c2 = (h2 + (i64) (1<<25)) >> 26; h3 += c2; h2 -= c2 << 26; + c6 = (h6 + (i64) (1<<25)) >> 26; h7 += c6; h6 -= c6 << 26; + c3 = (h3 + (i64) (1<<24)) >> 25; h4 += c3; h3 -= c3 << 25; + c7 = (h7 + (i64) (1<<24)) >> 25; h8 += c7; h7 -= c7 << 25; + c4 = (h4 + (i64) (1<<25)) >> 26; h5 += c4; h4 -= c4 << 26; + c8 = (h8 + (i64) (1<<25)) >> 26; h9 += c8; h8 -= c8 << 26; + c9 = (h9 + (i64) (1<<24)) >> 25; h0 += c9 * 19; h9 -= c9 << 25; + c0 = (h0 + (i64) (1<<25)) >> 26; h1 += c0; h0 -= c0 << 26; + + h[0] = h0; h[1] = h1; h[2] = h2; h[3] = h3; h[4] = h4; + h[5] = h5; h[6] = h6; h[7] = h7; h[8] = h8; h[9] = h9; +} + +sv fe_sq(fe h, const fe f) { fe_mul(h, f, f); } + +// power to a power of 2 minus a small number. +// Timing depends on pow_2 and minus, so they shall not be secret. +sv fe_power(fe out, const fe z, int pow_2, u8 minus) +{ + minus--; fe c; fe_copy(c, z); - FOR (i, 0, 254) { + for (int i = pow_2 - 2; i >= 0; i--) { fe_sq(c, c); - if(i !=251 && i!= 249) fe_mul(c, c, z); - } - fe_copy(out, c); - */ - fe t0, t1, t2, t3; - fe_sq(t0, z ); - fe_sq(t1, t0); - fe_sq(t1, t1); - fe_mul(t1, z, t1); - fe_mul(t0, t0, t1); - fe_sq(t2, t0); fe_mul(t1 , t1, t2); - fe_sq(t2, t1); FOR (i, 1, 5) fe_sq(t2, t2); fe_mul(t1 , t2, t1); - fe_sq(t2, t1); FOR (i, 1, 10) fe_sq(t2, t2); fe_mul(t2 , t2, t1); - fe_sq(t3, t2); FOR (i, 1, 20) fe_sq(t3, t3); fe_mul(t2 , t3, t2); - fe_sq(t2, t2); FOR (i, 1, 10) fe_sq(t2, t2); fe_mul(t1 , t2, t1); - fe_sq(t2, t1); FOR (i, 1, 50) fe_sq(t2, t2); fe_mul(t2 , t2, t1); - fe_sq(t3, t2); FOR (i, 1, 100) fe_sq(t3, t3); fe_mul(t2 , t3, t2); - fe_sq(t2, t2); FOR (i, 1, 50) fe_sq(t2, t2); fe_mul(t1 , t2, t1); - fe_sq(t1, t1); FOR (i, 1, 5) fe_sq(t1, t1); fe_mul(out, t1, t0); - -} - -void fe_pow22523(fe out, const fe z) -{ - /* - fe c; fe_copy(c, z); - FOR(i, 0, 251) { - fe_sq (c, c); - if (i != 249) fe_mul(c, c, z); + if (i >= 8 || !((minus >> i) & 1)) + fe_mul(c, c, z); } fe_copy(out, c); - */ - fe t0, t1, t2; - fe_sq(t0, z); - fe_sq(t1,t0); fe_sq(t1, t1); fe_mul(t1, z, t1); - fe_mul(t0, t0, t1); - fe_sq(t0, t0); fe_mul(t0, t1, t0); - fe_sq(t1, t0); FOR (i, 1, 5) fe_sq(t1, t1); fe_mul(t0, t1, t0); - fe_sq(t1, t0); FOR (i, 1, 10) fe_sq(t1, t1); fe_mul(t1, t1, t0); - fe_sq(t2, t1); FOR (i, 1, 20) fe_sq(t2, t2); fe_mul(t1, t2, t1); - fe_sq(t1, t1); FOR (i, 1, 10) fe_sq(t1, t1); fe_mul(t0, t1, t0); - fe_sq(t1, t0); FOR (i, 1, 50) fe_sq(t1, t1); fe_mul(t1, t1, t0); - fe_sq(t2, t1); FOR (i, 1, 100) fe_sq(t2, t2); fe_mul(t1, t2, t1); - fe_sq(t1, t1); FOR (i, 1, 50) fe_sq(t1, t1); fe_mul(t0, t1, t0); - fe_sq(t0, t0); FOR (i, 1, 2) fe_sq(t0, t0); fe_mul(out, t0, z); - } +sv fe_invert (fe out, const fe z) { fe_power(out, z, 255, 21); } +sv fe_pow22523(fe out, const fe z) { fe_power(out, z, 252, 3); } sv fe_tobytes(u8 s[32], const fe h) { @@ -1014,7 +946,8 @@ static int fe_isnonzero(const fe f) /////////////// /// X-25519 /// Taken from Supercop's ref10 implementation. -/////////////// Bigger than TweetNaCl, but over 8 times faster +/////////////// + sv trim_scalar(u8 s[32]) { s[ 0] &= 248; @@ -1035,14 +968,14 @@ void crypto_x25519(u8 shared_secret [32], trim_scalar(e); // Montgomery ladder - // We work in projective coordinates to avoid divisons: x = X / Z - // We don't care about the y coordinate. + // In projective coordinates, to avoid divisons: x = X / Z + // We don't care about the y coordinate, it's only 1 bit of information fe_1(x2); fe_0(z2); // "zero" point fe_copy(x3, x1); fe_1(z3); // "one" point - u32 swap = 0; + int swap = 0; for (int pos = 254; pos >= 0; --pos) { // constant time conditional swap before ladder step - u32 b = (e[pos / 8] >> (pos & 7)) & 1; + int b = (e[pos / 8] >> (pos & 7)) & 1; swap ^= b; // xor trick avoids swapping at the end of the loop fe_cswap(x2, x3, swap); fe_cswap(z2, z3, swap); @@ -1182,42 +1115,34 @@ sv ge_add(ge *s, const ge *p, const ge *q) fe_mul(s->T, e, h); // T3 = E * H } -sv ge_double(ge *s, const ge *p) -{ - fe a, b, c, d, e, f, g, h; - fe_sub(a, p->Y, p->X); fe_sq(a, a); // A = (Y1-X1)^2 - fe_add(b, p->X, p->Y); fe_sq(b, b); // B = (Y1+X1)^2 - fe_sq (c, p->T); fe_mul(c, c, D2); // C = T1^2 * k - fe_sq (d, p->Z); fe_add(d, d, d); // D = Z1^2 * 2 - fe_sub(e, b, a); // E = B - A - fe_sub(f, d, c); // F = D - C - fe_add(g, d, c); // G = D + C - fe_add(h, b, a); // H = B + A - fe_mul(s->X, e, f); // X3 = E * F - fe_mul(s->Y, g, h); // Y3 = G * H - fe_mul(s->Z, f, g); // Z3 = F * G - fe_mul(s->T, e, h); // T3 = E * H -} +sv ge_double(ge *s, const ge *p) { ge_add(s, p, p); } sv ge_scalarmult(ge *p, const ge *q, const u8 scalar[32]) { + // Simple Montgomery ladder, with straight double and add. ge t; fe_0(p->X); fe_copy(t.X, q->X); fe_1(p->Y); fe_copy(t.Y, q->Y); fe_1(p->Z); fe_copy(t.Z, q->Z); fe_0(p->T); fe_copy(t.T, q->T); - + int swap = 0; for (int i = 255; i >= 0; i--) { - u8 b = (scalar[i/8] >> (i & 7)) & 1; - ge_cswap(p, &t, b); + int b = (scalar[i/8] >> (i & 7)) & 1; + swap ^= b; // xor trick avoids unnecessary swaps + ge_cswap(p, &t, swap); + swap = b; ge_add(&t, &t, p); ge_double(p, p); - ge_cswap(p, &t, b); } + // one last swap makes up for the xor trick + ge_cswap(p, &t, swap); } sv ge_scalarmult_base(ge *p, const u8 scalar[32]) { + // Calls the general ge_scalarmult() with the base point. + // Other implementations use a precomputed table, but it + // takes way too much code. static const fe X = { 0x325d51a, 0x18b5823, 0x0f6592a, 0x104a92d, 0x1a4b31d, 0x1d6dc5c, 0x27118fe, 0x07fd814, 0x13cd6e5, 0x085a4db}; @@ -1235,15 +1160,15 @@ sv modL(u8 *r, i64 x[64]) 0xd6, 0x9c, 0xf7, 0xa2, 0xde, 0xf9, 0xde, 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10 }; - i64 i, j; - for (i = 63;i >= 32;--i) { + unsigned i; + for (i = 63; i >= 32; i--) { i64 carry = 0; - for (j = i - 32;j < i - 12;++j) { + FOR (j, i-32, i-12) { x[j] += carry - 16 * x[i] * L[j - (i - 32)]; carry = (x[j] + 128) >> 8; x[j] -= carry << 8; } - x[j] += carry; + x[i-12] += carry; x[i] = 0; } i64 carry = 0; @@ -1252,7 +1177,7 @@ sv modL(u8 *r, i64 x[64]) carry = x[j] >> 8; x[j] &= 255; } - FOR(j, 0, 32) x[j] -= carry * L[j]; + FOR(j, 0, 32) { x[j] -= carry * L[j]; } FOR(i, 0, 32) { x[i+1] += x[i] >> 8; r[i ] = x[i] & 255; @@ -1317,10 +1242,11 @@ void crypto_ed25519_sign(uint8_t signature[64], HASH_UPDATE(&ctx, message, message_size); HASH_FINAL (&ctx, r); + // first half of the signature = "random" nonce times basepoint ge R; reduce(r); ge_scalarmult_base(&R, r); - ge_tobytes(signature, &R); // first half of the signature = "random" nonce + ge_tobytes(signature, &R); u8 h_ram[64]; hash_ram(h_ram, signature, public_key, message, message_size); diff --git a/more_speed.c b/more_speed.c new file mode 100644 index 0000000..9afd728 --- /dev/null +++ b/more_speed.c @@ -0,0 +1,118 @@ +// Some low hanging fruits to make Monocypher a bit faster. +// +// It's a nice to have, but it probably won't save you if Monocypher +// is really too slow. If that ever happens, consider switching to 64 +// bits implementations such as Donna, or even direct assembly. +// +// Using Donna's field arithmetic should yield a 5-fold speedup at +// least. Signing can be even faster, but it takes a lot of code. +// +// On my computer, the alternate routines below makes curve25519 about +// 20% faster. Noticable, but not worth the extra hundred lines. + + +// Specialised squaring function, faster than general multiplication. +sv fe_sq(fe h, const fe f) +{ + i32 f0 = f[0]; i32 f1 = f[1]; i32 f2 = f[2]; i32 f3 = f[3]; i32 f4 = f[4]; + i32 f5 = f[5]; i32 f6 = f[6]; i32 f7 = f[7]; i32 f8 = f[8]; i32 f9 = f[9]; + i32 f0_2 = f0*2; i32 f1_2 = f1*2; i32 f2_2 = f2*2; i32 f3_2 = f3*2; + i32 f4_2 = f4*2; i32 f5_2 = f5*2; i32 f6_2 = f6*2; i32 f7_2 = f7*2; + i32 f5_38 = f5*38; i32 f6_19 = f6*19; i32 f7_38 = f7*38; + i32 f8_19 = f8*19; i32 f9_38 = f9*38; + + i64 h0 = f0 *(i64)f0 + f1_2*(i64)f9_38 + f2_2*(i64)f8_19 + + f3_2*(i64)f7_38 + f4_2*(i64)f6_19 + f5 *(i64)f5_38; + i64 h1 = f0_2*(i64)f1 + f2 *(i64)f9_38 + f3_2*(i64)f8_19 + + f4 *(i64)f7_38 + f5_2*(i64)f6_19; + i64 h2 = f0_2*(i64)f2 + f1_2*(i64)f1 + f3_2*(i64)f9_38 + + f4_2*(i64)f8_19 + f5_2*(i64)f7_38 + f6 *(i64)f6_19; + i64 h3 = f0_2*(i64)f3 + f1_2*(i64)f2 + f4 *(i64)f9_38 + + f5_2*(i64)f8_19 + f6 *(i64)f7_38; + i64 h4 = f0_2*(i64)f4 + f1_2*(i64)f3_2 + f2 *(i64)f2 + + f5_2*(i64)f9_38 + f6_2*(i64)f8_19 + f7 *(i64)f7_38; + i64 h5 = f0_2*(i64)f5 + f1_2*(i64)f4 + f2_2*(i64)f3 + + f6 *(i64)f9_38 + f7_2*(i64)f8_19; + i64 h6 = f0_2*(i64)f6 + f1_2*(i64)f5_2 + f2_2*(i64)f4 + + f3_2*(i64)f3 + f7_2*(i64)f9_38 + f8 *(i64)f8_19; + i64 h7 = f0_2*(i64)f7 + f1_2*(i64)f6 + f2_2*(i64)f5 + + f3_2*(i64)f4 + f8 *(i64)f9_38; + i64 h8 = f0_2*(i64)f8 + f1_2*(i64)f7_2 + f2_2*(i64)f6 + + f3_2*(i64)f5_2 + f4 *(i64)f4 + f9 *(i64)f9_38; + i64 h9 = f0_2*(i64)f9 + f1_2*(i64)f8 + f2_2*(i64)f7 + + f3_2*(i64)f6 + f4 *(i64)f5_2; + + i64 c0, c1, c2, c3, c4, c5, c6, c7, c8, c9; + c0 = (h0 + (i64) (1<<25)) >> 26; h1 += c0; h0 -= c0 << 26; + c4 = (h4 + (i64) (1<<25)) >> 26; h5 += c4; h4 -= c4 << 26; + c1 = (h1 + (i64) (1<<24)) >> 25; h2 += c1; h1 -= c1 << 25; + c5 = (h5 + (i64) (1<<24)) >> 25; h6 += c5; h5 -= c5 << 25; + c2 = (h2 + (i64) (1<<25)) >> 26; h3 += c2; h2 -= c2 << 26; + c6 = (h6 + (i64) (1<<25)) >> 26; h7 += c6; h6 -= c6 << 26; + c3 = (h3 + (i64) (1<<24)) >> 25; h4 += c3; h3 -= c3 << 25; + c7 = (h7 + (i64) (1<<24)) >> 25; h8 += c7; h7 -= c7 << 25; + c4 = (h4 + (i64) (1<<25)) >> 26; h5 += c4; h4 -= c4 << 26; + c8 = (h8 + (i64) (1<<25)) >> 26; h9 += c8; h8 -= c8 << 26; + c9 = (h9 + (i64) (1<<24)) >> 25; h0 += c9 * 19; h9 -= c9 << 25; + c0 = (h0 + (i64) (1<<25)) >> 26; h1 += c0; h0 -= c0 << 26; + + h[0] = h0; h[1] = h1; h[2] = h2; h[3] = h3; h[4] = h4; + h[5] = h5; h[6] = h6; h[7] = h7; h[8] = h8; h[9] = h9; +} + +// slightly faster inversion modulo 2^255 - 19 +sv fe_invert(fe out, const fe z) +{ + fe t0, t1, t2, t3; + fe_sq(t0, z ); + fe_sq(t1, t0); + fe_sq(t1, t1); + fe_mul(t1, z, t1); + fe_mul(t0, t0, t1); + fe_sq(t2, t0); fe_mul(t1 , t1, t2); + fe_sq(t2, t1); FOR (i, 1, 5) fe_sq(t2, t2); fe_mul(t1 , t2, t1); + fe_sq(t2, t1); FOR (i, 1, 10) fe_sq(t2, t2); fe_mul(t2 , t2, t1); + fe_sq(t3, t2); FOR (i, 1, 20) fe_sq(t3, t3); fe_mul(t2 , t3, t2); + fe_sq(t2, t2); FOR (i, 1, 10) fe_sq(t2, t2); fe_mul(t1 , t2, t1); + fe_sq(t2, t1); FOR (i, 1, 50) fe_sq(t2, t2); fe_mul(t2 , t2, t1); + fe_sq(t3, t2); FOR (i, 1, 100) fe_sq(t3, t3); fe_mul(t2 , t3, t2); + fe_sq(t2, t2); FOR (i, 1, 50) fe_sq(t2, t2); fe_mul(t1 , t2, t1); + fe_sq(t1, t1); FOR (i, 1, 5) fe_sq(t1, t1); fe_mul(out, t1, t0); +} + +// slightly faster power to 2^252 - 3, for ed25519 decompression +void fe_pow22523(fe out, const fe z) +{ + fe t0, t1, t2; + fe_sq(t0, z); + fe_sq(t1,t0); fe_sq(t1, t1); fe_mul(t1, z, t1); + fe_mul(t0, t0, t1); + fe_sq(t0, t0); fe_mul(t0, t1, t0); + fe_sq(t1, t0); FOR (i, 1, 5) fe_sq(t1, t1); fe_mul(t0, t1, t0); + fe_sq(t1, t0); FOR (i, 1, 10) fe_sq(t1, t1); fe_mul(t1, t1, t0); + fe_sq(t2, t1); FOR (i, 1, 20) fe_sq(t2, t2); fe_mul(t1, t2, t1); + fe_sq(t1, t1); FOR (i, 1, 10) fe_sq(t1, t1); fe_mul(t0, t1, t0); + fe_sq(t1, t0); FOR (i, 1, 50) fe_sq(t1, t1); fe_mul(t1, t1, t0); + fe_sq(t2, t1); FOR (i, 1, 100) fe_sq(t2, t2); fe_mul(t1, t2, t1); + fe_sq(t1, t1); FOR (i, 1, 50) fe_sq(t1, t1); fe_mul(t0, t1, t0); + fe_sq(t0, t0); FOR (i, 1, 2) fe_sq(t0, t0); fe_mul(out, t0, z); +} + +// Slightly faster twisted Edwards point doubling, assuming we use a +// specialised squaring function such as the above. +sv ge_double(ge *s, const ge *p) +{ + fe a, b, c, d, e, f, g, h; + fe_sub(a, p->Y, p->X); fe_sq(a, a); // A = (Y1-X1)^2 + fe_add(b, p->X, p->Y); fe_sq(b, b); // B = (Y1+X1)^2 + fe_sq (c, p->T); fe_mul(c, c, D2); // C = T1^2 * k + fe_sq (d, p->Z); fe_add(d, d, d); // D = Z1^2 * 2 + fe_sub(e, b, a); // E = B - A + fe_sub(f, d, c); // F = D - C + fe_add(g, d, c); // G = D + C + fe_add(h, b, a); // H = B + A + fe_mul(s->X, e, f); // X3 = E * F + fe_mul(s->Y, g, h); // Y3 = G * H + fe_mul(s->Z, f, g); // Z3 = F * G + fe_mul(s->T, e, h); // T3 = E * H +}