From 430dd1bb200643021dffbf2683bb964534bc7f2c Mon Sep 17 00:00:00 2001 From: Loup Vaillant Date: Mon, 18 Nov 2019 22:41:41 +0100 Subject: [PATCH] Leveraged fe_pow22523 to to simplify fe_invert The multiplication chain used in those two function is probably optimal, but it is also kind of black magic, and takes quite a bit of code. TweetNaCl has a much shorter, much easier to read, much slower addition chain. I figured maybe a middle ground were possible. Turns out it's difficult. I couldn't come up with a nice multiplication chain on my own. But I did notice a relationship between 2^252 - 3 and 2^255 - 23 (the latter is used to invert): they start with the same bit pattern. More specifically: 2^255 - 23 = (2^252 - 3) * 8 + 3 I can use the same multiplication chain for both function, and just finish the job for the inversion. The cost of this patch compared to the ref10 multiplication chain is five field multiplications, three of which are squaring. The effect on the benchmark is so small that we don't even notice the difference. The benefit is 10 meaty lines of code, and a corresponding decrease in binary size. --- src/monocypher.c | 38 ++++++++++++++------------------------ 1 file changed, 14 insertions(+), 24 deletions(-) diff --git a/src/monocypher.c b/src/monocypher.c index 974332b..95e1095 100644 --- a/src/monocypher.c +++ b/src/monocypher.c @@ -1181,30 +1181,6 @@ static void fe_sq2(fe h, const fe f) fe_mul_small(h, h, 2); } -// This could be simplified, but it would be slower -static void fe_invert(fe out, const fe z) -{ - fe t0, t1, t2, t3; - fe_sq(t0, z ); - fe_sq(t1, t0); - fe_sq(t1, t1); - fe_mul(t1, z, t1); - fe_mul(t0, t0, t1); - fe_sq(t2, t0); fe_mul(t1 , t1, t2); - fe_sq(t2, t1); FOR (i, 1, 5) fe_sq(t2, t2); fe_mul(t1 , t2, t1); - fe_sq(t2, t1); FOR (i, 1, 10) fe_sq(t2, t2); fe_mul(t2 , t2, t1); - fe_sq(t3, t2); FOR (i, 1, 20) fe_sq(t3, t3); fe_mul(t2 , t3, t2); - fe_sq(t2, t2); FOR (i, 1, 10) fe_sq(t2, t2); fe_mul(t1 , t2, t1); - fe_sq(t2, t1); FOR (i, 1, 50) fe_sq(t2, t2); fe_mul(t2 , t2, t1); - fe_sq(t3, t2); FOR (i, 1, 100) fe_sq(t3, t3); fe_mul(t2 , t3, t2); - fe_sq(t2, t2); FOR (i, 1, 50) fe_sq(t2, t2); fe_mul(t1 , t2, t1); - fe_sq(t1, t1); FOR (i, 1, 5) fe_sq(t1, t1); fe_mul(out, t1, t0); - WIPE_BUFFER(t0); - WIPE_BUFFER(t1); - WIPE_BUFFER(t2); - WIPE_BUFFER(t3); -} - // This could be simplified, but it would be slower static void fe_pow22523(fe out, const fe z) { @@ -1226,6 +1202,20 @@ static void fe_pow22523(fe out, const fe z) WIPE_BUFFER(t2); } +// Inverting means multiplying by 2^255 - 23 +// 2^255 - 21 = (2^252 - 3) * 8 + 3 +// So we reuse the multiplication chain of fe_pow22523 +static void fe_invert(fe out, const fe z) +{ + fe tmp; + fe_pow22523(tmp, z); + // tmp2^8 * z^3 + fe_sq(tmp, tmp); // 0 + fe_sq(tmp, tmp); fe_mul(tmp, tmp, z); // 1 + fe_sq(tmp, tmp); fe_mul(out, tmp, z); // 1 + WIPE_BUFFER(tmp); +} + static void fe_tobytes(u8 s[32], const fe h) { i32 t[10]; -- 2.47.3