From 430dd1bb200643021dffbf2683bb964534bc7f2c Mon Sep 17 00:00:00 2001
From: Loup Vaillant <loup@loup-vaillant.fr>
Date: Mon, 18 Nov 2019 22:41:41 +0100
Subject: [PATCH] Leveraged fe_pow22523 to to simplify fe_invert

The multiplication chain used in those two function is probably optimal,
but it is also kind of black magic, and takes quite a bit of code.
TweetNaCl has a much shorter, much easier to read, much slower addition
chain. I figured maybe a middle ground were possible.

Turns out it's difficult. I couldn't come up with a nice multiplication
chain on my own. But I did notice a relationship between 2^252 - 3 and
2^255 - 23 (the latter is used to invert): they start with the same bit
pattern. More specifically:

    2^255 - 23 = (2^252 - 3) * 8 + 3

I can use the same multiplication chain for both function, and just
finish the job for the inversion.

The cost of this patch compared to the ref10 multiplication chain is
five field multiplications, three of which are squaring. The effect on
the benchmark is so small that we don't even notice the difference.

The benefit is 10 meaty lines of code, and a corresponding decrease in
binary size.
---
 src/monocypher.c | 38 ++++++++++++++------------------------
 1 file changed, 14 insertions(+), 24 deletions(-)

diff --git a/src/monocypher.c b/src/monocypher.c
index 974332b..95e1095 100644
--- a/src/monocypher.c
+++ b/src/monocypher.c
@@ -1181,30 +1181,6 @@ static void fe_sq2(fe h, const fe f)
     fe_mul_small(h, h, 2);
 }
 
-// This could be simplified, but it would be slower
-static void fe_invert(fe out, const fe z)
-{
-    fe t0, t1, t2, t3;
-    fe_sq(t0, z );
-    fe_sq(t1, t0);
-    fe_sq(t1, t1);
-    fe_mul(t1,  z, t1);
-    fe_mul(t0, t0, t1);
-    fe_sq(t2, t0);                                fe_mul(t1 , t1, t2);
-    fe_sq(t2, t1); FOR (i, 1,   5) fe_sq(t2, t2); fe_mul(t1 , t2, t1);
-    fe_sq(t2, t1); FOR (i, 1,  10) fe_sq(t2, t2); fe_mul(t2 , t2, t1);
-    fe_sq(t3, t2); FOR (i, 1,  20) fe_sq(t3, t3); fe_mul(t2 , t3, t2);
-    fe_sq(t2, t2); FOR (i, 1,  10) fe_sq(t2, t2); fe_mul(t1 , t2, t1);
-    fe_sq(t2, t1); FOR (i, 1,  50) fe_sq(t2, t2); fe_mul(t2 , t2, t1);
-    fe_sq(t3, t2); FOR (i, 1, 100) fe_sq(t3, t3); fe_mul(t2 , t3, t2);
-    fe_sq(t2, t2); FOR (i, 1,  50) fe_sq(t2, t2); fe_mul(t1 , t2, t1);
-    fe_sq(t1, t1); FOR (i, 1,   5) fe_sq(t1, t1); fe_mul(out, t1, t0);
-    WIPE_BUFFER(t0);
-    WIPE_BUFFER(t1);
-    WIPE_BUFFER(t2);
-    WIPE_BUFFER(t3);
-}
-
 // This could be simplified, but it would be slower
 static void fe_pow22523(fe out, const fe z)
 {
@@ -1226,6 +1202,20 @@ static void fe_pow22523(fe out, const fe z)
     WIPE_BUFFER(t2);
 }
 
+// Inverting means multiplying by 2^255 - 23
+// 2^255 - 21 = (2^252 - 3) * 8 + 3
+// So we reuse the multiplication chain of fe_pow22523
+static void fe_invert(fe out, const fe z)
+{
+    fe tmp;
+    fe_pow22523(tmp, z);
+    // tmp2^8 * z^3
+    fe_sq(tmp, tmp);                        // 0
+    fe_sq(tmp, tmp);  fe_mul(tmp, tmp, z);  // 1
+    fe_sq(tmp, tmp);  fe_mul(out, tmp, z);  // 1
+    WIPE_BUFFER(tmp);
+}
+
 static void fe_tobytes(u8 s[32], const fe h)
 {
     i32 t[10];
-- 
2.47.3