From d5d9c79b4c5c4085478357cbf2a8613f837cc436 Mon Sep 17 00:00:00 2001
From: Loup Vaillant <loup@loup-vaillant.fr>
Date: Sun, 8 Nov 2020 22:41:58 +0100
Subject: [PATCH] Documented 2^255-19 carry propagation

Fixes #185

Carry propagation is now justified, in a way that I can personally vouch
for (I used to rely on SUPERCOP's ref10 code and proofs).

The use of arithmetic right shifts is also documented, and a workaround
has been devised in case someone somewhere uses a platforms that does
not perform sign extension. (That will never happen.)
---
 src/monocypher.c | 266 ++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 230 insertions(+), 36 deletions(-)

diff --git a/src/monocypher.c b/src/monocypher.c
index b9fc5b2..9c8da66 100644
--- a/src/monocypher.c
+++ b/src/monocypher.c
@@ -1037,7 +1037,7 @@ void crypto_argon2i(u8   *hash,      u32 hash_size,
 ////////////////////////////////////
 /// Arithmetic modulo 2^255 - 19 ///
 ////////////////////////////////////
-//  Taken from SUPERCOP's ref10 implementation.
+//  Originally taken from SUPERCOP's ref10 implementation.
 //  A bit bigger than TweetNaCl, over 4 times faster.
 
 // field element
@@ -1092,52 +1092,182 @@ static void fe_ccopy(fe f, const fe g, int b)
     }
 }
 
+
+// Signed carry propagation
+// ------------------------
+//
+// Let t be a number.  It can be uniquely decomposed thus:
+//
+//    t = h*2^26 + l
+//    such that -2^25 <= l < 2^25
+//
+// Let c = (t + 2^25) / 2^26            (rounded down)
+//     c = (h*2^26 + l + 2^25) / 2^26   (rounded down)
+//     c =  h   +   (l + 2^25) / 2^26   (rounded down)
+//     c =  h                           (exactly)
+// Because 0 <= l + 2^25 < 2^26
+//
+// Let u = t          - c*2^26
+//     u = h*2^26 + l - h*2^26
+//     u = l
+// Therefore, -2^25 <= u < 2^25
+//
+// Additionally, if |t| < x, then |h| < x/2^26 (rounded down)
+//
+// Notations:
+// - In C, 1<<25 means 2^25.
+// - In C, x>>25 means floor(x / (2^25)).
+// - All of the above applies with 25 & 24 as well as 26 & 25.
+//
+//
+// Note on negative right shifts
+// -----------------------------
+//
+// In C, x >> n, where x is a negative integer, is implementation
+// defined.  In practice, all platforms do arithmetic shift, which is
+// equivalent to division by 2^26, rounded down.  Some compilers, like
+// GCC, even guarantee it.
+//
+// If we ever stumble upon a platform that does not propagate the sign
+// bit (we won't), visible failures will show at the slightest test, and
+// the signed shifts can be replaced by the following:
+//
+//     typedef struct { i64 x:39; } s25;
+//     typedef struct { i64 x:38; } s26;
+//     i64 shift25(i64 x) { s25 s; s.x = ((u64)x)>>25; return s.x; }
+//     i64 shift26(i64 x) { s26 s; s.x = ((u64)x)>>26; return s.x; }
+//
+// Current compilers cannot optimise this, causing a 30% drop in
+// performance.  Fairly expensive for something that never happens.
+//
+//
+// Precondition
+// ------------
+//
+// |t0|       < 2^63
+// |t1|..|t9| < 2^62
+//
+// Algorithm
+// ---------
+// c   = t0 + 2^25 / 2^26   -- |c|  <= 2^36
+// t0 -= c * 2^26           -- |t0| <= 2^25
+// t1 += c                  -- |t1| <= 2^63
+//
+// c   = t4 + 2^25 / 2^26   -- |c|  <= 2^36
+// t4 -= c * 2^26           -- |t4| <= 2^25
+// t5 += c                  -- |t5| <= 2^63
+//
+// c   = t1 + 2^24 / 2^25   -- |c|  <= 2^38
+// t1 -= c * 2^25           -- |t1| <= 2^24
+// t2 += c                  -- |t2| <= 2^63
+//
+// c   = t5 + 2^24 / 2^25   -- |c|  <= 2^38
+// t5 -= c * 2^25           -- |t5| <= 2^24
+// t6 += c                  -- |t6| <= 2^63
+//
+// c   = t2 + 2^25 / 2^26   -- |c|  <= 2^37
+// t2 -= c * 2^26           -- |t2| <= 2^25        < 1.1 * 2^25  (final t2)
+// t3 += c                  -- |t3| <= 2^63
+//
+// c   = t6 + 2^25 / 2^26   -- |c|  <= 2^37
+// t6 -= c * 2^26           -- |t6| <= 2^25        < 1.1 * 2^25  (final t6)
+// t7 += c                  -- |t7| <= 2^63
+//
+// c   = t3 + 2^24 / 2^25   -- |c|  <= 2^38
+// t3 -= c * 2^25           -- |t3| <= 2^24        < 1.1 * 2^24  (final t3)
+// t4 += c                  -- |t4| <= 2^25 + 2^38 < 2^39
+//
+// c   = t7 + 2^24 / 2^25   -- |c|  <= 2^38
+// t7 -= c * 2^25           -- |t7| <= 2^24        < 1.1 * 2^24  (final t7)
+// t8 += c                  -- |t8| <= 2^63
+//
+// c   = t4 + 2^25 / 2^26   -- |c|  <= 2^13
+// t4 -= c * 2^26           -- |t4| <= 2^25        < 1.1 * 2^25  (final t4)
+// t5 += c                  -- |t5| <= 2^24 + 2^13 < 1.1 * 2^24  (final t5)
+//
+// c   = t8 + 2^25 / 2^26   -- |c|  <= 2^37
+// t8 -= c * 2^26           -- |t8| <= 2^25        < 1.1 * 2^25  (final t8)
+// t9 += c                  -- |t9| <= 2^63
+//
+// c   = t9 + 2^24 / 2^25   -- |c|  <= 2^38
+// t9 -= c * 2^25           -- |t9| <= 2^24        < 1.1 * 2^24  (final t9)
+// t0 += c * 19             -- |t0| <= 2^25 + 2^38*19 < 2^44
+//
+// c   = t0 + 2^25 / 2^26   -- |c|  <= 2^18
+// t0 -= c * 2^26           -- |t0| <= 2^25        < 1.1 * 2^25  (final t0)
+// t1 += c                  -- |t1| <= 2^24 + 2^18 < 1.1 * 2^24  (final t1)
+//
+// Postcondition
+// -------------
+//   |t0|, |t2|, |t4|, |t6|, |t8|  <  1.1 * 2^25
+//   |t1|, |t3|, |t5|, |t7|, |t9|  <  1.1 * 2^24
 #define FE_CARRY                                                        \
-    i64 c0, c1, c2, c3, c4, c5, c6, c7, c8, c9;                         \
-    c0 = (t0 + ((i64)1<<25)) >> 26; t1 += c0;      t0 -= c0 * ((i64)1 << 26); \
-    c4 = (t4 + ((i64)1<<25)) >> 26; t5 += c4;      t4 -= c4 * ((i64)1 << 26); \
-    c1 = (t1 + ((i64)1<<24)) >> 25; t2 += c1;      t1 -= c1 * ((i64)1 << 25); \
-    c5 = (t5 + ((i64)1<<24)) >> 25; t6 += c5;      t5 -= c5 * ((i64)1 << 25); \
-    c2 = (t2 + ((i64)1<<25)) >> 26; t3 += c2;      t2 -= c2 * ((i64)1 << 26); \
-    c6 = (t6 + ((i64)1<<25)) >> 26; t7 += c6;      t6 -= c6 * ((i64)1 << 26); \
-    c3 = (t3 + ((i64)1<<24)) >> 25; t4 += c3;      t3 -= c3 * ((i64)1 << 25); \
-    c7 = (t7 + ((i64)1<<24)) >> 25; t8 += c7;      t7 -= c7 * ((i64)1 << 25); \
-    c4 = (t4 + ((i64)1<<25)) >> 26; t5 += c4;      t4 -= c4 * ((i64)1 << 26); \
-    c8 = (t8 + ((i64)1<<25)) >> 26; t9 += c8;      t8 -= c8 * ((i64)1 << 26); \
-    c9 = (t9 + ((i64)1<<24)) >> 25; t0 += c9 * 19; t9 -= c9 * ((i64)1 << 25); \
-    c0 = (t0 + ((i64)1<<25)) >> 26; t1 += c0;      t0 -= c0 * ((i64)1 << 26); \
+    i64 c;                                                              \
+    c = (t0 + ((i64)1<<25)) >> 26;  t0 -= c * ((i64)1 << 26);  t1 += c; \
+    c = (t4 + ((i64)1<<25)) >> 26;  t4 -= c * ((i64)1 << 26);  t5 += c; \
+    c = (t1 + ((i64)1<<24)) >> 25;  t1 -= c * ((i64)1 << 25);  t2 += c; \
+    c = (t5 + ((i64)1<<24)) >> 25;  t5 -= c * ((i64)1 << 25);  t6 += c; \
+    c = (t2 + ((i64)1<<25)) >> 26;  t2 -= c * ((i64)1 << 26);  t3 += c; \
+    c = (t6 + ((i64)1<<25)) >> 26;  t6 -= c * ((i64)1 << 26);  t7 += c; \
+    c = (t3 + ((i64)1<<24)) >> 25;  t3 -= c * ((i64)1 << 25);  t4 += c; \
+    c = (t7 + ((i64)1<<24)) >> 25;  t7 -= c * ((i64)1 << 25);  t8 += c; \
+    c = (t4 + ((i64)1<<25)) >> 26;  t4 -= c * ((i64)1 << 26);  t5 += c; \
+    c = (t8 + ((i64)1<<25)) >> 26;  t8 -= c * ((i64)1 << 26);  t9 += c; \
+    c = (t9 + ((i64)1<<24)) >> 25;  t9 -= c * ((i64)1 << 25);  t0 += c * 19; \
+    c = (t0 + ((i64)1<<25)) >> 26;  t0 -= c * ((i64)1 << 26);  t1 += c; \
     h[0]=(i32)t0;  h[1]=(i32)t1;  h[2]=(i32)t2;  h[3]=(i32)t3;  h[4]=(i32)t4; \
     h[5]=(i32)t5;  h[6]=(i32)t6;  h[7]=(i32)t7;  h[8]=(i32)t8;  h[9]=(i32)t9
 
 static void fe_frombytes(fe h, const u8 s[32])
 {
-    i64 t0 =  load32_le(s);
-    i64 t1 =  load24_le(s +  4) << 6;
-    i64 t2 =  load24_le(s +  7) << 5;
-    i64 t3 =  load24_le(s + 10) << 3;
-    i64 t4 =  load24_le(s + 13) << 2;
-    i64 t5 =  load32_le(s + 16);
-    i64 t6 =  load24_le(s + 20) << 7;
-    i64 t7 =  load24_le(s + 23) << 5;
-    i64 t8 =  load24_le(s + 26) << 4;
-    i64 t9 = (load24_le(s + 29) & 0x7fffff) << 2;
-    FE_CARRY;
-}
-
+    i64 t0 =  load32_le(s);                        // t0 < 2^32
+    i64 t1 =  load24_le(s +  4) << 6;              // t1 < 2^30
+    i64 t2 =  load24_le(s +  7) << 5;              // t2 < 2^29
+    i64 t3 =  load24_le(s + 10) << 3;              // t3 < 2^27
+    i64 t4 =  load24_le(s + 13) << 2;              // t4 < 2^26
+    i64 t5 =  load32_le(s + 16);                   // t5 < 2^32
+    i64 t6 =  load24_le(s + 20) << 7;              // t6 < 2^31
+    i64 t7 =  load24_le(s + 23) << 5;              // t7 < 2^29
+    i64 t8 =  load24_le(s + 26) << 4;              // t8 < 2^28
+    i64 t9 = (load24_le(s + 29) & 0x7fffff) << 2;  // t9 < 2^25
+    FE_CARRY;                                      // Carry recondition OK
+}
+
+// Precondition
+//   |h[0]|, |h[2]|, |h[4]|, |h[6]|, |h[8]|  <  1.1 * 2^25
+//   |h[1]|, |h[3]|, |h[5]|, |h[7]|, |h[9]|  <  1.1 * 2^24
+//
+// Therefore, |h| < 2^255-19
+// There are two possibilities:
+//
+// - If h is positive, all we need to do is reduce its individual
+//   limbs down to their tight positive range.
+// - If h is negative, we also need to add 2^255-19 to it.
+//   Or just remove 19 and chop off any exess bit.
 static void fe_tobytes(u8 s[32], const fe h)
 {
     i32 t[10];
     COPY(t, h, 10);
     i32 q = (19 * t[9] + (((i32) 1) << 24)) >> 25;
+    //                 |t9|                    < 1.1 * 2^24
+    //  -1.1 * 2^24  <  t9                     < 1.1 * 2^24
+    //  -21  * 2^24  <  19 * t9                < 21  * 2^24
+    //  -2^29        <  19 * t9 + 2^24         < 2^29
+    //  -2^29 / 2^25 < (19 * t9 + 2^24) / 2^25 < 2^29 / 2^25
+    //  -16          < (19 * t9 + 2^24) / 2^25 < 16
     FOR (i, 0, 5) {
-        q += t[2*i  ]; q >>= 26;
-        q += t[2*i+1]; q >>= 25;
+        q += t[2*i  ]; q >>= 26; // q = 0 or -1
+        q += t[2*i+1]; q >>= 25; // q = 0 or -1
     }
+    // q =  0 iff h >= 0
+    // q = -1 iff h <  0
+    // Adding q * 19 to h reduces h to its proper range.
     q *= 19;  // Shift carry back to the begining
     FOR (i, 0, 5) {
         t[i*2  ] += q;  q = t[i*2  ] >> 26;  t[i*2  ] -= q * ((i32)1 << 26);
         t[i*2+1] += q;  q = t[i*2+1] >> 25;  t[i*2+1] -= q * ((i32)1 << 25);
     }
+    // h is now fully reduced, and q represents the excess bit.
 
     store32_le(s +  0, ((u32)t[0] >>  0) | ((u32)t[1] << 26));
     store32_le(s +  4, ((u32)t[1] >>  6) | ((u32)t[2] << 19));
@@ -1151,7 +1281,13 @@ static void fe_tobytes(u8 s[32], const fe h)
     WIPE_BUFFER(t);
 }
 
-// multiply a field element by a signed 32-bit integer
+// Precondition
+// -------------
+//   |f0|, |f2|, |f4|, |f6|, |f8|  <  1.65 * 2^26
+//   |f1|, |f3|, |f5|, |f7|, |f9|  <  1.65 * 2^25
+//
+//   |g0|, |g2|, |g4|, |g6|, |g8|  <  1.65 * 2^26
+//   |g1|, |g3|, |g5|, |g7|, |g9|  <  1.65 * 2^25
 static void fe_mul_small(fe h, const fe f, i32 g)
 {
     i64 t0 = f[0] * (i64) g;  i64 t1 = f[1] * (i64) g;
@@ -1159,9 +1295,19 @@ static void fe_mul_small(fe h, const fe f, i32 g)
     i64 t4 = f[4] * (i64) g;  i64 t5 = f[5] * (i64) g;
     i64 t6 = f[6] * (i64) g;  i64 t7 = f[7] * (i64) g;
     i64 t8 = f[8] * (i64) g;  i64 t9 = f[9] * (i64) g;
-    FE_CARRY;
+    // |t0|, |t2|, |t4|, |t6|, |t8|  <  1.65 * 2^26 * 2^31  < 2^58
+    // |t1|, |t3|, |t5|, |t7|, |t9|  <  1.65 * 2^25 * 2^31  < 2^57
+
+    FE_CARRY; // Carry precondition OK
 }
 
+// Precondition
+// -------------
+//   |f0|, |f2|, |f4|, |f6|, |f8|  <  1.65 * 2^26
+//   |f1|, |f3|, |f5|, |f7|, |f9|  <  1.65 * 2^25
+//
+//   |g0|, |g2|, |g4|, |g6|, |g8|  <  1.65 * 2^26
+//   |g1|, |g3|, |g5|, |g7|, |g9|  <  1.65 * 2^25
 static void fe_mul(fe h, const fe f, const fe g)
 {
     // Everything is unrolled and put in temporary variables.
@@ -1174,6 +1320,9 @@ static void fe_mul(fe h, const fe f, const fe g)
     i32 G1 = g1*19;  i32 G2 = g2*19;  i32 G3 = g3*19;
     i32 G4 = g4*19;  i32 G5 = g5*19;  i32 G6 = g6*19;
     i32 G7 = g7*19;  i32 G8 = g8*19;  i32 G9 = g9*19;
+    // |F1|, |F3|, |F5|, |F7|, |F9|  <  1.65 * 2^26
+    // |G0|, |G2|, |G4|, |G6|, |G8|  <  2^31
+    // |G1|, |G3|, |G5|, |G7|, |G9|  <  2^30
 
     i64 t0 = f0*(i64)g0 + F1*(i64)G9 + f2*(i64)G8 + F3*(i64)G7 + f4*(i64)G6
         +    F5*(i64)G5 + f6*(i64)G4 + F7*(i64)G3 + f8*(i64)G2 + F9*(i64)G1;
@@ -1195,11 +1344,26 @@ static void fe_mul(fe h, const fe f, const fe g)
         +    F5*(i64)g3 + f6*(i64)g2 + F7*(i64)g1 + f8*(i64)g0 + F9*(i64)G9;
     i64 t9 = f0*(i64)g9 + f1*(i64)g8 + f2*(i64)g7 + f3*(i64)g6 + f4*(i64)g5
         +    f5*(i64)g4 + f6*(i64)g3 + f7*(i64)g2 + f8*(i64)g1 + f9*(i64)g0;
-
-    FE_CARRY;
-}
-
-// we could use fe_mul() for this, but this is significantly faster
+    // t0 < 0.67 * 2^61
+    // t1 < 0.41 * 2^61
+    // t2 < 0.52 * 2^61
+    // t3 < 0.32 * 2^61
+    // t4 < 0.38 * 2^61
+    // t5 < 0.22 * 2^61
+    // t6 < 0.23 * 2^61
+    // t7 < 0.13 * 2^61
+    // t8 < 0.09 * 2^61
+    // t9 < 0.03 * 2^61
+
+    FE_CARRY; // Everything below 2^62, Carry precondition OK
+}
+
+// Precondition
+// -------------
+//   |f0|, |f2|, |f4|, |f6|, |f8|  <  1.65 * 2^26
+//   |f1|, |f3|, |f5|, |f7|, |f9|  <  1.65 * 2^25
+//
+// Note: we could use fe_mul() for this, but this is significantly faster
 static void fe_sq(fe h, const fe f)
 {
     i32 f0 = f[0]; i32 f1 = f[1]; i32 f2 = f[2]; i32 f3 = f[3]; i32 f4 = f[4];
@@ -1208,6 +1372,9 @@ static void fe_sq(fe h, const fe f)
     i32 f4_2  = f4*2;   i32 f5_2  = f5*2;   i32 f6_2  = f6*2;   i32 f7_2 = f7*2;
     i32 f5_38 = f5*38;  i32 f6_19 = f6*19;  i32 f7_38 = f7*38;
     i32 f8_19 = f8*19;  i32 f9_38 = f9*38;
+    // |f0_2| , |f2_2| , |f4_2| , |f6_2| , |f8_2|  <  1.65 * 2^27
+    // |f1_2| , |f3_2| , |f5_2| , |f7_2| , |f9_2|  <  1.65 * 2^26
+    // |f5_38|, |f6_19|, |f7_38|, |f8_19|, |f9_38| <  2^31
 
     i64 t0 = f0  *(i64)f0    + f1_2*(i64)f9_38 + f2_2*(i64)f8_19
         +    f3_2*(i64)f7_38 + f4_2*(i64)f6_19 + f5  *(i64)f5_38;
@@ -1229,11 +1396,38 @@ static void fe_sq(fe h, const fe f)
         +    f3_2*(i64)f5_2  + f4  *(i64)f4    + f9  *(i64)f9_38;
     i64 t9 = f0_2*(i64)f9    + f1_2*(i64)f8    + f2_2*(i64)f7
         +    f3_2*(i64)f6    + f4  *(i64)f5_2;
+    // t0 < 0.67 * 2^61
+    // t1 < 0.41 * 2^61
+    // t2 < 0.52 * 2^61
+    // t3 < 0.32 * 2^61
+    // t4 < 0.38 * 2^61
+    // t5 < 0.22 * 2^61
+    // t6 < 0.23 * 2^61
+    // t7 < 0.13 * 2^61
+    // t8 < 0.09 * 2^61
+    // t9 < 0.03 * 2^61
 
     FE_CARRY;
 }
 
 // h = 2 * (f^2)
+//
+// Precondition
+// -------------
+//   |f0|, |f2|, |f4|, |f6|, |f8|  <  1.65 * 2^26
+//   |f1|, |f3|, |f5|, |f7|, |f9|  <  1.65 * 2^25
+//
+// Note: we could implement fe_sq2() by copying fe_sq(), multiplying
+// each limb by 2, *then* perform the carry.  This saves one carry.
+// However, doing so with the stated preconditions does not work (t2
+// would overflow).  There are 3 ways to solve this:
+//
+// 1. Show that t2 actually never overflows (it really does not).
+// 2. Accept an additional carry, at a small lost of performance.
+// 3. Make sure the input of fe_sq2() is freshly carried.
+//
+// SUPERCOP ref10 relies on (1).
+// Monocypher chose (2) and (3), mostly to save code.
 static void fe_sq2(fe h, const fe f)
 {
     fe_sq(h, f);
-- 
2.47.3