From d5d9c79b4c5c4085478357cbf2a8613f837cc436 Mon Sep 17 00:00:00 2001 From: Loup Vaillant Date: Sun, 8 Nov 2020 22:41:58 +0100 Subject: [PATCH] Documented 2^255-19 carry propagation Fixes #185 Carry propagation is now justified, in a way that I can personally vouch for (I used to rely on SUPERCOP's ref10 code and proofs). The use of arithmetic right shifts is also documented, and a workaround has been devised in case someone somewhere uses a platforms that does not perform sign extension. (That will never happen.) --- src/monocypher.c | 266 ++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 230 insertions(+), 36 deletions(-) diff --git a/src/monocypher.c b/src/monocypher.c index b9fc5b2..9c8da66 100644 --- a/src/monocypher.c +++ b/src/monocypher.c @@ -1037,7 +1037,7 @@ void crypto_argon2i(u8 *hash, u32 hash_size, //////////////////////////////////// /// Arithmetic modulo 2^255 - 19 /// //////////////////////////////////// -// Taken from SUPERCOP's ref10 implementation. +// Originally taken from SUPERCOP's ref10 implementation. // A bit bigger than TweetNaCl, over 4 times faster. // field element @@ -1092,52 +1092,182 @@ static void fe_ccopy(fe f, const fe g, int b) } } + +// Signed carry propagation +// ------------------------ +// +// Let t be a number. It can be uniquely decomposed thus: +// +// t = h*2^26 + l +// such that -2^25 <= l < 2^25 +// +// Let c = (t + 2^25) / 2^26 (rounded down) +// c = (h*2^26 + l + 2^25) / 2^26 (rounded down) +// c = h + (l + 2^25) / 2^26 (rounded down) +// c = h (exactly) +// Because 0 <= l + 2^25 < 2^26 +// +// Let u = t - c*2^26 +// u = h*2^26 + l - h*2^26 +// u = l +// Therefore, -2^25 <= u < 2^25 +// +// Additionally, if |t| < x, then |h| < x/2^26 (rounded down) +// +// Notations: +// - In C, 1<<25 means 2^25. +// - In C, x>>25 means floor(x / (2^25)). +// - All of the above applies with 25 & 24 as well as 26 & 25. +// +// +// Note on negative right shifts +// ----------------------------- +// +// In C, x >> n, where x is a negative integer, is implementation +// defined. In practice, all platforms do arithmetic shift, which is +// equivalent to division by 2^26, rounded down. Some compilers, like +// GCC, even guarantee it. +// +// If we ever stumble upon a platform that does not propagate the sign +// bit (we won't), visible failures will show at the slightest test, and +// the signed shifts can be replaced by the following: +// +// typedef struct { i64 x:39; } s25; +// typedef struct { i64 x:38; } s26; +// i64 shift25(i64 x) { s25 s; s.x = ((u64)x)>>25; return s.x; } +// i64 shift26(i64 x) { s26 s; s.x = ((u64)x)>>26; return s.x; } +// +// Current compilers cannot optimise this, causing a 30% drop in +// performance. Fairly expensive for something that never happens. +// +// +// Precondition +// ------------ +// +// |t0| < 2^63 +// |t1|..|t9| < 2^62 +// +// Algorithm +// --------- +// c = t0 + 2^25 / 2^26 -- |c| <= 2^36 +// t0 -= c * 2^26 -- |t0| <= 2^25 +// t1 += c -- |t1| <= 2^63 +// +// c = t4 + 2^25 / 2^26 -- |c| <= 2^36 +// t4 -= c * 2^26 -- |t4| <= 2^25 +// t5 += c -- |t5| <= 2^63 +// +// c = t1 + 2^24 / 2^25 -- |c| <= 2^38 +// t1 -= c * 2^25 -- |t1| <= 2^24 +// t2 += c -- |t2| <= 2^63 +// +// c = t5 + 2^24 / 2^25 -- |c| <= 2^38 +// t5 -= c * 2^25 -- |t5| <= 2^24 +// t6 += c -- |t6| <= 2^63 +// +// c = t2 + 2^25 / 2^26 -- |c| <= 2^37 +// t2 -= c * 2^26 -- |t2| <= 2^25 < 1.1 * 2^25 (final t2) +// t3 += c -- |t3| <= 2^63 +// +// c = t6 + 2^25 / 2^26 -- |c| <= 2^37 +// t6 -= c * 2^26 -- |t6| <= 2^25 < 1.1 * 2^25 (final t6) +// t7 += c -- |t7| <= 2^63 +// +// c = t3 + 2^24 / 2^25 -- |c| <= 2^38 +// t3 -= c * 2^25 -- |t3| <= 2^24 < 1.1 * 2^24 (final t3) +// t4 += c -- |t4| <= 2^25 + 2^38 < 2^39 +// +// c = t7 + 2^24 / 2^25 -- |c| <= 2^38 +// t7 -= c * 2^25 -- |t7| <= 2^24 < 1.1 * 2^24 (final t7) +// t8 += c -- |t8| <= 2^63 +// +// c = t4 + 2^25 / 2^26 -- |c| <= 2^13 +// t4 -= c * 2^26 -- |t4| <= 2^25 < 1.1 * 2^25 (final t4) +// t5 += c -- |t5| <= 2^24 + 2^13 < 1.1 * 2^24 (final t5) +// +// c = t8 + 2^25 / 2^26 -- |c| <= 2^37 +// t8 -= c * 2^26 -- |t8| <= 2^25 < 1.1 * 2^25 (final t8) +// t9 += c -- |t9| <= 2^63 +// +// c = t9 + 2^24 / 2^25 -- |c| <= 2^38 +// t9 -= c * 2^25 -- |t9| <= 2^24 < 1.1 * 2^24 (final t9) +// t0 += c * 19 -- |t0| <= 2^25 + 2^38*19 < 2^44 +// +// c = t0 + 2^25 / 2^26 -- |c| <= 2^18 +// t0 -= c * 2^26 -- |t0| <= 2^25 < 1.1 * 2^25 (final t0) +// t1 += c -- |t1| <= 2^24 + 2^18 < 1.1 * 2^24 (final t1) +// +// Postcondition +// ------------- +// |t0|, |t2|, |t4|, |t6|, |t8| < 1.1 * 2^25 +// |t1|, |t3|, |t5|, |t7|, |t9| < 1.1 * 2^24 #define FE_CARRY \ - i64 c0, c1, c2, c3, c4, c5, c6, c7, c8, c9; \ - c0 = (t0 + ((i64)1<<25)) >> 26; t1 += c0; t0 -= c0 * ((i64)1 << 26); \ - c4 = (t4 + ((i64)1<<25)) >> 26; t5 += c4; t4 -= c4 * ((i64)1 << 26); \ - c1 = (t1 + ((i64)1<<24)) >> 25; t2 += c1; t1 -= c1 * ((i64)1 << 25); \ - c5 = (t5 + ((i64)1<<24)) >> 25; t6 += c5; t5 -= c5 * ((i64)1 << 25); \ - c2 = (t2 + ((i64)1<<25)) >> 26; t3 += c2; t2 -= c2 * ((i64)1 << 26); \ - c6 = (t6 + ((i64)1<<25)) >> 26; t7 += c6; t6 -= c6 * ((i64)1 << 26); \ - c3 = (t3 + ((i64)1<<24)) >> 25; t4 += c3; t3 -= c3 * ((i64)1 << 25); \ - c7 = (t7 + ((i64)1<<24)) >> 25; t8 += c7; t7 -= c7 * ((i64)1 << 25); \ - c4 = (t4 + ((i64)1<<25)) >> 26; t5 += c4; t4 -= c4 * ((i64)1 << 26); \ - c8 = (t8 + ((i64)1<<25)) >> 26; t9 += c8; t8 -= c8 * ((i64)1 << 26); \ - c9 = (t9 + ((i64)1<<24)) >> 25; t0 += c9 * 19; t9 -= c9 * ((i64)1 << 25); \ - c0 = (t0 + ((i64)1<<25)) >> 26; t1 += c0; t0 -= c0 * ((i64)1 << 26); \ + i64 c; \ + c = (t0 + ((i64)1<<25)) >> 26; t0 -= c * ((i64)1 << 26); t1 += c; \ + c = (t4 + ((i64)1<<25)) >> 26; t4 -= c * ((i64)1 << 26); t5 += c; \ + c = (t1 + ((i64)1<<24)) >> 25; t1 -= c * ((i64)1 << 25); t2 += c; \ + c = (t5 + ((i64)1<<24)) >> 25; t5 -= c * ((i64)1 << 25); t6 += c; \ + c = (t2 + ((i64)1<<25)) >> 26; t2 -= c * ((i64)1 << 26); t3 += c; \ + c = (t6 + ((i64)1<<25)) >> 26; t6 -= c * ((i64)1 << 26); t7 += c; \ + c = (t3 + ((i64)1<<24)) >> 25; t3 -= c * ((i64)1 << 25); t4 += c; \ + c = (t7 + ((i64)1<<24)) >> 25; t7 -= c * ((i64)1 << 25); t8 += c; \ + c = (t4 + ((i64)1<<25)) >> 26; t4 -= c * ((i64)1 << 26); t5 += c; \ + c = (t8 + ((i64)1<<25)) >> 26; t8 -= c * ((i64)1 << 26); t9 += c; \ + c = (t9 + ((i64)1<<24)) >> 25; t9 -= c * ((i64)1 << 25); t0 += c * 19; \ + c = (t0 + ((i64)1<<25)) >> 26; t0 -= c * ((i64)1 << 26); t1 += c; \ h[0]=(i32)t0; h[1]=(i32)t1; h[2]=(i32)t2; h[3]=(i32)t3; h[4]=(i32)t4; \ h[5]=(i32)t5; h[6]=(i32)t6; h[7]=(i32)t7; h[8]=(i32)t8; h[9]=(i32)t9 static void fe_frombytes(fe h, const u8 s[32]) { - i64 t0 = load32_le(s); - i64 t1 = load24_le(s + 4) << 6; - i64 t2 = load24_le(s + 7) << 5; - i64 t3 = load24_le(s + 10) << 3; - i64 t4 = load24_le(s + 13) << 2; - i64 t5 = load32_le(s + 16); - i64 t6 = load24_le(s + 20) << 7; - i64 t7 = load24_le(s + 23) << 5; - i64 t8 = load24_le(s + 26) << 4; - i64 t9 = (load24_le(s + 29) & 0x7fffff) << 2; - FE_CARRY; -} - + i64 t0 = load32_le(s); // t0 < 2^32 + i64 t1 = load24_le(s + 4) << 6; // t1 < 2^30 + i64 t2 = load24_le(s + 7) << 5; // t2 < 2^29 + i64 t3 = load24_le(s + 10) << 3; // t3 < 2^27 + i64 t4 = load24_le(s + 13) << 2; // t4 < 2^26 + i64 t5 = load32_le(s + 16); // t5 < 2^32 + i64 t6 = load24_le(s + 20) << 7; // t6 < 2^31 + i64 t7 = load24_le(s + 23) << 5; // t7 < 2^29 + i64 t8 = load24_le(s + 26) << 4; // t8 < 2^28 + i64 t9 = (load24_le(s + 29) & 0x7fffff) << 2; // t9 < 2^25 + FE_CARRY; // Carry recondition OK +} + +// Precondition +// |h[0]|, |h[2]|, |h[4]|, |h[6]|, |h[8]| < 1.1 * 2^25 +// |h[1]|, |h[3]|, |h[5]|, |h[7]|, |h[9]| < 1.1 * 2^24 +// +// Therefore, |h| < 2^255-19 +// There are two possibilities: +// +// - If h is positive, all we need to do is reduce its individual +// limbs down to their tight positive range. +// - If h is negative, we also need to add 2^255-19 to it. +// Or just remove 19 and chop off any exess bit. static void fe_tobytes(u8 s[32], const fe h) { i32 t[10]; COPY(t, h, 10); i32 q = (19 * t[9] + (((i32) 1) << 24)) >> 25; + // |t9| < 1.1 * 2^24 + // -1.1 * 2^24 < t9 < 1.1 * 2^24 + // -21 * 2^24 < 19 * t9 < 21 * 2^24 + // -2^29 < 19 * t9 + 2^24 < 2^29 + // -2^29 / 2^25 < (19 * t9 + 2^24) / 2^25 < 2^29 / 2^25 + // -16 < (19 * t9 + 2^24) / 2^25 < 16 FOR (i, 0, 5) { - q += t[2*i ]; q >>= 26; - q += t[2*i+1]; q >>= 25; + q += t[2*i ]; q >>= 26; // q = 0 or -1 + q += t[2*i+1]; q >>= 25; // q = 0 or -1 } + // q = 0 iff h >= 0 + // q = -1 iff h < 0 + // Adding q * 19 to h reduces h to its proper range. q *= 19; // Shift carry back to the begining FOR (i, 0, 5) { t[i*2 ] += q; q = t[i*2 ] >> 26; t[i*2 ] -= q * ((i32)1 << 26); t[i*2+1] += q; q = t[i*2+1] >> 25; t[i*2+1] -= q * ((i32)1 << 25); } + // h is now fully reduced, and q represents the excess bit. store32_le(s + 0, ((u32)t[0] >> 0) | ((u32)t[1] << 26)); store32_le(s + 4, ((u32)t[1] >> 6) | ((u32)t[2] << 19)); @@ -1151,7 +1281,13 @@ static void fe_tobytes(u8 s[32], const fe h) WIPE_BUFFER(t); } -// multiply a field element by a signed 32-bit integer +// Precondition +// ------------- +// |f0|, |f2|, |f4|, |f6|, |f8| < 1.65 * 2^26 +// |f1|, |f3|, |f5|, |f7|, |f9| < 1.65 * 2^25 +// +// |g0|, |g2|, |g4|, |g6|, |g8| < 1.65 * 2^26 +// |g1|, |g3|, |g5|, |g7|, |g9| < 1.65 * 2^25 static void fe_mul_small(fe h, const fe f, i32 g) { i64 t0 = f[0] * (i64) g; i64 t1 = f[1] * (i64) g; @@ -1159,9 +1295,19 @@ static void fe_mul_small(fe h, const fe f, i32 g) i64 t4 = f[4] * (i64) g; i64 t5 = f[5] * (i64) g; i64 t6 = f[6] * (i64) g; i64 t7 = f[7] * (i64) g; i64 t8 = f[8] * (i64) g; i64 t9 = f[9] * (i64) g; - FE_CARRY; + // |t0|, |t2|, |t4|, |t6|, |t8| < 1.65 * 2^26 * 2^31 < 2^58 + // |t1|, |t3|, |t5|, |t7|, |t9| < 1.65 * 2^25 * 2^31 < 2^57 + + FE_CARRY; // Carry precondition OK } +// Precondition +// ------------- +// |f0|, |f2|, |f4|, |f6|, |f8| < 1.65 * 2^26 +// |f1|, |f3|, |f5|, |f7|, |f9| < 1.65 * 2^25 +// +// |g0|, |g2|, |g4|, |g6|, |g8| < 1.65 * 2^26 +// |g1|, |g3|, |g5|, |g7|, |g9| < 1.65 * 2^25 static void fe_mul(fe h, const fe f, const fe g) { // Everything is unrolled and put in temporary variables. @@ -1174,6 +1320,9 @@ static void fe_mul(fe h, const fe f, const fe g) i32 G1 = g1*19; i32 G2 = g2*19; i32 G3 = g3*19; i32 G4 = g4*19; i32 G5 = g5*19; i32 G6 = g6*19; i32 G7 = g7*19; i32 G8 = g8*19; i32 G9 = g9*19; + // |F1|, |F3|, |F5|, |F7|, |F9| < 1.65 * 2^26 + // |G0|, |G2|, |G4|, |G6|, |G8| < 2^31 + // |G1|, |G3|, |G5|, |G7|, |G9| < 2^30 i64 t0 = f0*(i64)g0 + F1*(i64)G9 + f2*(i64)G8 + F3*(i64)G7 + f4*(i64)G6 + F5*(i64)G5 + f6*(i64)G4 + F7*(i64)G3 + f8*(i64)G2 + F9*(i64)G1; @@ -1195,11 +1344,26 @@ static void fe_mul(fe h, const fe f, const fe g) + F5*(i64)g3 + f6*(i64)g2 + F7*(i64)g1 + f8*(i64)g0 + F9*(i64)G9; i64 t9 = f0*(i64)g9 + f1*(i64)g8 + f2*(i64)g7 + f3*(i64)g6 + f4*(i64)g5 + f5*(i64)g4 + f6*(i64)g3 + f7*(i64)g2 + f8*(i64)g1 + f9*(i64)g0; - - FE_CARRY; -} - -// we could use fe_mul() for this, but this is significantly faster + // t0 < 0.67 * 2^61 + // t1 < 0.41 * 2^61 + // t2 < 0.52 * 2^61 + // t3 < 0.32 * 2^61 + // t4 < 0.38 * 2^61 + // t5 < 0.22 * 2^61 + // t6 < 0.23 * 2^61 + // t7 < 0.13 * 2^61 + // t8 < 0.09 * 2^61 + // t9 < 0.03 * 2^61 + + FE_CARRY; // Everything below 2^62, Carry precondition OK +} + +// Precondition +// ------------- +// |f0|, |f2|, |f4|, |f6|, |f8| < 1.65 * 2^26 +// |f1|, |f3|, |f5|, |f7|, |f9| < 1.65 * 2^25 +// +// Note: we could use fe_mul() for this, but this is significantly faster static void fe_sq(fe h, const fe f) { i32 f0 = f[0]; i32 f1 = f[1]; i32 f2 = f[2]; i32 f3 = f[3]; i32 f4 = f[4]; @@ -1208,6 +1372,9 @@ static void fe_sq(fe h, const fe f) i32 f4_2 = f4*2; i32 f5_2 = f5*2; i32 f6_2 = f6*2; i32 f7_2 = f7*2; i32 f5_38 = f5*38; i32 f6_19 = f6*19; i32 f7_38 = f7*38; i32 f8_19 = f8*19; i32 f9_38 = f9*38; + // |f0_2| , |f2_2| , |f4_2| , |f6_2| , |f8_2| < 1.65 * 2^27 + // |f1_2| , |f3_2| , |f5_2| , |f7_2| , |f9_2| < 1.65 * 2^26 + // |f5_38|, |f6_19|, |f7_38|, |f8_19|, |f9_38| < 2^31 i64 t0 = f0 *(i64)f0 + f1_2*(i64)f9_38 + f2_2*(i64)f8_19 + f3_2*(i64)f7_38 + f4_2*(i64)f6_19 + f5 *(i64)f5_38; @@ -1229,11 +1396,38 @@ static void fe_sq(fe h, const fe f) + f3_2*(i64)f5_2 + f4 *(i64)f4 + f9 *(i64)f9_38; i64 t9 = f0_2*(i64)f9 + f1_2*(i64)f8 + f2_2*(i64)f7 + f3_2*(i64)f6 + f4 *(i64)f5_2; + // t0 < 0.67 * 2^61 + // t1 < 0.41 * 2^61 + // t2 < 0.52 * 2^61 + // t3 < 0.32 * 2^61 + // t4 < 0.38 * 2^61 + // t5 < 0.22 * 2^61 + // t6 < 0.23 * 2^61 + // t7 < 0.13 * 2^61 + // t8 < 0.09 * 2^61 + // t9 < 0.03 * 2^61 FE_CARRY; } // h = 2 * (f^2) +// +// Precondition +// ------------- +// |f0|, |f2|, |f4|, |f6|, |f8| < 1.65 * 2^26 +// |f1|, |f3|, |f5|, |f7|, |f9| < 1.65 * 2^25 +// +// Note: we could implement fe_sq2() by copying fe_sq(), multiplying +// each limb by 2, *then* perform the carry. This saves one carry. +// However, doing so with the stated preconditions does not work (t2 +// would overflow). There are 3 ways to solve this: +// +// 1. Show that t2 actually never overflows (it really does not). +// 2. Accept an additional carry, at a small lost of performance. +// 3. Make sure the input of fe_sq2() is freshly carried. +// +// SUPERCOP ref10 relies on (1). +// Monocypher chose (2) and (3), mostly to save code. static void fe_sq2(fe h, const fe f) { fe_sq(h, f); -- 2.47.3