////////////////////////////////////
/// Arithmetic modulo 2^255 - 19 ///
////////////////////////////////////
-// Taken from SUPERCOP's ref10 implementation.
+// Originally taken from SUPERCOP's ref10 implementation.
// A bit bigger than TweetNaCl, over 4 times faster.
// field element
}
}
+
+// Signed carry propagation
+// ------------------------
+//
+// Let t be a number. It can be uniquely decomposed thus:
+//
+// t = h*2^26 + l
+// such that -2^25 <= l < 2^25
+//
+// Let c = (t + 2^25) / 2^26 (rounded down)
+// c = (h*2^26 + l + 2^25) / 2^26 (rounded down)
+// c = h + (l + 2^25) / 2^26 (rounded down)
+// c = h (exactly)
+// Because 0 <= l + 2^25 < 2^26
+//
+// Let u = t - c*2^26
+// u = h*2^26 + l - h*2^26
+// u = l
+// Therefore, -2^25 <= u < 2^25
+//
+// Additionally, if |t| < x, then |h| < x/2^26 (rounded down)
+//
+// Notations:
+// - In C, 1<<25 means 2^25.
+// - In C, x>>25 means floor(x / (2^25)).
+// - All of the above applies with 25 & 24 as well as 26 & 25.
+//
+//
+// Note on negative right shifts
+// -----------------------------
+//
+// In C, x >> n, where x is a negative integer, is implementation
+// defined. In practice, all platforms do arithmetic shift, which is
+// equivalent to division by 2^26, rounded down. Some compilers, like
+// GCC, even guarantee it.
+//
+// If we ever stumble upon a platform that does not propagate the sign
+// bit (we won't), visible failures will show at the slightest test, and
+// the signed shifts can be replaced by the following:
+//
+// typedef struct { i64 x:39; } s25;
+// typedef struct { i64 x:38; } s26;
+// i64 shift25(i64 x) { s25 s; s.x = ((u64)x)>>25; return s.x; }
+// i64 shift26(i64 x) { s26 s; s.x = ((u64)x)>>26; return s.x; }
+//
+// Current compilers cannot optimise this, causing a 30% drop in
+// performance. Fairly expensive for something that never happens.
+//
+//
+// Precondition
+// ------------
+//
+// |t0| < 2^63
+// |t1|..|t9| < 2^62
+//
+// Algorithm
+// ---------
+// c = t0 + 2^25 / 2^26 -- |c| <= 2^36
+// t0 -= c * 2^26 -- |t0| <= 2^25
+// t1 += c -- |t1| <= 2^63
+//
+// c = t4 + 2^25 / 2^26 -- |c| <= 2^36
+// t4 -= c * 2^26 -- |t4| <= 2^25
+// t5 += c -- |t5| <= 2^63
+//
+// c = t1 + 2^24 / 2^25 -- |c| <= 2^38
+// t1 -= c * 2^25 -- |t1| <= 2^24
+// t2 += c -- |t2| <= 2^63
+//
+// c = t5 + 2^24 / 2^25 -- |c| <= 2^38
+// t5 -= c * 2^25 -- |t5| <= 2^24
+// t6 += c -- |t6| <= 2^63
+//
+// c = t2 + 2^25 / 2^26 -- |c| <= 2^37
+// t2 -= c * 2^26 -- |t2| <= 2^25 < 1.1 * 2^25 (final t2)
+// t3 += c -- |t3| <= 2^63
+//
+// c = t6 + 2^25 / 2^26 -- |c| <= 2^37
+// t6 -= c * 2^26 -- |t6| <= 2^25 < 1.1 * 2^25 (final t6)
+// t7 += c -- |t7| <= 2^63
+//
+// c = t3 + 2^24 / 2^25 -- |c| <= 2^38
+// t3 -= c * 2^25 -- |t3| <= 2^24 < 1.1 * 2^24 (final t3)
+// t4 += c -- |t4| <= 2^25 + 2^38 < 2^39
+//
+// c = t7 + 2^24 / 2^25 -- |c| <= 2^38
+// t7 -= c * 2^25 -- |t7| <= 2^24 < 1.1 * 2^24 (final t7)
+// t8 += c -- |t8| <= 2^63
+//
+// c = t4 + 2^25 / 2^26 -- |c| <= 2^13
+// t4 -= c * 2^26 -- |t4| <= 2^25 < 1.1 * 2^25 (final t4)
+// t5 += c -- |t5| <= 2^24 + 2^13 < 1.1 * 2^24 (final t5)
+//
+// c = t8 + 2^25 / 2^26 -- |c| <= 2^37
+// t8 -= c * 2^26 -- |t8| <= 2^25 < 1.1 * 2^25 (final t8)
+// t9 += c -- |t9| <= 2^63
+//
+// c = t9 + 2^24 / 2^25 -- |c| <= 2^38
+// t9 -= c * 2^25 -- |t9| <= 2^24 < 1.1 * 2^24 (final t9)
+// t0 += c * 19 -- |t0| <= 2^25 + 2^38*19 < 2^44
+//
+// c = t0 + 2^25 / 2^26 -- |c| <= 2^18
+// t0 -= c * 2^26 -- |t0| <= 2^25 < 1.1 * 2^25 (final t0)
+// t1 += c -- |t1| <= 2^24 + 2^18 < 1.1 * 2^24 (final t1)
+//
+// Postcondition
+// -------------
+// |t0|, |t2|, |t4|, |t6|, |t8| < 1.1 * 2^25
+// |t1|, |t3|, |t5|, |t7|, |t9| < 1.1 * 2^24
#define FE_CARRY \
- i64 c0, c1, c2, c3, c4, c5, c6, c7, c8, c9; \
- c0 = (t0 + ((i64)1<<25)) >> 26; t1 += c0; t0 -= c0 * ((i64)1 << 26); \
- c4 = (t4 + ((i64)1<<25)) >> 26; t5 += c4; t4 -= c4 * ((i64)1 << 26); \
- c1 = (t1 + ((i64)1<<24)) >> 25; t2 += c1; t1 -= c1 * ((i64)1 << 25); \
- c5 = (t5 + ((i64)1<<24)) >> 25; t6 += c5; t5 -= c5 * ((i64)1 << 25); \
- c2 = (t2 + ((i64)1<<25)) >> 26; t3 += c2; t2 -= c2 * ((i64)1 << 26); \
- c6 = (t6 + ((i64)1<<25)) >> 26; t7 += c6; t6 -= c6 * ((i64)1 << 26); \
- c3 = (t3 + ((i64)1<<24)) >> 25; t4 += c3; t3 -= c3 * ((i64)1 << 25); \
- c7 = (t7 + ((i64)1<<24)) >> 25; t8 += c7; t7 -= c7 * ((i64)1 << 25); \
- c4 = (t4 + ((i64)1<<25)) >> 26; t5 += c4; t4 -= c4 * ((i64)1 << 26); \
- c8 = (t8 + ((i64)1<<25)) >> 26; t9 += c8; t8 -= c8 * ((i64)1 << 26); \
- c9 = (t9 + ((i64)1<<24)) >> 25; t0 += c9 * 19; t9 -= c9 * ((i64)1 << 25); \
- c0 = (t0 + ((i64)1<<25)) >> 26; t1 += c0; t0 -= c0 * ((i64)1 << 26); \
+ i64 c; \
+ c = (t0 + ((i64)1<<25)) >> 26; t0 -= c * ((i64)1 << 26); t1 += c; \
+ c = (t4 + ((i64)1<<25)) >> 26; t4 -= c * ((i64)1 << 26); t5 += c; \
+ c = (t1 + ((i64)1<<24)) >> 25; t1 -= c * ((i64)1 << 25); t2 += c; \
+ c = (t5 + ((i64)1<<24)) >> 25; t5 -= c * ((i64)1 << 25); t6 += c; \
+ c = (t2 + ((i64)1<<25)) >> 26; t2 -= c * ((i64)1 << 26); t3 += c; \
+ c = (t6 + ((i64)1<<25)) >> 26; t6 -= c * ((i64)1 << 26); t7 += c; \
+ c = (t3 + ((i64)1<<24)) >> 25; t3 -= c * ((i64)1 << 25); t4 += c; \
+ c = (t7 + ((i64)1<<24)) >> 25; t7 -= c * ((i64)1 << 25); t8 += c; \
+ c = (t4 + ((i64)1<<25)) >> 26; t4 -= c * ((i64)1 << 26); t5 += c; \
+ c = (t8 + ((i64)1<<25)) >> 26; t8 -= c * ((i64)1 << 26); t9 += c; \
+ c = (t9 + ((i64)1<<24)) >> 25; t9 -= c * ((i64)1 << 25); t0 += c * 19; \
+ c = (t0 + ((i64)1<<25)) >> 26; t0 -= c * ((i64)1 << 26); t1 += c; \
h[0]=(i32)t0; h[1]=(i32)t1; h[2]=(i32)t2; h[3]=(i32)t3; h[4]=(i32)t4; \
h[5]=(i32)t5; h[6]=(i32)t6; h[7]=(i32)t7; h[8]=(i32)t8; h[9]=(i32)t9
static void fe_frombytes(fe h, const u8 s[32])
{
- i64 t0 = load32_le(s);
- i64 t1 = load24_le(s + 4) << 6;
- i64 t2 = load24_le(s + 7) << 5;
- i64 t3 = load24_le(s + 10) << 3;
- i64 t4 = load24_le(s + 13) << 2;
- i64 t5 = load32_le(s + 16);
- i64 t6 = load24_le(s + 20) << 7;
- i64 t7 = load24_le(s + 23) << 5;
- i64 t8 = load24_le(s + 26) << 4;
- i64 t9 = (load24_le(s + 29) & 0x7fffff) << 2;
- FE_CARRY;
-}
-
+ i64 t0 = load32_le(s); // t0 < 2^32
+ i64 t1 = load24_le(s + 4) << 6; // t1 < 2^30
+ i64 t2 = load24_le(s + 7) << 5; // t2 < 2^29
+ i64 t3 = load24_le(s + 10) << 3; // t3 < 2^27
+ i64 t4 = load24_le(s + 13) << 2; // t4 < 2^26
+ i64 t5 = load32_le(s + 16); // t5 < 2^32
+ i64 t6 = load24_le(s + 20) << 7; // t6 < 2^31
+ i64 t7 = load24_le(s + 23) << 5; // t7 < 2^29
+ i64 t8 = load24_le(s + 26) << 4; // t8 < 2^28
+ i64 t9 = (load24_le(s + 29) & 0x7fffff) << 2; // t9 < 2^25
+ FE_CARRY; // Carry recondition OK
+}
+
+// Precondition
+// |h[0]|, |h[2]|, |h[4]|, |h[6]|, |h[8]| < 1.1 * 2^25
+// |h[1]|, |h[3]|, |h[5]|, |h[7]|, |h[9]| < 1.1 * 2^24
+//
+// Therefore, |h| < 2^255-19
+// There are two possibilities:
+//
+// - If h is positive, all we need to do is reduce its individual
+// limbs down to their tight positive range.
+// - If h is negative, we also need to add 2^255-19 to it.
+// Or just remove 19 and chop off any exess bit.
static void fe_tobytes(u8 s[32], const fe h)
{
i32 t[10];
COPY(t, h, 10);
i32 q = (19 * t[9] + (((i32) 1) << 24)) >> 25;
+ // |t9| < 1.1 * 2^24
+ // -1.1 * 2^24 < t9 < 1.1 * 2^24
+ // -21 * 2^24 < 19 * t9 < 21 * 2^24
+ // -2^29 < 19 * t9 + 2^24 < 2^29
+ // -2^29 / 2^25 < (19 * t9 + 2^24) / 2^25 < 2^29 / 2^25
+ // -16 < (19 * t9 + 2^24) / 2^25 < 16
FOR (i, 0, 5) {
- q += t[2*i ]; q >>= 26;
- q += t[2*i+1]; q >>= 25;
+ q += t[2*i ]; q >>= 26; // q = 0 or -1
+ q += t[2*i+1]; q >>= 25; // q = 0 or -1
}
+ // q = 0 iff h >= 0
+ // q = -1 iff h < 0
+ // Adding q * 19 to h reduces h to its proper range.
q *= 19; // Shift carry back to the begining
FOR (i, 0, 5) {
t[i*2 ] += q; q = t[i*2 ] >> 26; t[i*2 ] -= q * ((i32)1 << 26);
t[i*2+1] += q; q = t[i*2+1] >> 25; t[i*2+1] -= q * ((i32)1 << 25);
}
+ // h is now fully reduced, and q represents the excess bit.
store32_le(s + 0, ((u32)t[0] >> 0) | ((u32)t[1] << 26));
store32_le(s + 4, ((u32)t[1] >> 6) | ((u32)t[2] << 19));
WIPE_BUFFER(t);
}
-// multiply a field element by a signed 32-bit integer
+// Precondition
+// -------------
+// |f0|, |f2|, |f4|, |f6|, |f8| < 1.65 * 2^26
+// |f1|, |f3|, |f5|, |f7|, |f9| < 1.65 * 2^25
+//
+// |g0|, |g2|, |g4|, |g6|, |g8| < 1.65 * 2^26
+// |g1|, |g3|, |g5|, |g7|, |g9| < 1.65 * 2^25
static void fe_mul_small(fe h, const fe f, i32 g)
{
i64 t0 = f[0] * (i64) g; i64 t1 = f[1] * (i64) g;
i64 t4 = f[4] * (i64) g; i64 t5 = f[5] * (i64) g;
i64 t6 = f[6] * (i64) g; i64 t7 = f[7] * (i64) g;
i64 t8 = f[8] * (i64) g; i64 t9 = f[9] * (i64) g;
- FE_CARRY;
+ // |t0|, |t2|, |t4|, |t6|, |t8| < 1.65 * 2^26 * 2^31 < 2^58
+ // |t1|, |t3|, |t5|, |t7|, |t9| < 1.65 * 2^25 * 2^31 < 2^57
+
+ FE_CARRY; // Carry precondition OK
}
+// Precondition
+// -------------
+// |f0|, |f2|, |f4|, |f6|, |f8| < 1.65 * 2^26
+// |f1|, |f3|, |f5|, |f7|, |f9| < 1.65 * 2^25
+//
+// |g0|, |g2|, |g4|, |g6|, |g8| < 1.65 * 2^26
+// |g1|, |g3|, |g5|, |g7|, |g9| < 1.65 * 2^25
static void fe_mul(fe h, const fe f, const fe g)
{
// Everything is unrolled and put in temporary variables.
i32 G1 = g1*19; i32 G2 = g2*19; i32 G3 = g3*19;
i32 G4 = g4*19; i32 G5 = g5*19; i32 G6 = g6*19;
i32 G7 = g7*19; i32 G8 = g8*19; i32 G9 = g9*19;
+ // |F1|, |F3|, |F5|, |F7|, |F9| < 1.65 * 2^26
+ // |G0|, |G2|, |G4|, |G6|, |G8| < 2^31
+ // |G1|, |G3|, |G5|, |G7|, |G9| < 2^30
i64 t0 = f0*(i64)g0 + F1*(i64)G9 + f2*(i64)G8 + F3*(i64)G7 + f4*(i64)G6
+ F5*(i64)G5 + f6*(i64)G4 + F7*(i64)G3 + f8*(i64)G2 + F9*(i64)G1;
+ F5*(i64)g3 + f6*(i64)g2 + F7*(i64)g1 + f8*(i64)g0 + F9*(i64)G9;
i64 t9 = f0*(i64)g9 + f1*(i64)g8 + f2*(i64)g7 + f3*(i64)g6 + f4*(i64)g5
+ f5*(i64)g4 + f6*(i64)g3 + f7*(i64)g2 + f8*(i64)g1 + f9*(i64)g0;
-
- FE_CARRY;
-}
-
-// we could use fe_mul() for this, but this is significantly faster
+ // t0 < 0.67 * 2^61
+ // t1 < 0.41 * 2^61
+ // t2 < 0.52 * 2^61
+ // t3 < 0.32 * 2^61
+ // t4 < 0.38 * 2^61
+ // t5 < 0.22 * 2^61
+ // t6 < 0.23 * 2^61
+ // t7 < 0.13 * 2^61
+ // t8 < 0.09 * 2^61
+ // t9 < 0.03 * 2^61
+
+ FE_CARRY; // Everything below 2^62, Carry precondition OK
+}
+
+// Precondition
+// -------------
+// |f0|, |f2|, |f4|, |f6|, |f8| < 1.65 * 2^26
+// |f1|, |f3|, |f5|, |f7|, |f9| < 1.65 * 2^25
+//
+// Note: we could use fe_mul() for this, but this is significantly faster
static void fe_sq(fe h, const fe f)
{
i32 f0 = f[0]; i32 f1 = f[1]; i32 f2 = f[2]; i32 f3 = f[3]; i32 f4 = f[4];
i32 f4_2 = f4*2; i32 f5_2 = f5*2; i32 f6_2 = f6*2; i32 f7_2 = f7*2;
i32 f5_38 = f5*38; i32 f6_19 = f6*19; i32 f7_38 = f7*38;
i32 f8_19 = f8*19; i32 f9_38 = f9*38;
+ // |f0_2| , |f2_2| , |f4_2| , |f6_2| , |f8_2| < 1.65 * 2^27
+ // |f1_2| , |f3_2| , |f5_2| , |f7_2| , |f9_2| < 1.65 * 2^26
+ // |f5_38|, |f6_19|, |f7_38|, |f8_19|, |f9_38| < 2^31
i64 t0 = f0 *(i64)f0 + f1_2*(i64)f9_38 + f2_2*(i64)f8_19
+ f3_2*(i64)f7_38 + f4_2*(i64)f6_19 + f5 *(i64)f5_38;
+ f3_2*(i64)f5_2 + f4 *(i64)f4 + f9 *(i64)f9_38;
i64 t9 = f0_2*(i64)f9 + f1_2*(i64)f8 + f2_2*(i64)f7
+ f3_2*(i64)f6 + f4 *(i64)f5_2;
+ // t0 < 0.67 * 2^61
+ // t1 < 0.41 * 2^61
+ // t2 < 0.52 * 2^61
+ // t3 < 0.32 * 2^61
+ // t4 < 0.38 * 2^61
+ // t5 < 0.22 * 2^61
+ // t6 < 0.23 * 2^61
+ // t7 < 0.13 * 2^61
+ // t8 < 0.09 * 2^61
+ // t9 < 0.03 * 2^61
FE_CARRY;
}
// h = 2 * (f^2)
+//
+// Precondition
+// -------------
+// |f0|, |f2|, |f4|, |f6|, |f8| < 1.65 * 2^26
+// |f1|, |f3|, |f5|, |f7|, |f9| < 1.65 * 2^25
+//
+// Note: we could implement fe_sq2() by copying fe_sq(), multiplying
+// each limb by 2, *then* perform the carry. This saves one carry.
+// However, doing so with the stated preconditions does not work (t2
+// would overflow). There are 3 ways to solve this:
+//
+// 1. Show that t2 actually never overflows (it really does not).
+// 2. Accept an additional carry, at a small lost of performance.
+// 3. Make sure the input of fe_sq2() is freshly carried.
+//
+// SUPERCOP ref10 relies on (1).
+// Monocypher chose (2) and (3), mostly to save code.
static void fe_sq2(fe h, const fe f)
{
fe_sq(h, f);