From: Loup Vaillant Date: Fri, 3 Aug 2018 21:25:55 +0000 (+0200) Subject: Cleaner fe_frombytes() (loading field elements) X-Git-Url: https://git.codecow.com/?a=commitdiff_plain;h=6ee8787e61b3918789eab0ab38ce176abc767abb;p=Monocypher.git Cleaner fe_frombytes() (loading field elements) The old version of fe_frombytes() from the ref10 implementation was not as clean as I wanted it to be: instead of loading exactly the right bytes, it played fast and loose, then used a carry operation to compensate. It works, but there's a more direct, simpler, and I suspect faster approach: put the right bits in the right place to begin with. --- diff --git a/src/monocypher.c b/src/monocypher.c index 142d645..39cbe0d 100644 --- a/src/monocypher.c +++ b/src/monocypher.c @@ -33,13 +33,6 @@ typedef uint64_t u64; static const u8 zero[128] = {0}; -static u32 load24_le(const u8 s[3]) -{ - return (u32)s[0] - | ((u32)s[1] << 8) - | ((u32)s[2] << 16); -} - static u32 load32_le(const u8 s[4]) { return (u32)s[0] @@ -1034,34 +1027,18 @@ static void fe_ccopy(fe f, const fe g, i32 b) } } -#define FE_CARRY \ - i64 c0, c1, c2, c3, c4, c5, c6, c7, c8, c9; \ - c9 = (t9 + (i64) (1<<24)) >> 25; t0 += c9 * 19; t9 -= c9 * (1 << 25); \ - c1 = (t1 + (i64) (1<<24)) >> 25; t2 += c1; t1 -= c1 * (1 << 25); \ - c3 = (t3 + (i64) (1<<24)) >> 25; t4 += c3; t3 -= c3 * (1 << 25); \ - c5 = (t5 + (i64) (1<<24)) >> 25; t6 += c5; t5 -= c5 * (1 << 25); \ - c7 = (t7 + (i64) (1<<24)) >> 25; t8 += c7; t7 -= c7 * (1 << 25); \ - c0 = (t0 + (i64) (1<<25)) >> 26; t1 += c0; t0 -= c0 * (1 << 26); \ - c2 = (t2 + (i64) (1<<25)) >> 26; t3 += c2; t2 -= c2 * (1 << 26); \ - c4 = (t4 + (i64) (1<<25)) >> 26; t5 += c4; t4 -= c4 * (1 << 26); \ - c6 = (t6 + (i64) (1<<25)) >> 26; t7 += c6; t6 -= c6 * (1 << 26); \ - c8 = (t8 + (i64) (1<<25)) >> 26; t9 += c8; t8 -= c8 * (1 << 26); \ - h[0]=(i32)t0; h[1]=(i32)t1; h[2]=(i32)t2; h[3]=(i32)t3; h[4]=(i32)t4; \ - h[5]=(i32)t5; h[6]=(i32)t6; h[7]=(i32)t7; h[8]=(i32)t8; h[9]=(i32)t9 - static void fe_frombytes(fe h, const u8 s[32]) { - i64 t0 = load32_le(s); - i64 t1 = load24_le(s + 4) << 6; - i64 t2 = load24_le(s + 7) << 5; - i64 t3 = load24_le(s + 10) << 3; - i64 t4 = load24_le(s + 13) << 2; - i64 t5 = load32_le(s + 16); - i64 t6 = load24_le(s + 20) << 7; - i64 t7 = load24_le(s + 23) << 5; - i64 t8 = load24_le(s + 26) << 4; - i64 t9 = (load24_le(s + 29) & 8388607) << 2; - FE_CARRY; + h[0]= load32_le(s ) & 0x3ffffff; + h[1]= load32_le(s + 3) >> 2 & 0x1ffffff; + h[2]= load32_le(s + 6) >> 3 & 0x3ffffff; + h[3]= load32_le(s + 9) >> 5 & 0x1ffffff; + h[4]= load32_le(s + 12) >> 6 & 0x3ffffff; + h[5]= load32_le(s + 16) & 0x1ffffff; + h[6]= load32_le(s + 19) >> 1 & 0x3ffffff; + h[7]= load32_le(s + 22) >> 3 & 0x1ffffff; + h[8]= load32_le(s + 25) >> 4 & 0x3ffffff; + h[9]= load32_le(s + 28) >> 6 & 0x1ffffff; } static void fe_mul_small(fe h, const fe f, i32 g) @@ -1071,7 +1048,20 @@ static void fe_mul_small(fe h, const fe f, i32 g) i64 t4 = f[4] * (i64) g; i64 t5 = f[5] * (i64) g; i64 t6 = f[6] * (i64) g; i64 t7 = f[7] * (i64) g; i64 t8 = f[8] * (i64) g; i64 t9 = f[9] * (i64) g; - FE_CARRY; + + i64 c0, c1, c2, c3, c4, c5, c6, c7, c8, c9; + c9 = (t9 + (i64) (1<<24)) >> 25; t0 += c9 * 19; t9 -= c9 * (1 << 25); + c1 = (t1 + (i64) (1<<24)) >> 25; t2 += c1; t1 -= c1 * (1 << 25); + c3 = (t3 + (i64) (1<<24)) >> 25; t4 += c3; t3 -= c3 * (1 << 25); + c5 = (t5 + (i64) (1<<24)) >> 25; t6 += c5; t5 -= c5 * (1 << 25); + c7 = (t7 + (i64) (1<<24)) >> 25; t8 += c7; t7 -= c7 * (1 << 25); + c0 = (t0 + (i64) (1<<25)) >> 26; t1 += c0; t0 -= c0 * (1 << 26); + c2 = (t2 + (i64) (1<<25)) >> 26; t3 += c2; t2 -= c2 * (1 << 26); + c4 = (t4 + (i64) (1<<25)) >> 26; t5 += c4; t4 -= c4 * (1 << 26); + c6 = (t6 + (i64) (1<<25)) >> 26; t7 += c6; t6 -= c6 * (1 << 26); + c8 = (t8 + (i64) (1<<25)) >> 26; t9 += c8; t8 -= c8 * (1 << 26); + h[0]=(i32)t0; h[1]=(i32)t1; h[2]=(i32)t2; h[3]=(i32)t3; h[4]=(i32)t4; + h[5]=(i32)t5; h[6]=(i32)t6; h[7]=(i32)t7; h[8]=(i32)t8; h[9]=(i32)t9; } static void fe_mul121666(fe h, const fe f) { fe_mul_small(h, f, 121666); }