From 70b9b5aa70be0d84ba0826bd358ac235df932ee4 Mon Sep 17 00:00:00 2001 From: Chris Duncan Date: Mon, 16 Mar 2026 13:57:59 -0700 Subject: [PATCH] Refactor field element multiplication hot paths. --- asconfig.json | 2 +- assembly/base.ts | 2 +- assembly/blake2b.ts | 12 +- assembly/constants.ts | 35 +- assembly/fe.ts | 1504 +++++++++++++++++++++++++++++------------ assembly/ge.ts | 579 +++------------- assembly/nano-nacl.ts | 39 +- assembly/p.ts | 236 +++++++ index.html | 47 +- index.ts | 125 +++- 10 files changed, 1591 insertions(+), 990 deletions(-) create mode 100644 assembly/p.ts diff --git a/asconfig.json b/asconfig.json index 11c4e61..2828ae2 100644 --- a/asconfig.json +++ b/asconfig.json @@ -9,7 +9,7 @@ "uncheckedBehavior": "always", "bindings": "esm", "sourceMap": false, - "debug": false, + "debug": true, "runtime": "stub", "exportRuntime": false, "enable": [ diff --git a/assembly/base.ts b/assembly/base.ts index c9a7293..d59e79a 100644 --- a/assembly/base.ts +++ b/assembly/base.ts @@ -2,7 +2,7 @@ //! SPDX-License-Identifier: ISC import { fe } from './fe' -import { ge_precomp } from './ge' +import { ge_precomp } from './p' export const base = StaticArray.fromArray>([ StaticArray.fromArray([ /* 0/31 */ diff --git a/assembly/blake2b.ts b/assembly/blake2b.ts index c61f76f..82c654b 100644 --- a/assembly/blake2b.ts +++ b/assembly/blake2b.ts @@ -75,8 +75,9 @@ export class Blake2b { this.p[3] = 1 // depth // initialize hash state + memory.copy(changetype(this.h), changetype(blake2b_iv), 64) for (let i = 0; i < 8; i++) { - this.h[i] = blake2b_iv[i] ^ load(changetype(this.p) + (i << 3)) + this.h[i] ^= load(changetype(this.p) + (i << 3)) } return this } @@ -120,13 +121,12 @@ export class Blake2b { } // Defined in BLAKE2 section 2.4 + @inline COMPRESS (isFinal: bool): void { // initialize state vector - for (let i = 0; i < 8; i++) { - this.v[i] = this.h[i] - this.v[i + 8] = blake2b_iv[i] - } + memory.copy(changetype(this.v), changetype(this.h), 64) + memory.copy(changetype(this.v) + 64, changetype(blake2b_iv), 64) // lo 64 bits of counter this.v[12] ^= v128.extract_lane(this.t, 0) @@ -151,6 +151,7 @@ export class Blake2b { } } + @inline ROUND (r: u8): void { for (let i = 0; i < 8; i++) { const a = blake2b_state[i][0] @@ -162,6 +163,7 @@ export class Blake2b { } } + @inline G (a: u8, b: u8, c: u8, d: u8, x: u8, y: u8): void { this.v[a] = this.v[a] + this.v[b] + this.m[x] this.v[d] = rotr(this.v[d] ^ this.v[a], 32) diff --git a/assembly/constants.ts b/assembly/constants.ts index 426adc7..3b41cd9 100644 --- a/assembly/constants.ts +++ b/assembly/constants.ts @@ -6,45 +6,12 @@ export const fe25519_sqrtm1 = StaticArray.fromArray([ -32595792, -7943725, 9377950, 3500415, 12389472, -272473, -25146209, -2005654, 326686, 11406482 ]) -/* sqrt(-486664) */ -export const ed25519_sqrtam2 = StaticArray.fromArray([ - -12222970, -8312128, -11511410, 9067497, -15300785, -241793, 25456130, 14121551, -12187136, 3972024 -]) - /* 37095705934669439343138083508754565189542113879843219016388785533085940283555 */ export const ed25519_d = StaticArray.fromArray([ -10913610, 13857413, -15372611, 6949391, 114729, -8787816, -6275908, -3247719, -18696448, -12055116 ]) -/* 2 * d = - * 16295367250680780974490674513165176452449235426866156013048779062215315747161 - */ +/* 16295367250680780974490674513165176452449235426866156013048779062215315747161 */ export const ed25519_d2 = StaticArray.fromArray([ -21827239, -5839606, -30745221, 13898782, 229458, 15978800, -12551817, -6495438, 29715968, 9444199 ]) - -/* A = 486662 */ -export const ed25519_A_32 = 486662 -export const ed25519_A = StaticArray.fromArray([ - ed25519_A_32, 0, 0, 0, 0, 0, 0, 0, 0, 0 -]) - -/* sqrt(ad - 1) with a = -1 (mod p) */ -export const ed25519_sqrtadm1 = StaticArray.fromArray([ - 24849947, -153582, -23613485, 6347715, -21072328, -667138, -25271143, -15367704, -870347, 14525639 -]) - -/* 1 / sqrt(a - d) */ -export const ed25519_invsqrtamd = StaticArray.fromArray([ - 6111485, 4156064, -27798727, 12243468, -25904040, 120897, 20826367, -7060776, 6093568, -1986012 -]) - -/* 1 - d ^ 2 */ -export const ed25519_onemsqd = StaticArray.fromArray([ - 6275446, -16617371, -22938544, -3773710, 11667077, 7397348, -27922721, 1766195, -24433858, 672203 -]) - -/* (d - 1) ^ 2 */ -export const ed25519_sqdmone = StaticArray.fromArray([ - 15551795, -11097455, -13425098, -10125071, -11896535, 10178284, -26634327, 4729244, -5282110, -10116402 -]) diff --git a/assembly/fe.ts b/assembly/fe.ts index f2170b0..e12d4b4 100644 --- a/assembly/fe.ts +++ b/assembly/fe.ts @@ -20,8 +20,10 @@ import { load_3, load_4, sodium_is_zero } from "./utils" * 10x 25- and 26-bit array of values representing a large integer `n mod p`. */ export type FieldElement = StaticArray -export function fe (source: Array = new Array(10)): FieldElement { - return StaticArray.fromArray(source) +export function fe (src: Array = new Array(12)): FieldElement { + const dst = new StaticArray(12) + memory.copy(changetype(dst), src.dataStart, src.length << 2) + return dst } /** @@ -32,7 +34,7 @@ export function fe (source: Array = new Array(10)): FieldElement { //@ts-expect-error @inline export function fe_0 (h: FieldElement): void { - memory.fill(changetype(h), 0, 40) + memory.fill(changetype(h), 0, 48) } /** @@ -44,7 +46,7 @@ export function fe_0 (h: FieldElement): void { @inline export function fe_1 (h: FieldElement): void { fe_0(h) - store(changetype(h), 1) + store(changetype(h), 1, 0) } /** @@ -57,21 +59,12 @@ export function fe_1 (h: FieldElement): void { //@ts-expect-error @inline export function fe_add (h: FieldElement, f: FieldElement, g: FieldElement): void { - const o = changetype(h) - const a = changetype(f) - const b = changetype(g) - - let ai = v128.load(a) - let bi = v128.load(b) - v128.store(o, v128.add(ai, bi)) - - ai = v128.load(a, 16) - bi = v128.load(b, 16) - v128.store(o, v128.add(ai, bi), 16) - - ai = i64x2(load(a + 32), 0) - bi = i64x2(load(b + 32), 0) - v128.store_lane(o, v128.add(ai, bi), 0, 32) + const h_ptr = changetype(h) + const f_ptr = changetype(f) + const g_ptr = changetype(g) + v128.store(h_ptr, v128.add(v128.load(f_ptr, 0), v128.load(g_ptr, 0)), 0) + v128.store(h_ptr, v128.add(v128.load(f_ptr, 16), v128.load(g_ptr, 16)), 16) + v128.store_lane(h_ptr, v128.add(v128.load(f_ptr, 32), v128.load(g_ptr, 32)), 0, 32) } /** @@ -84,22 +77,12 @@ export function fe_add (h: FieldElement, f: FieldElement, g: FieldElement): void //@ts-expect-error @inline export function fe_cmov (f: FieldElement, g: FieldElement, b: u64): void { - const p = changetype(f) - const q = changetype(g) - + const f_ptr = changetype(f) + const g_ptr = changetype(g) const c = v128.splat(0 - b) - - let pi = v128.load(p) - let qi = v128.load(q) - v128.store(p, v128.bitselect(qi, pi, c)) - - pi = v128.load(p, 16) - qi = v128.load(q, 16) - v128.store(p, v128.bitselect(qi, pi, c), 16) - - pi = i64x2(load(p + 32), 0) - qi = i64x2(load(q + 32), 0) - v128.store_lane(p, v128.bitselect(qi, pi, c), 0, 32) + v128.store(f_ptr, v128.bitselect(v128.load(g_ptr), v128.load(f_ptr, 0), c)) + v128.store(f_ptr, v128.bitselect(v128.load(g_ptr, 16), v128.load(f_ptr, 16), c), 16) + v128.store_lane(f_ptr, v128.bitselect(v128.load(g_ptr, 32), v128.load(f_ptr, 32), c), 0, 32) } /** @@ -124,25 +107,23 @@ export function fe_copy (h: FieldElement, f: FieldElement): void { //@ts-expect-error @inline export function fe_cswap (f: FieldElement, g: FieldElement, b: u64): void { - const p = changetype(f) - const q = changetype(g) - + const f_ptr = changetype(f) + const g_ptr = changetype(g) const c = v128.splat(0 - b) - let pi = v128.load(p) - let qi = v128.load(q) - v128.store(p, v128.bitselect(qi, pi, c)) - v128.store(q, v128.bitselect(pi, qi, c)) - - pi = v128.load(p, 16) - qi = v128.load(q, 16) - v128.store(p, v128.bitselect(qi, pi, c), 16) - v128.store(q, v128.bitselect(pi, qi, c), 16) - - pi = i64x2(load(p + 32), 0) - qi = i64x2(load(q + 32), 0) - v128.store_lane(p, v128.bitselect(qi, pi, c), 0, 32) - v128.store_lane(q, v128.bitselect(pi, qi, c), 0, 32) + const f03 = v128.load(f_ptr) + const g03 = v128.load(g_ptr) + const f47 = v128.load(f_ptr, 16) + const g47 = v128.load(g_ptr, 16) + const f89 = v128.load(f_ptr, 32) + const g89 = v128.load(g_ptr, 32) + + v128.store(f_ptr, v128.bitselect(g03, f03, c)) + v128.store(g_ptr, v128.bitselect(f03, g03, c)) + v128.store(f_ptr, v128.bitselect(g47, f47, c), 16) + v128.store(g_ptr, v128.bitselect(f47, g47, c), 16) + v128.store_lane(f_ptr, v128.bitselect(g89, f89, c), 0, 32) + v128.store_lane(g_ptr, v128.bitselect(f89, g89, c), 0, 32) } /** @@ -154,17 +135,11 @@ export function fe_cswap (f: FieldElement, g: FieldElement, b: u64): void { //@ts-expect-error @inline export function fe_dbl (h: FieldElement, f: FieldElement): void { - const o = changetype(h) - const a = changetype(f) - - let ai = v128.load(a) - v128.store(o, v128.add(ai, ai)) - - ai = v128.load(a, 16) - v128.store(o, v128.add(ai, ai), 16) - - ai = i64x2(load(a + 32), 0) - v128.store_lane(o, v128.add(ai, ai), 0, 32) + const h_ptr = changetype(h) + const f_ptr = changetype(f) + v128.store(h_ptr, v128.shl(v128.load(f_ptr, 0), 1), 0) + v128.store(h_ptr, v128.shl(v128.load(f_ptr, 16), 1), 16) + v128.store_lane(h_ptr, v128.shl(v128.load(f_ptr, 32), 1), 0, 32) } /** @@ -195,46 +170,47 @@ export function fe_frombytes (h: FieldElement, s: StaticArray): void { carry9 = (h9 + i64(1 << 24)) >> 25 h0 += carry9 * 19 - h9 -= carry9 * u64(1 << 25) + h9 -= carry9 << 25 carry1 = (h1 + i64(1 << 24)) >> 25 h2 += carry1 - h1 -= carry1 * u64(1 << 25) + h1 -= carry1 << 25 carry3 = (h3 + i64(1 << 24)) >> 25 h4 += carry3 - h3 -= carry3 * u64(1 << 25) + h3 -= carry3 << 25 carry5 = (h5 + i64(1 << 24)) >> 25 h6 += carry5 - h5 -= carry5 * u64(1 << 25) + h5 -= carry5 << 25 carry7 = (h7 + i64(1 << 24)) >> 25 h8 += carry7 - h7 -= carry7 * u64(1 << 25) + h7 -= carry7 << 25 carry0 = (h0 + i64(1 << 25)) >> 26 h1 += carry0 - h0 -= carry0 * u64(1 << 26) + h0 -= carry0 << 26 carry2 = (h2 + i64(1 << 25)) >> 26 h3 += carry2 - h2 -= carry2 * u64(1 << 26) + h2 -= carry2 << 26 carry4 = (h4 + i64(1 << 25)) >> 26 h5 += carry4 - h4 -= carry4 * u64(1 << 26) + h4 -= carry4 << 26 carry6 = (h6 + i64(1 << 25)) >> 26 h7 += carry6 - h6 -= carry6 * u64(1 << 26) + h6 -= carry6 << 26 carry8 = (h8 + i64(1 << 25)) >> 26 h9 += carry8 - h8 -= carry8 * u64(1 << 26) - - h[0] = i32(h0) - h[1] = i32(h1) - h[2] = i32(h2) - h[3] = i32(h3) - h[4] = i32(h4) - h[5] = i32(h5) - h[6] = i32(h6) - h[7] = i32(h7) - h[8] = i32(h8) - h[9] = i32(h9) + h8 -= carry8 << 26 + + const h_ptr: usize = changetype(h) + store(h_ptr, h0, 0) + store(h_ptr, h1, 4) + store(h_ptr, h2, 8) + store(h_ptr, h3, 12) + store(h_ptr, h4, 16) + store(h_ptr, h5, 20) + store(h_ptr, h6, 24) + store(h_ptr, h7, 28) + store(h_ptr, h8, 32) + store(h_ptr, h9, 36) } const t0: FieldElement = fe() @@ -291,7 +267,7 @@ export function fe_invert (out: FieldElement, z: FieldElement): void { fe_mul(out, t1, t0) } -const fe_isnegative_s = new StaticArray(32) +const fe_isnegative_t = fe() /** * return 1 if f is in {1,3,5,...,q-2} * return 0 if f is in {0,2,4,...,q-1} @@ -302,9 +278,9 @@ const fe_isnegative_s = new StaticArray(32) //@ts-expect-error @inline export function fe_isnegative (f: FieldElement): u8 { - const s = fe_isnegative_s - fe_tobytes(s, f) - return s[0] & 1 + const t = fe_isnegative_t + fe_reduce(t, f) + return load(changetype(t), 0) & 1 } const fe_iszero_s = new StaticArray(32) @@ -323,7 +299,6 @@ export function fe_iszero (f: FieldElement): u8 { return sodium_is_zero(s, 32) } -const multiply_t = new StaticArray(24) /** * Multiply two FieldElements and store the product. * @@ -332,60 +307,256 @@ const multiply_t = new StaticArray(24) * @param g FieldElement multiplicand source */ export function fe_mul (h: FieldElement, f: FieldElement, g: FieldElement): void { - const o = changetype(h) - const a = changetype(f) - const b = changetype(g) - - const t = changetype(multiply_t) - memory.fill(t, 0, 160) - - const b0: v128 = v128.load(b) - const b4: v128 = v128.load(b + 16) - const b8: v128 = i32x4(load(b + 32), load(b + 36), 0, 0) - - for (let i = 0; i < 10; i++) { - const odd = (i & 1) + 1 - const ai = v128.splat(load(a + (i << 2))) - - let ptr = t + (usize(i) << 3) - let pLo = i64x2.extmul_low_i32x4_s(ai, b0) - let pHi = i64x2.extmul_high_i32x4_s(ai, b0) - pLo = v128.mul(pLo, i64x2(1, odd)) - pHi = v128.mul(pHi, i64x2(1, odd)) - let tLo = v128.load(ptr) - let tHi = v128.load(ptr + 16) - tLo = i64x2.add(tLo, pLo) - tHi = i64x2.add(tHi, pHi) - v128.store(ptr, tLo) - v128.store(ptr + 16, tHi) - - ptr += 32 - pLo = i64x2.extmul_low_i32x4_s(ai, b4) - pHi = i64x2.extmul_high_i32x4_s(ai, b4) - pLo = v128.mul(pLo, i64x2(1, odd)) - pHi = v128.mul(pHi, i64x2(1, odd)) - tLo = v128.load(ptr) - tHi = v128.load(ptr + 16) - tLo = i64x2.add(tLo, pLo) - tHi = i64x2.add(tHi, pHi) - v128.store(ptr, tLo) - v128.store(ptr + 16, tHi) - - ptr += 32 - pLo = i64x2.extmul_low_i32x4_s(ai, b8) - pHi = i64x2.extmul_high_i32x4_s(ai, b8) - pLo = v128.mul(pLo, i64x2(1, odd)) - pHi = v128.mul(pHi, i64x2(1, odd)) - tLo = v128.load(ptr) - tHi = v128.load(ptr + 16) - tLo = i64x2.add(tLo, pLo) - tHi = i64x2.add(tHi, pHi) - v128.store(ptr, tLo) - v128.store(ptr + 16, tHi) - } - // discard last 4 elements since we do not actually need them - memory.fill(t + 160, 0, 32) - normalize(o, t) + const f_ptr: usize = changetype(f) + const g_ptr: usize = changetype(g) + const g0123: v128 = v128.load(g_ptr, 0) + const g4567: v128 = v128.load(g_ptr, 16) + const g89xx: v128 = v128.load(g_ptr, 32) + + // mask to select even lane from first vector and odd lane from second + const m: v128 = i32x4(-1, 0, -1, 0) + + // mask to double odd-on-odd indexes + const odds: v128 = i32x4(0, -1, 0, -1) + + // factor of 19 for wrap from h9 to h0 + const v19: v128 = i32x4.splat(19) + const g0123_19: v128 = i32x4.mul(g0123, v19) + const g4567_19: v128 = i32x4.mul(g4567, v19) + const g89xx_19: v128 = i32x4.mul(g89xx, v19) + + // f[0] + let fi: i32 = load(f_ptr, 0) + let fv: v128 = i32x4.splat(fi) + + let h01: v128 = i64x2.extmul_low_i32x4_s(fv, g0123) + let h23: v128 = i64x2.extmul_high_i32x4_s(fv, g0123) + let h45: v128 = i64x2.extmul_low_i32x4_s(fv, g4567) + let h67: v128 = i64x2.extmul_high_i32x4_s(fv, g4567) + let h89: v128 = i64x2.extmul_low_i32x4_s(fv, g89xx) + + // f[1] + fi = load(f_ptr, 4) + fv = i32x4.splat(fi) + fv = i32x4.add(fv, v128.and(fv, odds)) + + let h12: v128 = i64x2.extmul_low_i32x4_s(fv, g0123) + let h34: v128 = i64x2.extmul_high_i32x4_s(fv, g0123) + let h56: v128 = i64x2.extmul_low_i32x4_s(fv, g4567) + let h78: v128 = i64x2.extmul_high_i32x4_s(fv, g4567) + let h90: v128 = i64x2.extmul_low_i32x4_s(fv, v128.bitselect(g89xx, g89xx_19, m)) + + // f[2] + fi = load(f_ptr, 8) + fv = i32x4.splat(fi) + + let t0: v128 = i64x2.extmul_low_i32x4_s(fv, g0123) + let t1: v128 = i64x2.extmul_high_i32x4_s(fv, g0123) + let t2: v128 = i64x2.extmul_low_i32x4_s(fv, g4567) + let t3: v128 = i64x2.extmul_high_i32x4_s(fv, g4567) + let t4: v128 = i64x2.extmul_low_i32x4_s(fv, g89xx_19) + + h23 = i64x2.add(h23, t0) + h45 = i64x2.add(h45, t1) + h67 = i64x2.add(h67, t2) + h89 = i64x2.add(h89, t3) + h01 = i64x2.add(h01, t4) + + // f[3] + fi = load(f_ptr, 12) + fv = i32x4.splat(fi) + fv = i32x4.add(fv, v128.and(fv, odds)) + + t0 = i64x2.extmul_low_i32x4_s(fv, g0123) + t1 = i64x2.extmul_high_i32x4_s(fv, g0123) + t2 = i64x2.extmul_low_i32x4_s(fv, g4567) + t3 = i64x2.extmul_high_i32x4_s(fv, v128.bitselect(g4567, g4567_19, m)) + t4 = i64x2.extmul_low_i32x4_s(fv, g89xx_19) + + h34 = i64x2.add(h34, t0) + h56 = i64x2.add(h56, t1) + h78 = i64x2.add(h78, t2) + h90 = i64x2.add(h90, t3) + h12 = i64x2.add(h12, t4) + + // f[4] + fi = load(f_ptr, 16) + fv = i32x4.splat(fi) + + t0 = i64x2.extmul_low_i32x4_s(fv, g0123) + t1 = i64x2.extmul_high_i32x4_s(fv, g0123) + t2 = i64x2.extmul_low_i32x4_s(fv, g4567) + t3 = i64x2.extmul_high_i32x4_s(fv, g4567_19) + t4 = i64x2.extmul_low_i32x4_s(fv, g89xx_19) + + h45 = i64x2.add(h45, t0) + h67 = i64x2.add(h67, t1) + h89 = i64x2.add(h89, t2) + h01 = i64x2.add(h01, t3) + h23 = i64x2.add(h23, t4) + + // f[5] + fi = load(f_ptr, 20) + fv = i32x4.splat(fi) + fv = i32x4.add(fv, v128.and(fv, odds)) + + t0 = i64x2.extmul_low_i32x4_s(fv, g0123) + t1 = i64x2.extmul_high_i32x4_s(fv, g0123) + t2 = i64x2.extmul_low_i32x4_s(fv, v128.bitselect(g4567, g4567_19, m)) + t3 = i64x2.extmul_high_i32x4_s(fv, g4567_19) + t4 = i64x2.extmul_low_i32x4_s(fv, g89xx_19) + + h56 = i64x2.add(h56, t0) + h78 = i64x2.add(h78, t1) + h90 = i64x2.add(h90, t2) + h12 = i64x2.add(h12, t3) + h34 = i64x2.add(h34, t4) + + // f[6] + fi = load(f_ptr, 24) + fv = i32x4.splat(fi) + + t0 = i64x2.extmul_low_i32x4_s(fv, g0123) + t1 = i64x2.extmul_high_i32x4_s(fv, g0123) + t2 = i64x2.extmul_low_i32x4_s(fv, g4567_19) + t3 = i64x2.extmul_high_i32x4_s(fv, g4567_19) + t4 = i64x2.extmul_low_i32x4_s(fv, g89xx_19) + + h67 = i64x2.add(h67, t0) + h89 = i64x2.add(h89, t1) + h01 = i64x2.add(h01, t2) + h23 = i64x2.add(h23, t3) + h45 = i64x2.add(h45, t4) + + // f[7] + fi = load(f_ptr, 28) + fv = i32x4.splat(fi) + fv = i32x4.add(fv, v128.and(fv, odds)) + + t0 = i64x2.extmul_low_i32x4_s(fv, g0123) + t1 = i64x2.extmul_high_i32x4_s(fv, v128.bitselect(g0123, g0123_19, m)) + t2 = i64x2.extmul_low_i32x4_s(fv, g4567_19) + t3 = i64x2.extmul_high_i32x4_s(fv, g4567_19) + t4 = i64x2.extmul_low_i32x4_s(fv, g89xx_19) + + h78 = i64x2.add(h78, t0) + h90 = i64x2.add(h90, t1) + h12 = i64x2.add(h12, t2) + h34 = i64x2.add(h34, t3) + h56 = i64x2.add(h56, t4) + + // f[8] + fi = load(f_ptr, 32) + fv = i32x4.splat(fi) + + t0 = i64x2.extmul_low_i32x4_s(fv, g0123) + t1 = i64x2.extmul_high_i32x4_s(fv, g0123_19) + t2 = i64x2.extmul_low_i32x4_s(fv, g4567_19) + t3 = i64x2.extmul_high_i32x4_s(fv, g4567_19) + t4 = i64x2.extmul_low_i32x4_s(fv, g89xx_19) + + h89 = i64x2.add(h89, t0) + h01 = i64x2.add(h01, t1) + h23 = i64x2.add(h23, t2) + h45 = i64x2.add(h45, t3) + h67 = i64x2.add(h67, t4) + + // f[9] + fi = load(f_ptr, 36) + fv = i32x4.splat(fi) + fv = i32x4.add(fv, v128.and(fv, odds)) + + t0 = i64x2.extmul_low_i32x4_s(fv, v128.bitselect(g0123, g0123_19, m)) + t1 = i64x2.extmul_high_i32x4_s(fv, g0123_19) + t2 = i64x2.extmul_low_i32x4_s(fv, g4567_19) + t3 = i64x2.extmul_high_i32x4_s(fv, g4567_19) + t4 = i64x2.extmul_low_i32x4_s(fv, g89xx_19) + + h90 = i64x2.add(h90, t0) + h12 = i64x2.add(h12, t1) + h34 = i64x2.add(h34, t2) + h56 = i64x2.add(h56, t3) + h78 = i64x2.add(h78, t4) + + // extract scalars from vectors + let h0: i64 = i64x2.extract_lane(h90, 1) + i64x2.extract_lane(h01, 0) + let h1: i64 = i64x2.extract_lane(h01, 1) + i64x2.extract_lane(h12, 0) + let h2: i64 = i64x2.extract_lane(h12, 1) + i64x2.extract_lane(h23, 0) + let h3: i64 = i64x2.extract_lane(h23, 1) + i64x2.extract_lane(h34, 0) + let h4: i64 = i64x2.extract_lane(h34, 1) + i64x2.extract_lane(h45, 0) + let h5: i64 = i64x2.extract_lane(h45, 1) + i64x2.extract_lane(h56, 0) + let h6: i64 = i64x2.extract_lane(h56, 1) + i64x2.extract_lane(h67, 0) + let h7: i64 = i64x2.extract_lane(h67, 1) + i64x2.extract_lane(h78, 0) + let h8: i64 = i64x2.extract_lane(h78, 1) + i64x2.extract_lane(h89, 0) + let h9: i64 = i64x2.extract_lane(h89, 1) + i64x2.extract_lane(h90, 0) + + // extract scalars from vectors during first carry + let carry0: i64 + let carry1: i64 + let carry2: i64 + let carry3: i64 + let carry4: i64 + let carry5: i64 + let carry6: i64 + let carry7: i64 + let carry8: i64 + let carry9: i64 + + carry0 = (h0 + (1 << 25)) >> 26 + h1 += carry0 + h0 -= carry0 << 26 + carry4 = (h4 + (1 << 25)) >> 26 + h5 += carry4 + h4 -= carry4 << 26 + + carry1 = (h1 + (1 << 24)) >> 25 + h2 += carry1 + h1 -= carry1 << 25 + carry5 = (h5 + (1 << 24)) >> 25 + h6 += carry5 + h5 -= carry5 << 25 + + carry2 = (h2 + (1 << 25)) >> 26 + h3 += carry2 + h2 -= carry2 << 26 + carry6 = (h6 + (1 << 25)) >> 26 + h7 += carry6 + h6 -= carry6 << 26 + + carry3 = (h3 + (1 << 24)) >> 25 + h4 += carry3 + h3 -= carry3 << 25 + carry7 = (h7 + (1 << 24)) >> 25 + h8 += carry7 + h7 -= carry7 << 25 + + carry4 = (h4 + (1 << 25)) >> 26 + h5 += carry4 + h4 -= carry4 << 26 + carry8 = (h8 + (1 << 25)) >> 26 + h9 += carry8 + h8 -= carry8 << 26 + + carry9 = (h9 + (1 << 24)) >> 25 + h0 += carry9 * 19 + h9 -= carry9 << 25 + + carry0 = (h0 + (1 << 25)) >> 26 + h1 += carry0 + h0 -= carry0 << 26 + + // assign results to output + const h_ptr: usize = changetype(h) + store(h_ptr, h0, 0) + store(h_ptr, h1, 4) + store(h_ptr, h2, 8) + store(h_ptr, h3, 12) + store(h_ptr, h4, 16) + store(h_ptr, h5, 20) + store(h_ptr, h6, 24) + store(h_ptr, h7, 28) + store(h_ptr, h8, 32) + store(h_ptr, h9, 36) } /** @@ -397,17 +568,11 @@ export function fe_mul (h: FieldElement, f: FieldElement, g: FieldElement): void //@ts-expect-error @inline export function fe_neg (h: FieldElement, f: FieldElement): void { - const o = changetype(h) - const a = changetype(f) - - let ai = v128.load(a) - v128.store(o, v128.neg(ai)) - - ai = v128.load(a, 16) - v128.store(o, v128.neg(ai), 16) - - ai = i64x2(load(a + 32), 0) - v128.store_lane(o, v128.neg(ai), 0, 32) + const h_ptr = changetype(h) + const f_ptr = changetype(f) + v128.store(h_ptr, v128.neg(v128.load(f_ptr))) + v128.store(h_ptr, v128.neg(v128.load(f_ptr, 16)), 16) + v128.store_lane(h_ptr, v128.neg(v128.load(f_ptr, 32)), 0, 32) } const fe_pow22523_t0: FieldElement = fe() @@ -492,100 +657,752 @@ export function fe_pow22523 (out: FieldElement, z: FieldElement): void { * so floor(2⁻²⁵⁵(h + 19 2^(-25) h9 + 2^(-1))) = q. */ export function fe_reduce (h: FieldElement, f: FieldElement): void { - let h0 = f[0] - let h1 = f[1] - let h2 = f[2] - let h3 = f[3] - let h4 = f[4] - let h5 = f[5] - let h6 = f[6] - let h7 = f[7] - let h8 = f[8] - let h9 = f[9] + const f_ptr: usize = changetype(f) + let f0: i32 = load(f_ptr, 0) + let f1: i32 = load(f_ptr, 4) + let f2: i32 = load(f_ptr, 8) + let f3: i32 = load(f_ptr, 12) + let f4: i32 = load(f_ptr, 16) + let f5: i32 = load(f_ptr, 20) + let f6: i32 = load(f_ptr, 24) + let f7: i32 = load(f_ptr, 28) + let f8: i32 = load(f_ptr, 32) + let f9: i32 = load(f_ptr, 36) let q: i32 - let carry0: i32, carry1: i32, carry2: i32, carry3: i32, carry4: i32, carry5: i32, carry6: i32, carry7: i32, carry8: i32, carry9: i32 - - q = (19 * h9 + (u32(1) << 24)) >> 25 - q = (h0 + q) >> 26 - q = (h1 + q) >> 25 - q = (h2 + q) >> 26 - q = (h3 + q) >> 25 - q = (h4 + q) >> 26 - q = (h5 + q) >> 25 - q = (h6 + q) >> 26 - q = (h7 + q) >> 25 - q = (h8 + q) >> 26 - q = (h9 + q) >> 25 + let c0: i32, c1: i32, c2: i32, c3: i32, c4: i32, c5: i32, c6: i32, c7: i32, c8: i32, c9: i32 + + q = (19 * f9 + (1 << 24)) >> 25 + q = (f0 + q) >> 26 + q = (f1 + q) >> 25 + q = (f2 + q) >> 26 + q = (f3 + q) >> 25 + q = (f4 + q) >> 26 + q = (f5 + q) >> 25 + q = (f6 + q) >> 26 + q = (f7 + q) >> 25 + q = (f8 + q) >> 26 + q = (f9 + q) >> 25 /* Goal: Output h-(2²⁵⁵-19)q, which is between 0 and 2²⁵⁵-20. */ - h0 += 19 * q + f0 += 19 * q /* Goal: Output h-2²⁵⁵ q, which is between 0 and 2²⁵⁵-20. */ - carry0 = h0 >> 26 + c0 = f0 >> 26 + f1 += c0 + f0 -= c0 << 26 + c1 = f1 >> 25 + f2 += c1 + f1 -= c1 << 25 + c2 = f2 >> 26 + f3 += c2 + f2 -= c2 << 26 + c3 = f3 >> 25 + f4 += c3 + f3 -= c3 << 25 + c4 = f4 >> 26 + f5 += c4 + f4 -= c4 << 26 + c5 = f5 >> 25 + f6 += c5 + f5 -= c5 << 25 + c6 = f6 >> 26 + f7 += c6 + f6 -= c6 << 26 + c7 = f7 >> 25 + f8 += c7 + f7 -= c7 << 25 + c8 = f8 >> 26 + f9 += c8 + f8 -= c8 << 26 + c9 = f9 >> 25 + f9 -= c9 << 25 + + const h_ptr: usize = changetype(h) + store(h_ptr, f0, 0) + store(h_ptr, f1, 4) + store(h_ptr, f2, 8) + store(h_ptr, f3, 12) + store(h_ptr, f4, 16) + store(h_ptr, f5, 20) + store(h_ptr, f6, 24) + store(h_ptr, f7, 28) + store(h_ptr, f8, 32) + store(h_ptr, f9, 36) +} + +/** + * Square a FieldElement and store the result. + * + * Some iterations of the inner loop can be skipped since + * `f[x]*f[y] + f[y]*f[x] = 2*f[x]*f[y]` + * + * ``` + * // non-constant-time example + * for (let i = 0; i < 10; i++) { + * for (let j = i; j < 10; j++) { + * h[i + j] += f[i] * g[j] + * if (i < j) { + * h[i + j] *= 2 + * } + * } + * } + * ``` + * + * h = f * f + * Can overlap h with f. + * + * Preconditions: + * |f| bounded by 1.65*2^26,1.65*2^25,1.65*2^26,1.65*2^25,etc. + * + * Postconditions: + * |h| bounded by 1.01*2^25,1.01*2^24,1.01*2^25,1.01*2^24,etc. + * + * @param h FieldElement power destination + * @param f FieldElement base source + */ +export function fe_sq (h: FieldElement, f: FieldElement): void { + const f_ptr: usize = changetype(f) + const f0: i64 = i64(load(f_ptr, 0)) + const f1: i64 = i64(load(f_ptr, 4)) + const f2: i64 = i64(load(f_ptr, 8)) + const f3: i64 = i64(load(f_ptr, 12)) + const f4: i64 = i64(load(f_ptr, 16)) + const f5: i64 = i64(load(f_ptr, 20)) + const f6: i64 = i64(load(f_ptr, 24)) + const f7: i64 = i64(load(f_ptr, 28)) + const f8: i64 = i64(load(f_ptr, 32)) + const f9: i64 = i64(load(f_ptr, 36)) + + const f0_2: i64 = f0 * 2 + const f1_2: i64 = f1 * 2 + const f2_2: i64 = f2 * 2 + const f3_2: i64 = f3 * 2 + const f4_2: i64 = f4 * 2 + const f5_2: i64 = f5 * 2 + const f6_2: i64 = f6 * 2 + const f7_2: i64 = f7 * 2 + + const f5_19: i64 = f5 * 19 /* 1.959375*2^29 */ + const f6_19: i64 = f6 * 19 /* 1.959375*2^30 */ + const f7_19: i64 = f7 * 19 /* 1.959375*2^29 */ + const f8_19: i64 = f8 * 19 /* 1.959375*2^30 */ + const f9_19: i64 = f9 * 19 /* 1.959375*2^29 */ + + const f7_38: i64 = f7 * 38 /* 1.959375*2^30 */ + const f9_38: i64 = f9 * 38 /* 1.959375*2^30 */ + + let h0: i64 = f0 * f0 + let h1: i64 = f0_2 * f1 + let h2: i64 = f0_2 * f2 + let h3: i64 = f0_2 * f3 + let h4: i64 = f0_2 * f4 + let h5: i64 = f0_2 * f5 + let h6: i64 = f0_2 * f6 + let h7: i64 = f0_2 * f7 + let h8: i64 = f0_2 * f8 + let h9: i64 = f0_2 * f9 + + h2 += f1_2 * f1 + h3 += f1_2 * f2 + h4 += f1_2 * f3_2 + h5 += f1_2 * f4 + h6 += f1_2 * f5_2 + h7 += f1_2 * f6 + h8 += f1_2 * f7_2 + h9 += f1_2 * f8 + h0 += f1_2 * f9_38 + + h4 += f2 * f2 + h5 += f2_2 * f3 + h6 += f2_2 * f4 + h7 += f2_2 * f5 + h8 += f2_2 * f6 + h9 += f2_2 * f7 + h0 += f2_2 * f8_19 + h1 += f2_2 * f9_19 + + h6 += f3_2 * f3 + h7 += f3_2 * f4 + h8 += f3_2 * f5_2 + h9 += f3_2 * f6 + h0 += f3_2 * f7_38 + h1 += f3_2 * f8_19 + h2 += f3_2 * f9_38 + + h8 += f4 * f4 + h9 += f4_2 * f5 + h0 += f4_2 * f6_19 + h1 += f4_2 * f7_19 + h2 += f4_2 * f8_19 + h3 += f4_2 * f9_19 + + h0 += f5_2 * f5_19 + h1 += f5_2 * f6_19 + h2 += f5_2 * f7_38 + h3 += f5_2 * f8_19 + h4 += f5_2 * f9_38 + + h2 += f6 * f6_19 + h3 += f6_2 * f7_19 + h4 += f6_2 * f8_19 + h5 += f6_2 * f9_19 + + h4 += f7_2 * f7_19 + h5 += f7_2 * f8_19 + h6 += f7_2 * f9_38 + + h6 += f8 * f8_19 + h7 += f8 * f9_38 + + h8 += f9 * f9_38 + + let carry0: i64 + let carry1: i64 + let carry2: i64 + let carry3: i64 + let carry4: i64 + let carry5: i64 + let carry6: i64 + let carry7: i64 + let carry8: i64 + let carry9: i64 + + carry0 = (h0 + i64(1 << 25)) >> 26 h1 += carry0 - h0 -= carry0 * u32(1 << 26) - carry1 = h1 >> 25 - h2 += carry1 - h1 -= carry1 * u32(1 << 25) - carry2 = h2 >> 26 - h3 += carry2 - h2 -= carry2 * u32(1 << 26) - carry3 = h3 >> 25 - h4 += carry3 - h3 -= carry3 * u32(1 << 25) - carry4 = h4 >> 26 + h0 -= carry0 << 26 + carry4 = (h4 + i64(1 << 25)) >> 26 h5 += carry4 - h4 -= carry4 * u32(1 << 26) - carry5 = h5 >> 25 + h4 -= carry4 << 26 + + carry1 = (h1 + i64(1 << 24)) >> 25 + h2 += carry1 + h1 -= carry1 << 25 + carry5 = (h5 + i64(1 << 24)) >> 25 h6 += carry5 - h5 -= carry5 * u32(1 << 25) - carry6 = h6 >> 26 + h5 -= carry5 << 25 + + carry2 = (h2 + i64(1 << 25)) >> 26 + h3 += carry2 + h2 -= carry2 << 26 + carry6 = (h6 + i64(1 << 25)) >> 26 h7 += carry6 - h6 -= carry6 * u32(1 << 26) - carry7 = h7 >> 25 + h6 -= carry6 << 26 + + carry3 = (h3 + i64(1 << 24)) >> 25 + h4 += carry3 + h3 -= carry3 << 25 + carry7 = (h7 + i64(1 << 24)) >> 25 h8 += carry7 - h7 -= carry7 * u32(1 << 25) - carry8 = h8 >> 26 + h7 -= carry7 << 25 + + carry4 = (h4 + i64(1 << 25)) >> 26 + h5 += carry4 + h4 -= carry4 << 26 + carry8 = (h8 + i64(1 << 25)) >> 26 h9 += carry8 - h8 -= carry8 * u32(1 << 26) - carry9 = h9 >> 25 - h9 -= carry9 * u32(1 << 25) - - h[0] = h0 - h[1] = h1 - h[2] = h2 - h[3] = h3 - h[4] = h4 - h[5] = h5 - h[6] = h6 - h[7] = h7 - h[8] = h8 - h[9] = h9 + h8 -= carry8 << 26 + + carry9 = (h9 + i64(1 << 24)) >> 25 + h0 += carry9 * 19 + h9 -= carry9 << 25 + + carry0 = (h0 + i64(1 << 25)) >> 26 + h1 += carry0 + h0 -= carry0 << 26 + + const h_ptr: usize = changetype(h) + store(h_ptr, h0, 0) + store(h_ptr, h1, 4) + store(h_ptr, h2, 8) + store(h_ptr, h3, 12) + store(h_ptr, h4, 16) + store(h_ptr, h5, 20) + store(h_ptr, h6, 24) + store(h_ptr, h7, 28) + store(h_ptr, h8, 32) + store(h_ptr, h9, 36) } /** * Square a FieldElement and store the result. * - * @param h FieldElement power destination - * @param f FieldElement base source + * Some iterations of the inner loop can be skipped since + * `f[x]*f[y] + f[y]*f[x] = 2*f[x]*f[y]` + * + * ``` + * // non-constant-time example + * for (let i = 0; i < 10; i++) { + * for (let j = i; j < 10; j++) { + * h[i + j] += f[i] * g[j] + * if (i < j) { + * h[i + j] *= 2 + * } + * } + * } + * ``` + * + * h = f * f + * Can overlap h with f. + * + * Preconditions: + * |f| bounded by 1.65*2^26,1.65*2^25,1.65*2^26,1.65*2^25,etc. + * + * Postconditions: + * |h| bounded by 1.01*2^25,1.01*2^24,1.01*2^25,1.01*2^24,etc. + * + * @param hx FieldElement power destination + * @param fx FieldElement base source */ -//@ts-expect-error -@inline -export function fe_sq (h: FieldElement, f: FieldElement): void { - fe_mul(h, f, f) +export function fe_sq_vec (hx: FieldElement, fx: FieldElement, hy: FieldElement, fy: FieldElement): void { + const fx_ptr: usize = changetype(fx) + const fy_ptr: usize = changetype(fy) + const f0: v128 = i32x4(load(fx_ptr, 0), load(fy_ptr, 0), 0, 0) + const f1: v128 = i32x4(load(fx_ptr, 4), load(fy_ptr, 4), 0, 0) + const f2: v128 = i32x4(load(fx_ptr, 8), load(fy_ptr, 8), 0, 0) + const f3: v128 = i32x4(load(fx_ptr, 12), load(fy_ptr, 12), 0, 0) + const f4: v128 = i32x4(load(fx_ptr, 16), load(fy_ptr, 16), 0, 0) + const f5: v128 = i32x4(load(fx_ptr, 20), load(fy_ptr, 20), 0, 0) + const f6: v128 = i32x4(load(fx_ptr, 24), load(fy_ptr, 24), 0, 0) + const f7: v128 = i32x4(load(fx_ptr, 28), load(fy_ptr, 28), 0, 0) + const f8: v128 = i32x4(load(fx_ptr, 32), load(fy_ptr, 32), 0, 0) + const f9: v128 = i32x4(load(fx_ptr, 36), load(fy_ptr, 36), 0, 0) + + const v19: v128 = i32x4.splat(19) + const f5_19: v128 = i32x4.mul(f5, v19) /* 1.959375*2^29 */ + const f6_19: v128 = i32x4.mul(f6, v19) /* 1.959375*2^30 */ + const f7_19: v128 = i32x4.mul(f7, v19) /* 1.959375*2^29 */ + const f8_19: v128 = i32x4.mul(f8, v19) /* 1.959375*2^30 */ + const f9_19: v128 = i32x4.mul(f9, v19) /* 1.959375*2^29 */ + + // f[0] + let f_2: v128 = i32x4.shl(f0, 1) + + let h0: v128 = i64x2.extmul_low_i32x4_s(f0, f0) + let h1: v128 = i64x2.extmul_low_i32x4_s(f_2, f1) + let h2: v128 = i64x2.extmul_low_i32x4_s(f_2, f2) + let h3: v128 = i64x2.extmul_low_i32x4_s(f_2, f3) + let h4: v128 = i64x2.extmul_low_i32x4_s(f_2, f4) + let h5: v128 = i64x2.extmul_low_i32x4_s(f_2, f5) + let h6: v128 = i64x2.extmul_low_i32x4_s(f_2, f6) + let h7: v128 = i64x2.extmul_low_i32x4_s(f_2, f7) + let h8: v128 = i64x2.extmul_low_i32x4_s(f_2, f8) + let h9: v128 = i64x2.extmul_low_i32x4_s(f_2, f9) + + // f[1] + f_2 = i32x4.shl(f1, 1) + let f_4: v128 = i32x4.shl(f1, 2) + + let t0: v128 = i64x2.extmul_low_i32x4_s(f_2, f1) + let t1: v128 = i64x2.extmul_low_i32x4_s(f_2, f2) + let t2: v128 = i64x2.extmul_low_i32x4_s(f_4, f3) + let t3: v128 = i64x2.extmul_low_i32x4_s(f_2, f4) + let t4: v128 = i64x2.extmul_low_i32x4_s(f_4, f5) + let t5: v128 = i64x2.extmul_low_i32x4_s(f_2, f6) + let t6: v128 = i64x2.extmul_low_i32x4_s(f_4, f7) + let t7: v128 = i64x2.extmul_low_i32x4_s(f_2, f8) + let t8: v128 = i64x2.extmul_low_i32x4_s(f_4, f9_19) + + h2 = i64x2.add(h2, t0) + h3 = i64x2.add(h3, t1) + h4 = i64x2.add(h4, t2) + h5 = i64x2.add(h5, t3) + h6 = i64x2.add(h6, t4) + h7 = i64x2.add(h7, t5) + h8 = i64x2.add(h8, t6) + h9 = i64x2.add(h9, t7) + h0 = i64x2.add(h0, t8) + + // f[2] + f_2 = i32x4.shl(f2, 1) + + t0 = i64x2.extmul_low_i32x4_s(f2, f2) + t1 = i64x2.extmul_low_i32x4_s(f_2, f3) + t2 = i64x2.extmul_low_i32x4_s(f_2, f4) + t3 = i64x2.extmul_low_i32x4_s(f_2, f5) + t4 = i64x2.extmul_low_i32x4_s(f_2, f6) + t5 = i64x2.extmul_low_i32x4_s(f_2, f7) + t6 = i64x2.extmul_low_i32x4_s(f_2, f8_19) + t7 = i64x2.extmul_low_i32x4_s(f_2, f9_19) + + h4 = i64x2.add(h4, t0) + h5 = i64x2.add(h5, t1) + h6 = i64x2.add(h6, t2) + h7 = i64x2.add(h7, t3) + h8 = i64x2.add(h8, t4) + h9 = i64x2.add(h9, t5) + h0 = i64x2.add(h0, t6) + h1 = i64x2.add(h1, t7) + + // f[3] + f_2 = i32x4.shl(f3, 1) + f_4 = i32x4.shl(f3, 2) + + t0 = i64x2.extmul_low_i32x4_s(f_2, f3) + t1 = i64x2.extmul_low_i32x4_s(f_2, f4) + t2 = i64x2.extmul_low_i32x4_s(f_4, f5) + t3 = i64x2.extmul_low_i32x4_s(f_2, f6) + t4 = i64x2.extmul_low_i32x4_s(f_4, f7_19) /* 1.959375*2^30 */ + t5 = i64x2.extmul_low_i32x4_s(f_2, f8_19) + t6 = i64x2.extmul_low_i32x4_s(f_4, f9_19) + + h6 = i64x2.add(h6, t0) + h7 = i64x2.add(h7, t1) + h8 = i64x2.add(h8, t2) + h9 = i64x2.add(h9, t3) + h0 = i64x2.add(h0, t4) /* 1.959375*2^30 */ + h1 = i64x2.add(h1, t5) + h2 = i64x2.add(h2, t6) + + // f[4] + f_2 = i32x4.shl(f4, 1) + + t0 = i64x2.extmul_low_i32x4_s(f4, f4) + t1 = i64x2.extmul_low_i32x4_s(f_2, f5) + t2 = i64x2.extmul_low_i32x4_s(f_2, f6_19) + t3 = i64x2.extmul_low_i32x4_s(f_2, f7_19) + t4 = i64x2.extmul_low_i32x4_s(f_2, f8_19) + t5 = i64x2.extmul_low_i32x4_s(f_2, f9_19) + + h8 = i64x2.add(h8, t0) + h9 = i64x2.add(h9, t1) + h0 = i64x2.add(h0, t2) + h1 = i64x2.add(h1, t3) + h2 = i64x2.add(h2, t4) + h3 = i64x2.add(h3, t5) + + // f[5] + f_2 = i32x4.shl(f5, 1) + f_4 = i32x4.shl(f5, 2) + + t0 = i64x2.extmul_low_i32x4_s(f_2, f5_19) + t1 = i64x2.extmul_low_i32x4_s(f_2, f6_19) + t2 = i64x2.extmul_low_i32x4_s(f_4, f7_19) + t3 = i64x2.extmul_low_i32x4_s(f_2, f8_19) + t4 = i64x2.extmul_low_i32x4_s(f_4, f9_19) /* 1.959375*2^30 */ + + h0 = i64x2.add(h0, t0) + h1 = i64x2.add(h1, t1) + h2 = i64x2.add(h2, t2) + h3 = i64x2.add(h3, t3) + h4 = i64x2.add(h4, t4) /* 1.959375*2^30 */ + + // f[6] + f_2 = i32x4.shl(f6, 1) + + t0 = i64x2.extmul_low_i32x4_s(f6, f6_19) + t1 = i64x2.extmul_low_i32x4_s(f_2, f7_19) + t2 = i64x2.extmul_low_i32x4_s(f_2, f8_19) + t3 = i64x2.extmul_low_i32x4_s(f_2, f9_19) + + h2 = i64x2.add(h2, t0) + h3 = i64x2.add(h3, t1) + h4 = i64x2.add(h4, t2) + h5 = i64x2.add(h5, t3) + + // f[7] + f_2 = i32x4.shl(f7, 1) + f_4 = i32x4.shl(f7, 2) + + t0 = i64x2.extmul_low_i32x4_s(f_2, f7_19) + t1 = i64x2.extmul_low_i32x4_s(f_2, f8_19) + t2 = i64x2.extmul_low_i32x4_s(f_4, f9_19) + + h4 = i64x2.add(h4, t0) + h5 = i64x2.add(h5, t1) + h6 = i64x2.add(h6, t2) + + // f[8] + f_2 = i32x4.shl(f8, 1) + + t0 = i64x2.extmul_low_i32x4_s(f8, f8_19) + t1 = i64x2.extmul_low_i32x4_s(f_2, f9_19) + + h6 = i64x2.add(h6, t0) + h7 = i64x2.add(h7, t1) + + // f[9] + f_2 = i32x4.shl(f9, 1) + + t0 = i64x2.extmul_low_i32x4_s(f_2, f9_19) + + h8 = i64x2.add(h8, t0) + + const v24: v128 = i64x2.splat(1 << 24) + const v25: v128 = i64x2.splat(1 << 25) + let carry0: v128 + let carry1: v128 + let carry2: v128 + let carry3: v128 + let carry4: v128 + let carry5: v128 + let carry6: v128 + let carry7: v128 + let carry8: v128 + let carry9: v128 + + carry0 = i64x2.shr_s(((i64x2.add(h0, v25))), 26) + h1 = i64x2.add(h1, carry0) + h0 = i64x2.sub(h0, i64x2.shl(carry0, 26)) + carry4 = i64x2.shr_s(((i64x2.add(h4, v25))), 26) + h5 = i64x2.add(h5, carry4) + h4 = i64x2.sub(h4, i64x2.shl(carry4, 26)) + + carry1 = i64x2.shr_s(((i64x2.add(h1, v24))), 25) + h2 = i64x2.add(h2, carry1) + h1 = i64x2.sub(h1, i64x2.shl(carry1, 25)) + carry5 = i64x2.shr_s(((i64x2.add(h5, v24))), 25) + h6 = i64x2.add(h6, carry5) + h5 = i64x2.sub(h5, i64x2.shl(carry5, 25)) + + carry2 = i64x2.shr_s(((i64x2.add(h2, v25))), 26) + h3 = i64x2.add(h3, carry2) + h2 = i64x2.sub(h2, i64x2.shl(carry2, 26)) + carry6 = i64x2.shr_s(((i64x2.add(h6, v25))), 26) + h7 = i64x2.add(h7, carry6) + h6 = i64x2.sub(h6, i64x2.shl(carry6, 26)) + + carry3 = i64x2.shr_s(((i64x2.add(h3, v24))), 25) + h4 = i64x2.add(h4, carry3) + h3 = i64x2.sub(h3, i64x2.shl(carry3, 25)) + carry7 = i64x2.shr_s(((i64x2.add(h7, v24))), 25) + h8 = i64x2.add(h8, carry7) + h7 = i64x2.sub(h7, i64x2.shl(carry7, 25)) + + carry4 = i64x2.shr_s(((i64x2.add(h4, v25))), 26) + h5 = i64x2.add(h5, carry4) + h4 = i64x2.sub(h4, i64x2.shl(carry4, 26)) + carry8 = i64x2.shr_s(((i64x2.add(h8, v25))), 26) + h9 = i64x2.add(h9, carry8) + h8 = i64x2.sub(h8, i64x2.shl(carry8, 26)) + + carry9 = i64x2.shr_s(((i64x2.add(h9, v24))), 25) + h0 = i64x2.add(h0, i64x2.mul(carry9, i64x2.splat(19))) + h9 = i64x2.sub(h9, i64x2.shl(carry9, 25)) + + carry0 = i64x2.shr_s(((i64x2.add(h0, v25))), 26) + h1 = i64x2.add(h1, carry0) + h0 = i64x2.sub(h0, i64x2.shl(carry0, 26)) + + const hx_ptr: usize = changetype(hx) + store(hx_ptr, i64x2.extract_lane(h0, 0), 0) + store(hx_ptr, i64x2.extract_lane(h1, 0), 4) + store(hx_ptr, i64x2.extract_lane(h2, 0), 8) + store(hx_ptr, i64x2.extract_lane(h3, 0), 12) + store(hx_ptr, i64x2.extract_lane(h4, 0), 16) + store(hx_ptr, i64x2.extract_lane(h5, 0), 20) + store(hx_ptr, i64x2.extract_lane(h6, 0), 24) + store(hx_ptr, i64x2.extract_lane(h7, 0), 28) + store(hx_ptr, i64x2.extract_lane(h8, 0), 32) + store(hx_ptr, i64x2.extract_lane(h9, 0), 36) + + const hy_ptr: usize = changetype(hy) + store(hy_ptr, i64x2.extract_lane(h0, 1), 0) + store(hy_ptr, i64x2.extract_lane(h1, 1), 4) + store(hy_ptr, i64x2.extract_lane(h2, 1), 8) + store(hy_ptr, i64x2.extract_lane(h3, 1), 12) + store(hy_ptr, i64x2.extract_lane(h4, 1), 16) + store(hy_ptr, i64x2.extract_lane(h5, 1), 20) + store(hy_ptr, i64x2.extract_lane(h6, 1), 24) + store(hy_ptr, i64x2.extract_lane(h7, 1), 28) + store(hy_ptr, i64x2.extract_lane(h8, 1), 32) + store(hy_ptr, i64x2.extract_lane(h9, 1), 36) } /** * Square a FieldElement, double it, and store the result. * + * h = 2 * f * f + * Can overlap h with f. + * + * Preconditions: + * |f| bounded by 1.65*2^26,1.65*2^25,1.65*2^26,1.65*2^25,etc. + * + * Postconditions: + * |h| bounded by 1.01*2^25,1.01*2^24,1.01*2^25,1.01*2^24,etc. + * * @param h FieldElement power destination * @param f FieldElement base source */ -//@ts-expect-error -@inline export function fe_sq2 (h: FieldElement, f: FieldElement): void { - fe_mul(h, f, f) - fe_add(h, h, h) + const f_ptr: usize = changetype(f) + const f0: i64 = i64(load(f_ptr, 0)) + const f1: i64 = i64(load(f_ptr, 4)) + const f2: i64 = i64(load(f_ptr, 8)) + const f3: i64 = i64(load(f_ptr, 12)) + const f4: i64 = i64(load(f_ptr, 16)) + const f5: i64 = i64(load(f_ptr, 20)) + const f6: i64 = i64(load(f_ptr, 24)) + const f7: i64 = i64(load(f_ptr, 28)) + const f8: i64 = i64(load(f_ptr, 32)) + const f9: i64 = i64(load(f_ptr, 36)) + + const f0_2: i64 = f0 * 2 + const f1_2: i64 = f1 * 2 + const f2_2: i64 = f2 * 2 + const f3_2: i64 = f3 * 2 + const f4_2: i64 = f4 * 2 + const f5_2: i64 = f5 * 2 + const f6_2: i64 = f6 * 2 + const f7_2: i64 = f7 * 2 + + const f5_38: i64 = f5 * 38 /* 1.959375*2^30 */ + const f6_19: i64 = f6 * 19 /* 1.959375*2^30 */ + const f7_38: i64 = f7 * 38/* 1.959375*2^30 */ + const f8_19: i64 = f8 * 19/* 1.959375*2^30 */ + const f9_38: i64 = f9 * 38/* 1.959375*2^30 */ + + const f0f0: i64 = f0 * f0 + const f0f1_2: i64 = f0_2 * f1 + const f0f2_2: i64 = f0_2 * f2 + const f0f3_2: i64 = f0_2 * f3 + const f0f4_2: i64 = f0_2 * f4 + const f0f5_2: i64 = f0_2 * f5 + const f0f6_2: i64 = f0_2 * f6 + const f0f7_2: i64 = f0_2 * f7 + const f0f8_2: i64 = f0_2 * f8 + const f0f9_2: i64 = f0_2 * f9 + + const f1f1_2: i64 = f1_2 * f1 + const f1f2_2: i64 = f1_2 * f2 + const f1f3_4: i64 = f1_2 * f3_2 + const f1f4_2: i64 = f1_2 * f4 + const f1f5_4: i64 = f1_2 * f5_2 + const f1f6_2: i64 = f1_2 * f6 + const f1f7_4: i64 = f1_2 * f7_2 + const f1f8_2: i64 = f1_2 * f8 + const f1f9_76: i64 = f1_2 * f9_38 + + const f2f2: i64 = f2 * f2 + const f2f3_2: i64 = f2_2 * f3 + const f2f4_2: i64 = f2_2 * f4 + const f2f5_2: i64 = f2_2 * f5 + const f2f6_2: i64 = f2_2 * f6 + const f2f7_2: i64 = f2_2 * f7 + const f2f8_38: i64 = f2_2 * f8_19 + const f2f9_38: i64 = f2 * f9_38 + + const f3f3_2: i64 = f3_2 * f3 + const f3f4_2: i64 = f3_2 * f4 + const f3f5_4: i64 = f3_2 * f5_2 + const f3f6_2: i64 = f3_2 * f6 + const f3f7_76: i64 = f3_2 * f7_38 + const f3f8_38: i64 = f3_2 * f8_19 + const f3f9_76: i64 = f3_2 * f9_38 + + const f4f4: i64 = f4 * f4 + const f4f5_2: i64 = f4_2 * f5 + const f4f6_38: i64 = f4_2 * f6_19 + const f4f7_38: i64 = f4 * f7_38 + const f4f8_38: i64 = f4_2 * f8_19 + const f4f9_38: i64 = f4 * f9_38 + + const f5f5_38: i64 = f5 * f5_38 + const f5f6_38: i64 = f5_2 * f6_19 + const f5f7_76: i64 = f5_2 * f7_38 + const f5f8_38: i64 = f5_2 * f8_19 + const f5f9_76: i64 = f5_2 * f9_38 + + const f6f6_19: i64 = f6 * f6_19 + const f6f7_38: i64 = f6 * f7_38 + const f6f8_38: i64 = f6_2 * f8_19 + const f6f9_38: i64 = f6 * f9_38 + + const f7f7_38: i64 = f7 * f7_38 + const f7f8_38: i64 = f7_2 * f8_19 + const f7f9_76: i64 = f7_2 * f9_38 + + const f8f8_19: i64 = f8 * f8_19 + const f8f9_38: i64 = f8 * f9_38 + + const f9f9_38: i64 = f9 * f9_38 + + let h0: i64 = f0f0 + f1f9_76 + f2f8_38 + f3f7_76 + f4f6_38 + f5f5_38 + let h1: i64 = f0f1_2 + f2f9_38 + f3f8_38 + f4f7_38 + f5f6_38 + let h2: i64 = f0f2_2 + f1f1_2 + f3f9_76 + f4f8_38 + f5f7_76 + f6f6_19 + let h3: i64 = f0f3_2 + f1f2_2 + f4f9_38 + f5f8_38 + f6f7_38 + let h4: i64 = f0f4_2 + f1f3_4 + f2f2 + f5f9_76 + f6f8_38 + f7f7_38 + let h5: i64 = f0f5_2 + f1f4_2 + f2f3_2 + f6f9_38 + f7f8_38 + let h6: i64 = f0f6_2 + f1f5_4 + f2f4_2 + f3f3_2 + f7f9_76 + f8f8_19 + let h7: i64 = f0f7_2 + f1f6_2 + f2f5_2 + f3f4_2 + f8f9_38 + let h8: i64 = f0f8_2 + f1f7_4 + f2f6_2 + f3f5_4 + f4f4 + f9f9_38 + let h9: i64 = f0f9_2 + f1f8_2 + f2f7_2 + f3f6_2 + f4f5_2 + + h0 *= 2 + h1 *= 2 + h2 *= 2 + h3 *= 2 + h4 *= 2 + h5 *= 2 + h6 *= 2 + h7 *= 2 + h8 *= 2 + h9 *= 2 + + let carry0: i64 + let carry1: i64 + let carry2: i64 + let carry3: i64 + let carry4: i64 + let carry5: i64 + let carry6: i64 + let carry7: i64 + let carry8: i64 + let carry9: i64 + + carry0 = (h0 + i64(1 << 25)) >> 26 + h1 += carry0 + h0 -= carry0 << 26 + carry4 = (h4 + i64(1 << 25)) >> 26 + h5 += carry4 + h4 -= carry4 << 26 + + carry1 = (h1 + i64(1 << 24)) >> 25 + h2 += carry1 + h1 -= carry1 << 25 + carry5 = (h5 + i64(1 << 24)) >> 25 + h6 += carry5 + h5 -= carry5 << 25 + + carry2 = (h2 + i64(1 << 25)) >> 26 + h3 += carry2 + h2 -= carry2 << 26 + carry6 = (h6 + i64(1 << 25)) >> 26 + h7 += carry6 + h6 -= carry6 << 26 + + carry3 = (h3 + i64(1 << 24)) >> 25 + h4 += carry3 + h3 -= carry3 << 25 + carry7 = (h7 + i64(1 << 24)) >> 25 + h8 += carry7 + h7 -= carry7 << 25 + + carry4 = (h4 + i64(1 << 25)) >> 26 + h5 += carry4 + h4 -= carry4 << 26 + carry8 = (h8 + i64(1 << 25)) >> 26 + h9 += carry8 + h8 -= carry8 << 26 + + carry9 = (h9 + i64(1 << 24)) >> 25 + h0 += carry9 * 19 + h9 -= carry9 << 25 + + carry0 = (h0 + i64(1 << 25)) >> 26 + h1 += carry0 + h0 -= carry0 << 26 + + const h_ptr: usize = changetype(h) + store(h_ptr, h0, 0) + store(h_ptr, h1, 4) + store(h_ptr, h2, 8) + store(h_ptr, h3, 12) + store(h_ptr, h4, 16) + store(h_ptr, h5, 20) + store(h_ptr, h6, 24) + store(h_ptr, h7, 28) + store(h_ptr, h8, 32) + store(h_ptr, h9, 36) } /** @@ -598,21 +1415,12 @@ export function fe_sq2 (h: FieldElement, f: FieldElement): void { //@ts-expect-error @inline export function fe_sub (h: FieldElement, f: FieldElement, g: FieldElement): void { - const o = changetype(h) - const a = changetype(f) - const b = changetype(g) - - let ai = v128.load(a) - let bi = v128.load(b) - v128.store(o, v128.sub(ai, bi)) - - ai = v128.load(a, 16) - bi = v128.load(b, 16) - v128.store(o, v128.sub(ai, bi), 16) - - ai = i64x2(load(a + 32), 0) - bi = i64x2(load(b + 32), 0) - v128.store_lane(o, v128.sub(ai, bi), 0, 32) + const f_ptr = changetype(f) + const g_ptr = changetype(g) + const h_ptr = changetype(h) + v128.store(h_ptr, v128.sub(v128.load(f_ptr, 0), v128.load(g_ptr, 0)), 0) + v128.store(h_ptr, v128.sub(v128.load(f_ptr, 16), v128.load(g_ptr, 16)), 16) + v128.store_lane(h_ptr, v128.sub(v128.load(f_ptr, 32), v128.load(g_ptr, 32)), 0, 32) } const fe_tobytes_t: FieldElement = fe() @@ -626,183 +1434,49 @@ const fe_tobytes_t: FieldElement = fe() export function fe_tobytes (s: StaticArray, h: FieldElement): void { const t = fe_tobytes_t fe_reduce(t, h) - s[0] = u8(t[0] >> 0) - s[1] = u8(t[0] >> 8) - s[2] = u8(t[0] >> 16) - s[3] = u8((t[0] >> 24) | (t[1] * (u32(1) << 2))) - s[4] = u8(t[1] >> 6) - s[5] = u8(t[1] >> 14) - s[6] = u8((t[1] >> 22) | (t[2] * (u32(1) << 3))) - s[7] = u8(t[2] >> 5) - s[8] = u8(t[2] >> 13) - s[9] = u8((t[2] >> 21) | (t[3] * (u32(1) << 5))) - s[10] = u8(t[3] >> 3) - s[11] = u8(t[3] >> 11) - s[12] = u8((t[3] >> 19) | (t[4] * (u32(1) << 6))) - s[13] = u8(t[4] >> 2) - s[14] = u8(t[4] >> 10) - s[15] = u8(t[4] >> 18) - s[16] = u8(t[5] >> 0) - s[17] = u8(t[5] >> 8) - s[18] = u8(t[5] >> 16) - s[19] = u8((t[5] >> 24) | (t[6] * (u32(1) << 1))) - s[20] = u8(t[6] >> 7) - s[21] = u8(t[6] >> 15) - s[22] = u8((t[6] >> 23) | (t[7] * (u32(1) << 3))) - s[23] = u8(t[7] >> 5) - s[24] = u8(t[7] >> 13) - s[25] = u8((t[7] >> 21) | (t[8] * (u32(1) << 4))) - s[26] = u8(t[8] >> 4) - s[27] = u8(t[8] >> 12) - s[28] = u8((t[8] >> 20) | (t[9] * (u32(1) << 6))) - s[29] = u8(t[9] >> 2) - s[30] = u8(t[9] >> 10) - s[31] = u8(t[9] >> 18) -} - -/** - * Internal function. - * - * Normalize results of `fe_multiply` and `fe_square` across limbs. This - * requires a reduction step and two carry steps. - * - * // reduce - * for (let i = 0; i < 15; i++) { - * t[i] += 38 * t[i + 16] - * } - * // carry twice - * for (let i = 0; i < 2; i++) { - * for (let i = 0; i < 16; i++) { - * c += t[i] - * t[i] = c & 0xFFFF - * c >>= 16 - * } - * t[0] += 38 * c - * } -*/ -//@ts-expect-error -@inline -function normalize (o: usize, t: usize): void { - // reduce from 20 limbs to 10 - let x = load(t) - let y = load(t, 80) - store(t, x + (19 * y)) - - x = load(t, 8) - y = load(t, 88) - store(t, x + (19 * y), 8) - - x = load(t, 16) - y = load(t, 96) - store(t, x + (19 * y), 16) - - x = load(t, 24) - y = load(t, 104) - store(t, x + (19 * y), 24) - - x = load(t, 32) - y = load(t, 112) - store(t, x + (19 * y), 32) - - x = load(t, 40) - y = load(t, 120) - store(t, x + (19 * y), 40) - - x = load(t, 48) - y = load(t, 128) - store(t, x + (19 * y), 48) - - x = load(t, 56) - y = load(t, 136) - store(t, x + (19 * y), 56) - - x = load(t, 64) - y = load(t, 144) - store(t, x + (19 * y), 64) - - x = load(t, 72) - y = load(t, 152) - store(t, x + (19 * y), 72) - - // first carry - let v: i64 = load(t, 0) - let c: i64 = (v + (1 << 25)) >> 26 - store(t, v - (c << 26), 0) - - v = load(t, 8) + c - c = (v + (1 << 24)) >> 25 - store(t, v - (c << 25), 8) - - v = load(t, 16) + c - c = (v + (1 << 25)) >> 26 - store(t, v - (c << 26), 16) - - v = load(t, 24) + c - c = (v + (1 << 24)) >> 25 - store(t, v - (c << 25), 24) - - v = load(t, 32) + c - c = (v + (1 << 25)) >> 26 - store(t, v - (c << 26), 32) - - v = load(t, 40) + c - c = (v + (1 << 24)) >> 25 - store(t, v - (c << 25), 40) - - v = load(t, 48) + c - c = (v + (1 << 25)) >> 26 - store(t, v - (c << 26), 48) - - v = load(t, 56) + c - c = (v + (1 << 24)) >> 25 - store(t, v - (c << 25), 56) - - v = load(t, 64) + c - c = (v + (1 << 25)) >> 26 - store(t, v - (c << 26), 64) - - v = load(t, 72) + c - c = (v + (1 << 24)) >> 25 - store(t, v - (c << 25), 72) - - // fold final carry back to start then do second carry and assign to output - v = load(t, 0) + (19 * c) - c = (v + (1 << 25)) >> 26 - store(o, v - (c << 26), 0) - - v = load(t, 8) + c - c = (v + (1 << 24)) >> 25 - store(o, v - (c << 25), 4) - - v = load(t, 16) + c - c = (v + (1 << 25)) >> 26 - store(o, v - (c << 26), 8) - - v = load(t, 24) + c - c = (v + (1 << 24)) >> 25 - store(o, v - (c << 25), 12) - - v = load(t, 32) + c - c = (v + (1 << 25)) >> 26 - store(o, v - (c << 26), 16) - - v = load(t, 40) + c - c = (v + (1 << 24)) >> 25 - store(o, v - (c << 25), 20) - - v = load(t, 48) + c - c = (v + (1 << 25)) >> 26 - store(o, v - (c << 26), 24) - - v = load(t, 56) + c - c = (v + (1 << 24)) >> 25 - store(o, v - (c << 25), 28) - - v = load(t, 64) + c - c = (v + (1 << 25)) >> 26 - store(o, v - (c << 26), 32) - - v = load(t, 72) + c - c = (v + (1 << 24)) >> 25 - store(o, v - (c << 25), 36) + const t_ptr: usize = changetype(t) + const t0 = load(t_ptr, 0) + const t1 = load(t_ptr, 4) + const t2 = load(t_ptr, 8) + const t3 = load(t_ptr, 12) + const t4 = load(t_ptr, 16) + const t5 = load(t_ptr, 20) + const t6 = load(t_ptr, 24) + const t7 = load(t_ptr, 28) + const t8 = load(t_ptr, 32) + const t9 = load(t_ptr, 36) + + const s_ptr: usize = changetype(s) + store(s_ptr, t0 >> 0, 0) + store(s_ptr, t0 >> 8, 1) + store(s_ptr, t0 >> 16, 2) + store(s_ptr, (t0 >> 24) | (t1 << 2), 3) + store(s_ptr, t1 >> 6, 4) + store(s_ptr, t1 >> 14, 5) + store(s_ptr, (t1 >> 22) | (t2 << 3), 6) + store(s_ptr, t2 >> 5, 7) + store(s_ptr, t2 >> 13, 8) + store(s_ptr, (t2 >> 21) | (t3 << 5), 9) + store(s_ptr, t3 >> 3, 10) + store(s_ptr, t3 >> 11, 11) + store(s_ptr, (t3 >> 19) | (t4 << 6), 12) + store(s_ptr, t4 >> 2, 13) + store(s_ptr, t4 >> 10, 14) + store(s_ptr, t4 >> 18, 15) + store(s_ptr, t5 >> 0, 16) + store(s_ptr, t5 >> 8, 17) + store(s_ptr, t5 >> 16, 18) + store(s_ptr, (t5 >> 24) | (t6 << 1), 19) + store(s_ptr, t6 >> 7, 20) + store(s_ptr, t6 >> 15, 21) + store(s_ptr, (t6 >> 23) | (t7 << 3), 22) + store(s_ptr, t7 >> 5, 23) + store(s_ptr, t7 >> 13, 24) + store(s_ptr, (t7 >> 21) | (t8 << 4), 25) + store(s_ptr, t8 >> 4, 26) + store(s_ptr, t8 >> 12, 27) + store(s_ptr, (t8 >> 20) | (t9 << 6), 28) + store(s_ptr, t9 >> 2, 29) + store(s_ptr, t9 >> 10, 30) + store(s_ptr, t9 >> 18, 31) } diff --git a/assembly/ge.ts b/assembly/ge.ts index 386a7c9..6abd642 100644 --- a/assembly/ge.ts +++ b/assembly/ge.ts @@ -17,77 +17,13 @@ * - ge_precomp (Duif): (y+x,y-x,2dxy) */ import { base, base2 } from './base' -import { ed25519_d, ed25519_d2, fe25519_sqrtm1 } from './constants' -import { fe, fe_0, fe_1, fe_add, fe_cmov, fe_copy, fe_frombytes, fe_invert, fe_isnegative, fe_iszero, fe_mul, fe_neg, fe_pow22523, fe_sq, fe_sq2, fe_sub, fe_tobytes, FieldElement } from './fe' +import { ed25519_d, fe25519_sqrtm1 } from './constants' +import { fe, fe_1, fe_add, fe_cmov, fe_copy, fe_frombytes, fe_isnegative, fe_iszero, fe_mul, fe_neg, fe_pow22523, fe_sq, fe_sub, FieldElement } from './fe' +import { ge_add_cached, ge_add_precomp, ge_cached, ge_p1p1, ge_p1p1_to_p2, ge_p1p1_to_p3, ge_p2, ge_p2_0, ge_p2_dbl, ge_p2_to_p3, ge_p3, ge_p3_0, ge_p3_dbl, ge_p3_to_cached, ge_p3_tobytes, ge_precomp, ge_precomp_0, ge_sub_cached, ge_sub_precomp } from './p' import { equal, negative, optblocker_u8 } from './utils' -export class ge_p2 { - X: FieldElement = fe() - Y: FieldElement = fe() - Z: FieldElement = fe() -} - -export class ge_p3 extends ge_p2 { - T: FieldElement = fe() - __brand: 'ge_p3' = 'ge_p3' -} - -export class ge_p1p1 extends ge_p2 { - T: FieldElement = fe() - __brand: 'ge_p1p1' = 'ge_p1p1' -} - -export class ge_precomp { - yplusx: FieldElement = fe() - yminusx: FieldElement = fe() - xy2d: FieldElement = fe() -} - -export class ge_cached { - YplusX: FieldElement = fe() - YminusX: FieldElement = fe() - Z: FieldElement = fe() - T2d: FieldElement = fe() -} - -//@ts-expect-error -@inline -function ge_cached_8 (): StaticArray { - return StaticArray.fromArray([ - new ge_cached(), - new ge_cached(), - new ge_cached(), - new ge_cached(), - new ge_cached(), - new ge_cached(), - new ge_cached(), - new ge_cached() - ]) -} - -//@ts-expect-error -@inline -export function ge_p2_0 (h: ge_p2): void { - fe_0(h.X) - fe_1(h.Y) - fe_1(h.Z) -} - -//@ts-expect-error -@inline -export function ge_p3_0 (h: ge_p3): void { - ge_p2_0(h) - fe_0(h.T) -} - //@ts-expect-error @inline -export function ge_precomp_0 (h: ge_precomp): void { - fe_1(h.yplusx) - fe_1(h.yminusx) - fe_0(h.xy2d) -} - function ge_cmov (t: ge_precomp, u: ge_precomp, b: u8): void { fe_cmov(t.yplusx, u.yplusx, b) fe_cmov(t.yminusx, u.yminusx, b) @@ -116,354 +52,12 @@ function ge_cmov8 (t: ge_precomp, precomp: StaticArray, b: i8): void ge_cmov(t, minust, bnegative) } +//@ts-expect-error +@inline function ge_cmov8_base (t: ge_precomp, pos: i32, b: i8): void { ge_cmov8(t, base[pos], b) } -const ge_add_precomp_t0: FieldElement = fe() -/** - * r = p + q - */ -function ge_add_precomp (r: ge_p1p1, p: ge_p3, q: ge_precomp): void { - const t0 = ge_add_precomp_t0 - fe_add(r.X, p.Y, p.X) - fe_sub(r.Y, p.Y, p.X) - fe_mul(r.Z, r.X, q.yplusx) - fe_mul(r.Y, r.Y, q.yminusx) - fe_mul(r.T, q.xy2d, p.T) - fe_add(t0, p.Z, p.Z) - fe_sub(r.X, r.Z, r.Y) - fe_add(r.Y, r.Z, r.Y) - fe_add(r.Z, t0, r.T) - fe_sub(r.T, t0, r.T) -} - -function ge_p1p1_to_p3 (r: ge_p3, p: ge_p1p1): void { - fe_mul(r.X, p.X, p.T) - fe_mul(r.Y, p.Y, p.Z) - fe_mul(r.Z, p.Z, p.T) - fe_mul(r.T, p.X, p.Y) -} - -function ge_p3_to_p2 (r: ge_p2, p: ge_p3): void { - fe_copy(r.X, p.X) - fe_copy(r.Y, p.Y) - fe_copy(r.Z, p.Z) -} - -const ge_p2_dbl_t0: FieldElement = fe() -function ge_p2_dbl (r: ge_p1p1, p: ge_p2): void { - const t0 = ge_p2_dbl_t0 - fe_sq(r.X, p.X) - fe_sq(r.Z, p.Y) - fe_sq2(r.T, p.Z) - fe_add(r.Y, p.X, p.Y) - fe_sq(t0, r.Y) - fe_add(r.Y, r.Z, r.X) - fe_sub(r.Z, r.Z, r.X) - fe_sub(r.X, t0, r.Y) - fe_sub(r.T, r.T, r.Z) -} - -const ge_p3_dbl_q: ge_p2 = new ge_p2() -/** - * r = 2 * p - */ -function ge_p3_dbl (r: ge_p1p1, p: ge_p3): void { - const q = ge_p3_dbl_q - ge_p3_to_p2(q, p) - ge_p2_dbl(r, q) -} - -/** - * r = p - */ -function ge_p1p1_to_p2 (r: ge_p2, p: ge_p1p1): void { - fe_mul(r.X, p.X, p.T) - fe_mul(r.Y, p.Y, p.Z) - fe_mul(r.Z, p.Z, p.T) -} - -/** - * r = p - */ -function ge_p3_to_cached (r: ge_cached, p: ge_p3): void { - fe_add(r.YplusX, p.Y, p.X) - fe_sub(r.YminusX, p.Y, p.X) - fe_copy(r.Z, p.Z) - fe_mul(r.T2d, p.T, ed25519_d2) -} - -const ge_add_cached_t0: FieldElement = fe() -/** - * r = p + q - */ -function ge_add_cached (r: ge_p1p1, p: ge_p3, q: ge_cached): void { - const t0 = ge_add_cached_t0 - fe_add(r.X, p.Y, p.X) - fe_sub(r.Y, p.Y, p.X) - fe_mul(r.Z, r.X, q.YplusX) - fe_mul(r.Y, r.Y, q.YminusX) - fe_mul(r.T, q.T2d, p.T) - fe_mul(r.X, p.Z, q.Z) - fe_add(t0, r.X, r.X) - fe_sub(r.X, r.Z, r.Y) - fe_add(r.Y, r.Z, r.Y) - fe_add(r.Z, t0, r.T) - fe_sub(r.T, t0, r.T) -} - -function ge_cached_0 (h: ge_cached): void { - fe_1(h.YplusX) - fe_1(h.YminusX) - fe_1(h.Z) - fe_0(h.T2d) -} - -function ge_cmov_cached (t: ge_cached, u: ge_cached, b: u8): void { - fe_cmov(t.YplusX, u.YplusX, b) - fe_cmov(t.YminusX, u.YminusX, b) - fe_cmov(t.Z, u.Z, b) - fe_cmov(t.T2d, u.T2d, b) -} - -const ge_cmov8_cached_minust: ge_cached = new ge_cached() -function ge_cmov8_cached (t: ge_cached, cached: StaticArray, b: i8): void { - const minust = ge_cmov8_cached_minust - const bnegative: u8 = negative(b) - const babs: u8 = b - (((-bnegative) & b) << 1) - - ge_cached_0(t) - ge_cmov_cached(t, cached[0], equal(babs, 1)) - ge_cmov_cached(t, cached[1], equal(babs, 2)) - ge_cmov_cached(t, cached[2], equal(babs, 3)) - ge_cmov_cached(t, cached[3], equal(babs, 4)) - ge_cmov_cached(t, cached[4], equal(babs, 5)) - ge_cmov_cached(t, cached[5], equal(babs, 6)) - ge_cmov_cached(t, cached[6], equal(babs, 7)) - ge_cmov_cached(t, cached[7], equal(babs, 8)) - fe_copy(minust.YplusX, t.YminusX) - fe_copy(minust.YminusX, t.YplusX) - fe_copy(minust.Z, t.Z) - fe_neg(minust.T2d, t.T2d) - ge_cmov_cached(t, minust, bnegative) -} - -const ge_scalarmult_e: StaticArray = new StaticArray(64) -const ge_scalarmult_r: ge_p1p1 = new ge_p1p1() -const ge_scalarmult_s: ge_p2 = new ge_p2() -const ge_scalarmult_t2: ge_p1p1 = new ge_p1p1() -const ge_scalarmult_t3: ge_p1p1 = new ge_p1p1() -const ge_scalarmult_t4: ge_p1p1 = new ge_p1p1() -const ge_scalarmult_t5: ge_p1p1 = new ge_p1p1() -const ge_scalarmult_t6: ge_p1p1 = new ge_p1p1() -const ge_scalarmult_t7: ge_p1p1 = new ge_p1p1() -const ge_scalarmult_t8: ge_p1p1 = new ge_p1p1() -const ge_scalarmult_p2: ge_p3 = new ge_p3() -const ge_scalarmult_p3: ge_p3 = new ge_p3() -const ge_scalarmult_p4: ge_p3 = new ge_p3() -const ge_scalarmult_p5: ge_p3 = new ge_p3() -const ge_scalarmult_p6: ge_p3 = new ge_p3() -const ge_scalarmult_p7: ge_p3 = new ge_p3() -const ge_scalarmult_p8: ge_p3 = new ge_p3() -const ge_scalarmult_pi: StaticArray = ge_cached_8() -const ge_scalarmult_t: ge_cached = new ge_cached() -/** - * `h = a * p` - * - * where - * - * `a = 256⁰a[0] + 256¹a[1] + ... + 256³¹a[31]` - * - * Preconditions: - * - `a[31] <= 127` - * - `p` is public - */ -export function ge_scalarmult (h: ge_p3, a: StaticArray, p: ge_p3): void { - const e = ge_scalarmult_e - const r = ge_scalarmult_r - const s = ge_scalarmult_s - const t2 = ge_scalarmult_t2 - const t3 = ge_scalarmult_t3 - const t4 = ge_scalarmult_t4 - const t5 = ge_scalarmult_t5 - const t6 = ge_scalarmult_t6 - const t7 = ge_scalarmult_t7 - const t8 = ge_scalarmult_t8 - const p2 = ge_scalarmult_p2 - const p3 = ge_scalarmult_p3 - const p4 = ge_scalarmult_p4 - const p5 = ge_scalarmult_p5 - const p6 = ge_scalarmult_p6 - const p7 = ge_scalarmult_p7 - const p8 = ge_scalarmult_p8 - const pi = ge_scalarmult_pi - const t = ge_scalarmult_t - let carry: i8 - let i: i8 - - ge_p3_to_cached(pi[1 - 1], p) /* p */ - - ge_p3_dbl(t2, p) - ge_p1p1_to_p3(p2, t2) - ge_p3_to_cached(pi[2 - 1], p2) /* 2p = 2*p */ - - ge_add_cached(t3, p, pi[2 - 1]) - ge_p1p1_to_p3(p3, t3) - ge_p3_to_cached(pi[3 - 1], p3) /* 3p = 2p+p */ - - ge_p3_dbl(t4, p2) - ge_p1p1_to_p3(p4, t4) - ge_p3_to_cached(pi[4 - 1], p4) /* 4p = 2*2p */ - - ge_add_cached(t5, p, pi[4 - 1]) - ge_p1p1_to_p3(p5, t5) - ge_p3_to_cached(pi[5 - 1], p5) /* 5p = 4p+p */ - - ge_p3_dbl(t6, p3) - ge_p1p1_to_p3(p6, t6) - ge_p3_to_cached(pi[6 - 1], p6) /* 6p = 2*3p */ - - ge_add_cached(t7, p, pi[6 - 1]) - ge_p1p1_to_p3(p7, t7) - ge_p3_to_cached(pi[7 - 1], p7) /* 7p = 6p+p */ - - ge_p3_dbl(t8, p4) - ge_p1p1_to_p3(p8, t8) - ge_p3_to_cached(pi[8 - 1], p8) /* 8p = 2*4p */ - - for (i = 0; i < 32; ++i) { - e[2 * i + 0] = (a[i] >> 0) & 15 - e[2 * i + 1] = (a[i] >> 4) & 15 - } - /* each e[i] is between 0 and 15 */ - /* e[63] is between 0 and 7 */ - - carry = 0 - for (i = 0; i < 63; ++i) { - e[i] += carry - carry = e[i] + 8 - carry >>= 4 - e[i] -= i8(carry << 4) - } - e[63] += carry - /* each e[i] is between -8 and 8 */ - - ge_p3_0(h) - - for (i = 63; i != 0; i--) { - ge_cmov8_cached(t, pi, e[i]) - ge_add_cached(r, h, t) - - ge_p1p1_to_p2(s, r) - ge_p2_dbl(r, s) - ge_p1p1_to_p2(s, r) - ge_p2_dbl(r, s) - ge_p1p1_to_p2(s, r) - ge_p2_dbl(r, s) - ge_p1p1_to_p2(s, r) - ge_p2_dbl(r, s) - - ge_p1p1_to_p3(h, r) /* *16 */ - } - ge_cmov8_cached(t, pi, e[i]) - ge_add_cached(r, h, t) - - ge_p1p1_to_p3(h, r) -} - -const ge_scalarmult_base_e: StaticArray = new StaticArray(64) -const ge_scalarmult_base_r: ge_p1p1 = new ge_p1p1() -const ge_scalarmult_base_s: ge_p2 = new ge_p2() -const ge_scalarmult_base_t: ge_precomp = new ge_precomp() -/** - * `h = a * B` (with precomputation) - * - * where - * - * `a = 256⁰a[0] + 256¹a[1] + ... + 256³¹a[31]` - * - * B is the Ed25519 base point `(x,⅘)` with `x` positive. As bytes: - * - * `0x5866666666666666666666666666666666666666666666666666666666666666` - * - * Preconditions: - * - `a[31] <= 127` - */ -export function ge_scalarmult_base (h: ge_p3, a: StaticArray): void { - const e = ge_scalarmult_base_e - const r = ge_scalarmult_base_r - const s = ge_scalarmult_base_s - const t = ge_scalarmult_base_t - - for (let i = 0; i < 32; ++i) { - e[(i << 1) + 0] = (a[i] >> 0) & 15 - e[(i << 1) + 1] = (a[i] >> 4) & 15 - } - /* each e[i] is between 0 and 15 */ - /* e[63] is between 0 and 7 */ - - let carry: i8 = 0 - for (let i = 0; i < 63; ++i) { - e[i] += carry - carry = e[i] + 8 - carry >>= 4 - e[i] -= carry * (i8(1) << 4) - } - e[63] += carry - /* each e[i] is between -8 and 8 */ - - ge_p3_0(h) - - for (let i = 1; i < 64; i += 2) { - ge_cmov8_base(t, i / 2, e[i]) - ge_add_precomp(r, h, t) - ge_p1p1_to_p3(h, r) - } - - ge_p3_dbl(r, h) - ge_p1p1_to_p2(s, r) - ge_p2_dbl(r, s) - ge_p1p1_to_p2(s, r) - ge_p2_dbl(r, s) - ge_p1p1_to_p2(s, r) - ge_p2_dbl(r, s) - ge_p1p1_to_p3(h, r) - - for (let i = 0; i < 64; i += 2) { - ge_cmov8_base(t, i / 2, e[i]) - ge_add_precomp(r, h, t) - ge_p1p1_to_p3(h, r) - } -} - -const ge_p3_tobytes_recip: FieldElement = fe() -const ge_p3_tobytes_x: FieldElement = fe() -const ge_p3_tobytes_y: FieldElement = fe() -export function ge_p3_tobytes (s: StaticArray, h: ge_p3): void { - const recip = ge_p3_tobytes_recip - const x = ge_p3_tobytes_x - const y = ge_p3_tobytes_y - fe_invert(recip, h.Z) - fe_mul(x, h.X, recip) - fe_mul(y, h.Y, recip) - fe_tobytes(s, y) - s[31] ^= fe_isnegative(x) << 7 -} - -/** - * true if `s < p` - */ -export function ge_is_canonical (s: StaticArray): u8 { - let c: u8 = (s[31] & 0x7f) ^ 0x7f - for (let i = 30; i > 0; i--) { - c |= s[i] ^ 0xff - } - c = (c - 1) >> 8 - const d: u8 = (0xec - s[0]) >> 8 - return 1 - (c & d & 1) -} - const u: FieldElement = fe() const v: FieldElement = fe() const v3: FieldElement = fe() @@ -499,7 +93,7 @@ export function ge_frombytes_negate_vartime (h: ge_p3, s: StaticArray): i32 fe_mul(h.X, h.X, fe25519_sqrtm1) } - if (fe_isnegative(h.X) == (s[31] >> 7)) { /* vartime function - compiler optimization is fine */ + if (fe_isnegative(h.X) == (load(changetype(s), 31) >> 7)) { /* vartime function - compiler optimization is fine */ fe_neg(h.X, h.X) } fe_mul(h.T, h.X, h.Y) @@ -572,45 +166,102 @@ export function ge_has_small_order (p: ge_p3): i32 { } /** - * r = p + * true if `s < p` */ -export function ge_p2_to_p3 (r: ge_p3, p: ge_p2): void { - fe_copy(r.X, p.X) - fe_copy(r.Y, p.Y) - fe_copy(r.Z, p.Z) - fe_mul(r.T, p.X, p.Y) +export function ge_is_canonical (s: StaticArray): u8 { + let c: u8 = (s[31] & 0x7f) ^ 0x7f + for (let i = 30; i > 0; i--) { + c |= s[i] ^ 0xff + } + c = (c - 1) >> 8 + const d: u8 = (0xec - s[0]) >> 8 + return 1 - (c & d & 1) } -const ge_p3_sub_q_neg: ge_p3 = new ge_p3() -/* r = p-q */ -export function ge_p3_sub (r: ge_p3, p: ge_p3, q: ge_p3): void { - const q_neg = ge_p3_sub_q_neg - ge_p3_neg(q_neg, q) - ge_p3_add(r, p, q_neg) -} +const ge_scalarmult_base_e: StaticArray = new StaticArray(64) +const ge_scalarmult_base_r: ge_p1p1 = new ge_p1p1() +const ge_scalarmult_base_s: ge_p2 = new ge_p2() +const ge_scalarmult_base_t: ge_precomp = new ge_precomp() +/** + * `h = a * B` (with precomputation) + * + * where + * + * `a = 256⁰a[0] + 256¹a[1] + ... + 256³¹a[31]` + * + * B is the Ed25519 base point `(x,⅘)` with `x` positive. As bytes: + * + * `0x5866666666666666666666666666666666666666666666666666666666666666` + * + * Preconditions: + * - `a[31] <= 127` + */ +function ge_scalarmult_base (h: ge_p3, a: StaticArray): void { + const e = ge_scalarmult_base_e + const r = ge_scalarmult_base_r + const s = ge_scalarmult_base_s + const t = ge_scalarmult_base_t + + for (let i = 0; i < 32; ++i) { + e[(i << 1) + 0] = (a[i] >> 0) & 15 + e[(i << 1) + 1] = (a[i] >> 4) & 15 + } + /* each e[i] is between 0 and 15 */ + /* e[63] is between 0 and 7 */ + + let carry: i8 = 0 + for (let i = 0; i < 63; ++i) { + e[i] += carry + carry = e[i] + 8 + carry >>= 4 + e[i] -= carry * (i8(1) << 4) + } + e[63] += carry + /* each e[i] is between -8 and 8 */ + + ge_p3_0(h) + + for (let i = 1; i < 64; i += 2) { + ge_cmov8_base(t, i / 2, e[i]) + ge_add_precomp(r, h, t) + ge_p1p1_to_p3(h, r) + } + + ge_p3_dbl(r, h) + ge_p1p1_to_p2(s, r) + ge_p2_dbl(r, s) + ge_p1p1_to_p2(s, r) + ge_p2_dbl(r, s) + ge_p1p1_to_p2(s, r) + ge_p2_dbl(r, s) + ge_p1p1_to_p3(h, r) -/* r = -p */ -function ge_p3_neg (r: ge_p3, p: ge_p3): void { - fe_neg(r.X, p.X) - fe_copy(r.Y, p.Y) - fe_copy(r.Z, p.Z) - fe_neg(r.T, p.T) + for (let i = 0; i < 64; i += 2) { + ge_cmov8_base(t, i / 2, e[i]) + ge_add_precomp(r, h, t) + ge_p1p1_to_p3(h, r) + } } -const ge_p3_add_q_cached = new ge_cached() -const ge_p3_add_q_p1p1 = new ge_p1p1() -/* r = p+q */ -function ge_p3_add (r: ge_p3, p: ge_p3, q: ge_p3): void { - const q_cached = ge_p3_add_q_cached - const p1p1 = ge_p3_add_q_p1p1 - ge_p3_to_cached(q_cached, q) - ge_add_cached(p1p1, p, q_cached) - ge_p1p1_to_p3(r, p1p1) +const ge_scalarmult_base_tobytes_h: ge_p3 = new ge_p3() +export function ge_scalarmult_base_tobytes (s: StaticArray, a: StaticArray): void { + const h = ge_scalarmult_base_tobytes_h + ge_scalarmult_base(h, a) + ge_p3_tobytes(s, h) } const ge_double_scalarmult_vartime_aslide: StaticArray = new StaticArray(256) const ge_double_scalarmult_vartime_bslide: StaticArray = new StaticArray(256) -const ge_double_scalarmult_vartime_Ai: StaticArray = ge_cached_8() /* A,3A,5A,7A,9A,11A,13A,15A */ +const ge_double_scalarmult_vartime_Ai: StaticArray = StaticArray.fromArray([ + new ge_cached(), + new ge_cached(), + new ge_cached(), + new ge_cached(), + new ge_cached(), + new ge_cached(), + new ge_cached(), + new ge_cached() +]) /* A,3A,5A,7A,9A,11A,13A,15A */ const ge_double_scalarmult_vartime_t: ge_p1p1 = new ge_p1p1() const ge_double_scalarmult_vartime_u: ge_p3 = new ge_p3() const ge_double_scalarmult_vartime_A2: ge_p3 = new ge_p3() @@ -622,7 +273,7 @@ const ge_double_scalarmult_vartime_A2: ge_p3 = new ge_p3() * * Only used for signatures verification. */ -export function ge_double_scalarmult_vartime (r: ge_p2, a: StaticArray, A: ge_p3, b: StaticArray): void { +function ge_double_scalarmult_vartime (r: ge_p2, a: StaticArray, A: ge_p3, b: StaticArray): void { const Bi = base2 const aslide = ge_double_scalarmult_vartime_aslide const bslide = ge_double_scalarmult_vartime_bslide @@ -699,6 +350,13 @@ export function ge_double_scalarmult_vartime (r: ge_p2, a: StaticArray, A: g } } +const ge_double_scalarmult_vartime_p: ge_p2 = new ge_p2() +export function ge_double_scalarmult_vartime_to_p3 (r: ge_p3, a: StaticArray, A: ge_p3, b: StaticArray): void { + const p = ge_double_scalarmult_vartime_p + ge_double_scalarmult_vartime(p, a, A, b) + ge_p2_to_p3(r, p) +} + function slide_vartime (r: StaticArray, a: StaticArray): void { let i: i32 let b: i32 @@ -739,40 +397,3 @@ function slide_vartime (r: StaticArray, a: StaticArray): void { } } } - -const ge_sub_cached_t0: FieldElement = fe() -/** - * r = p - q - */ -function ge_sub_cached (r: ge_p1p1, p: ge_p3, q: ge_cached): void { - const t0 = ge_sub_cached_t0 - fe_add(r.X, p.Y, p.X) - fe_sub(r.Y, p.Y, p.X) - fe_mul(r.Z, r.X, q.YminusX) - fe_mul(r.Y, r.Y, q.YplusX) - fe_mul(r.T, q.T2d, p.T) - fe_mul(r.X, p.Z, q.Z) - fe_add(t0, r.X, r.X) - fe_sub(r.X, r.Z, r.Y) - fe_add(r.Y, r.Z, r.Y) - fe_sub(r.Z, t0, r.T) - fe_add(r.T, t0, r.T) -} - -const ge_sub_precomp_t0: FieldElement = fe() -/** - * r = p - q - */ -function ge_sub_precomp (r: ge_p1p1, p: ge_p3, q: ge_precomp): void { - const t0 = ge_sub_precomp_t0 - fe_add(r.X, p.Y, p.X) - fe_sub(r.Y, p.Y, p.X) - fe_mul(r.Z, r.X, q.yminusx) - fe_mul(r.Y, r.Y, q.yplusx) - fe_mul(r.T, q.xy2d, p.T) - fe_add(t0, p.Z, p.Z) - fe_sub(r.X, r.Z, r.Y) - fe_add(r.Y, r.Z, r.Y) - fe_sub(r.Z, t0, r.T) - fe_add(r.T, t0, r.T) -} diff --git a/assembly/nano-nacl.ts b/assembly/nano-nacl.ts index 6e2f75b..36fadc7 100644 --- a/assembly/nano-nacl.ts +++ b/assembly/nano-nacl.ts @@ -2,7 +2,8 @@ //! SPDX-License-Identifier: GPL-3.0-or-later import { Blake2b } from './blake2b' -import { ge_double_scalarmult_vartime, ge_frombytes, ge_frombytes_negate_vartime, ge_has_small_order, ge_is_canonical, ge_p2, ge_p2_to_p3, ge_p3, ge_p3_0, ge_p3_sub, ge_p3_tobytes, ge_scalarmult_base } from './ge' +import { ge_double_scalarmult_vartime_to_p3, ge_frombytes, ge_frombytes_negate_vartime, ge_has_small_order, ge_is_canonical, ge_scalarmult_base_tobytes } from './ge' +import { ge_p3, ge_sub_p3 } from './p' import { sc_is_canonical, sc_muladd, sc_reduce } from './sc' const BLOCKHASH_BYTES: i32 = 32 @@ -29,18 +30,11 @@ function crypto_hash (o: StaticArray, i: StaticArray): void { blake2b.init().update(i).digest(o) } -const crypto_derive_A = new ge_p3() function crypto_derive (pk: StaticArray, sk: StaticArray, seed: StaticArray): void { - const A = crypto_derive_A - ge_p3_0(A) - crypto_hash(sk, seed) clamp(sk) - trace(`seed hashed to sk: ${sk.map((b: u8) => b.toString(16).padStart(2, '0')).join('')}`) - - ge_scalarmult_base(A, sk) - ge_p3_tobytes(pk, A) + ge_scalarmult_base_tobytes(pk, sk) memory.copy(changetype(sk), changetype(seed), PRIVATEKEY_BYTES) memory.copy(changetype(sk) + 32, changetype(pk), PUBLICKEY_BYTES) } @@ -48,17 +42,14 @@ function crypto_derive (pk: StaticArray, sk: StaticArray, seed: StaticAr const crypto_sign_az = new StaticArray(64) const crypto_sign_nonce = new StaticArray(64) const crypto_sign_hram = new StaticArray(64) -const crypto_sign_R = new ge_p3() const crypto_sign_zm = new StaticArray(SIGNATURE_BYTES) const crypto_sign_t = new StaticArray(PRIVATEKEY_BYTES) function crypto_sign (s: StaticArray, m: StaticArray, sk: StaticArray): void { const az = crypto_sign_az const nonce = crypto_sign_nonce const hram = crypto_sign_hram - const R = crypto_sign_R const zm = crypto_sign_zm const t = crypto_sign_t - ge_p3_0(R) // Hash secret key to private scalar `a` and prefix for nonce derivation `z` memory.copy(changetype(t), changetype(sk), PRIVATEKEY_BYTES) @@ -71,9 +62,8 @@ function crypto_sign (s: StaticArray, m: StaticArray, sk: StaticArray(SIGNEDBLOCKHASH_BYTES) const crypto_verify_S = new StaticArray(32) /** @@ -109,7 +98,6 @@ function crypto_verify (s: StaticArray, m: StaticArray, pk: StaticArray< const expected_r = crypto_verify_expected_r const A = crypto_verify_A const sb_ah = crypto_verify_sb_ah - const sb_ah_p2 = crypto_verify_sb_ah_p2 const ram = crypto_verify_ram const S = crypto_verify_S @@ -136,9 +124,8 @@ function crypto_verify (s: StaticArray, m: StaticArray, pk: StaticArray< crypto_hash(h, ram) sc_reduce(h) - ge_double_scalarmult_vartime(sb_ah_p2, h, A, S) - ge_p2_to_p3(sb_ah, sb_ah_p2) - ge_p3_sub(check, expected_r, sb_ah) + ge_double_scalarmult_vartime_to_p3(sb_ah, h, A, S) + ge_sub_p3(check, expected_r, sb_ah) return ge_has_small_order(check) - 1 } @@ -169,11 +156,7 @@ export function derive (): void { seed[i] = load(INPUT_BUFFER + i) } - const start = performance.now() crypto_derive(pk, sk, seed) - const end = performance.now() - trace('derive time', 1, end - start) - for (let i = 0; i < PUBLICKEY_BYTES; i++) { store(OUTPUT_BUFFER + i, pk[i]) } @@ -202,11 +185,7 @@ export function sign (): void { sk[i] = load(usize(i) + ptr) } - const start = performance.now() crypto_sign(s, h, sk) - const end = performance.now() - trace('sign time', 1, end - start) - for (let i = 0; i < SIGNATURE_BYTES; i++) { store(OUTPUT_BUFFER + i, s[i]) } @@ -241,10 +220,6 @@ export function verify (): void { k[i] = load(usize(i) + ptr) } - const start = performance.now() const v = crypto_verify(s, h, k) - const end = performance.now() - trace('verify time', 1, end - start) - store(OUTPUT_BUFFER, v) } diff --git a/assembly/p.ts b/assembly/p.ts new file mode 100644 index 0000000..537eeec --- /dev/null +++ b/assembly/p.ts @@ -0,0 +1,236 @@ +//! SPDX-FileCopyrightText: 2026 Chris Duncan +//! SPDX-License-Identifier: GPL-3.0-or-later + +import { ed25519_d2 } from './constants' +import { FieldElement, fe, fe_0, fe_1, fe_add, fe_copy, fe_dbl, fe_invert, fe_isnegative, fe_mul, fe_neg, fe_sq, fe_sq_vec, fe_sq2, fe_sub, fe_tobytes } from './fe' + +export class ge_p2 { + X: FieldElement = fe() + Y: FieldElement = fe() + Z: FieldElement = fe() +} + +export class ge_p1p1 extends ge_p2 { + T: FieldElement = fe() + __brand: 'ge_p1p1' = 'ge_p1p1' +} + +export class ge_p3 extends ge_p2 { + T: FieldElement = fe() + __brand: 'ge_p3' = 'ge_p3' +} +export class ge_cached { + YplusX: FieldElement = fe() + YminusX: FieldElement = fe() + Z: FieldElement = fe() + T2d: FieldElement = fe() +} + +export class ge_precomp { + yplusx: FieldElement = fe() + yminusx: FieldElement = fe() + xy2d: FieldElement = fe() +} + +//@ts-expect-error +@inline +export function ge_p2_0 (h: ge_p2): void { + fe_0(h.X) + fe_1(h.Y) + fe_1(h.Z) +} + +const ge_p2_dbl_t0: FieldElement = fe() +//@ts-expect-error +@inline +export function ge_p2_dbl (r: ge_p1p1, p: ge_p2): void { + const t0 = ge_p2_dbl_t0 + fe_sq_vec(r.X, p.X, r.Z, p.Y) + fe_sq2(r.T, p.Z) + fe_add(r.Y, p.X, p.Y) + fe_sq(t0, r.Y) + fe_add(r.Y, r.Z, r.X) + fe_sub(r.Z, r.Z, r.X) + fe_sub(r.X, t0, r.Y) + fe_sub(r.T, r.T, r.Z) +} + +/** + * r = p + */ +//@ts-expect-error +@inline +export function ge_p2_to_p3 (r: ge_p3, p: ge_p2): void { + fe_copy(r.X, p.X) + fe_copy(r.Y, p.Y) + fe_copy(r.Z, p.Z) + fe_mul(r.T, p.X, p.Y) +} + +/** + * r = p + */ +//@ts-expect-error +@inline +export function ge_p1p1_to_p2 (r: ge_p2, p: ge_p1p1): void { + fe_mul(r.X, p.X, p.T) + fe_mul(r.Y, p.Y, p.Z) + fe_mul(r.Z, p.Z, p.T) +} + +//@ts-expect-error +@inline +export function ge_p1p1_to_p3 (r: ge_p3, p: ge_p1p1): void { + fe_mul(r.X, p.X, p.T) + fe_mul(r.Y, p.Y, p.Z) + fe_mul(r.Z, p.Z, p.T) + fe_mul(r.T, p.X, p.Y) +} + +//@ts-expect-error +@inline +export function ge_p3_0 (h: ge_p3): void { + ge_p2_0(h) + fe_0(h.T) +} + +const ge_p3_dbl_q: ge_p2 = new ge_p2() +/** + * r = 2 * p + */ +export function ge_p3_dbl (r: ge_p1p1, p: ge_p3): void { + const q = ge_p3_dbl_q + fe_copy(q.X, p.X) + fe_copy(q.Y, p.Y) + fe_copy(q.Z, p.Z) + ge_p2_dbl(r, q) +} + +const ge_p3_tobytes_recip: FieldElement = fe() +const ge_p3_tobytes_x: FieldElement = fe() +const ge_p3_tobytes_y: FieldElement = fe() +export function ge_p3_tobytes (s: StaticArray, h: ge_p3): void { + const recip = ge_p3_tobytes_recip + const x = ge_p3_tobytes_x + const y = ge_p3_tobytes_y + fe_invert(recip, h.Z) + fe_mul(x, h.X, recip) + fe_mul(y, h.Y, recip) + fe_tobytes(s, y) + s[31] ^= fe_isnegative(x) << 7 +} + +/** + * r = p + */ +//@ts-expect-error +@inline +export function ge_p3_to_cached (r: ge_cached, p: ge_p3): void { + fe_add(r.YplusX, p.Y, p.X) + fe_sub(r.YminusX, p.Y, p.X) + fe_copy(r.Z, p.Z) + fe_mul(r.T2d, p.T, ed25519_d2) +} + +//@ts-expect-error +@inline +export function ge_precomp_0 (h: ge_precomp): void { + fe_1(h.yplusx) + fe_1(h.yminusx) + fe_0(h.xy2d) +} + +const ge_add_cached_t0: FieldElement = fe() +/** + * r = p + q + */ +export function ge_add_cached (r: ge_p1p1, p: ge_p3, q: ge_cached): void { + const t0 = ge_add_cached_t0 + fe_add(r.X, p.Y, p.X) + fe_sub(r.Y, p.Y, p.X) + fe_mul(r.Z, r.X, q.YplusX) + fe_mul(r.Y, r.Y, q.YminusX) + fe_mul(r.T, q.T2d, p.T) + fe_mul(r.X, p.Z, q.Z) + fe_dbl(t0, r.X) + fe_sub(r.X, r.Z, r.Y) + fe_add(r.Y, r.Z, r.Y) + fe_add(r.Z, t0, r.T) + fe_sub(r.T, t0, r.T) +} + +const ge_add_precomp_t0: FieldElement = fe() +/** + * r = p + q + */ +//@ts-expect-error +@inline +export function ge_add_precomp (r: ge_p1p1, p: ge_p3, q: ge_precomp): void { + const t0 = ge_add_precomp_t0 + fe_add(r.X, p.Y, p.X) + fe_sub(r.Y, p.Y, p.X) + fe_mul(r.Z, r.X, q.yplusx) + fe_mul(r.Y, r.Y, q.yminusx) + fe_mul(r.T, q.xy2d, p.T) + fe_dbl(t0, p.Z) + fe_sub(r.X, r.Z, r.Y) + fe_add(r.Y, r.Z, r.Y) + fe_add(r.Z, t0, r.T) + fe_sub(r.T, t0, r.T) +} + +const ge_sub_p3_q_cached = new ge_cached() +const ge_sub_p3_p1p1 = new ge_p1p1() +/* r = p-q */ +//@ts-expect-error +@inline +export function ge_sub_p3 (r: ge_p3, p: ge_p3, q: ge_p3): void { + const q_cached = ge_sub_p3_q_cached + const p1p1 = ge_sub_p3_p1p1 + + fe_neg(r.X, q.X) + fe_copy(r.Y, q.Y) + fe_copy(r.Z, q.Z) + fe_neg(r.T, q.T) + + ge_p3_to_cached(q_cached, r) + ge_add_cached(p1p1, p, q_cached) + ge_p1p1_to_p3(r, p1p1) +} + +const ge_sub_cached_t0: FieldElement = fe() +/** + * r = p - q + */ +export function ge_sub_cached (r: ge_p1p1, p: ge_p3, q: ge_cached): void { + const t0 = ge_sub_cached_t0 + fe_add(r.X, p.Y, p.X) + fe_sub(r.Y, p.Y, p.X) + fe_mul(r.Z, r.X, q.YminusX) + fe_mul(r.Y, r.Y, q.YplusX) + fe_mul(r.T, q.T2d, p.T) + fe_mul(r.X, p.Z, q.Z) + fe_dbl(t0, r.X) + fe_sub(r.X, r.Z, r.Y) + fe_add(r.Y, r.Z, r.Y) + fe_sub(r.Z, t0, r.T) + fe_add(r.T, t0, r.T) +} + +const ge_sub_precomp_t0: FieldElement = fe() +/** + * r = p - q + */ +export function ge_sub_precomp (r: ge_p1p1, p: ge_p3, q: ge_precomp): void { + const t0 = ge_sub_precomp_t0 + fe_add(r.X, p.Y, p.X) + fe_sub(r.Y, p.Y, p.X) + fe_mul(r.Z, r.X, q.yminusx) + fe_mul(r.Y, r.Y, q.yplusx) + fe_mul(r.T, q.xy2d, p.T) + fe_dbl(t0, p.Z) + fe_sub(r.X, r.Z, r.Y) + fe_add(r.Y, r.Z, r.Y) + fe_sub(r.Z, t0, r.T) + fe_add(r.T, t0, r.T) +} diff --git a/index.html b/index.html index fcfcd4c..656709b 100644 --- a/index.html +++ b/index.html @@ -112,26 +112,33 @@ SPDX-License-Identifier: GPL-3.0-or-later } export async function test (api, size, runs, isDebug, isSelfCheck) { + if (typeof size !== 'number' || size < 1) { + size = 1 + } + if (typeof runs !== 'number' || runs < 1) { + runs = 1 + } const selectedApi = api // Execute once on load to initialize worker and WASM const h = random(), k = random(64) await NanoNaCl.sign(h, k) const derive = sk => { + const sk8 = new Uint8Array(sk.match(/.{2}/g).map(b => parseInt(b, 16))) switch (api) { case 'NanoNaCl': { - return NanoNaCl.derive(sk) + const pk8 = NanoNaCl.deriveSync(sk8) + return [...pk8].map(b => b.toString(16).padStart(2, '0')).join('') } case 'NanocurrencyWeb': { return NanocurrencyWeb.wallet.legacyAccounts(sk)[0].publicKey } case 'Sodium': { - const sk8 = sk.match(/.{2}/g).map(b => parseInt(b, 16)) - return sodium.crypto_sign_seed_keypair(new Uint8Array(sk8), 'hex').publicKey + return sodium.crypto_sign_seed_keypair(sk8, 'hex').publicKey } case 'TweetNaCl.js': { const sk8 = sk.match(/.{2}/g).map(b => parseInt(b, 16)) - const pk8 = nacl.sign.keyPair.fromSeed(new Uint8Array(sk8)).publicKey + const pk8 = nacl.sign.keyPair.fromSeed(sk8).publicKey return [...pk8].map(b => b.toString(16).padStart(2, '0')).join('') } default: { @@ -140,24 +147,21 @@ SPDX-License-Identifier: GPL-3.0-or-later } } const sign = (h, sk, pk) => { + const h8 = new Uint8Array(h.match(/.{2}/g).map(b => parseInt(b, 16))) + const sk8 = new Uint8Array((sk + pk).match(/.{2}/g).map(b => parseInt(b, 16))) switch (api) { case 'NanoNaCl': { - return NanoNaCl.sign(h, sk + pk) + const s8 = NanoNaCl.signSync(h8, sk8) + return [...s8].map(b => b.toString(16).padStart(2, '0')).join('') } case 'NanocurrencyWeb': { const { privateKey } = NanocurrencyWeb.wallet.legacyAccounts(sk)[0] return NanocurrencyWeb.tools.sign(privateKey, h) } case 'Sodium': { - sk = sk + pk - const h8 = h.match(/.{2}/g).map(b => parseInt(b, 16)) - const sk8 = sk.match(/.{2}/g).map(b => parseInt(b, 16)) return sodium.crypto_sign_detached(new Uint8Array(h8), new Uint8Array(sk8), 'hex') } case 'TweetNaCl.js': { - sk = sk + pk - const h8 = h.match(/.{2}/g).map(b => parseInt(b, 16)) - const sk8 = sk.match(/.{2}/g).map(b => parseInt(b, 16)) const s8 = nacl.sign.detached(new Uint8Array(h8), new Uint8Array(sk8)) return [...s8].map(b => b.toString(16).padStart(2, '0')).join('') } @@ -167,24 +171,22 @@ SPDX-License-Identifier: GPL-3.0-or-later } } const verify = (s, h, pk) => { + const h8 = new Uint8Array(h.match(/.{2}/g).map(b => parseInt(b, 16))) + const s8 = new Uint8Array(s.match(/.{2}/g).map(b => parseInt(b, 16))) + const pk8 = new Uint8Array(pk.match(/.{2}/g).map(b => parseInt(b, 16))) switch (api) { case 'NanoNaCl': { - return NanoNaCl.verify(s, h, pk) + const v = NanoNaCl.verifySync(s8, h8, pk8) + return v[0] === 0 } case 'NanocurrencyWeb': { return NanocurrencyWeb.tools.verify(pk, s, h) } case 'Sodium': { - const h8 = h.match(/.{2}/g).map(b => parseInt(b, 16)) - const s8 = s.match(/.{2}/g).map(b => parseInt(b, 16)) - const pk8 = pk.match(/.{2}/g).map(b => parseInt(b, 16)) - return sodium.crypto_sign_verify_detached(new Uint8Array(s8), new Uint8Array(h8), new Uint8Array(pk8)) + return sodium.crypto_sign_verify_detached(s8, h8, pk8) } case 'TweetNaCl.js': { - const h8 = h.match(/.{2}/g).map(b => parseInt(b, 16)) - const s8 = s.match(/.{2}/g).map(b => parseInt(b, 16)) - const pk8 = pk.match(/.{2}/g).map(b => parseInt(b, 16)) - return nacl.sign.detached.verify(new Uint8Array(h8), new Uint8Array(s8), new Uint8Array(pk8)) + return nacl.sign.detached.verify(h8, s8, pk8) } default: { return null @@ -196,6 +198,7 @@ SPDX-License-Identifier: GPL-3.0-or-later api = 'NanoNaCl' document.getElementById('status').innerHTML = `RUNNING SELF-CHECK` console.log(`%cNanoNaCl`, 'color:green', 'Checking validation against known values') + // https://docs.nano.org/integration-guides/key-management/#success-response_2 TEST_VECTORS ??= { blockHash: 'BB569136FA05F8CBF65CEF2EDE368475B289C4477342976556BA4C0DDF216E45', privateKey: '781186FB9EF17DB6E3D1056550D9FAE5D5BBADA6A6BC370E4CBB938B1DC71DA3', @@ -390,9 +393,9 @@ SPDX-License-Identifier: GPL-3.0-or-later - + - + diff --git a/index.ts b/index.ts index d571e7e..92d9644 100644 --- a/index.ts +++ b/index.ts @@ -16,7 +16,7 @@ type Data = { signature?: string } -const NanoNaCl = async (bytes: number[]): Promise => { +export const NanoNaCl = async (bytes: number[]): Promise => { let isReady = false let wasm let module @@ -314,6 +314,129 @@ async function run (data: Record): Promise { } } +let isReady = false +let wasm +let module +let instance +let memory: WebAssembly.Memory +let deriveSync: (k: Uint8Array) => Uint8Array +let signSync: (h: Uint8Array, k: Uint8Array) => Uint8Array +let verifySync: (s: Uint8Array, h: Uint8Array, k: Uint8Array) => Uint8Array +try { + wasm = Uint8Array.from(nacl) + module = await WebAssembly.compile(wasm) + instance = await WebAssembly.instantiate(module, { + env: { + abort: (msg: any, file: any, row: any, col: any) => { + // ~lib/builtins/abort(~lib/string/String | null?, ~lib/string/String | null?, u32?, u32?) => void + // msg = __liftString(msg >>> 0) + // file = __liftString(file >>> 0) + row = row >>> 0 + col = col >>> 0 + console.error('wasm abort:', `msg ${msg}`, `file ${file}`, `row ${row}`, `col ${col}`) + throw new Error(msg, { cause: { file, row, col } }) + }, + "performance.now" () { + // ~lib/bindings/dom/performance.now() => f64 + return performance.now() + }, + trace: (message: any, n?: number, a0?: number, a1?: number, a2?: number, a3?: number, a4?: number) => { + // ~lib/builtins/trace(~lib/string/String, i32?, f64?, f64?, f64?, f64?, f64?) => void + message = __liftString(message >>> 0); + (() => { + // @external.js + console.log(message, ...[a0, a1, a2, a3, a4].slice(0, n)) + })() + }, + memory: new WebAssembly.Memory({ initial: 4, maximum: 4 }) + } + }) + const { exports } = instance as { exports: { derive: Derive, sign: Sign, verify: Verify, getInputPointer: GetInputPointer, getOutputPointer: GetOutputPointer, memory: WebAssembly.Memory } } + memory = exports.memory + + function __liftString (pointer: number) { + if (!pointer) return null + const + //@ts-ignore + end = pointer + new Uint32Array(memory.buffer)[pointer - 4 >>> 2] >>> 1, + //@ts-ignore + memoryU16 = new Uint16Array(memory.buffer) + let + start = pointer >>> 1, + string = "" + while (end - start > 1024) string += String.fromCharCode(...memoryU16.subarray(start, start += 1024)) + return string + String.fromCharCode(...memoryU16.subarray(start, end)) + } + + deriveSync = function (k: Uint8Array): Uint8Array { + // assembly/nano-nacl/derive() => void + let buffer: DataView | undefined = new DataView(memory.buffer) + let inPtr = exports.getInputPointer() + for (let i = 0; i < 32; i++) { + buffer.setUint8(inPtr + i, k[i]) + } + exports.derive() + const outPtr = exports.getOutputPointer() + const pk = new Uint8Array(32) + buffer = new DataView(memory.buffer) + for (let i = 0; i < 32; i++) { + pk[i] = buffer.getUint8(outPtr + i) + } + buffer = undefined + return pk + } + + signSync = function (h: Uint8Array, k: Uint8Array): Uint8Array { + // assembly/nano-nacl/sign() => void + let buffer: DataView | undefined = new DataView(memory.buffer) + let inPtr = exports.getInputPointer() + for (let i = 0; i < 32; i++) { + buffer.setUint8(inPtr + i, h[i]) + } + inPtr += 32 + for (let i = 0; i < 64; i++) { + buffer.setUint8(inPtr + i, k[i]) + } + exports.sign() + const outPtr = exports.getOutputPointer() + const s = new Uint8Array(64) + buffer = new DataView(memory.buffer) + for (let i = 0; i < 64; i++) { + s[i] = buffer.getUint8(outPtr + i) + } + buffer = undefined + return s + } + + verifySync = function (s: Uint8Array, h: Uint8Array, k: Uint8Array): Uint8Array { + // assembly/nano-nacl/verify() => void + let buffer: DataView | undefined = new DataView(memory.buffer) + let inPtr = exports.getInputPointer() + for (let i = 0; i < 64; i++) { + buffer.setUint8(inPtr + i, s[i]) + } + inPtr += 64 + for (let i = 0; i < 32; i++) { + buffer.setUint8(inPtr + i, h[i]) + } + inPtr += 32 + for (let i = 0; i < 32; i++) { + buffer.setUint8(inPtr + i, k[i]) + } + exports.verify() + const outPtr = exports.getOutputPointer() + const v = new Uint8Array(1) + buffer = new DataView(memory.buffer) + v[0] = buffer.getUint8(outPtr) + buffer = undefined + return v + } + isReady = true +} catch (err) { + throw new Error('Error instantiating WebAssembly', { cause: err }) +} +export { deriveSync, signSync, verifySync } + /** * Nano public key derivation using WebAssembly. */ -- 2.47.3