Refactor field element multiplication hot paths.

author Chris Duncan <chris@codecow.com>

Mon, 16 Mar 2026 20:57:59 +0000 (13:57 -0700)

committer Chris Duncan <chris@codecow.com>

Mon, 16 Mar 2026 20:57:59 +0000 (13:57 -0700)
author Chris Duncan <chris@codecow.com>
Mon, 16 Mar 2026 20:57:59 +0000 (13:57 -0700)
committer Chris Duncan <chris@codecow.com>
Mon, 16 Mar 2026 20:57:59 +0000 (13:57 -0700)
diff --git a/asconfig.json b/asconfig.json

index 11c4e6191232f2c345dad66101d1eedf1d556afc..2828ae298fb426342bc989a62c9eab26ec031403 100644 (file)
--- a/asconfig.json
+++ b/asconfig.json
@@ -9,7 +9,7 @@
                 "uncheckedBehavior": "always",
                 "bindings": "esm",
                 "sourceMap": false,
-               "debug": false,
+               "debug": true,
                 "runtime": "stub",
                 "exportRuntime": false,
                 "enable": [
diff --git a/assembly/base.ts b/assembly/base.ts

index c9a72936712a1d75e4ed8411b4cee5d64fd2b520..d59e79a615a2a7943842de6cd37b51713228cede 100644 (file)
--- a/assembly/base.ts
+++ b/assembly/base.ts
@@ -2,7 +2,7 @@
  //! SPDX-License-Identifier: ISC
  
  import { fe } from './fe'
-import { ge_precomp } from './ge'
+import { ge_precomp } from './p'
  
  export const base = StaticArray.fromArray<StaticArray<ge_precomp>>([
         StaticArray.fromArray<ge_precomp>([ /* 0/31 */
diff --git a/assembly/blake2b.ts b/assembly/blake2b.ts

index c61f76fbec4b0596f2da8cf5bc15cfdf8de8242a..82c654ba8dd23f2168eaccabe9be82fc5e017e41 100644 (file)
--- a/assembly/blake2b.ts
+++ b/assembly/blake2b.ts
@@ -75,8 +75,9 @@ export class Blake2b {
                 this.p[3] = 1 // depth
  
                 // initialize hash state
+               memory.copy(changetype<usize>(this.h), changetype<usize>(blake2b_iv), 64)
                 for (let i = 0; i < 8; i++) {
-                       this.h[i] = blake2b_iv[i] ^ load<u64>(changetype<usize>(this.p) + (i << 3))
+                       this.h[i] ^= load<u64>(changetype<usize>(this.p) + (i << 3))
                 }
                 return this
         }
@@ -120,13 +121,12 @@ export class Blake2b {
         }
  
         // Defined in BLAKE2 section 2.4
+       @inline
         COMPRESS (isFinal: bool): void {
  
                 // initialize state vector
-               for (let i = 0; i < 8; i++) {
-                       this.v[i] = this.h[i]
-                       this.v[i + 8] = blake2b_iv[i]
-               }
+               memory.copy(changetype<usize>(this.v), changetype<usize>(this.h), 64)
+               memory.copy(changetype<usize>(this.v) + 64, changetype<usize>(blake2b_iv), 64)
  
                 // lo 64 bits of counter
                 this.v[12] ^= v128.extract_lane<u64>(this.t, 0)
@@ -151,6 +151,7 @@ export class Blake2b {
                 }
         }
  
+       @inline
         ROUND (r: u8): void {
                 for (let i = 0; i < 8; i++) {
                         const a = blake2b_state[i][0]
@@ -162,6 +163,7 @@ export class Blake2b {
                 }
         }
  
+       @inline
         G (a: u8, b: u8, c: u8, d: u8, x: u8, y: u8): void {
                 this.v[a] = this.v[a] + this.v[b] + this.m[x]
                 this.v[d] = rotr<u64>(this.v[d] ^ this.v[a], 32)
diff --git a/assembly/constants.ts b/assembly/constants.ts

index 426adc7adcf054760f9f9fef8f812f58aa650e05..3b41cd9b31b42701251381abe4b1e1dafd286c5c 100644 (file)
--- a/assembly/constants.ts
+++ b/assembly/constants.ts
@@ -6,45 +6,12 @@ export const fe25519_sqrtm1 = StaticArray.fromArray<i32>([
         -32595792, -7943725, 9377950, 3500415, 12389472, -272473, -25146209, -2005654, 326686, 11406482
  ])
  
-/* sqrt(-486664) */
-export const ed25519_sqrtam2 = StaticArray.fromArray<i32>([
-       -12222970, -8312128, -11511410, 9067497, -15300785, -241793, 25456130, 14121551, -12187136, 3972024
-])
-
  /* 37095705934669439343138083508754565189542113879843219016388785533085940283555 */
  export const ed25519_d = StaticArray.fromArray<i32>([
         -10913610, 13857413, -15372611, 6949391, 114729, -8787816, -6275908, -3247719, -18696448, -12055116
  ])
  
-/* 2 * d =
- * 16295367250680780974490674513165176452449235426866156013048779062215315747161
- */
+/* 16295367250680780974490674513165176452449235426866156013048779062215315747161 */
  export const ed25519_d2 = StaticArray.fromArray<i32>([
         -21827239, -5839606, -30745221, 13898782, 229458, 15978800, -12551817, -6495438, 29715968, 9444199
  ])
-
-/* A = 486662 */
-export const ed25519_A_32 = 486662
-export const ed25519_A = StaticArray.fromArray<i32>([
-       ed25519_A_32, 0, 0, 0, 0, 0, 0, 0, 0, 0
-])
-
-/* sqrt(ad - 1) with a = -1 (mod p) */
-export const ed25519_sqrtadm1 = StaticArray.fromArray<i32>([
-       24849947, -153582, -23613485, 6347715, -21072328, -667138, -25271143, -15367704, -870347, 14525639
-])
-
-/* 1 / sqrt(a - d) */
-export const ed25519_invsqrtamd = StaticArray.fromArray<i32>([
-       6111485, 4156064, -27798727, 12243468, -25904040, 120897, 20826367, -7060776, 6093568, -1986012
-])
-
-/* 1 - d ^ 2 */
-export const ed25519_onemsqd = StaticArray.fromArray<i32>([
-       6275446, -16617371, -22938544, -3773710, 11667077, 7397348, -27922721, 1766195, -24433858, 672203
-])
-
-/* (d - 1) ^ 2 */
-export const ed25519_sqdmone = StaticArray.fromArray<i32>([
-       15551795, -11097455, -13425098, -10125071, -11896535, 10178284, -26634327, 4729244, -5282110, -10116402
-])
diff --git a/assembly/fe.ts b/assembly/fe.ts

index f2170b0410dfe066e8f826d3695d774936b1f3de..e12d4b4943be007cecebcafb2d06bb8efe639679 100644 (file)
--- a/assembly/fe.ts
+++ b/assembly/fe.ts
@@ -20,8 +20,10 @@ import { load_3, load_4, sodium_is_zero } from "./utils"
   * 10x 25- and 26-bit array of values representing a large integer `n mod p`.
   */
  export type FieldElement = StaticArray<i32>
-export function fe (source: Array<i32> = new Array<i32>(10)): FieldElement {
-       return StaticArray.fromArray<i32>(source)
+export function fe (src: Array<i32> = new Array<i32>(12)): FieldElement {
+       const dst = new StaticArray<i32>(12)
+       memory.copy(changetype<usize>(dst), src.dataStart, src.length << 2)
+       return dst
  }
  
  /**
@@ -32,7 +34,7 @@ export function fe (source: Array<i32> = new Array<i32>(10)): FieldElement {
  //@ts-expect-error
  @inline
  export function fe_0 (h: FieldElement): void {
-       memory.fill(changetype<usize>(h), 0, 40)
+       memory.fill(changetype<usize>(h), 0, 48)
  }
  
  /**
@@ -44,7 +46,7 @@ export function fe_0 (h: FieldElement): void {
  @inline
  export function fe_1 (h: FieldElement): void {
         fe_0(h)
-       store<i32>(changetype<usize>(h), 1)
+       store<i32>(changetype<usize>(h), 1, 0)
  }
  
  /**
@@ -57,21 +59,12 @@ export function fe_1 (h: FieldElement): void {
  //@ts-expect-error
  @inline
  export function fe_add (h: FieldElement, f: FieldElement, g: FieldElement): void {
-       const o = changetype<usize>(h)
-       const a = changetype<usize>(f)
-       const b = changetype<usize>(g)
-
-       let ai = v128.load(a)
-       let bi = v128.load(b)
-       v128.store(o, v128.add<i32>(ai, bi))
-
-       ai = v128.load(a, 16)
-       bi = v128.load(b, 16)
-       v128.store(o, v128.add<i32>(ai, bi), 16)
-
-       ai = i64x2(load<i64>(a + 32), 0)
-       bi = i64x2(load<i64>(b + 32), 0)
-       v128.store_lane<i64>(o, v128.add<i32>(ai, bi), 0, 32)
+       const h_ptr = changetype<usize>(h)
+       const f_ptr = changetype<usize>(f)
+       const g_ptr = changetype<usize>(g)
+       v128.store(h_ptr, v128.add<i32>(v128.load(f_ptr, 0), v128.load(g_ptr, 0)), 0)
+       v128.store(h_ptr, v128.add<i32>(v128.load(f_ptr, 16), v128.load(g_ptr, 16)), 16)
+       v128.store_lane<i64>(h_ptr, v128.add<i32>(v128.load(f_ptr, 32), v128.load(g_ptr, 32)), 0, 32)
  }
  
  /**
@@ -84,22 +77,12 @@ export function fe_add (h: FieldElement, f: FieldElement, g: FieldElement): void
  //@ts-expect-error
  @inline
  export function fe_cmov (f: FieldElement, g: FieldElement, b: u64): void {
-       const p = changetype<usize>(f)
-       const q = changetype<usize>(g)
-
+       const f_ptr = changetype<usize>(f)
+       const g_ptr = changetype<usize>(g)
         const c = v128.splat<u64>(0 - b)
-
-       let pi = v128.load(p)
-       let qi = v128.load(q)
-       v128.store(p, v128.bitselect(qi, pi, c))
-
-       pi = v128.load(p, 16)
-       qi = v128.load(q, 16)
-       v128.store(p, v128.bitselect(qi, pi, c), 16)
-
-       pi = i64x2(load<i64>(p + 32), 0)
-       qi = i64x2(load<i64>(q + 32), 0)
-       v128.store_lane<u64>(p, v128.bitselect(qi, pi, c), 0, 32)
+       v128.store(f_ptr, v128.bitselect(v128.load(g_ptr), v128.load(f_ptr, 0), c))
+       v128.store(f_ptr, v128.bitselect(v128.load(g_ptr, 16), v128.load(f_ptr, 16), c), 16)
+       v128.store_lane<u64>(f_ptr, v128.bitselect(v128.load(g_ptr, 32), v128.load(f_ptr, 32), c), 0, 32)
  }
  
  /**
@@ -124,25 +107,23 @@ export function fe_copy (h: FieldElement, f: FieldElement): void {
  //@ts-expect-error
  @inline
  export function fe_cswap (f: FieldElement, g: FieldElement, b: u64): void {
-       const p = changetype<usize>(f)
-       const q = changetype<usize>(g)
-
+       const f_ptr = changetype<usize>(f)
+       const g_ptr = changetype<usize>(g)
         const c = v128.splat<u64>(0 - b)
  
-       let pi = v128.load(p)
-       let qi = v128.load(q)
-       v128.store(p, v128.bitselect(qi, pi, c))
-       v128.store(q, v128.bitselect(pi, qi, c))
-
-       pi = v128.load(p, 16)
-       qi = v128.load(q, 16)
-       v128.store(p, v128.bitselect(qi, pi, c), 16)
-       v128.store(q, v128.bitselect(pi, qi, c), 16)
-
-       pi = i64x2(load<i64>(p + 32), 0)
-       qi = i64x2(load<i64>(q + 32), 0)
-       v128.store_lane<i64>(p, v128.bitselect(qi, pi, c), 0, 32)
-       v128.store_lane<i64>(q, v128.bitselect(pi, qi, c), 0, 32)
+       const f03 = v128.load(f_ptr)
+       const g03 = v128.load(g_ptr)
+       const f47 = v128.load(f_ptr, 16)
+       const g47 = v128.load(g_ptr, 16)
+       const f89 = v128.load(f_ptr, 32)
+       const g89 = v128.load(g_ptr, 32)
+
+       v128.store(f_ptr, v128.bitselect(g03, f03, c))
+       v128.store(g_ptr, v128.bitselect(f03, g03, c))
+       v128.store(f_ptr, v128.bitselect(g47, f47, c), 16)
+       v128.store(g_ptr, v128.bitselect(f47, g47, c), 16)
+       v128.store_lane<i64>(f_ptr, v128.bitselect(g89, f89, c), 0, 32)
+       v128.store_lane<i64>(g_ptr, v128.bitselect(f89, g89, c), 0, 32)
  }
  
  /**
@@ -154,17 +135,11 @@ export function fe_cswap (f: FieldElement, g: FieldElement, b: u64): void {
  //@ts-expect-error
  @inline
  export function fe_dbl (h: FieldElement, f: FieldElement): void {
-       const o = changetype<usize>(h)
-       const a = changetype<usize>(f)
-
-       let ai = v128.load(a)
-       v128.store(o, v128.add<i32>(ai, ai))
-
-       ai = v128.load(a, 16)
-       v128.store(o, v128.add<i32>(ai, ai), 16)
-
-       ai = i64x2(load<i64>(a + 32), 0)
-       v128.store_lane<i64>(o, v128.add<i32>(ai, ai), 0, 32)
+       const h_ptr = changetype<usize>(h)
+       const f_ptr = changetype<usize>(f)
+       v128.store(h_ptr, v128.shl<i32>(v128.load(f_ptr, 0), 1), 0)
+       v128.store(h_ptr, v128.shl<i32>(v128.load(f_ptr, 16), 1), 16)
+       v128.store_lane<i64>(h_ptr, v128.shl<i32>(v128.load(f_ptr, 32), 1), 0, 32)
  }
  
  /**
@@ -195,46 +170,47 @@ export function fe_frombytes (h: FieldElement, s: StaticArray<u8>): void {
  
         carry9 = (h9 + i64(1 << 24)) >> 25
         h0 += carry9 * 19
-       h9 -= carry9 * u64(1 << 25)
+       h9 -= carry9 << 25
         carry1 = (h1 + i64(1 << 24)) >> 25
         h2 += carry1
-       h1 -= carry1 * u64(1 << 25)
+       h1 -= carry1 << 25
         carry3 = (h3 + i64(1 << 24)) >> 25
         h4 += carry3
-       h3 -= carry3 * u64(1 << 25)
+       h3 -= carry3 << 25
         carry5 = (h5 + i64(1 << 24)) >> 25
         h6 += carry5
-       h5 -= carry5 * u64(1 << 25)
+       h5 -= carry5 << 25
         carry7 = (h7 + i64(1 << 24)) >> 25
         h8 += carry7
-       h7 -= carry7 * u64(1 << 25)
+       h7 -= carry7 << 25
  
         carry0 = (h0 + i64(1 << 25)) >> 26
         h1 += carry0
-       h0 -= carry0 * u64(1 << 26)
+       h0 -= carry0 << 26
         carry2 = (h2 + i64(1 << 25)) >> 26
         h3 += carry2
-       h2 -= carry2 * u64(1 << 26)
+       h2 -= carry2 << 26
         carry4 = (h4 + i64(1 << 25)) >> 26
         h5 += carry4
-       h4 -= carry4 * u64(1 << 26)
+       h4 -= carry4 << 26
         carry6 = (h6 + i64(1 << 25)) >> 26
         h7 += carry6
-       h6 -= carry6 * u64(1 << 26)
+       h6 -= carry6 << 26
         carry8 = (h8 + i64(1 << 25)) >> 26
         h9 += carry8
-       h8 -= carry8 * u64(1 << 26)
-
-       h[0] = i32(h0)
-       h[1] = i32(h1)
-       h[2] = i32(h2)
-       h[3] = i32(h3)
-       h[4] = i32(h4)
-       h[5] = i32(h5)
-       h[6] = i32(h6)
-       h[7] = i32(h7)
-       h[8] = i32(h8)
-       h[9] = i32(h9)
+       h8 -= carry8 << 26
+
+       const h_ptr: usize = changetype<usize>(h)
+       store<i32>(h_ptr, h0, 0)
+       store<i32>(h_ptr, h1, 4)
+       store<i32>(h_ptr, h2, 8)
+       store<i32>(h_ptr, h3, 12)
+       store<i32>(h_ptr, h4, 16)
+       store<i32>(h_ptr, h5, 20)
+       store<i32>(h_ptr, h6, 24)
+       store<i32>(h_ptr, h7, 28)
+       store<i32>(h_ptr, h8, 32)
+       store<i32>(h_ptr, h9, 36)
  }
  
  const t0: FieldElement = fe()
@@ -291,7 +267,7 @@ export function fe_invert (out: FieldElement, z: FieldElement): void {
         fe_mul(out, t1, t0)
  }
  
-const fe_isnegative_s = new StaticArray<u8>(32)
+const fe_isnegative_t = fe()
  /**
   * return 1 if f is in {1,3,5,...,q-2}
   * return 0 if f is in {0,2,4,...,q-1}
@@ -302,9 +278,9 @@ const fe_isnegative_s = new StaticArray<u8>(32)
  //@ts-expect-error
  @inline
  export function fe_isnegative (f: FieldElement): u8 {
-       const s = fe_isnegative_s
-       fe_tobytes(s, f)
-       return s[0] & 1
+       const t = fe_isnegative_t
+       fe_reduce(t, f)
+       return load<u8>(changetype<usize>(t), 0) & 1
  }
  
  const fe_iszero_s = new StaticArray<u8>(32)
@@ -323,7 +299,6 @@ export function fe_iszero (f: FieldElement): u8 {
         return sodium_is_zero(s, 32)
  }
  
-const multiply_t = new StaticArray<i64>(24)
  /**
   * Multiply two FieldElements and store the product.
   *
@@ -332,60 +307,256 @@ const multiply_t = new StaticArray<i64>(24)
   * @param g FieldElement multiplicand source
   */
  export function fe_mul (h: FieldElement, f: FieldElement, g: FieldElement): void {
-       const o = changetype<usize>(h)
-       const a = changetype<usize>(f)
-       const b = changetype<usize>(g)
-
-       const t = changetype<usize>(multiply_t)
-       memory.fill(t, 0, 160)
-
-       const b0: v128 = v128.load(b)
-       const b4: v128 = v128.load(b + 16)
-       const b8: v128 = i32x4(load<i32>(b + 32), load<i32>(b + 36), 0, 0)
-
-       for (let i = 0; i < 10; i++) {
-               const odd = (i & 1) + 1
-               const ai = v128.splat<i32>(load<i32>(a + (i << 2)))
-
-               let ptr = t + (usize(i) << 3)
-               let pLo = i64x2.extmul_low_i32x4_s(ai, b0)
-               let pHi = i64x2.extmul_high_i32x4_s(ai, b0)
-               pLo = v128.mul<i64>(pLo, i64x2(1, odd))
-               pHi = v128.mul<i64>(pHi, i64x2(1, odd))
-               let tLo = v128.load(ptr)
-               let tHi = v128.load(ptr + 16)
-               tLo = i64x2.add(tLo, pLo)
-               tHi = i64x2.add(tHi, pHi)
-               v128.store(ptr, tLo)
-               v128.store(ptr + 16, tHi)
-
-               ptr += 32
-               pLo = i64x2.extmul_low_i32x4_s(ai, b4)
-               pHi = i64x2.extmul_high_i32x4_s(ai, b4)
-               pLo = v128.mul<i64>(pLo, i64x2(1, odd))
-               pHi = v128.mul<i64>(pHi, i64x2(1, odd))
-               tLo = v128.load(ptr)
-               tHi = v128.load(ptr + 16)
-               tLo = i64x2.add(tLo, pLo)
-               tHi = i64x2.add(tHi, pHi)
-               v128.store(ptr, tLo)
-               v128.store(ptr + 16, tHi)
-
-               ptr += 32
-               pLo = i64x2.extmul_low_i32x4_s(ai, b8)
-               pHi = i64x2.extmul_high_i32x4_s(ai, b8)
-               pLo = v128.mul<i64>(pLo, i64x2(1, odd))
-               pHi = v128.mul<i64>(pHi, i64x2(1, odd))
-               tLo = v128.load(ptr)
-               tHi = v128.load(ptr + 16)
-               tLo = i64x2.add(tLo, pLo)
-               tHi = i64x2.add(tHi, pHi)
-               v128.store(ptr, tLo)
-               v128.store(ptr + 16, tHi)
-       }
-       // discard last 4 elements since we do not actually need them
-       memory.fill(t + 160, 0, 32)
-       normalize(o, t)
+       const f_ptr: usize = changetype<usize>(f)
+       const g_ptr: usize = changetype<usize>(g)
+       const g0123: v128 = v128.load(g_ptr, 0)
+       const g4567: v128 = v128.load(g_ptr, 16)
+       const g89xx: v128 = v128.load(g_ptr, 32)
+
+       // mask to select even lane from first vector and odd lane from second
+       const m: v128 = i32x4(-1, 0, -1, 0)
+
+       // mask to double odd-on-odd indexes
+       const odds: v128 = i32x4(0, -1, 0, -1)
+
+       // factor of 19 for wrap from h9 to h0
+       const v19: v128 = i32x4.splat(19)
+       const g0123_19: v128 = i32x4.mul(g0123, v19)
+       const g4567_19: v128 = i32x4.mul(g4567, v19)
+       const g89xx_19: v128 = i32x4.mul(g89xx, v19)
+
+       // f[0]
+       let fi: i32 = load<i32>(f_ptr, 0)
+       let fv: v128 = i32x4.splat(fi)
+
+       let h01: v128 = i64x2.extmul_low_i32x4_s(fv, g0123)
+       let h23: v128 = i64x2.extmul_high_i32x4_s(fv, g0123)
+       let h45: v128 = i64x2.extmul_low_i32x4_s(fv, g4567)
+       let h67: v128 = i64x2.extmul_high_i32x4_s(fv, g4567)
+       let h89: v128 = i64x2.extmul_low_i32x4_s(fv, g89xx)
+
+       // f[1]
+       fi = load<i32>(f_ptr, 4)
+       fv = i32x4.splat(fi)
+       fv = i32x4.add(fv, v128.and(fv, odds))
+
+       let h12: v128 = i64x2.extmul_low_i32x4_s(fv, g0123)
+       let h34: v128 = i64x2.extmul_high_i32x4_s(fv, g0123)
+       let h56: v128 = i64x2.extmul_low_i32x4_s(fv, g4567)
+       let h78: v128 = i64x2.extmul_high_i32x4_s(fv, g4567)
+       let h90: v128 = i64x2.extmul_low_i32x4_s(fv, v128.bitselect(g89xx, g89xx_19, m))
+
+       // f[2]
+       fi = load<i32>(f_ptr, 8)
+       fv = i32x4.splat(fi)
+
+       let t0: v128 = i64x2.extmul_low_i32x4_s(fv, g0123)
+       let t1: v128 = i64x2.extmul_high_i32x4_s(fv, g0123)
+       let t2: v128 = i64x2.extmul_low_i32x4_s(fv, g4567)
+       let t3: v128 = i64x2.extmul_high_i32x4_s(fv, g4567)
+       let t4: v128 = i64x2.extmul_low_i32x4_s(fv, g89xx_19)
+
+       h23 = i64x2.add(h23, t0)
+       h45 = i64x2.add(h45, t1)
+       h67 = i64x2.add(h67, t2)
+       h89 = i64x2.add(h89, t3)
+       h01 = i64x2.add(h01, t4)
+
+       // f[3]
+       fi = load<i32>(f_ptr, 12)
+       fv = i32x4.splat(fi)
+       fv = i32x4.add(fv, v128.and(fv, odds))
+
+       t0 = i64x2.extmul_low_i32x4_s(fv, g0123)
+       t1 = i64x2.extmul_high_i32x4_s(fv, g0123)
+       t2 = i64x2.extmul_low_i32x4_s(fv, g4567)
+       t3 = i64x2.extmul_high_i32x4_s(fv, v128.bitselect(g4567, g4567_19, m))
+       t4 = i64x2.extmul_low_i32x4_s(fv, g89xx_19)
+
+       h34 = i64x2.add(h34, t0)
+       h56 = i64x2.add(h56, t1)
+       h78 = i64x2.add(h78, t2)
+       h90 = i64x2.add(h90, t3)
+       h12 = i64x2.add(h12, t4)
+
+       // f[4]
+       fi = load<i32>(f_ptr, 16)
+       fv = i32x4.splat(fi)
+
+       t0 = i64x2.extmul_low_i32x4_s(fv, g0123)
+       t1 = i64x2.extmul_high_i32x4_s(fv, g0123)
+       t2 = i64x2.extmul_low_i32x4_s(fv, g4567)
+       t3 = i64x2.extmul_high_i32x4_s(fv, g4567_19)
+       t4 = i64x2.extmul_low_i32x4_s(fv, g89xx_19)
+
+       h45 = i64x2.add(h45, t0)
+       h67 = i64x2.add(h67, t1)
+       h89 = i64x2.add(h89, t2)
+       h01 = i64x2.add(h01, t3)
+       h23 = i64x2.add(h23, t4)
+
+       // f[5]
+       fi = load<i32>(f_ptr, 20)
+       fv = i32x4.splat(fi)
+       fv = i32x4.add(fv, v128.and(fv, odds))
+
+       t0 = i64x2.extmul_low_i32x4_s(fv, g0123)
+       t1 = i64x2.extmul_high_i32x4_s(fv, g0123)
+       t2 = i64x2.extmul_low_i32x4_s(fv, v128.bitselect(g4567, g4567_19, m))
+       t3 = i64x2.extmul_high_i32x4_s(fv, g4567_19)
+       t4 = i64x2.extmul_low_i32x4_s(fv, g89xx_19)
+
+       h56 = i64x2.add(h56, t0)
+       h78 = i64x2.add(h78, t1)
+       h90 = i64x2.add(h90, t2)
+       h12 = i64x2.add(h12, t3)
+       h34 = i64x2.add(h34, t4)
+
+       // f[6]
+       fi = load<i32>(f_ptr, 24)
+       fv = i32x4.splat(fi)
+
+       t0 = i64x2.extmul_low_i32x4_s(fv, g0123)
+       t1 = i64x2.extmul_high_i32x4_s(fv, g0123)
+       t2 = i64x2.extmul_low_i32x4_s(fv, g4567_19)
+       t3 = i64x2.extmul_high_i32x4_s(fv, g4567_19)
+       t4 = i64x2.extmul_low_i32x4_s(fv, g89xx_19)
+
+       h67 = i64x2.add(h67, t0)
+       h89 = i64x2.add(h89, t1)
+       h01 = i64x2.add(h01, t2)
+       h23 = i64x2.add(h23, t3)
+       h45 = i64x2.add(h45, t4)
+
+       // f[7]
+       fi = load<i32>(f_ptr, 28)
+       fv = i32x4.splat(fi)
+       fv = i32x4.add(fv, v128.and(fv, odds))
+
+       t0 = i64x2.extmul_low_i32x4_s(fv, g0123)
+       t1 = i64x2.extmul_high_i32x4_s(fv, v128.bitselect(g0123, g0123_19, m))
+       t2 = i64x2.extmul_low_i32x4_s(fv, g4567_19)
+       t3 = i64x2.extmul_high_i32x4_s(fv, g4567_19)
+       t4 = i64x2.extmul_low_i32x4_s(fv, g89xx_19)
+
+       h78 = i64x2.add(h78, t0)
+       h90 = i64x2.add(h90, t1)
+       h12 = i64x2.add(h12, t2)
+       h34 = i64x2.add(h34, t3)
+       h56 = i64x2.add(h56, t4)
+
+       // f[8]
+       fi = load<i32>(f_ptr, 32)
+       fv = i32x4.splat(fi)
+
+       t0 = i64x2.extmul_low_i32x4_s(fv, g0123)
+       t1 = i64x2.extmul_high_i32x4_s(fv, g0123_19)
+       t2 = i64x2.extmul_low_i32x4_s(fv, g4567_19)
+       t3 = i64x2.extmul_high_i32x4_s(fv, g4567_19)
+       t4 = i64x2.extmul_low_i32x4_s(fv, g89xx_19)
+
+       h89 = i64x2.add(h89, t0)
+       h01 = i64x2.add(h01, t1)
+       h23 = i64x2.add(h23, t2)
+       h45 = i64x2.add(h45, t3)
+       h67 = i64x2.add(h67, t4)
+
+       // f[9]
+       fi = load<i32>(f_ptr, 36)
+       fv = i32x4.splat(fi)
+       fv = i32x4.add(fv, v128.and(fv, odds))
+
+       t0 = i64x2.extmul_low_i32x4_s(fv, v128.bitselect(g0123, g0123_19, m))
+       t1 = i64x2.extmul_high_i32x4_s(fv, g0123_19)
+       t2 = i64x2.extmul_low_i32x4_s(fv, g4567_19)
+       t3 = i64x2.extmul_high_i32x4_s(fv, g4567_19)
+       t4 = i64x2.extmul_low_i32x4_s(fv, g89xx_19)
+
+       h90 = i64x2.add(h90, t0)
+       h12 = i64x2.add(h12, t1)
+       h34 = i64x2.add(h34, t2)
+       h56 = i64x2.add(h56, t3)
+       h78 = i64x2.add(h78, t4)
+
+       // extract scalars from vectors
+       let h0: i64 = i64x2.extract_lane(h90, 1) + i64x2.extract_lane(h01, 0)
+       let h1: i64 = i64x2.extract_lane(h01, 1) + i64x2.extract_lane(h12, 0)
+       let h2: i64 = i64x2.extract_lane(h12, 1) + i64x2.extract_lane(h23, 0)
+       let h3: i64 = i64x2.extract_lane(h23, 1) + i64x2.extract_lane(h34, 0)
+       let h4: i64 = i64x2.extract_lane(h34, 1) + i64x2.extract_lane(h45, 0)
+       let h5: i64 = i64x2.extract_lane(h45, 1) + i64x2.extract_lane(h56, 0)
+       let h6: i64 = i64x2.extract_lane(h56, 1) + i64x2.extract_lane(h67, 0)
+       let h7: i64 = i64x2.extract_lane(h67, 1) + i64x2.extract_lane(h78, 0)
+       let h8: i64 = i64x2.extract_lane(h78, 1) + i64x2.extract_lane(h89, 0)
+       let h9: i64 = i64x2.extract_lane(h89, 1) + i64x2.extract_lane(h90, 0)
+
+       // extract scalars from vectors during first carry
+       let carry0: i64
+       let carry1: i64
+       let carry2: i64
+       let carry3: i64
+       let carry4: i64
+       let carry5: i64
+       let carry6: i64
+       let carry7: i64
+       let carry8: i64
+       let carry9: i64
+
+       carry0 = (h0 + (1 << 25)) >> 26
+       h1 += carry0
+       h0 -= carry0 << 26
+       carry4 = (h4 + (1 << 25)) >> 26
+       h5 += carry4
+       h4 -= carry4 << 26
+
+       carry1 = (h1 + (1 << 24)) >> 25
+       h2 += carry1
+       h1 -= carry1 << 25
+       carry5 = (h5 + (1 << 24)) >> 25
+       h6 += carry5
+       h5 -= carry5 << 25
+
+       carry2 = (h2 + (1 << 25)) >> 26
+       h3 += carry2
+       h2 -= carry2 << 26
+       carry6 = (h6 + (1 << 25)) >> 26
+       h7 += carry6
+       h6 -= carry6 << 26
+
+       carry3 = (h3 + (1 << 24)) >> 25
+       h4 += carry3
+       h3 -= carry3 << 25
+       carry7 = (h7 + (1 << 24)) >> 25
+       h8 += carry7
+       h7 -= carry7 << 25
+
+       carry4 = (h4 + (1 << 25)) >> 26
+       h5 += carry4
+       h4 -= carry4 << 26
+       carry8 = (h8 + (1 << 25)) >> 26
+       h9 += carry8
+       h8 -= carry8 << 26
+
+       carry9 = (h9 + (1 << 24)) >> 25
+       h0 += carry9 * 19
+       h9 -= carry9 << 25
+
+       carry0 = (h0 + (1 << 25)) >> 26
+       h1 += carry0
+       h0 -= carry0 << 26
+
+       // assign results to output
+       const h_ptr: usize = changetype<usize>(h)
+       store<i32>(h_ptr, h0, 0)
+       store<i32>(h_ptr, h1, 4)
+       store<i32>(h_ptr, h2, 8)
+       store<i32>(h_ptr, h3, 12)
+       store<i32>(h_ptr, h4, 16)
+       store<i32>(h_ptr, h5, 20)
+       store<i32>(h_ptr, h6, 24)
+       store<i32>(h_ptr, h7, 28)
+       store<i32>(h_ptr, h8, 32)
+       store<i32>(h_ptr, h9, 36)
  }
  
  /**
@@ -397,17 +568,11 @@ export function fe_mul (h: FieldElement, f: FieldElement, g: FieldElement): void
  //@ts-expect-error
  @inline
  export function fe_neg (h: FieldElement, f: FieldElement): void {
-       const o = changetype<usize>(h)
-       const a = changetype<usize>(f)
-
-       let ai = v128.load(a)
-       v128.store(o, v128.neg<i32>(ai))
-
-       ai = v128.load(a, 16)
-       v128.store(o, v128.neg<i32>(ai), 16)
-
-       ai = i64x2(load<i64>(a + 32), 0)
-       v128.store_lane<i64>(o, v128.neg<i32>(ai), 0, 32)
+       const h_ptr = changetype<usize>(h)
+       const f_ptr = changetype<usize>(f)
+       v128.store(h_ptr, v128.neg<i32>(v128.load(f_ptr)))
+       v128.store(h_ptr, v128.neg<i32>(v128.load(f_ptr, 16)), 16)
+       v128.store_lane<i64>(h_ptr, v128.neg<i32>(v128.load(f_ptr, 32)), 0, 32)
  }
  
  const fe_pow22523_t0: FieldElement = fe()
@@ -492,100 +657,752 @@ export function fe_pow22523 (out: FieldElement, z: FieldElement): void {
   * so floor(2⁻²⁵⁵(h + 19 2^(-25) h9 + 2^(-1))) = q.
   */
  export function fe_reduce (h: FieldElement, f: FieldElement): void {
-       let h0 = f[0]
-       let h1 = f[1]
-       let h2 = f[2]
-       let h3 = f[3]
-       let h4 = f[4]
-       let h5 = f[5]
-       let h6 = f[6]
-       let h7 = f[7]
-       let h8 = f[8]
-       let h9 = f[9]
+       const f_ptr: usize = changetype<usize>(f)
+       let f0: i32 = load<i32>(f_ptr, 0)
+       let f1: i32 = load<i32>(f_ptr, 4)
+       let f2: i32 = load<i32>(f_ptr, 8)
+       let f3: i32 = load<i32>(f_ptr, 12)
+       let f4: i32 = load<i32>(f_ptr, 16)
+       let f5: i32 = load<i32>(f_ptr, 20)
+       let f6: i32 = load<i32>(f_ptr, 24)
+       let f7: i32 = load<i32>(f_ptr, 28)
+       let f8: i32 = load<i32>(f_ptr, 32)
+       let f9: i32 = load<i32>(f_ptr, 36)
         let q: i32
-       let carry0: i32, carry1: i32, carry2: i32, carry3: i32, carry4: i32, carry5: i32, carry6: i32, carry7: i32, carry8: i32, carry9: i32
-
-       q = (19 * h9 + (u32(1) << 24)) >> 25
-       q = (h0 + q) >> 26
-       q = (h1 + q) >> 25
-       q = (h2 + q) >> 26
-       q = (h3 + q) >> 25
-       q = (h4 + q) >> 26
-       q = (h5 + q) >> 25
-       q = (h6 + q) >> 26
-       q = (h7 + q) >> 25
-       q = (h8 + q) >> 26
-       q = (h9 + q) >> 25
+       let c0: i32, c1: i32, c2: i32, c3: i32, c4: i32, c5: i32, c6: i32, c7: i32, c8: i32, c9: i32
+
+       q = (19 * f9 + (1 << 24)) >> 25
+       q = (f0 + q) >> 26
+       q = (f1 + q) >> 25
+       q = (f2 + q) >> 26
+       q = (f3 + q) >> 25
+       q = (f4 + q) >> 26
+       q = (f5 + q) >> 25
+       q = (f6 + q) >> 26
+       q = (f7 + q) >> 25
+       q = (f8 + q) >> 26
+       q = (f9 + q) >> 25
  
         /* Goal: Output h-(2²⁵⁵-19)q, which is between 0 and 2²⁵⁵-20. */
-       h0 += 19 * q
+       f0 += 19 * q
         /* Goal: Output h-2²⁵⁵ q, which is between 0 and 2²⁵⁵-20. */
  
-       carry0 = h0 >> 26
+       c0 = f0 >> 26
+       f1 += c0
+       f0 -= c0 << 26
+       c1 = f1 >> 25
+       f2 += c1
+       f1 -= c1 << 25
+       c2 = f2 >> 26
+       f3 += c2
+       f2 -= c2 << 26
+       c3 = f3 >> 25
+       f4 += c3
+       f3 -= c3 << 25
+       c4 = f4 >> 26
+       f5 += c4
+       f4 -= c4 << 26
+       c5 = f5 >> 25
+       f6 += c5
+       f5 -= c5 << 25
+       c6 = f6 >> 26
+       f7 += c6
+       f6 -= c6 << 26
+       c7 = f7 >> 25
+       f8 += c7
+       f7 -= c7 << 25
+       c8 = f8 >> 26
+       f9 += c8
+       f8 -= c8 << 26
+       c9 = f9 >> 25
+       f9 -= c9 << 25
+
+       const h_ptr: usize = changetype<usize>(h)
+       store<i32>(h_ptr, f0, 0)
+       store<i32>(h_ptr, f1, 4)
+       store<i32>(h_ptr, f2, 8)
+       store<i32>(h_ptr, f3, 12)
+       store<i32>(h_ptr, f4, 16)
+       store<i32>(h_ptr, f5, 20)
+       store<i32>(h_ptr, f6, 24)
+       store<i32>(h_ptr, f7, 28)
+       store<i32>(h_ptr, f8, 32)
+       store<i32>(h_ptr, f9, 36)
+}
+
+/**
+ * Square a FieldElement and store the result.
+ *
+ * Some iterations of the inner loop can be skipped since
+ * `f[x]*f[y] + f[y]*f[x] = 2*f[x]*f[y]`
+ *
+ * ```
+ * // non-constant-time example
+ * for (let i = 0; i < 10; i++) {
+ *     for (let j = i; j < 10; j++) {
+ *             h[i + j] += f[i] * g[j]
+ *             if (i < j) {
+ *                     h[i + j] *= 2
+ *             }
+ *     }
+ * }
+ * ```
+ *
+ * h = f * f
+ * Can overlap h with f.
+ *
+ * Preconditions:
+ * |f| bounded by 1.65*2^26,1.65*2^25,1.65*2^26,1.65*2^25,etc.
+ *
+ * Postconditions:
+ * |h| bounded by 1.01*2^25,1.01*2^24,1.01*2^25,1.01*2^24,etc.
+ *
+ * @param h FieldElement power destination
+ * @param f FieldElement base source
+ */
+export function fe_sq (h: FieldElement, f: FieldElement): void {
+       const f_ptr: usize = changetype<usize>(f)
+       const f0: i64 = i64(load<i32>(f_ptr, 0))
+       const f1: i64 = i64(load<i32>(f_ptr, 4))
+       const f2: i64 = i64(load<i32>(f_ptr, 8))
+       const f3: i64 = i64(load<i32>(f_ptr, 12))
+       const f4: i64 = i64(load<i32>(f_ptr, 16))
+       const f5: i64 = i64(load<i32>(f_ptr, 20))
+       const f6: i64 = i64(load<i32>(f_ptr, 24))
+       const f7: i64 = i64(load<i32>(f_ptr, 28))
+       const f8: i64 = i64(load<i32>(f_ptr, 32))
+       const f9: i64 = i64(load<i32>(f_ptr, 36))
+
+       const f0_2: i64 = f0 * 2
+       const f1_2: i64 = f1 * 2
+       const f2_2: i64 = f2 * 2
+       const f3_2: i64 = f3 * 2
+       const f4_2: i64 = f4 * 2
+       const f5_2: i64 = f5 * 2
+       const f6_2: i64 = f6 * 2
+       const f7_2: i64 = f7 * 2
+
+       const f5_19: i64 = f5 * 19 /* 1.959375*2^29 */
+       const f6_19: i64 = f6 * 19 /* 1.959375*2^30 */
+       const f7_19: i64 = f7 * 19 /* 1.959375*2^29 */
+       const f8_19: i64 = f8 * 19 /* 1.959375*2^30 */
+       const f9_19: i64 = f9 * 19 /* 1.959375*2^29 */
+
+       const f7_38: i64 = f7 * 38 /* 1.959375*2^30 */
+       const f9_38: i64 = f9 * 38 /* 1.959375*2^30 */
+
+       let h0: i64 = f0 * f0
+       let h1: i64 = f0_2 * f1
+       let h2: i64 = f0_2 * f2
+       let h3: i64 = f0_2 * f3
+       let h4: i64 = f0_2 * f4
+       let h5: i64 = f0_2 * f5
+       let h6: i64 = f0_2 * f6
+       let h7: i64 = f0_2 * f7
+       let h8: i64 = f0_2 * f8
+       let h9: i64 = f0_2 * f9
+
+       h2 += f1_2 * f1
+       h3 += f1_2 * f2
+       h4 += f1_2 * f3_2
+       h5 += f1_2 * f4
+       h6 += f1_2 * f5_2
+       h7 += f1_2 * f6
+       h8 += f1_2 * f7_2
+       h9 += f1_2 * f8
+       h0 += f1_2 * f9_38
+
+       h4 += f2 * f2
+       h5 += f2_2 * f3
+       h6 += f2_2 * f4
+       h7 += f2_2 * f5
+       h8 += f2_2 * f6
+       h9 += f2_2 * f7
+       h0 += f2_2 * f8_19
+       h1 += f2_2 * f9_19
+
+       h6 += f3_2 * f3
+       h7 += f3_2 * f4
+       h8 += f3_2 * f5_2
+       h9 += f3_2 * f6
+       h0 += f3_2 * f7_38
+       h1 += f3_2 * f8_19
+       h2 += f3_2 * f9_38
+
+       h8 += f4 * f4
+       h9 += f4_2 * f5
+       h0 += f4_2 * f6_19
+       h1 += f4_2 * f7_19
+       h2 += f4_2 * f8_19
+       h3 += f4_2 * f9_19
+
+       h0 += f5_2 * f5_19
+       h1 += f5_2 * f6_19
+       h2 += f5_2 * f7_38
+       h3 += f5_2 * f8_19
+       h4 += f5_2 * f9_38
+
+       h2 += f6 * f6_19
+       h3 += f6_2 * f7_19
+       h4 += f6_2 * f8_19
+       h5 += f6_2 * f9_19
+
+       h4 += f7_2 * f7_19
+       h5 += f7_2 * f8_19
+       h6 += f7_2 * f9_38
+
+       h6 += f8 * f8_19
+       h7 += f8 * f9_38
+
+       h8 += f9 * f9_38
+
+       let carry0: i64
+       let carry1: i64
+       let carry2: i64
+       let carry3: i64
+       let carry4: i64
+       let carry5: i64
+       let carry6: i64
+       let carry7: i64
+       let carry8: i64
+       let carry9: i64
+
+       carry0 = (h0 + i64(1 << 25)) >> 26
         h1 += carry0
-       h0 -= carry0 * u32(1 << 26)
-       carry1 = h1 >> 25
-       h2 += carry1
-       h1 -= carry1 * u32(1 << 25)
-       carry2 = h2 >> 26
-       h3 += carry2
-       h2 -= carry2 * u32(1 << 26)
-       carry3 = h3 >> 25
-       h4 += carry3
-       h3 -= carry3 * u32(1 << 25)
-       carry4 = h4 >> 26
+       h0 -= carry0 << 26
+       carry4 = (h4 + i64(1 << 25)) >> 26
         h5 += carry4
-       h4 -= carry4 * u32(1 << 26)
-       carry5 = h5 >> 25
+       h4 -= carry4 << 26
+
+       carry1 = (h1 + i64(1 << 24)) >> 25
+       h2 += carry1
+       h1 -= carry1 << 25
+       carry5 = (h5 + i64(1 << 24)) >> 25
         h6 += carry5
-       h5 -= carry5 * u32(1 << 25)
-       carry6 = h6 >> 26
+       h5 -= carry5 << 25
+
+       carry2 = (h2 + i64(1 << 25)) >> 26
+       h3 += carry2
+       h2 -= carry2 << 26
+       carry6 = (h6 + i64(1 << 25)) >> 26
         h7 += carry6
-       h6 -= carry6 * u32(1 << 26)
-       carry7 = h7 >> 25
+       h6 -= carry6 << 26
+
+       carry3 = (h3 + i64(1 << 24)) >> 25
+       h4 += carry3
+       h3 -= carry3 << 25
+       carry7 = (h7 + i64(1 << 24)) >> 25
         h8 += carry7
-       h7 -= carry7 * u32(1 << 25)
-       carry8 = h8 >> 26
+       h7 -= carry7 << 25
+
+       carry4 = (h4 + i64(1 << 25)) >> 26
+       h5 += carry4
+       h4 -= carry4 << 26
+       carry8 = (h8 + i64(1 << 25)) >> 26
         h9 += carry8
-       h8 -= carry8 * u32(1 << 26)
-       carry9 = h9 >> 25
-       h9 -= carry9 * u32(1 << 25)
-
-       h[0] = h0
-       h[1] = h1
-       h[2] = h2
-       h[3] = h3
-       h[4] = h4
-       h[5] = h5
-       h[6] = h6
-       h[7] = h7
-       h[8] = h8
-       h[9] = h9
+       h8 -= carry8 << 26
+
+       carry9 = (h9 + i64(1 << 24)) >> 25
+       h0 += carry9 * 19
+       h9 -= carry9 << 25
+
+       carry0 = (h0 + i64(1 << 25)) >> 26
+       h1 += carry0
+       h0 -= carry0 << 26
+
+       const h_ptr: usize = changetype<usize>(h)
+       store<i32>(h_ptr, h0, 0)
+       store<i32>(h_ptr, h1, 4)
+       store<i32>(h_ptr, h2, 8)
+       store<i32>(h_ptr, h3, 12)
+       store<i32>(h_ptr, h4, 16)
+       store<i32>(h_ptr, h5, 20)
+       store<i32>(h_ptr, h6, 24)
+       store<i32>(h_ptr, h7, 28)
+       store<i32>(h_ptr, h8, 32)
+       store<i32>(h_ptr, h9, 36)
  }
  
  /**
   * Square a FieldElement and store the result.
   *
- * @param h FieldElement power destination
- * @param f FieldElement base source
+ * Some iterations of the inner loop can be skipped since
+ * `f[x]*f[y] + f[y]*f[x] = 2*f[x]*f[y]`
+ *
+ * ```
+ * // non-constant-time example
+ * for (let i = 0; i < 10; i++) {
+ *     for (let j = i; j < 10; j++) {
+ *             h[i + j] += f[i] * g[j]
+ *             if (i < j) {
+ *                     h[i + j] *= 2
+ *             }
+ *     }
+ * }
+ * ```
+ *
+ * h = f * f
+ * Can overlap h with f.
+ *
+ * Preconditions:
+ * |f| bounded by 1.65*2^26,1.65*2^25,1.65*2^26,1.65*2^25,etc.
+ *
+ * Postconditions:
+ * |h| bounded by 1.01*2^25,1.01*2^24,1.01*2^25,1.01*2^24,etc.
+ *
+ * @param hx FieldElement power destination
+ * @param fx FieldElement base source
   */
-//@ts-expect-error
-@inline
-export function fe_sq (h: FieldElement, f: FieldElement): void {
-       fe_mul(h, f, f)
+export function fe_sq_vec (hx: FieldElement, fx: FieldElement, hy: FieldElement, fy: FieldElement): void {
+       const fx_ptr: usize = changetype<usize>(fx)
+       const fy_ptr: usize = changetype<usize>(fy)
+       const f0: v128 = i32x4(load<i32>(fx_ptr, 0), load<i32>(fy_ptr, 0), 0, 0)
+       const f1: v128 = i32x4(load<i32>(fx_ptr, 4), load<i32>(fy_ptr, 4), 0, 0)
+       const f2: v128 = i32x4(load<i32>(fx_ptr, 8), load<i32>(fy_ptr, 8), 0, 0)
+       const f3: v128 = i32x4(load<i32>(fx_ptr, 12), load<i32>(fy_ptr, 12), 0, 0)
+       const f4: v128 = i32x4(load<i32>(fx_ptr, 16), load<i32>(fy_ptr, 16), 0, 0)
+       const f5: v128 = i32x4(load<i32>(fx_ptr, 20), load<i32>(fy_ptr, 20), 0, 0)
+       const f6: v128 = i32x4(load<i32>(fx_ptr, 24), load<i32>(fy_ptr, 24), 0, 0)
+       const f7: v128 = i32x4(load<i32>(fx_ptr, 28), load<i32>(fy_ptr, 28), 0, 0)
+       const f8: v128 = i32x4(load<i32>(fx_ptr, 32), load<i32>(fy_ptr, 32), 0, 0)
+       const f9: v128 = i32x4(load<i32>(fx_ptr, 36), load<i32>(fy_ptr, 36), 0, 0)
+
+       const v19: v128 = i32x4.splat(19)
+       const f5_19: v128 = i32x4.mul(f5, v19) /* 1.959375*2^29 */
+       const f6_19: v128 = i32x4.mul(f6, v19) /* 1.959375*2^30 */
+       const f7_19: v128 = i32x4.mul(f7, v19) /* 1.959375*2^29 */
+       const f8_19: v128 = i32x4.mul(f8, v19) /* 1.959375*2^30 */
+       const f9_19: v128 = i32x4.mul(f9, v19) /* 1.959375*2^29 */
+
+       // f[0]
+       let f_2: v128 = i32x4.shl(f0, 1)
+
+       let h0: v128 = i64x2.extmul_low_i32x4_s(f0, f0)
+       let h1: v128 = i64x2.extmul_low_i32x4_s(f_2, f1)
+       let h2: v128 = i64x2.extmul_low_i32x4_s(f_2, f2)
+       let h3: v128 = i64x2.extmul_low_i32x4_s(f_2, f3)
+       let h4: v128 = i64x2.extmul_low_i32x4_s(f_2, f4)
+       let h5: v128 = i64x2.extmul_low_i32x4_s(f_2, f5)
+       let h6: v128 = i64x2.extmul_low_i32x4_s(f_2, f6)
+       let h7: v128 = i64x2.extmul_low_i32x4_s(f_2, f7)
+       let h8: v128 = i64x2.extmul_low_i32x4_s(f_2, f8)
+       let h9: v128 = i64x2.extmul_low_i32x4_s(f_2, f9)
+
+       // f[1]
+       f_2 = i32x4.shl(f1, 1)
+       let f_4: v128 = i32x4.shl(f1, 2)
+
+       let t0: v128 = i64x2.extmul_low_i32x4_s(f_2, f1)
+       let t1: v128 = i64x2.extmul_low_i32x4_s(f_2, f2)
+       let t2: v128 = i64x2.extmul_low_i32x4_s(f_4, f3)
+       let t3: v128 = i64x2.extmul_low_i32x4_s(f_2, f4)
+       let t4: v128 = i64x2.extmul_low_i32x4_s(f_4, f5)
+       let t5: v128 = i64x2.extmul_low_i32x4_s(f_2, f6)
+       let t6: v128 = i64x2.extmul_low_i32x4_s(f_4, f7)
+       let t7: v128 = i64x2.extmul_low_i32x4_s(f_2, f8)
+       let t8: v128 = i64x2.extmul_low_i32x4_s(f_4, f9_19)
+
+       h2 = i64x2.add(h2, t0)
+       h3 = i64x2.add(h3, t1)
+       h4 = i64x2.add(h4, t2)
+       h5 = i64x2.add(h5, t3)
+       h6 = i64x2.add(h6, t4)
+       h7 = i64x2.add(h7, t5)
+       h8 = i64x2.add(h8, t6)
+       h9 = i64x2.add(h9, t7)
+       h0 = i64x2.add(h0, t8)
+
+       // f[2]
+       f_2 = i32x4.shl(f2, 1)
+
+       t0 = i64x2.extmul_low_i32x4_s(f2, f2)
+       t1 = i64x2.extmul_low_i32x4_s(f_2, f3)
+       t2 = i64x2.extmul_low_i32x4_s(f_2, f4)
+       t3 = i64x2.extmul_low_i32x4_s(f_2, f5)
+       t4 = i64x2.extmul_low_i32x4_s(f_2, f6)
+       t5 = i64x2.extmul_low_i32x4_s(f_2, f7)
+       t6 = i64x2.extmul_low_i32x4_s(f_2, f8_19)
+       t7 = i64x2.extmul_low_i32x4_s(f_2, f9_19)
+
+       h4 = i64x2.add(h4, t0)
+       h5 = i64x2.add(h5, t1)
+       h6 = i64x2.add(h6, t2)
+       h7 = i64x2.add(h7, t3)
+       h8 = i64x2.add(h8, t4)
+       h9 = i64x2.add(h9, t5)
+       h0 = i64x2.add(h0, t6)
+       h1 = i64x2.add(h1, t7)
+
+       // f[3]
+       f_2 = i32x4.shl(f3, 1)
+       f_4 = i32x4.shl(f3, 2)
+
+       t0 = i64x2.extmul_low_i32x4_s(f_2, f3)
+       t1 = i64x2.extmul_low_i32x4_s(f_2, f4)
+       t2 = i64x2.extmul_low_i32x4_s(f_4, f5)
+       t3 = i64x2.extmul_low_i32x4_s(f_2, f6)
+       t4 = i64x2.extmul_low_i32x4_s(f_4, f7_19) /* 1.959375*2^30 */
+       t5 = i64x2.extmul_low_i32x4_s(f_2, f8_19)
+       t6 = i64x2.extmul_low_i32x4_s(f_4, f9_19)
+
+       h6 = i64x2.add(h6, t0)
+       h7 = i64x2.add(h7, t1)
+       h8 = i64x2.add(h8, t2)
+       h9 = i64x2.add(h9, t3)
+       h0 = i64x2.add(h0, t4) /* 1.959375*2^30 */
+       h1 = i64x2.add(h1, t5)
+       h2 = i64x2.add(h2, t6)
+
+       // f[4]
+       f_2 = i32x4.shl(f4, 1)
+
+       t0 = i64x2.extmul_low_i32x4_s(f4, f4)
+       t1 = i64x2.extmul_low_i32x4_s(f_2, f5)
+       t2 = i64x2.extmul_low_i32x4_s(f_2, f6_19)
+       t3 = i64x2.extmul_low_i32x4_s(f_2, f7_19)
+       t4 = i64x2.extmul_low_i32x4_s(f_2, f8_19)
+       t5 = i64x2.extmul_low_i32x4_s(f_2, f9_19)
+
+       h8 = i64x2.add(h8, t0)
+       h9 = i64x2.add(h9, t1)
+       h0 = i64x2.add(h0, t2)
+       h1 = i64x2.add(h1, t3)
+       h2 = i64x2.add(h2, t4)
+       h3 = i64x2.add(h3, t5)
+
+       // f[5]
+       f_2 = i32x4.shl(f5, 1)
+       f_4 = i32x4.shl(f5, 2)
+
+       t0 = i64x2.extmul_low_i32x4_s(f_2, f5_19)
+       t1 = i64x2.extmul_low_i32x4_s(f_2, f6_19)
+       t2 = i64x2.extmul_low_i32x4_s(f_4, f7_19)
+       t3 = i64x2.extmul_low_i32x4_s(f_2, f8_19)
+       t4 = i64x2.extmul_low_i32x4_s(f_4, f9_19) /* 1.959375*2^30 */
+
+       h0 = i64x2.add(h0, t0)
+       h1 = i64x2.add(h1, t1)
+       h2 = i64x2.add(h2, t2)
+       h3 = i64x2.add(h3, t3)
+       h4 = i64x2.add(h4, t4) /* 1.959375*2^30 */
+
+       // f[6]
+       f_2 = i32x4.shl(f6, 1)
+
+       t0 = i64x2.extmul_low_i32x4_s(f6, f6_19)
+       t1 = i64x2.extmul_low_i32x4_s(f_2, f7_19)
+       t2 = i64x2.extmul_low_i32x4_s(f_2, f8_19)
+       t3 = i64x2.extmul_low_i32x4_s(f_2, f9_19)
+
+       h2 = i64x2.add(h2, t0)
+       h3 = i64x2.add(h3, t1)
+       h4 = i64x2.add(h4, t2)
+       h5 = i64x2.add(h5, t3)
+
+       // f[7]
+       f_2 = i32x4.shl(f7, 1)
+       f_4 = i32x4.shl(f7, 2)
+
+       t0 = i64x2.extmul_low_i32x4_s(f_2, f7_19)
+       t1 = i64x2.extmul_low_i32x4_s(f_2, f8_19)
+       t2 = i64x2.extmul_low_i32x4_s(f_4, f9_19)
+
+       h4 = i64x2.add(h4, t0)
+       h5 = i64x2.add(h5, t1)
+       h6 = i64x2.add(h6, t2)
+
+       // f[8]
+       f_2 = i32x4.shl(f8, 1)
+
+       t0 = i64x2.extmul_low_i32x4_s(f8, f8_19)
+       t1 = i64x2.extmul_low_i32x4_s(f_2, f9_19)
+
+       h6 = i64x2.add(h6, t0)
+       h7 = i64x2.add(h7, t1)
+
+       // f[9]
+       f_2 = i32x4.shl(f9, 1)
+
+       t0 = i64x2.extmul_low_i32x4_s(f_2, f9_19)
+
+       h8 = i64x2.add(h8, t0)
+
+       const v24: v128 = i64x2.splat(1 << 24)
+       const v25: v128 = i64x2.splat(1 << 25)
+       let carry0: v128
+       let carry1: v128
+       let carry2: v128
+       let carry3: v128
+       let carry4: v128
+       let carry5: v128
+       let carry6: v128
+       let carry7: v128
+       let carry8: v128
+       let carry9: v128
+
+       carry0 = i64x2.shr_s(((i64x2.add(h0, v25))), 26)
+       h1 = i64x2.add(h1, carry0)
+       h0 = i64x2.sub(h0, i64x2.shl(carry0, 26))
+       carry4 = i64x2.shr_s(((i64x2.add(h4, v25))), 26)
+       h5 = i64x2.add(h5, carry4)
+       h4 = i64x2.sub(h4, i64x2.shl(carry4, 26))
+
+       carry1 = i64x2.shr_s(((i64x2.add(h1, v24))), 25)
+       h2 = i64x2.add(h2, carry1)
+       h1 = i64x2.sub(h1, i64x2.shl(carry1, 25))
+       carry5 = i64x2.shr_s(((i64x2.add(h5, v24))), 25)
+       h6 = i64x2.add(h6, carry5)
+       h5 = i64x2.sub(h5, i64x2.shl(carry5, 25))
+
+       carry2 = i64x2.shr_s(((i64x2.add(h2, v25))), 26)
+       h3 = i64x2.add(h3, carry2)
+       h2 = i64x2.sub(h2, i64x2.shl(carry2, 26))
+       carry6 = i64x2.shr_s(((i64x2.add(h6, v25))), 26)
+       h7 = i64x2.add(h7, carry6)
+       h6 = i64x2.sub(h6, i64x2.shl(carry6, 26))
+
+       carry3 = i64x2.shr_s(((i64x2.add(h3, v24))), 25)
+       h4 = i64x2.add(h4, carry3)
+       h3 = i64x2.sub(h3, i64x2.shl(carry3, 25))
+       carry7 = i64x2.shr_s(((i64x2.add(h7, v24))), 25)
+       h8 = i64x2.add(h8, carry7)
+       h7 = i64x2.sub(h7, i64x2.shl(carry7, 25))
+
+       carry4 = i64x2.shr_s(((i64x2.add(h4, v25))), 26)
+       h5 = i64x2.add(h5, carry4)
+       h4 = i64x2.sub(h4, i64x2.shl(carry4, 26))
+       carry8 = i64x2.shr_s(((i64x2.add(h8, v25))), 26)
+       h9 = i64x2.add(h9, carry8)
+       h8 = i64x2.sub(h8, i64x2.shl(carry8, 26))
+
+       carry9 = i64x2.shr_s(((i64x2.add(h9, v24))), 25)
+       h0 = i64x2.add(h0, i64x2.mul(carry9, i64x2.splat(19)))
+       h9 = i64x2.sub(h9, i64x2.shl(carry9, 25))
+
+       carry0 = i64x2.shr_s(((i64x2.add(h0, v25))), 26)
+       h1 = i64x2.add(h1, carry0)
+       h0 = i64x2.sub(h0, i64x2.shl(carry0, 26))
+
+       const hx_ptr: usize = changetype<usize>(hx)
+       store<i32>(hx_ptr, i64x2.extract_lane(h0, 0), 0)
+       store<i32>(hx_ptr, i64x2.extract_lane(h1, 0), 4)
+       store<i32>(hx_ptr, i64x2.extract_lane(h2, 0), 8)
+       store<i32>(hx_ptr, i64x2.extract_lane(h3, 0), 12)
+       store<i32>(hx_ptr, i64x2.extract_lane(h4, 0), 16)
+       store<i32>(hx_ptr, i64x2.extract_lane(h5, 0), 20)
+       store<i32>(hx_ptr, i64x2.extract_lane(h6, 0), 24)
+       store<i32>(hx_ptr, i64x2.extract_lane(h7, 0), 28)
+       store<i32>(hx_ptr, i64x2.extract_lane(h8, 0), 32)
+       store<i32>(hx_ptr, i64x2.extract_lane(h9, 0), 36)
+
+       const hy_ptr: usize = changetype<usize>(hy)
+       store<i32>(hy_ptr, i64x2.extract_lane(h0, 1), 0)
+       store<i32>(hy_ptr, i64x2.extract_lane(h1, 1), 4)
+       store<i32>(hy_ptr, i64x2.extract_lane(h2, 1), 8)
+       store<i32>(hy_ptr, i64x2.extract_lane(h3, 1), 12)
+       store<i32>(hy_ptr, i64x2.extract_lane(h4, 1), 16)
+       store<i32>(hy_ptr, i64x2.extract_lane(h5, 1), 20)
+       store<i32>(hy_ptr, i64x2.extract_lane(h6, 1), 24)
+       store<i32>(hy_ptr, i64x2.extract_lane(h7, 1), 28)
+       store<i32>(hy_ptr, i64x2.extract_lane(h8, 1), 32)
+       store<i32>(hy_ptr, i64x2.extract_lane(h9, 1), 36)
  }
  
  /**
   * Square a FieldElement, double it, and store the result.
   *
+ * h = 2 * f * f
+ * Can overlap h with f.
+ *
+ * Preconditions:
+ * |f| bounded by 1.65*2^26,1.65*2^25,1.65*2^26,1.65*2^25,etc.
+ *
+ * Postconditions:
+ * |h| bounded by 1.01*2^25,1.01*2^24,1.01*2^25,1.01*2^24,etc.
+ *
   * @param h FieldElement power destination
   * @param f FieldElement base source
   */
-//@ts-expect-error
-@inline
  export function fe_sq2 (h: FieldElement, f: FieldElement): void {
-       fe_mul(h, f, f)
-       fe_add(h, h, h)
+       const f_ptr: usize = changetype<usize>(f)
+       const f0: i64 = i64(load<i32>(f_ptr, 0))
+       const f1: i64 = i64(load<i32>(f_ptr, 4))
+       const f2: i64 = i64(load<i32>(f_ptr, 8))
+       const f3: i64 = i64(load<i32>(f_ptr, 12))
+       const f4: i64 = i64(load<i32>(f_ptr, 16))
+       const f5: i64 = i64(load<i32>(f_ptr, 20))
+       const f6: i64 = i64(load<i32>(f_ptr, 24))
+       const f7: i64 = i64(load<i32>(f_ptr, 28))
+       const f8: i64 = i64(load<i32>(f_ptr, 32))
+       const f9: i64 = i64(load<i32>(f_ptr, 36))
+
+       const f0_2: i64 = f0 * 2
+       const f1_2: i64 = f1 * 2
+       const f2_2: i64 = f2 * 2
+       const f3_2: i64 = f3 * 2
+       const f4_2: i64 = f4 * 2
+       const f5_2: i64 = f5 * 2
+       const f6_2: i64 = f6 * 2
+       const f7_2: i64 = f7 * 2
+
+       const f5_38: i64 = f5 * 38 /* 1.959375*2^30 */
+       const f6_19: i64 = f6 * 19 /* 1.959375*2^30 */
+       const f7_38: i64 = f7 * 38/* 1.959375*2^30 */
+       const f8_19: i64 = f8 * 19/* 1.959375*2^30 */
+       const f9_38: i64 = f9 * 38/* 1.959375*2^30 */
+
+       const f0f0: i64 = f0 * f0
+       const f0f1_2: i64 = f0_2 * f1
+       const f0f2_2: i64 = f0_2 * f2
+       const f0f3_2: i64 = f0_2 * f3
+       const f0f4_2: i64 = f0_2 * f4
+       const f0f5_2: i64 = f0_2 * f5
+       const f0f6_2: i64 = f0_2 * f6
+       const f0f7_2: i64 = f0_2 * f7
+       const f0f8_2: i64 = f0_2 * f8
+       const f0f9_2: i64 = f0_2 * f9
+
+       const f1f1_2: i64 = f1_2 * f1
+       const f1f2_2: i64 = f1_2 * f2
+       const f1f3_4: i64 = f1_2 * f3_2
+       const f1f4_2: i64 = f1_2 * f4
+       const f1f5_4: i64 = f1_2 * f5_2
+       const f1f6_2: i64 = f1_2 * f6
+       const f1f7_4: i64 = f1_2 * f7_2
+       const f1f8_2: i64 = f1_2 * f8
+       const f1f9_76: i64 = f1_2 * f9_38
+
+       const f2f2: i64 = f2 * f2
+       const f2f3_2: i64 = f2_2 * f3
+       const f2f4_2: i64 = f2_2 * f4
+       const f2f5_2: i64 = f2_2 * f5
+       const f2f6_2: i64 = f2_2 * f6
+       const f2f7_2: i64 = f2_2 * f7
+       const f2f8_38: i64 = f2_2 * f8_19
+       const f2f9_38: i64 = f2 * f9_38
+
+       const f3f3_2: i64 = f3_2 * f3
+       const f3f4_2: i64 = f3_2 * f4
+       const f3f5_4: i64 = f3_2 * f5_2
+       const f3f6_2: i64 = f3_2 * f6
+       const f3f7_76: i64 = f3_2 * f7_38
+       const f3f8_38: i64 = f3_2 * f8_19
+       const f3f9_76: i64 = f3_2 * f9_38
+
+       const f4f4: i64 = f4 * f4
+       const f4f5_2: i64 = f4_2 * f5
+       const f4f6_38: i64 = f4_2 * f6_19
+       const f4f7_38: i64 = f4 * f7_38
+       const f4f8_38: i64 = f4_2 * f8_19
+       const f4f9_38: i64 = f4 * f9_38
+
+       const f5f5_38: i64 = f5 * f5_38
+       const f5f6_38: i64 = f5_2 * f6_19
+       const f5f7_76: i64 = f5_2 * f7_38
+       const f5f8_38: i64 = f5_2 * f8_19
+       const f5f9_76: i64 = f5_2 * f9_38
+
+       const f6f6_19: i64 = f6 * f6_19
+       const f6f7_38: i64 = f6 * f7_38
+       const f6f8_38: i64 = f6_2 * f8_19
+       const f6f9_38: i64 = f6 * f9_38
+
+       const f7f7_38: i64 = f7 * f7_38
+       const f7f8_38: i64 = f7_2 * f8_19
+       const f7f9_76: i64 = f7_2 * f9_38
+
+       const f8f8_19: i64 = f8 * f8_19
+       const f8f9_38: i64 = f8 * f9_38
+
+       const f9f9_38: i64 = f9 * f9_38
+
+       let h0: i64 = f0f0 + f1f9_76 + f2f8_38 + f3f7_76 + f4f6_38 + f5f5_38
+       let h1: i64 = f0f1_2 + f2f9_38 + f3f8_38 + f4f7_38 + f5f6_38
+       let h2: i64 = f0f2_2 + f1f1_2 + f3f9_76 + f4f8_38 + f5f7_76 + f6f6_19
+       let h3: i64 = f0f3_2 + f1f2_2 + f4f9_38 + f5f8_38 + f6f7_38
+       let h4: i64 = f0f4_2 + f1f3_4 + f2f2 + f5f9_76 + f6f8_38 + f7f7_38
+       let h5: i64 = f0f5_2 + f1f4_2 + f2f3_2 + f6f9_38 + f7f8_38
+       let h6: i64 = f0f6_2 + f1f5_4 + f2f4_2 + f3f3_2 + f7f9_76 + f8f8_19
+       let h7: i64 = f0f7_2 + f1f6_2 + f2f5_2 + f3f4_2 + f8f9_38
+       let h8: i64 = f0f8_2 + f1f7_4 + f2f6_2 + f3f5_4 + f4f4 + f9f9_38
+       let h9: i64 = f0f9_2 + f1f8_2 + f2f7_2 + f3f6_2 + f4f5_2
+
+       h0 *= 2
+       h1 *= 2
+       h2 *= 2
+       h3 *= 2
+       h4 *= 2
+       h5 *= 2
+       h6 *= 2
+       h7 *= 2
+       h8 *= 2
+       h9 *= 2
+
+       let carry0: i64
+       let carry1: i64
+       let carry2: i64
+       let carry3: i64
+       let carry4: i64
+       let carry5: i64
+       let carry6: i64
+       let carry7: i64
+       let carry8: i64
+       let carry9: i64
+
+       carry0 = (h0 + i64(1 << 25)) >> 26
+       h1 += carry0
+       h0 -= carry0 << 26
+       carry4 = (h4 + i64(1 << 25)) >> 26
+       h5 += carry4
+       h4 -= carry4 << 26
+
+       carry1 = (h1 + i64(1 << 24)) >> 25
+       h2 += carry1
+       h1 -= carry1 << 25
+       carry5 = (h5 + i64(1 << 24)) >> 25
+       h6 += carry5
+       h5 -= carry5 << 25
+
+       carry2 = (h2 + i64(1 << 25)) >> 26
+       h3 += carry2
+       h2 -= carry2 << 26
+       carry6 = (h6 + i64(1 << 25)) >> 26
+       h7 += carry6
+       h6 -= carry6 << 26
+
+       carry3 = (h3 + i64(1 << 24)) >> 25
+       h4 += carry3
+       h3 -= carry3 << 25
+       carry7 = (h7 + i64(1 << 24)) >> 25
+       h8 += carry7
+       h7 -= carry7 << 25
+
+       carry4 = (h4 + i64(1 << 25)) >> 26
+       h5 += carry4
+       h4 -= carry4 << 26
+       carry8 = (h8 + i64(1 << 25)) >> 26
+       h9 += carry8
+       h8 -= carry8 << 26
+
+       carry9 = (h9 + i64(1 << 24)) >> 25
+       h0 += carry9 * 19
+       h9 -= carry9 << 25
+
+       carry0 = (h0 + i64(1 << 25)) >> 26
+       h1 += carry0
+       h0 -= carry0 << 26
+
+       const h_ptr: usize = changetype<usize>(h)
+       store<i32>(h_ptr, h0, 0)
+       store<i32>(h_ptr, h1, 4)
+       store<i32>(h_ptr, h2, 8)
+       store<i32>(h_ptr, h3, 12)
+       store<i32>(h_ptr, h4, 16)
+       store<i32>(h_ptr, h5, 20)
+       store<i32>(h_ptr, h6, 24)
+       store<i32>(h_ptr, h7, 28)
+       store<i32>(h_ptr, h8, 32)
+       store<i32>(h_ptr, h9, 36)
  }
  
  /**
@@ -598,21 +1415,12 @@ export function fe_sq2 (h: FieldElement, f: FieldElement): void {
  //@ts-expect-error
  @inline
  export function fe_sub (h: FieldElement, f: FieldElement, g: FieldElement): void {
-       const o = changetype<usize>(h)
-       const a = changetype<usize>(f)
-       const b = changetype<usize>(g)
-
-       let ai = v128.load(a)
-       let bi = v128.load(b)
-       v128.store(o, v128.sub<i32>(ai, bi))
-
-       ai = v128.load(a, 16)
-       bi = v128.load(b, 16)
-       v128.store(o, v128.sub<i32>(ai, bi), 16)
-
-       ai = i64x2(load<i64>(a + 32), 0)
-       bi = i64x2(load<i64>(b + 32), 0)
-       v128.store_lane<i64>(o, v128.sub<i32>(ai, bi), 0, 32)
+       const f_ptr = changetype<usize>(f)
+       const g_ptr = changetype<usize>(g)
+       const h_ptr = changetype<usize>(h)
+       v128.store(h_ptr, v128.sub<i32>(v128.load(f_ptr, 0), v128.load(g_ptr, 0)), 0)
+       v128.store(h_ptr, v128.sub<i32>(v128.load(f_ptr, 16), v128.load(g_ptr, 16)), 16)
+       v128.store_lane<i64>(h_ptr, v128.sub<i32>(v128.load(f_ptr, 32), v128.load(g_ptr, 32)), 0, 32)
  }
  
  const fe_tobytes_t: FieldElement = fe()
@@ -626,183 +1434,49 @@ const fe_tobytes_t: FieldElement = fe()
  export function fe_tobytes (s: StaticArray<u8>, h: FieldElement): void {
         const t = fe_tobytes_t
         fe_reduce(t, h)
-       s[0] = u8(t[0] >> 0)
-       s[1] = u8(t[0] >> 8)
-       s[2] = u8(t[0] >> 16)
-       s[3] = u8((t[0] >> 24) | (t[1] * (u32(1) << 2)))
-       s[4] = u8(t[1] >> 6)
-       s[5] = u8(t[1] >> 14)
-       s[6] = u8((t[1] >> 22) | (t[2] * (u32(1) << 3)))
-       s[7] = u8(t[2] >> 5)
-       s[8] = u8(t[2] >> 13)
-       s[9] = u8((t[2] >> 21) | (t[3] * (u32(1) << 5)))
-       s[10] = u8(t[3] >> 3)
-       s[11] = u8(t[3] >> 11)
-       s[12] = u8((t[3] >> 19) | (t[4] * (u32(1) << 6)))
-       s[13] = u8(t[4] >> 2)
-       s[14] = u8(t[4] >> 10)
-       s[15] = u8(t[4] >> 18)
-       s[16] = u8(t[5] >> 0)
-       s[17] = u8(t[5] >> 8)
-       s[18] = u8(t[5] >> 16)
-       s[19] = u8((t[5] >> 24) | (t[6] * (u32(1) << 1)))
-       s[20] = u8(t[6] >> 7)
-       s[21] = u8(t[6] >> 15)
-       s[22] = u8((t[6] >> 23) | (t[7] * (u32(1) << 3)))
-       s[23] = u8(t[7] >> 5)
-       s[24] = u8(t[7] >> 13)
-       s[25] = u8((t[7] >> 21) | (t[8] * (u32(1) << 4)))
-       s[26] = u8(t[8] >> 4)
-       s[27] = u8(t[8] >> 12)
-       s[28] = u8((t[8] >> 20) | (t[9] * (u32(1) << 6)))
-       s[29] = u8(t[9] >> 2)
-       s[30] = u8(t[9] >> 10)
-       s[31] = u8(t[9] >> 18)
-}
-
-/**
- * Internal function.
- *
- * Normalize results of `fe_multiply` and `fe_square` across limbs. This
- * requires a reduction step and two carry steps.
- *
- *     // reduce
- *     for (let i = 0; i < 15; i++) {
- *             t[i] += 38 * t[i + 16]
- *     }
- *     // carry twice
- *     for (let i = 0; i < 2; i++) {
- *             for (let i = 0; i < 16; i++) {
- *                     c += t[i]
- *                     t[i] = c & 0xFFFF
- *                     c >>= 16
- *             }
- *             t[0] += 38 * c
- * }
-*/
-//@ts-expect-error
-@inline
-function normalize (o: usize, t: usize): void {
-       // reduce from 20 limbs to 10
-       let x = load<i64>(t)
-       let y = load<i64>(t, 80)
-       store<i64>(t, x + (19 * y))
-
-       x = load<i64>(t, 8)
-       y = load<i64>(t, 88)
-       store<i64>(t, x + (19 * y), 8)
-
-       x = load<i64>(t, 16)
-       y = load<i64>(t, 96)
-       store<i64>(t, x + (19 * y), 16)
-
-       x = load<i64>(t, 24)
-       y = load<i64>(t, 104)
-       store<i64>(t, x + (19 * y), 24)
-
-       x = load<i64>(t, 32)
-       y = load<i64>(t, 112)
-       store<i64>(t, x + (19 * y), 32)
-
-       x = load<i64>(t, 40)
-       y = load<i64>(t, 120)
-       store<i64>(t, x + (19 * y), 40)
-
-       x = load<i64>(t, 48)
-       y = load<i64>(t, 128)
-       store<i64>(t, x + (19 * y), 48)
-
-       x = load<i64>(t, 56)
-       y = load<i64>(t, 136)
-       store<i64>(t, x + (19 * y), 56)
-
-       x = load<i64>(t, 64)
-       y = load<i64>(t, 144)
-       store<i64>(t, x + (19 * y), 64)
-
-       x = load<i64>(t, 72)
-       y = load<i64>(t, 152)
-       store<i64>(t, x + (19 * y), 72)
-
-       // first carry
-       let v: i64 = load<i64>(t, 0)
-       let c: i64 = (v + (1 << 25)) >> 26
-       store<i64>(t, v - (c << 26), 0)
-
-       v = load<i64>(t, 8) + c
-       c = (v + (1 << 24)) >> 25
-       store<i64>(t, v - (c << 25), 8)
-
-       v = load<i64>(t, 16) + c
-       c = (v + (1 << 25)) >> 26
-       store<i64>(t, v - (c << 26), 16)
-
-       v = load<i64>(t, 24) + c
-       c = (v + (1 << 24)) >> 25
-       store<i64>(t, v - (c << 25), 24)
-
-       v = load<i64>(t, 32) + c
-       c = (v + (1 << 25)) >> 26
-       store<i64>(t, v - (c << 26), 32)
-
-       v = load<i64>(t, 40) + c
-       c = (v + (1 << 24)) >> 25
-       store<i64>(t, v - (c << 25), 40)
-
-       v = load<i64>(t, 48) + c
-       c = (v + (1 << 25)) >> 26
-       store<i64>(t, v - (c << 26), 48)
-
-       v = load<i64>(t, 56) + c
-       c = (v + (1 << 24)) >> 25
-       store<i64>(t, v - (c << 25), 56)
-
-       v = load<i64>(t, 64) + c
-       c = (v + (1 << 25)) >> 26
-       store<i64>(t, v - (c << 26), 64)
-
-       v = load<i64>(t, 72) + c
-       c = (v + (1 << 24)) >> 25
-       store<i64>(t, v - (c << 25), 72)
-
-       // fold final carry back to start then do second carry and assign to output
-       v = load<i64>(t, 0) + (19 * c)
-       c = (v + (1 << 25)) >> 26
-       store<i32>(o, v - (c << 26), 0)
-
-       v = load<i64>(t, 8) + c
-       c = (v + (1 << 24)) >> 25
-       store<i32>(o, v - (c << 25), 4)
-
-       v = load<i64>(t, 16) + c
-       c = (v + (1 << 25)) >> 26
-       store<i32>(o, v - (c << 26), 8)
-
-       v = load<i64>(t, 24) + c
-       c = (v + (1 << 24)) >> 25
-       store<i32>(o, v - (c << 25), 12)
-
-       v = load<i64>(t, 32) + c
-       c = (v + (1 << 25)) >> 26
-       store<i32>(o, v - (c << 26), 16)
-
-       v = load<i64>(t, 40) + c
-       c = (v + (1 << 24)) >> 25
-       store<i32>(o, v - (c << 25), 20)
-
-       v = load<i64>(t, 48) + c
-       c = (v + (1 << 25)) >> 26
-       store<i32>(o, v - (c << 26), 24)
-
-       v = load<i64>(t, 56) + c
-       c = (v + (1 << 24)) >> 25
-       store<i32>(o, v - (c << 25), 28)
-
-       v = load<i64>(t, 64) + c
-       c = (v + (1 << 25)) >> 26
-       store<i32>(o, v - (c << 26), 32)
-
-       v = load<i64>(t, 72) + c
-       c = (v + (1 << 24)) >> 25
-       store<i32>(o, v - (c << 25), 36)
+       const t_ptr: usize = changetype<usize>(t)
+       const t0 = load<i32>(t_ptr, 0)
+       const t1 = load<i32>(t_ptr, 4)
+       const t2 = load<i32>(t_ptr, 8)
+       const t3 = load<i32>(t_ptr, 12)
+       const t4 = load<i32>(t_ptr, 16)
+       const t5 = load<i32>(t_ptr, 20)
+       const t6 = load<i32>(t_ptr, 24)
+       const t7 = load<i32>(t_ptr, 28)
+       const t8 = load<i32>(t_ptr, 32)
+       const t9 = load<i32>(t_ptr, 36)
+
+       const s_ptr: usize = changetype<usize>(s)
+       store<u8>(s_ptr, t0 >> 0, 0)
+       store<u8>(s_ptr, t0 >> 8, 1)
+       store<u8>(s_ptr, t0 >> 16, 2)
+       store<u8>(s_ptr, (t0 >> 24) | (t1 << 2), 3)
+       store<u8>(s_ptr, t1 >> 6, 4)
+       store<u8>(s_ptr, t1 >> 14, 5)
+       store<u8>(s_ptr, (t1 >> 22) | (t2 << 3), 6)
+       store<u8>(s_ptr, t2 >> 5, 7)
+       store<u8>(s_ptr, t2 >> 13, 8)
+       store<u8>(s_ptr, (t2 >> 21) | (t3 << 5), 9)
+       store<u8>(s_ptr, t3 >> 3, 10)
+       store<u8>(s_ptr, t3 >> 11, 11)
+       store<u8>(s_ptr, (t3 >> 19) | (t4 << 6), 12)
+       store<u8>(s_ptr, t4 >> 2, 13)
+       store<u8>(s_ptr, t4 >> 10, 14)
+       store<u8>(s_ptr, t4 >> 18, 15)
+       store<u8>(s_ptr, t5 >> 0, 16)
+       store<u8>(s_ptr, t5 >> 8, 17)
+       store<u8>(s_ptr, t5 >> 16, 18)
+       store<u8>(s_ptr, (t5 >> 24) | (t6 << 1), 19)
+       store<u8>(s_ptr, t6 >> 7, 20)
+       store<u8>(s_ptr, t6 >> 15, 21)
+       store<u8>(s_ptr, (t6 >> 23) | (t7 << 3), 22)
+       store<u8>(s_ptr, t7 >> 5, 23)
+       store<u8>(s_ptr, t7 >> 13, 24)
+       store<u8>(s_ptr, (t7 >> 21) | (t8 << 4), 25)
+       store<u8>(s_ptr, t8 >> 4, 26)
+       store<u8>(s_ptr, t8 >> 12, 27)
+       store<u8>(s_ptr, (t8 >> 20) | (t9 << 6), 28)
+       store<u8>(s_ptr, t9 >> 2, 29)
+       store<u8>(s_ptr, t9 >> 10, 30)
+       store<u8>(s_ptr, t9 >> 18, 31)
  }
diff --git a/assembly/ge.ts b/assembly/ge.ts

index 386a7c93ef02611bbc2839f62f00e30978acfb04..6abd6424a8c9ce7ee0780822f8888083bfb0bfe3 100644 (file)
--- a/assembly/ge.ts
+++ b/assembly/ge.ts
@@ -17,77 +17,13 @@
   * - ge_precomp (Duif): (y+x,y-x,2dxy)
  */
  import { base, base2 } from './base'
-import { ed25519_d, ed25519_d2, fe25519_sqrtm1 } from './constants'
-import { fe, fe_0, fe_1, fe_add, fe_cmov, fe_copy, fe_frombytes, fe_invert, fe_isnegative, fe_iszero, fe_mul, fe_neg, fe_pow22523, fe_sq, fe_sq2, fe_sub, fe_tobytes, FieldElement } from './fe'
+import { ed25519_d, fe25519_sqrtm1 } from './constants'
+import { fe, fe_1, fe_add, fe_cmov, fe_copy, fe_frombytes, fe_isnegative, fe_iszero, fe_mul, fe_neg, fe_pow22523, fe_sq, fe_sub, FieldElement } from './fe'
+import { ge_add_cached, ge_add_precomp, ge_cached, ge_p1p1, ge_p1p1_to_p2, ge_p1p1_to_p3, ge_p2, ge_p2_0, ge_p2_dbl, ge_p2_to_p3, ge_p3, ge_p3_0, ge_p3_dbl, ge_p3_to_cached, ge_p3_tobytes, ge_precomp, ge_precomp_0, ge_sub_cached, ge_sub_precomp } from './p'
  import { equal, negative, optblocker_u8 } from './utils'
  
-export class ge_p2 {
-       X: FieldElement = fe()
-       Y: FieldElement = fe()
-       Z: FieldElement = fe()
-}
-
-export class ge_p3 extends ge_p2 {
-       T: FieldElement = fe()
-       __brand: 'ge_p3' = 'ge_p3'
-}
-
-export class ge_p1p1 extends ge_p2 {
-       T: FieldElement = fe()
-       __brand: 'ge_p1p1' = 'ge_p1p1'
-}
-
-export class ge_precomp {
-       yplusx: FieldElement = fe()
-       yminusx: FieldElement = fe()
-       xy2d: FieldElement = fe()
-}
-
-export class ge_cached {
-       YplusX: FieldElement = fe()
-       YminusX: FieldElement = fe()
-       Z: FieldElement = fe()
-       T2d: FieldElement = fe()
-}
-
-//@ts-expect-error
-@inline
-function ge_cached_8 (): StaticArray<ge_cached> {
-       return StaticArray.fromArray<ge_cached>([
-               new ge_cached(),
-               new ge_cached(),
-               new ge_cached(),
-               new ge_cached(),
-               new ge_cached(),
-               new ge_cached(),
-               new ge_cached(),
-               new ge_cached()
-       ])
-}
-
-//@ts-expect-error
-@inline
-export function ge_p2_0 (h: ge_p2): void {
-       fe_0(h.X)
-       fe_1(h.Y)
-       fe_1(h.Z)
-}
-
-//@ts-expect-error
-@inline
-export function ge_p3_0 (h: ge_p3): void {
-       ge_p2_0(h)
-       fe_0(h.T)
-}
-
  //@ts-expect-error
  @inline
-export function ge_precomp_0 (h: ge_precomp): void {
-       fe_1(h.yplusx)
-       fe_1(h.yminusx)
-       fe_0(h.xy2d)
-}
-
  function ge_cmov (t: ge_precomp, u: ge_precomp, b: u8): void {
         fe_cmov(t.yplusx, u.yplusx, b)
         fe_cmov(t.yminusx, u.yminusx, b)
@@ -116,354 +52,12 @@ function ge_cmov8 (t: ge_precomp, precomp: StaticArray<ge_precomp>, b: i8): void
         ge_cmov(t, minust, bnegative)
  }
  
+//@ts-expect-error
+@inline
  function ge_cmov8_base (t: ge_precomp, pos: i32, b: i8): void {
         ge_cmov8(t, base[pos], b)
  }
  
-const ge_add_precomp_t0: FieldElement = fe()
-/**
- * r = p + q
- */
-function ge_add_precomp (r: ge_p1p1, p: ge_p3, q: ge_precomp): void {
-       const t0 = ge_add_precomp_t0
-       fe_add(r.X, p.Y, p.X)
-       fe_sub(r.Y, p.Y, p.X)
-       fe_mul(r.Z, r.X, q.yplusx)
-       fe_mul(r.Y, r.Y, q.yminusx)
-       fe_mul(r.T, q.xy2d, p.T)
-       fe_add(t0, p.Z, p.Z)
-       fe_sub(r.X, r.Z, r.Y)
-       fe_add(r.Y, r.Z, r.Y)
-       fe_add(r.Z, t0, r.T)
-       fe_sub(r.T, t0, r.T)
-}
-
-function ge_p1p1_to_p3 (r: ge_p3, p: ge_p1p1): void {
-       fe_mul(r.X, p.X, p.T)
-       fe_mul(r.Y, p.Y, p.Z)
-       fe_mul(r.Z, p.Z, p.T)
-       fe_mul(r.T, p.X, p.Y)
-}
-
-function ge_p3_to_p2 (r: ge_p2, p: ge_p3): void {
-       fe_copy(r.X, p.X)
-       fe_copy(r.Y, p.Y)
-       fe_copy(r.Z, p.Z)
-}
-
-const ge_p2_dbl_t0: FieldElement = fe()
-function ge_p2_dbl (r: ge_p1p1, p: ge_p2): void {
-       const t0 = ge_p2_dbl_t0
-       fe_sq(r.X, p.X)
-       fe_sq(r.Z, p.Y)
-       fe_sq2(r.T, p.Z)
-       fe_add(r.Y, p.X, p.Y)
-       fe_sq(t0, r.Y)
-       fe_add(r.Y, r.Z, r.X)
-       fe_sub(r.Z, r.Z, r.X)
-       fe_sub(r.X, t0, r.Y)
-       fe_sub(r.T, r.T, r.Z)
-}
-
-const ge_p3_dbl_q: ge_p2 = new ge_p2()
-/**
- * r = 2 * p
- */
-function ge_p3_dbl (r: ge_p1p1, p: ge_p3): void {
-       const q = ge_p3_dbl_q
-       ge_p3_to_p2(q, p)
-       ge_p2_dbl(r, q)
-}
-
-/**
- * r = p
- */
-function ge_p1p1_to_p2 (r: ge_p2, p: ge_p1p1): void {
-       fe_mul(r.X, p.X, p.T)
-       fe_mul(r.Y, p.Y, p.Z)
-       fe_mul(r.Z, p.Z, p.T)
-}
-
-/**
- * r = p
- */
-function ge_p3_to_cached (r: ge_cached, p: ge_p3): void {
-       fe_add(r.YplusX, p.Y, p.X)
-       fe_sub(r.YminusX, p.Y, p.X)
-       fe_copy(r.Z, p.Z)
-       fe_mul(r.T2d, p.T, ed25519_d2)
-}
-
-const ge_add_cached_t0: FieldElement = fe()
-/**
- * r = p + q
- */
-function ge_add_cached (r: ge_p1p1, p: ge_p3, q: ge_cached): void {
-       const t0 = ge_add_cached_t0
-       fe_add(r.X, p.Y, p.X)
-       fe_sub(r.Y, p.Y, p.X)
-       fe_mul(r.Z, r.X, q.YplusX)
-       fe_mul(r.Y, r.Y, q.YminusX)
-       fe_mul(r.T, q.T2d, p.T)
-       fe_mul(r.X, p.Z, q.Z)
-       fe_add(t0, r.X, r.X)
-       fe_sub(r.X, r.Z, r.Y)
-       fe_add(r.Y, r.Z, r.Y)
-       fe_add(r.Z, t0, r.T)
-       fe_sub(r.T, t0, r.T)
-}
-
-function ge_cached_0 (h: ge_cached): void {
-       fe_1(h.YplusX)
-       fe_1(h.YminusX)
-       fe_1(h.Z)
-       fe_0(h.T2d)
-}
-
-function ge_cmov_cached (t: ge_cached, u: ge_cached, b: u8): void {
-       fe_cmov(t.YplusX, u.YplusX, b)
-       fe_cmov(t.YminusX, u.YminusX, b)
-       fe_cmov(t.Z, u.Z, b)
-       fe_cmov(t.T2d, u.T2d, b)
-}
-
-const ge_cmov8_cached_minust: ge_cached = new ge_cached()
-function ge_cmov8_cached (t: ge_cached, cached: StaticArray<ge_cached>, b: i8): void {
-       const minust = ge_cmov8_cached_minust
-       const bnegative: u8 = negative(b)
-       const babs: u8 = b - (((-bnegative) & b) << 1)
-
-       ge_cached_0(t)
-       ge_cmov_cached(t, cached[0], equal(babs, 1))
-       ge_cmov_cached(t, cached[1], equal(babs, 2))
-       ge_cmov_cached(t, cached[2], equal(babs, 3))
-       ge_cmov_cached(t, cached[3], equal(babs, 4))
-       ge_cmov_cached(t, cached[4], equal(babs, 5))
-       ge_cmov_cached(t, cached[5], equal(babs, 6))
-       ge_cmov_cached(t, cached[6], equal(babs, 7))
-       ge_cmov_cached(t, cached[7], equal(babs, 8))
-       fe_copy(minust.YplusX, t.YminusX)
-       fe_copy(minust.YminusX, t.YplusX)
-       fe_copy(minust.Z, t.Z)
-       fe_neg(minust.T2d, t.T2d)
-       ge_cmov_cached(t, minust, bnegative)
-}
-
-const ge_scalarmult_e: StaticArray<i8> = new StaticArray<i8>(64)
-const ge_scalarmult_r: ge_p1p1 = new ge_p1p1()
-const ge_scalarmult_s: ge_p2 = new ge_p2()
-const ge_scalarmult_t2: ge_p1p1 = new ge_p1p1()
-const ge_scalarmult_t3: ge_p1p1 = new ge_p1p1()
-const ge_scalarmult_t4: ge_p1p1 = new ge_p1p1()
-const ge_scalarmult_t5: ge_p1p1 = new ge_p1p1()
-const ge_scalarmult_t6: ge_p1p1 = new ge_p1p1()
-const ge_scalarmult_t7: ge_p1p1 = new ge_p1p1()
-const ge_scalarmult_t8: ge_p1p1 = new ge_p1p1()
-const ge_scalarmult_p2: ge_p3 = new ge_p3()
-const ge_scalarmult_p3: ge_p3 = new ge_p3()
-const ge_scalarmult_p4: ge_p3 = new ge_p3()
-const ge_scalarmult_p5: ge_p3 = new ge_p3()
-const ge_scalarmult_p6: ge_p3 = new ge_p3()
-const ge_scalarmult_p7: ge_p3 = new ge_p3()
-const ge_scalarmult_p8: ge_p3 = new ge_p3()
-const ge_scalarmult_pi: StaticArray<ge_cached> = ge_cached_8()
-const ge_scalarmult_t: ge_cached = new ge_cached()
-/**
- * `h = a * p`
- *
- * where
- *
- * `a = 256⁰a[0] + 256¹a[1] + ... + 256³¹a[31]`
- *
- * Preconditions:
- * - `a[31] <= 127`
- * - `p` is public
- */
-export function ge_scalarmult (h: ge_p3, a: StaticArray<u8>, p: ge_p3): void {
-       const e = ge_scalarmult_e
-       const r = ge_scalarmult_r
-       const s = ge_scalarmult_s
-       const t2 = ge_scalarmult_t2
-       const t3 = ge_scalarmult_t3
-       const t4 = ge_scalarmult_t4
-       const t5 = ge_scalarmult_t5
-       const t6 = ge_scalarmult_t6
-       const t7 = ge_scalarmult_t7
-       const t8 = ge_scalarmult_t8
-       const p2 = ge_scalarmult_p2
-       const p3 = ge_scalarmult_p3
-       const p4 = ge_scalarmult_p4
-       const p5 = ge_scalarmult_p5
-       const p6 = ge_scalarmult_p6
-       const p7 = ge_scalarmult_p7
-       const p8 = ge_scalarmult_p8
-       const pi = ge_scalarmult_pi
-       const t = ge_scalarmult_t
-       let carry: i8
-       let i: i8
-
-       ge_p3_to_cached(pi[1 - 1], p)   /* p */
-
-       ge_p3_dbl(t2, p)
-       ge_p1p1_to_p3(p2, t2)
-       ge_p3_to_cached(pi[2 - 1], p2) /* 2p = 2*p */
-
-       ge_add_cached(t3, p, pi[2 - 1])
-       ge_p1p1_to_p3(p3, t3)
-       ge_p3_to_cached(pi[3 - 1], p3) /* 3p = 2p+p */
-
-       ge_p3_dbl(t4, p2)
-       ge_p1p1_to_p3(p4, t4)
-       ge_p3_to_cached(pi[4 - 1], p4) /* 4p = 2*2p */
-
-       ge_add_cached(t5, p, pi[4 - 1])
-       ge_p1p1_to_p3(p5, t5)
-       ge_p3_to_cached(pi[5 - 1], p5) /* 5p = 4p+p */
-
-       ge_p3_dbl(t6, p3)
-       ge_p1p1_to_p3(p6, t6)
-       ge_p3_to_cached(pi[6 - 1], p6) /* 6p = 2*3p */
-
-       ge_add_cached(t7, p, pi[6 - 1])
-       ge_p1p1_to_p3(p7, t7)
-       ge_p3_to_cached(pi[7 - 1], p7) /* 7p = 6p+p */
-
-       ge_p3_dbl(t8, p4)
-       ge_p1p1_to_p3(p8, t8)
-       ge_p3_to_cached(pi[8 - 1], p8) /* 8p = 2*4p */
-
-       for (i = 0; i < 32; ++i) {
-               e[2 * i + 0] = (a[i] >> 0) & 15
-               e[2 * i + 1] = (a[i] >> 4) & 15
-       }
-       /* each e[i] is between 0 and 15 */
-       /* e[63] is between 0 and 7 */
-
-       carry = 0
-       for (i = 0; i < 63; ++i) {
-               e[i] += carry
-               carry = e[i] + 8
-               carry >>= 4
-               e[i] -= i8(carry << 4)
-       }
-       e[63] += carry
-       /* each e[i] is between -8 and 8 */
-
-       ge_p3_0(h)
-
-       for (i = 63; i != 0; i--) {
-               ge_cmov8_cached(t, pi, e[i])
-               ge_add_cached(r, h, t)
-
-               ge_p1p1_to_p2(s, r)
-               ge_p2_dbl(r, s)
-               ge_p1p1_to_p2(s, r)
-               ge_p2_dbl(r, s)
-               ge_p1p1_to_p2(s, r)
-               ge_p2_dbl(r, s)
-               ge_p1p1_to_p2(s, r)
-               ge_p2_dbl(r, s)
-
-               ge_p1p1_to_p3(h, r)  /* *16 */
-       }
-       ge_cmov8_cached(t, pi, e[i])
-       ge_add_cached(r, h, t)
-
-       ge_p1p1_to_p3(h, r)
-}
-
-const ge_scalarmult_base_e: StaticArray<i8> = new StaticArray<i8>(64)
-const ge_scalarmult_base_r: ge_p1p1 = new ge_p1p1()
-const ge_scalarmult_base_s: ge_p2 = new ge_p2()
-const ge_scalarmult_base_t: ge_precomp = new ge_precomp()
-/**
- * `h = a * B` (with precomputation)
- *
- * where
- *
- * `a = 256⁰a[0] + 256¹a[1] + ... + 256³¹a[31]`
- *
- * B is the Ed25519 base point `(x,⅘)` with `x` positive. As bytes:
- *
- * `0x5866666666666666666666666666666666666666666666666666666666666666`
- *
- * Preconditions:
- * - `a[31] <= 127`
- */
-export function ge_scalarmult_base (h: ge_p3, a: StaticArray<u8>): void {
-       const e = ge_scalarmult_base_e
-       const r = ge_scalarmult_base_r
-       const s = ge_scalarmult_base_s
-       const t = ge_scalarmult_base_t
-
-       for (let i = 0; i < 32; ++i) {
-               e[(i << 1) + 0] = (a[i] >> 0) & 15
-               e[(i << 1) + 1] = (a[i] >> 4) & 15
-       }
-       /* each e[i] is between 0 and 15 */
-       /* e[63] is between 0 and 7 */
-
-       let carry: i8 = 0
-       for (let i = 0; i < 63; ++i) {
-               e[i] += carry
-               carry = e[i] + 8
-               carry >>= 4
-               e[i] -= carry * (i8(1) << 4)
-       }
-       e[63] += carry
-       /* each e[i] is between -8 and 8 */
-
-       ge_p3_0(h)
-
-       for (let i = 1; i < 64; i += 2) {
-               ge_cmov8_base(t, i / 2, e[i])
-               ge_add_precomp(r, h, t)
-               ge_p1p1_to_p3(h, r)
-       }
-
-       ge_p3_dbl(r, h)
-       ge_p1p1_to_p2(s, r)
-       ge_p2_dbl(r, s)
-       ge_p1p1_to_p2(s, r)
-       ge_p2_dbl(r, s)
-       ge_p1p1_to_p2(s, r)
-       ge_p2_dbl(r, s)
-       ge_p1p1_to_p3(h, r)
-
-       for (let i = 0; i < 64; i += 2) {
-               ge_cmov8_base(t, i / 2, e[i])
-               ge_add_precomp(r, h, t)
-               ge_p1p1_to_p3(h, r)
-       }
-}
-
-const ge_p3_tobytes_recip: FieldElement = fe()
-const ge_p3_tobytes_x: FieldElement = fe()
-const ge_p3_tobytes_y: FieldElement = fe()
-export function ge_p3_tobytes (s: StaticArray<u8>, h: ge_p3): void {
-       const recip = ge_p3_tobytes_recip
-       const x = ge_p3_tobytes_x
-       const y = ge_p3_tobytes_y
-       fe_invert(recip, h.Z)
-       fe_mul(x, h.X, recip)
-       fe_mul(y, h.Y, recip)
-       fe_tobytes(s, y)
-       s[31] ^= fe_isnegative(x) << 7
-}
-
-/**
- * true if `s < p`
- */
-export function ge_is_canonical (s: StaticArray<u8>): u8 {
-       let c: u8 = (s[31] & 0x7f) ^ 0x7f
-       for (let i = 30; i > 0; i--) {
-               c |= s[i] ^ 0xff
-       }
-       c = (c - 1) >> 8
-       const d: u8 = (0xec - s[0]) >> 8
-       return 1 - (c & d & 1)
-}
-
  const u: FieldElement = fe()
  const v: FieldElement = fe()
  const v3: FieldElement = fe()
@@ -499,7 +93,7 @@ export function ge_frombytes_negate_vartime (h: ge_p3, s: StaticArray<u8>): i32
                 fe_mul(h.X, h.X, fe25519_sqrtm1)
         }
  
-       if (fe_isnegative(h.X) == (s[31] >> 7)) { /* vartime function - compiler optimization is fine */
+       if (fe_isnegative(h.X) == (load<u8>(changetype<usize>(s), 31) >> 7)) { /* vartime function - compiler optimization is fine */
                 fe_neg(h.X, h.X)
         }
         fe_mul(h.T, h.X, h.Y)
@@ -572,45 +166,102 @@ export function ge_has_small_order (p: ge_p3): i32 {
  }
  
  /**
- * r = p
+ * true if `s < p`
   */
-export function ge_p2_to_p3 (r: ge_p3, p: ge_p2): void {
-       fe_copy(r.X, p.X)
-       fe_copy(r.Y, p.Y)
-       fe_copy(r.Z, p.Z)
-       fe_mul(r.T, p.X, p.Y)
+export function ge_is_canonical (s: StaticArray<u8>): u8 {
+       let c: u8 = (s[31] & 0x7f) ^ 0x7f
+       for (let i = 30; i > 0; i--) {
+               c |= s[i] ^ 0xff
+       }
+       c = (c - 1) >> 8
+       const d: u8 = (0xec - s[0]) >> 8
+       return 1 - (c & d & 1)
  }
  
-const ge_p3_sub_q_neg: ge_p3 = new ge_p3()
-/* r = p-q */
-export function ge_p3_sub (r: ge_p3, p: ge_p3, q: ge_p3): void {
-       const q_neg = ge_p3_sub_q_neg
-       ge_p3_neg(q_neg, q)
-       ge_p3_add(r, p, q_neg)
-}
+const ge_scalarmult_base_e: StaticArray<i8> = new StaticArray<i8>(64)
+const ge_scalarmult_base_r: ge_p1p1 = new ge_p1p1()
+const ge_scalarmult_base_s: ge_p2 = new ge_p2()
+const ge_scalarmult_base_t: ge_precomp = new ge_precomp()
+/**
+ * `h = a * B` (with precomputation)
+ *
+ * where
+ *
+ * `a = 256⁰a[0] + 256¹a[1] + ... + 256³¹a[31]`
+ *
+ * B is the Ed25519 base point `(x,⅘)` with `x` positive. As bytes:
+ *
+ * `0x5866666666666666666666666666666666666666666666666666666666666666`
+ *
+ * Preconditions:
+ * - `a[31] <= 127`
+ */
+function ge_scalarmult_base (h: ge_p3, a: StaticArray<u8>): void {
+       const e = ge_scalarmult_base_e
+       const r = ge_scalarmult_base_r
+       const s = ge_scalarmult_base_s
+       const t = ge_scalarmult_base_t
+
+       for (let i = 0; i < 32; ++i) {
+               e[(i << 1) + 0] = (a[i] >> 0) & 15
+               e[(i << 1) + 1] = (a[i] >> 4) & 15
+       }
+       /* each e[i] is between 0 and 15 */
+       /* e[63] is between 0 and 7 */
+
+       let carry: i8 = 0
+       for (let i = 0; i < 63; ++i) {
+               e[i] += carry
+               carry = e[i] + 8
+               carry >>= 4
+               e[i] -= carry * (i8(1) << 4)
+       }
+       e[63] += carry
+       /* each e[i] is between -8 and 8 */
+
+       ge_p3_0(h)
+
+       for (let i = 1; i < 64; i += 2) {
+               ge_cmov8_base(t, i / 2, e[i])
+               ge_add_precomp(r, h, t)
+               ge_p1p1_to_p3(h, r)
+       }
+
+       ge_p3_dbl(r, h)
+       ge_p1p1_to_p2(s, r)
+       ge_p2_dbl(r, s)
+       ge_p1p1_to_p2(s, r)
+       ge_p2_dbl(r, s)
+       ge_p1p1_to_p2(s, r)
+       ge_p2_dbl(r, s)
+       ge_p1p1_to_p3(h, r)
  
-/* r = -p */
-function ge_p3_neg (r: ge_p3, p: ge_p3): void {
-       fe_neg(r.X, p.X)
-       fe_copy(r.Y, p.Y)
-       fe_copy(r.Z, p.Z)
-       fe_neg(r.T, p.T)
+       for (let i = 0; i < 64; i += 2) {
+               ge_cmov8_base(t, i / 2, e[i])
+               ge_add_precomp(r, h, t)
+               ge_p1p1_to_p3(h, r)
+       }
  }
  
-const ge_p3_add_q_cached = new ge_cached()
-const ge_p3_add_q_p1p1 = new ge_p1p1()
-/* r = p+q */
-function ge_p3_add (r: ge_p3, p: ge_p3, q: ge_p3): void {
-       const q_cached = ge_p3_add_q_cached
-       const p1p1 = ge_p3_add_q_p1p1
-       ge_p3_to_cached(q_cached, q)
-       ge_add_cached(p1p1, p, q_cached)
-       ge_p1p1_to_p3(r, p1p1)
+const ge_scalarmult_base_tobytes_h: ge_p3 = new ge_p3()
+export function ge_scalarmult_base_tobytes (s: StaticArray<u8>, a: StaticArray<u8>): void {
+       const h = ge_scalarmult_base_tobytes_h
+       ge_scalarmult_base(h, a)
+       ge_p3_tobytes(s, h)
  }
  
  const ge_double_scalarmult_vartime_aslide: StaticArray<i8> = new StaticArray<i8>(256)
  const ge_double_scalarmult_vartime_bslide: StaticArray<i8> = new StaticArray<i8>(256)
-const ge_double_scalarmult_vartime_Ai: StaticArray<ge_cached> = ge_cached_8() /* A,3A,5A,7A,9A,11A,13A,15A */
+const ge_double_scalarmult_vartime_Ai: StaticArray<ge_cached> = StaticArray.fromArray<ge_cached>([
+       new ge_cached(),
+       new ge_cached(),
+       new ge_cached(),
+       new ge_cached(),
+       new ge_cached(),
+       new ge_cached(),
+       new ge_cached(),
+       new ge_cached()
+]) /* A,3A,5A,7A,9A,11A,13A,15A */
  const ge_double_scalarmult_vartime_t: ge_p1p1 = new ge_p1p1()
  const ge_double_scalarmult_vartime_u: ge_p3 = new ge_p3()
  const ge_double_scalarmult_vartime_A2: ge_p3 = new ge_p3()
@@ -622,7 +273,7 @@ const ge_double_scalarmult_vartime_A2: ge_p3 = new ge_p3()
   *
   * Only used for signatures verification.
   */
-export function ge_double_scalarmult_vartime (r: ge_p2, a: StaticArray<u8>, A: ge_p3, b: StaticArray<u8>): void {
+function ge_double_scalarmult_vartime (r: ge_p2, a: StaticArray<u8>, A: ge_p3, b: StaticArray<u8>): void {
         const Bi = base2
         const aslide = ge_double_scalarmult_vartime_aslide
         const bslide = ge_double_scalarmult_vartime_bslide
@@ -699,6 +350,13 @@ export function ge_double_scalarmult_vartime (r: ge_p2, a: StaticArray<u8>, A: g
         }
  }
  
+const ge_double_scalarmult_vartime_p: ge_p2 = new ge_p2()
+export function ge_double_scalarmult_vartime_to_p3 (r: ge_p3, a: StaticArray<u8>, A: ge_p3, b: StaticArray<u8>): void {
+       const p = ge_double_scalarmult_vartime_p
+       ge_double_scalarmult_vartime(p, a, A, b)
+       ge_p2_to_p3(r, p)
+}
+
  function slide_vartime (r: StaticArray<i8>, a: StaticArray<u8>): void {
         let i: i32
         let b: i32
@@ -739,40 +397,3 @@ function slide_vartime (r: StaticArray<i8>, a: StaticArray<u8>): void {
                 }
         }
  }
-
-const ge_sub_cached_t0: FieldElement = fe()
-/**
- * r = p - q
- */
-function ge_sub_cached (r: ge_p1p1, p: ge_p3, q: ge_cached): void {
-       const t0 = ge_sub_cached_t0
-       fe_add(r.X, p.Y, p.X)
-       fe_sub(r.Y, p.Y, p.X)
-       fe_mul(r.Z, r.X, q.YminusX)
-       fe_mul(r.Y, r.Y, q.YplusX)
-       fe_mul(r.T, q.T2d, p.T)
-       fe_mul(r.X, p.Z, q.Z)
-       fe_add(t0, r.X, r.X)
-       fe_sub(r.X, r.Z, r.Y)
-       fe_add(r.Y, r.Z, r.Y)
-       fe_sub(r.Z, t0, r.T)
-       fe_add(r.T, t0, r.T)
-}
-
-const ge_sub_precomp_t0: FieldElement = fe()
-/**
- * r = p - q
- */
-function ge_sub_precomp (r: ge_p1p1, p: ge_p3, q: ge_precomp): void {
-       const t0 = ge_sub_precomp_t0
-       fe_add(r.X, p.Y, p.X)
-       fe_sub(r.Y, p.Y, p.X)
-       fe_mul(r.Z, r.X, q.yminusx)
-       fe_mul(r.Y, r.Y, q.yplusx)
-       fe_mul(r.T, q.xy2d, p.T)
-       fe_add(t0, p.Z, p.Z)
-       fe_sub(r.X, r.Z, r.Y)
-       fe_add(r.Y, r.Z, r.Y)
-       fe_sub(r.Z, t0, r.T)
-       fe_add(r.T, t0, r.T)
-}
diff --git a/assembly/nano-nacl.ts b/assembly/nano-nacl.ts

index 6e2f75b8b7c30bcdee9d2d59e76c0ac6762ffadb..36fadc7af0ca2d4c0ec87beb143973ab12299224 100644 (file)
--- a/assembly/nano-nacl.ts
+++ b/assembly/nano-nacl.ts
@@ -2,7 +2,8 @@
  //! SPDX-License-Identifier: GPL-3.0-or-later\r
  \r
  import { Blake2b } from './blake2b'\r
-import { ge_double_scalarmult_vartime, ge_frombytes, ge_frombytes_negate_vartime, ge_has_small_order, ge_is_canonical, ge_p2, ge_p2_to_p3, ge_p3, ge_p3_0, ge_p3_sub, ge_p3_tobytes, ge_scalarmult_base } from './ge'\r
+import { ge_double_scalarmult_vartime_to_p3, ge_frombytes, ge_frombytes_negate_vartime, ge_has_small_order, ge_is_canonical, ge_scalarmult_base_tobytes } from './ge'\r
+import { ge_p3, ge_sub_p3 } from './p'\r
  import { sc_is_canonical, sc_muladd, sc_reduce } from './sc'\r
  \r
  const BLOCKHASH_BYTES: i32 = 32\r
@@ -29,18 +30,11 @@ function crypto_hash (o: StaticArray<u8>, i: StaticArray<u8>): void {
         blake2b.init().update(i).digest(o)\r
  }\r
  \r
-const crypto_derive_A = new ge_p3()\r
  function crypto_derive (pk: StaticArray<u8>, sk: StaticArray<u8>, seed: StaticArray<u8>): void {\r
-       const A = crypto_derive_A\r
-       ge_p3_0(A)\r
-\r
         crypto_hash(sk, seed)\r
         clamp(sk)\r
  \r
-       trace(`seed hashed to sk: ${sk.map((b: u8) => b.toString(16).padStart(2, '0')).join('')}`)\r
-\r
-       ge_scalarmult_base(A, sk)\r
-       ge_p3_tobytes(pk, A)\r
+       ge_scalarmult_base_tobytes(pk, sk)\r
         memory.copy(changetype<usize>(sk), changetype<usize>(seed), PRIVATEKEY_BYTES)\r
         memory.copy(changetype<usize>(sk) + 32, changetype<usize>(pk), PUBLICKEY_BYTES)\r
  }\r
@@ -48,17 +42,14 @@ function crypto_derive (pk: StaticArray<u8>, sk: StaticArray<u8>, seed: StaticAr
  const crypto_sign_az = new StaticArray<u8>(64)\r
  const crypto_sign_nonce = new StaticArray<u8>(64)\r
  const crypto_sign_hram = new StaticArray<u8>(64)\r
-const crypto_sign_R = new ge_p3()\r
  const crypto_sign_zm = new StaticArray<u8>(SIGNATURE_BYTES)\r
  const crypto_sign_t = new StaticArray<u8>(PRIVATEKEY_BYTES)\r
  function crypto_sign (s: StaticArray<u8>, m: StaticArray<u8>, sk: StaticArray<u8>): void {\r
         const az = crypto_sign_az\r
         const nonce = crypto_sign_nonce\r
         const hram = crypto_sign_hram\r
-       const R = crypto_sign_R\r
         const zm = crypto_sign_zm\r
         const t = crypto_sign_t\r
-       ge_p3_0(R)\r
  \r
         // Hash secret key to private scalar `a` and prefix for nonce derivation `z`\r
         memory.copy(changetype<usize>(t), changetype<usize>(sk), PRIVATEKEY_BYTES)\r
@@ -71,9 +62,8 @@ function crypto_sign (s: StaticArray<u8>, m: StaticArray<u8>, sk: StaticArray<u8
         crypto_hash(nonce, zm)\r
         sc_reduce(nonce)\r
  \r
-       // Compute R = rB\r
-       ge_scalarmult_base(R, nonce)\r
-       ge_p3_tobytes(s, R)\r
+       // Compute R = rB, output to bytes s\r
+       ge_scalarmult_base_tobytes(s, nonce)\r
  \r
         // Concatenate public key `A` and message `M`\r
         // from parameter arguments: A = sk[0,32], M = m\r
@@ -97,7 +87,6 @@ const crypto_verify_check = new ge_p3()
  const crypto_verify_expected_r = new ge_p3()\r
  const crypto_verify_A = new ge_p3()\r
  const crypto_verify_sb_ah = new ge_p3()\r
-const crypto_verify_sb_ah_p2 = new ge_p2()\r
  const crypto_verify_ram = new StaticArray<u8>(SIGNEDBLOCKHASH_BYTES)\r
  const crypto_verify_S = new StaticArray<u8>(32)\r
  /**\r
@@ -109,7 +98,6 @@ function crypto_verify (s: StaticArray<u8>, m: StaticArray<u8>, pk: StaticArray<
         const expected_r = crypto_verify_expected_r\r
         const A = crypto_verify_A\r
         const sb_ah = crypto_verify_sb_ah\r
-       const sb_ah_p2 = crypto_verify_sb_ah_p2\r
         const ram = crypto_verify_ram\r
         const S = crypto_verify_S\r
  \r
@@ -136,9 +124,8 @@ function crypto_verify (s: StaticArray<u8>, m: StaticArray<u8>, pk: StaticArray<
         crypto_hash(h, ram)\r
         sc_reduce(h)\r
  \r
-       ge_double_scalarmult_vartime(sb_ah_p2, h, A, S)\r
-       ge_p2_to_p3(sb_ah, sb_ah_p2)\r
-       ge_p3_sub(check, expected_r, sb_ah)\r
+       ge_double_scalarmult_vartime_to_p3(sb_ah, h, A, S)\r
+       ge_sub_p3(check, expected_r, sb_ah)\r
  \r
         return ge_has_small_order(check) - 1\r
  }\r
@@ -169,11 +156,7 @@ export function derive (): void {
                 seed[i] = load<u8>(INPUT_BUFFER + i)\r
         }\r
  \r
-       const start = performance.now()\r
         crypto_derive(pk, sk, seed)\r
-       const end = performance.now()\r
-       trace('derive time', 1, end - start)\r
-\r
         for (let i = 0; i < PUBLICKEY_BYTES; i++) {\r
                 store<u8>(OUTPUT_BUFFER + i, pk[i])\r
         }\r
@@ -202,11 +185,7 @@ export function sign (): void {
                 sk[i] = load<u8>(usize(i) + ptr)\r
         }\r
  \r
-       const start = performance.now()\r
         crypto_sign(s, h, sk)\r
-       const end = performance.now()\r
-       trace('sign time', 1, end - start)\r
-\r
         for (let i = 0; i < SIGNATURE_BYTES; i++) {\r
                 store<u8>(OUTPUT_BUFFER + i, s[i])\r
         }\r
@@ -241,10 +220,6 @@ export function verify (): void {
                 k[i] = load<u8>(usize(i) + ptr)\r
         }\r
  \r
-       const start = performance.now()\r
         const v = crypto_verify(s, h, k)\r
-       const end = performance.now()\r
-       trace('verify time', 1, end - start)\r
-\r
         store<u8>(OUTPUT_BUFFER, v)\r
  }\r
diff --git a/assembly/p.ts b/assembly/p.ts

new file mode 100644 (file)

index 0000000..537eeec
--- /dev/null
+++ b/assembly/p.ts
@@ -0,0 +1,236 @@
+//! SPDX-FileCopyrightText: 2026 Chris Duncan <chris@codecow.com>
+//! SPDX-License-Identifier: GPL-3.0-or-later
+
+import { ed25519_d2 } from './constants'
+import { FieldElement, fe, fe_0, fe_1, fe_add, fe_copy, fe_dbl, fe_invert, fe_isnegative, fe_mul, fe_neg, fe_sq, fe_sq_vec, fe_sq2, fe_sub, fe_tobytes } from './fe'
+
+export class ge_p2 {
+       X: FieldElement = fe()
+       Y: FieldElement = fe()
+       Z: FieldElement = fe()
+}
+
+export class ge_p1p1 extends ge_p2 {
+       T: FieldElement = fe()
+       __brand: 'ge_p1p1' = 'ge_p1p1'
+}
+
+export class ge_p3 extends ge_p2 {
+       T: FieldElement = fe()
+       __brand: 'ge_p3' = 'ge_p3'
+}
+export class ge_cached {
+       YplusX: FieldElement = fe()
+       YminusX: FieldElement = fe()
+       Z: FieldElement = fe()
+       T2d: FieldElement = fe()
+}
+
+export class ge_precomp {
+       yplusx: FieldElement = fe()
+       yminusx: FieldElement = fe()
+       xy2d: FieldElement = fe()
+}
+
+//@ts-expect-error
+@inline
+export function ge_p2_0 (h: ge_p2): void {
+       fe_0(h.X)
+       fe_1(h.Y)
+       fe_1(h.Z)
+}
+
+const ge_p2_dbl_t0: FieldElement = fe()
+//@ts-expect-error
+@inline
+export function ge_p2_dbl (r: ge_p1p1, p: ge_p2): void {
+       const t0 = ge_p2_dbl_t0
+       fe_sq_vec(r.X, p.X, r.Z, p.Y)
+       fe_sq2(r.T, p.Z)
+       fe_add(r.Y, p.X, p.Y)
+       fe_sq(t0, r.Y)
+       fe_add(r.Y, r.Z, r.X)
+       fe_sub(r.Z, r.Z, r.X)
+       fe_sub(r.X, t0, r.Y)
+       fe_sub(r.T, r.T, r.Z)
+}
+
+/**
+ * r = p
+ */
+//@ts-expect-error
+@inline
+export function ge_p2_to_p3 (r: ge_p3, p: ge_p2): void {
+       fe_copy(r.X, p.X)
+       fe_copy(r.Y, p.Y)
+       fe_copy(r.Z, p.Z)
+       fe_mul(r.T, p.X, p.Y)
+}
+
+/**
+ * r = p
+ */
+//@ts-expect-error
+@inline
+export function ge_p1p1_to_p2 (r: ge_p2, p: ge_p1p1): void {
+       fe_mul(r.X, p.X, p.T)
+       fe_mul(r.Y, p.Y, p.Z)
+       fe_mul(r.Z, p.Z, p.T)
+}
+
+//@ts-expect-error
+@inline
+export function ge_p1p1_to_p3 (r: ge_p3, p: ge_p1p1): void {
+       fe_mul(r.X, p.X, p.T)
+       fe_mul(r.Y, p.Y, p.Z)
+       fe_mul(r.Z, p.Z, p.T)
+       fe_mul(r.T, p.X, p.Y)
+}
+
+//@ts-expect-error
+@inline
+export function ge_p3_0 (h: ge_p3): void {
+       ge_p2_0(h)
+       fe_0(h.T)
+}
+
+const ge_p3_dbl_q: ge_p2 = new ge_p2()
+/**
+ * r = 2 * p
+ */
+export function ge_p3_dbl (r: ge_p1p1, p: ge_p3): void {
+       const q = ge_p3_dbl_q
+       fe_copy(q.X, p.X)
+       fe_copy(q.Y, p.Y)
+       fe_copy(q.Z, p.Z)
+       ge_p2_dbl(r, q)
+}
+
+const ge_p3_tobytes_recip: FieldElement = fe()
+const ge_p3_tobytes_x: FieldElement = fe()
+const ge_p3_tobytes_y: FieldElement = fe()
+export function ge_p3_tobytes (s: StaticArray<u8>, h: ge_p3): void {
+       const recip = ge_p3_tobytes_recip
+       const x = ge_p3_tobytes_x
+       const y = ge_p3_tobytes_y
+       fe_invert(recip, h.Z)
+       fe_mul(x, h.X, recip)
+       fe_mul(y, h.Y, recip)
+       fe_tobytes(s, y)
+       s[31] ^= fe_isnegative(x) << 7
+}
+
+/**
+ * r = p
+ */
+//@ts-expect-error
+@inline
+export function ge_p3_to_cached (r: ge_cached, p: ge_p3): void {
+       fe_add(r.YplusX, p.Y, p.X)
+       fe_sub(r.YminusX, p.Y, p.X)
+       fe_copy(r.Z, p.Z)
+       fe_mul(r.T2d, p.T, ed25519_d2)
+}
+
+//@ts-expect-error
+@inline
+export function ge_precomp_0 (h: ge_precomp): void {
+       fe_1(h.yplusx)
+       fe_1(h.yminusx)
+       fe_0(h.xy2d)
+}
+
+const ge_add_cached_t0: FieldElement = fe()
+/**
+ * r = p + q
+ */
+export function ge_add_cached (r: ge_p1p1, p: ge_p3, q: ge_cached): void {
+       const t0 = ge_add_cached_t0
+       fe_add(r.X, p.Y, p.X)
+       fe_sub(r.Y, p.Y, p.X)
+       fe_mul(r.Z, r.X, q.YplusX)
+       fe_mul(r.Y, r.Y, q.YminusX)
+       fe_mul(r.T, q.T2d, p.T)
+       fe_mul(r.X, p.Z, q.Z)
+       fe_dbl(t0, r.X)
+       fe_sub(r.X, r.Z, r.Y)
+       fe_add(r.Y, r.Z, r.Y)
+       fe_add(r.Z, t0, r.T)
+       fe_sub(r.T, t0, r.T)
+}
+
+const ge_add_precomp_t0: FieldElement = fe()
+/**
+ * r = p + q
+ */
+//@ts-expect-error
+@inline
+export function ge_add_precomp (r: ge_p1p1, p: ge_p3, q: ge_precomp): void {
+       const t0 = ge_add_precomp_t0
+       fe_add(r.X, p.Y, p.X)
+       fe_sub(r.Y, p.Y, p.X)
+       fe_mul(r.Z, r.X, q.yplusx)
+       fe_mul(r.Y, r.Y, q.yminusx)
+       fe_mul(r.T, q.xy2d, p.T)
+       fe_dbl(t0, p.Z)
+       fe_sub(r.X, r.Z, r.Y)
+       fe_add(r.Y, r.Z, r.Y)
+       fe_add(r.Z, t0, r.T)
+       fe_sub(r.T, t0, r.T)
+}
+
+const ge_sub_p3_q_cached = new ge_cached()
+const ge_sub_p3_p1p1 = new ge_p1p1()
+/* r = p-q */
+//@ts-expect-error
+@inline
+export function ge_sub_p3 (r: ge_p3, p: ge_p3, q: ge_p3): void {
+       const q_cached = ge_sub_p3_q_cached
+       const p1p1 = ge_sub_p3_p1p1
+
+       fe_neg(r.X, q.X)
+       fe_copy(r.Y, q.Y)
+       fe_copy(r.Z, q.Z)
+       fe_neg(r.T, q.T)
+
+       ge_p3_to_cached(q_cached, r)
+       ge_add_cached(p1p1, p, q_cached)
+       ge_p1p1_to_p3(r, p1p1)
+}
+
+const ge_sub_cached_t0: FieldElement = fe()
+/**
+ * r = p - q
+ */
+export function ge_sub_cached (r: ge_p1p1, p: ge_p3, q: ge_cached): void {
+       const t0 = ge_sub_cached_t0
+       fe_add(r.X, p.Y, p.X)
+       fe_sub(r.Y, p.Y, p.X)
+       fe_mul(r.Z, r.X, q.YminusX)
+       fe_mul(r.Y, r.Y, q.YplusX)
+       fe_mul(r.T, q.T2d, p.T)
+       fe_mul(r.X, p.Z, q.Z)
+       fe_dbl(t0, r.X)
+       fe_sub(r.X, r.Z, r.Y)
+       fe_add(r.Y, r.Z, r.Y)
+       fe_sub(r.Z, t0, r.T)
+       fe_add(r.T, t0, r.T)
+}
+
+const ge_sub_precomp_t0: FieldElement = fe()
+/**
+ * r = p - q
+ */
+export function ge_sub_precomp (r: ge_p1p1, p: ge_p3, q: ge_precomp): void {
+       const t0 = ge_sub_precomp_t0
+       fe_add(r.X, p.Y, p.X)
+       fe_sub(r.Y, p.Y, p.X)
+       fe_mul(r.Z, r.X, q.yminusx)
+       fe_mul(r.Y, r.Y, q.yplusx)
+       fe_mul(r.T, q.xy2d, p.T)
+       fe_dbl(t0, p.Z)
+       fe_sub(r.X, r.Z, r.Y)
+       fe_add(r.Y, r.Z, r.Y)
+       fe_sub(r.Z, t0, r.T)
+       fe_add(r.T, t0, r.T)
+}
diff --git a/index.html b/index.html

index fcfcd4c095ee4d01dc43c64351c8c0ff8780cb12..656709b443c5c1b42400d329898bf04fa164602a 100644 (file)
--- a/index.html
+++ b/index.html
@@ -112,26 +112,33 @@ SPDX-License-Identifier: GPL-3.0-or-later
                 }
  
                 export async function test (api, size, runs, isDebug, isSelfCheck) {
+                       if (typeof size !== 'number' || size < 1) {
+                               size = 1
+                       }
+                       if (typeof runs !== 'number' || runs < 1) {
+                               runs = 1
+                       }
                         const selectedApi = api
                         // Execute once on load to initialize worker and WASM
                         const h = random(), k = random(64)
                         await NanoNaCl.sign(h, k)
  
                         const derive = sk => {
+                               const sk8 = new Uint8Array(sk.match(/.{2}/g).map(b => parseInt(b, 16)))
                                 switch (api) {
                                         case 'NanoNaCl': {
-                                               return NanoNaCl.derive(sk)
+                                               const pk8 = NanoNaCl.deriveSync(sk8)
+                                               return [...pk8].map(b => b.toString(16).padStart(2, '0')).join('')
                                         }
                                         case 'NanocurrencyWeb': {
                                                 return NanocurrencyWeb.wallet.legacyAccounts(sk)[0].publicKey
                                         }
                                         case 'Sodium': {
-                                               const sk8 = sk.match(/.{2}/g).map(b => parseInt(b, 16))
-                                               return sodium.crypto_sign_seed_keypair(new Uint8Array(sk8), 'hex').publicKey
+                                               return sodium.crypto_sign_seed_keypair(sk8, 'hex').publicKey
                                         }
                                         case 'TweetNaCl.js': {
                                                 const sk8 = sk.match(/.{2}/g).map(b => parseInt(b, 16))
-                                               const pk8 = nacl.sign.keyPair.fromSeed(new Uint8Array(sk8)).publicKey
+                                               const pk8 = nacl.sign.keyPair.fromSeed(sk8).publicKey
                                                 return [...pk8].map(b => b.toString(16).padStart(2, '0')).join('')
                                         }
                                         default: {
@@ -140,24 +147,21 @@ SPDX-License-Identifier: GPL-3.0-or-later
                                 }
                         }
                         const sign = (h, sk, pk) => {
+                               const h8 = new Uint8Array(h.match(/.{2}/g).map(b => parseInt(b, 16)))
+                               const sk8 = new Uint8Array((sk + pk).match(/.{2}/g).map(b => parseInt(b, 16)))
                                 switch (api) {
                                         case 'NanoNaCl': {
-                                               return NanoNaCl.sign(h, sk + pk)
+                                               const s8 = NanoNaCl.signSync(h8, sk8)
+                                               return [...s8].map(b => b.toString(16).padStart(2, '0')).join('')
                                         }
                                         case 'NanocurrencyWeb': {
                                                 const { privateKey } = NanocurrencyWeb.wallet.legacyAccounts(sk)[0]
                                                 return NanocurrencyWeb.tools.sign(privateKey, h)
                                         }
                                         case 'Sodium': {
-                                               sk = sk + pk
-                                               const h8 = h.match(/.{2}/g).map(b => parseInt(b, 16))
-                                               const sk8 = sk.match(/.{2}/g).map(b => parseInt(b, 16))
                                                 return sodium.crypto_sign_detached(new Uint8Array(h8), new Uint8Array(sk8), 'hex')
                                         }
                                         case 'TweetNaCl.js': {
-                                               sk = sk + pk
-                                               const h8 = h.match(/.{2}/g).map(b => parseInt(b, 16))
-                                               const sk8 = sk.match(/.{2}/g).map(b => parseInt(b, 16))
                                                 const s8 = nacl.sign.detached(new Uint8Array(h8), new Uint8Array(sk8))
                                                 return [...s8].map(b => b.toString(16).padStart(2, '0')).join('')
                                         }
@@ -167,24 +171,22 @@ SPDX-License-Identifier: GPL-3.0-or-later
                                 }
                         }
                         const verify = (s, h, pk) => {
+                               const h8 = new Uint8Array(h.match(/.{2}/g).map(b => parseInt(b, 16)))
+                               const s8 = new Uint8Array(s.match(/.{2}/g).map(b => parseInt(b, 16)))
+                               const pk8 = new Uint8Array(pk.match(/.{2}/g).map(b => parseInt(b, 16)))
                                 switch (api) {
                                         case 'NanoNaCl': {
-                                               return NanoNaCl.verify(s, h, pk)
+                                               const v = NanoNaCl.verifySync(s8, h8, pk8)
+                                               return v[0] === 0
                                         }
                                         case 'NanocurrencyWeb': {
                                                 return NanocurrencyWeb.tools.verify(pk, s, h)
                                         }
                                         case 'Sodium': {
-                                               const h8 = h.match(/.{2}/g).map(b => parseInt(b, 16))
-                                               const s8 = s.match(/.{2}/g).map(b => parseInt(b, 16))
-                                               const pk8 = pk.match(/.{2}/g).map(b => parseInt(b, 16))
-                                               return sodium.crypto_sign_verify_detached(new Uint8Array(s8), new Uint8Array(h8), new Uint8Array(pk8))
+                                               return sodium.crypto_sign_verify_detached(s8, h8, pk8)
                                         }
                                         case 'TweetNaCl.js': {
-                                               const h8 = h.match(/.{2}/g).map(b => parseInt(b, 16))
-                                               const s8 = s.match(/.{2}/g).map(b => parseInt(b, 16))
-                                               const pk8 = pk.match(/.{2}/g).map(b => parseInt(b, 16))
-                                               return nacl.sign.detached.verify(new Uint8Array(h8), new Uint8Array(s8), new Uint8Array(pk8))
+                                               return nacl.sign.detached.verify(h8, s8, pk8)
                                         }
                                         default: {
                                                 return null
@@ -196,6 +198,7 @@ SPDX-License-Identifier: GPL-3.0-or-later
                                 api = 'NanoNaCl'
                                 document.getElementById('status').innerHTML = `RUNNING SELF-CHECK`
                                 console.log(`%cNanoNaCl`, 'color:green', 'Checking validation against known values')
+                               // https://docs.nano.org/integration-guides/key-management/#success-response_2
                                 TEST_VECTORS ??= {
                                         blockHash: 'BB569136FA05F8CBF65CEF2EDE368475B289C4477342976556BA4C0DDF216E45',
                                         privateKey: '781186FB9EF17DB6E3D1056550D9FAE5D5BBADA6A6BC370E4CBB938B1DC71DA3',
@@ -390,9 +393,9 @@ SPDX-License-Identifier: GPL-3.0-or-later
                 </select>
         </span>
         <label for="size">Test Size</label>
-       <input id="size" type="number" value="100" autofocus />
+       <input id="size" type="number" value="100" min="1" autofocus />
         <label for="runs">Test Runs</label>
-       <input id="runs" type="number" value="10" autofocus />
+       <input id="runs" type="number" value="100" min="1" autofocus />
         <span>
                 <label for="isDebug">Debug?</label>
                 <input id="isDebug" type="checkbox" />
diff --git a/index.ts b/index.ts

index d571e7e9a1920e78f915b4cc4670477b87b83aae..92d9644bcb0039728f7b4ef92311d7568e613ef9 100644 (file)
--- a/index.ts
+++ b/index.ts
@@ -16,7 +16,7 @@ type Data = {
         signature?: string
  }
  
-const NanoNaCl = async (bytes: number[]): Promise<void> => {
+export const NanoNaCl = async (bytes: number[]): Promise<any> => {
         let isReady = false
         let wasm
         let module
@@ -314,6 +314,129 @@ async function run (data: Record<string, string>): Promise<string> {
         }
  }
  
+let isReady = false
+let wasm
+let module
+let instance
+let memory: WebAssembly.Memory
+let deriveSync: (k: Uint8Array) => Uint8Array<ArrayBuffer>
+let signSync: (h: Uint8Array, k: Uint8Array) => Uint8Array<ArrayBuffer>
+let verifySync: (s: Uint8Array, h: Uint8Array, k: Uint8Array) => Uint8Array<ArrayBuffer>
+try {
+       wasm = Uint8Array.from(nacl)
+       module = await WebAssembly.compile(wasm)
+       instance = await WebAssembly.instantiate(module, {
+               env: {
+                       abort: (msg: any, file: any, row: any, col: any) => {
+                               // ~lib/builtins/abort(~lib/string/String | null?, ~lib/string/String | null?, u32?, u32?) => void
+                               // msg = __liftString(msg >>> 0)
+                               // file = __liftString(file >>> 0)
+                               row = row >>> 0
+                               col = col >>> 0
+                               console.error('wasm abort:', `msg ${msg}`, `file ${file}`, `row ${row}`, `col ${col}`)
+                               throw new Error(msg, { cause: { file, row, col } })
+                       },
+                       "performance.now" () {
+                               // ~lib/bindings/dom/performance.now() => f64
+                               return performance.now()
+                       },
+                       trace: (message: any, n?: number, a0?: number, a1?: number, a2?: number, a3?: number, a4?: number) => {
+                               // ~lib/builtins/trace(~lib/string/String, i32?, f64?, f64?, f64?, f64?, f64?) => void
+                               message = __liftString(message >>> 0);
+                               (() => {
+                                       // @external.js
+                                       console.log(message, ...[a0, a1, a2, a3, a4].slice(0, n))
+                               })()
+                       },
+                       memory: new WebAssembly.Memory({ initial: 4, maximum: 4 })
+               }
+       })
+       const { exports } = instance as { exports: { derive: Derive, sign: Sign, verify: Verify, getInputPointer: GetInputPointer, getOutputPointer: GetOutputPointer, memory: WebAssembly.Memory } }
+       memory = exports.memory
+
+       function __liftString (pointer: number) {
+               if (!pointer) return null
+               const
+                       //@ts-ignore
+                       end = pointer + new Uint32Array(memory.buffer)[pointer - 4 >>> 2] >>> 1,
+                       //@ts-ignore
+                       memoryU16 = new Uint16Array(memory.buffer)
+               let
+                       start = pointer >>> 1,
+                       string = ""
+               while (end - start > 1024) string += String.fromCharCode(...memoryU16.subarray(start, start += 1024))
+               return string + String.fromCharCode(...memoryU16.subarray(start, end))
+       }
+
+       deriveSync = function (k: Uint8Array): Uint8Array<ArrayBuffer> {
+               // assembly/nano-nacl/derive() => void
+               let buffer: DataView | undefined = new DataView(memory.buffer)
+               let inPtr = exports.getInputPointer()
+               for (let i = 0; i < 32; i++) {
+                       buffer.setUint8(inPtr + i, k[i])
+               }
+               exports.derive()
+               const outPtr = exports.getOutputPointer()
+               const pk = new Uint8Array(32)
+               buffer = new DataView(memory.buffer)
+               for (let i = 0; i < 32; i++) {
+                       pk[i] = buffer.getUint8(outPtr + i)
+               }
+               buffer = undefined
+               return pk
+       }
+
+       signSync = function (h: Uint8Array, k: Uint8Array): Uint8Array<ArrayBuffer> {
+               // assembly/nano-nacl/sign() => void
+               let buffer: DataView | undefined = new DataView(memory.buffer)
+               let inPtr = exports.getInputPointer()
+               for (let i = 0; i < 32; i++) {
+                       buffer.setUint8(inPtr + i, h[i])
+               }
+               inPtr += 32
+               for (let i = 0; i < 64; i++) {
+                       buffer.setUint8(inPtr + i, k[i])
+               }
+               exports.sign()
+               const outPtr = exports.getOutputPointer()
+               const s = new Uint8Array(64)
+               buffer = new DataView(memory.buffer)
+               for (let i = 0; i < 64; i++) {
+                       s[i] = buffer.getUint8(outPtr + i)
+               }
+               buffer = undefined
+               return s
+       }
+
+       verifySync = function (s: Uint8Array, h: Uint8Array, k: Uint8Array): Uint8Array<ArrayBuffer> {
+               // assembly/nano-nacl/verify() => void
+               let buffer: DataView | undefined = new DataView(memory.buffer)
+               let inPtr = exports.getInputPointer()
+               for (let i = 0; i < 64; i++) {
+                       buffer.setUint8(inPtr + i, s[i])
+               }
+               inPtr += 64
+               for (let i = 0; i < 32; i++) {
+                       buffer.setUint8(inPtr + i, h[i])
+               }
+               inPtr += 32
+               for (let i = 0; i < 32; i++) {
+                       buffer.setUint8(inPtr + i, k[i])
+               }
+               exports.verify()
+               const outPtr = exports.getOutputPointer()
+               const v = new Uint8Array(1)
+               buffer = new DataView(memory.buffer)
+               v[0] = buffer.getUint8(outPtr)
+               buffer = undefined
+               return v
+       }
+       isReady = true
+} catch (err) {
+       throw new Error('Error instantiating WebAssembly', { cause: err })
+}
+export { deriveSync, signSync, verifySync }
+
  /**
  * Nano public key derivation using WebAssembly.
  */
author	Chris Duncan <chris@codecow.com>
	Mon, 16 Mar 2026 20:57:59 +0000 (13:57 -0700)
committer	Chris Duncan <chris@codecow.com>
	Mon, 16 Mar 2026 20:57:59 +0000 (13:57 -0700)
asconfig.json		patch \| blob \| history
assembly/base.ts		patch \| blob \| history
assembly/blake2b.ts		patch \| blob \| history
assembly/constants.ts		patch \| blob \| history
assembly/fe.ts		patch \| blob \| history
assembly/ge.ts		patch \| blob \| history
assembly/nano-nacl.ts		patch \| blob \| history
assembly/p.ts	[new file with mode: 0644]	patch \| blob
index.html		patch \| blob \| history
index.ts		patch \| blob \| history