From 70b9b5aa70be0d84ba0826bd358ac235df932ee4 Mon Sep 17 00:00:00 2001
From: Chris Duncan <chris@codecow.com>
Date: Mon, 16 Mar 2026 13:57:59 -0700
Subject: [PATCH] Refactor field element multiplication hot paths.

---
 asconfig.json         |    2 +-
 assembly/base.ts      |    2 +-
 assembly/blake2b.ts   |   12 +-
 assembly/constants.ts |   35 +-
 assembly/fe.ts        | 1504 +++++++++++++++++++++++++++++------------
 assembly/ge.ts        |  579 +++-------------
 assembly/nano-nacl.ts |   39 +-
 assembly/p.ts         |  236 +++++++
 index.html            |   47 +-
 index.ts              |  125 +++-
 10 files changed, 1591 insertions(+), 990 deletions(-)
 create mode 100644 assembly/p.ts
diff --git a/asconfig.json b/asconfig.json
index 11c4e61..2828ae2 100644
--- a/asconfig.json
+++ b/asconfig.json
@@ -9,7 +9,7 @@
 		"uncheckedBehavior": "always",
 		"bindings": "esm",
 		"sourceMap": false,
-		"debug": false,
+		"debug": true,
 		"runtime": "stub",
 		"exportRuntime": false,
 		"enable": [
diff --git a/assembly/base.ts b/assembly/base.ts
index c9a7293..d59e79a 100644
--- a/assembly/base.ts
+++ b/assembly/base.ts
@@ -2,7 +2,7 @@
 //! SPDX-License-Identifier: ISC
 
 import { fe } from './fe'
-import { ge_precomp } from './ge'
+import { ge_precomp } from './p'
 
 export const base = StaticArray.fromArray<StaticArray<ge_precomp>>([
 	StaticArray.fromArray<ge_precomp>([ /* 0/31 */
diff --git a/assembly/blake2b.ts b/assembly/blake2b.ts
index c61f76f..82c654b 100644
--- a/assembly/blake2b.ts
+++ b/assembly/blake2b.ts
@@ -75,8 +75,9 @@ export class Blake2b {
 		this.p[3] = 1 // depth
 
 		// initialize hash state
+		memory.copy(changetype<usize>(this.h), changetype<usize>(blake2b_iv), 64)
 		for (let i = 0; i < 8; i++) {
-			this.h[i] = blake2b_iv[i] ^ load<u64>(changetype<usize>(this.p) + (i << 3))
+			this.h[i] ^= load<u64>(changetype<usize>(this.p) + (i << 3))
 		}
 		return this
 	}
@@ -120,13 +121,12 @@ export class Blake2b {
 	}
 
 	// Defined in BLAKE2 section 2.4
+	@inline
 	COMPRESS (isFinal: bool): void {
 
 		// initialize state vector
-		for (let i = 0; i < 8; i++) {
-			this.v[i] = this.h[i]
-			this.v[i + 8] = blake2b_iv[i]
-		}
+		memory.copy(changetype<usize>(this.v), changetype<usize>(this.h), 64)
+		memory.copy(changetype<usize>(this.v) + 64, changetype<usize>(blake2b_iv), 64)
 
 		// lo 64 bits of counter
 		this.v[12] ^= v128.extract_lane<u64>(this.t, 0)
@@ -151,6 +151,7 @@ export class Blake2b {
 		}
 	}
 
+	@inline
 	ROUND (r: u8): void {
 		for (let i = 0; i < 8; i++) {
 			const a = blake2b_state[i][0]
@@ -162,6 +163,7 @@ export class Blake2b {
 		}
 	}
 
+	@inline
 	G (a: u8, b: u8, c: u8, d: u8, x: u8, y: u8): void {
 		this.v[a] = this.v[a] + this.v[b] + this.m[x]
 		this.v[d] = rotr<u64>(this.v[d] ^ this.v[a], 32)
diff --git a/assembly/constants.ts b/assembly/constants.ts
index 426adc7..3b41cd9 100644
--- a/assembly/constants.ts
+++ b/assembly/constants.ts
@@ -6,45 +6,12 @@ export const fe25519_sqrtm1 = StaticArray.fromArray<i32>([
 	-32595792, -7943725, 9377950, 3500415, 12389472, -272473, -25146209, -2005654, 326686, 11406482
 ])
 
-/* sqrt(-486664) */
-export const ed25519_sqrtam2 = StaticArray.fromArray<i32>([
-	-12222970, -8312128, -11511410, 9067497, -15300785, -241793, 25456130, 14121551, -12187136, 3972024
-])
-
 /* 37095705934669439343138083508754565189542113879843219016388785533085940283555 */
 export const ed25519_d = StaticArray.fromArray<i32>([
 	-10913610, 13857413, -15372611, 6949391, 114729, -8787816, -6275908, -3247719, -18696448, -12055116
 ])
 
-/* 2 * d =
- * 16295367250680780974490674513165176452449235426866156013048779062215315747161
- */
+/* 16295367250680780974490674513165176452449235426866156013048779062215315747161 */
 export const ed25519_d2 = StaticArray.fromArray<i32>([
 	-21827239, -5839606, -30745221, 13898782, 229458, 15978800, -12551817, -6495438, 29715968, 9444199
 ])
-
-/* A = 486662 */
-export const ed25519_A_32 = 486662
-export const ed25519_A = StaticArray.fromArray<i32>([
-	ed25519_A_32, 0, 0, 0, 0, 0, 0, 0, 0, 0
-])
-
-/* sqrt(ad - 1) with a = -1 (mod p) */
-export const ed25519_sqrtadm1 = StaticArray.fromArray<i32>([
-	24849947, -153582, -23613485, 6347715, -21072328, -667138, -25271143, -15367704, -870347, 14525639
-])
-
-/* 1 / sqrt(a - d) */
-export const ed25519_invsqrtamd = StaticArray.fromArray<i32>([
-	6111485, 4156064, -27798727, 12243468, -25904040, 120897, 20826367, -7060776, 6093568, -1986012
-])
-
-/* 1 - d ^ 2 */
-export const ed25519_onemsqd = StaticArray.fromArray<i32>([
-	6275446, -16617371, -22938544, -3773710, 11667077, 7397348, -27922721, 1766195, -24433858, 672203
-])
-
-/* (d - 1) ^ 2 */
-export const ed25519_sqdmone = StaticArray.fromArray<i32>([
-	15551795, -11097455, -13425098, -10125071, -11896535, 10178284, -26634327, 4729244, -5282110, -10116402
-])
diff --git a/assembly/fe.ts b/assembly/fe.ts
index f2170b0..e12d4b4 100644
--- a/assembly/fe.ts
+++ b/assembly/fe.ts
@@ -20,8 +20,10 @@ import { load_3, load_4, sodium_is_zero } from "./utils"
  * 10x 25- and 26-bit array of values representing a large integer `n mod p`.
  */
 export type FieldElement = StaticArray<i32>
-export function fe (source: Array<i32> = new Array<i32>(10)): FieldElement {
-	return StaticArray.fromArray<i32>(source)
+export function fe (src: Array<i32> = new Array<i32>(12)): FieldElement {
+	const dst = new StaticArray<i32>(12)
+	memory.copy(changetype<usize>(dst), src.dataStart, src.length << 2)
+	return dst
 }
 
 /**
@@ -32,7 +34,7 @@ export function fe (source: Array<i32> = new Array<i32>(10)): FieldElement {
 //@ts-expect-error
 @inline
 export function fe_0 (h: FieldElement): void {
-	memory.fill(changetype<usize>(h), 0, 40)
+	memory.fill(changetype<usize>(h), 0, 48)
 }
 
 /**
@@ -44,7 +46,7 @@ export function fe_0 (h: FieldElement): void {
 @inline
 export function fe_1 (h: FieldElement): void {
 	fe_0(h)
-	store<i32>(changetype<usize>(h), 1)
+	store<i32>(changetype<usize>(h), 1, 0)
 }
 
 /**
@@ -57,21 +59,12 @@ export function fe_1 (h: FieldElement): void {
 //@ts-expect-error
 @inline
 export function fe_add (h: FieldElement, f: FieldElement, g: FieldElement): void {
-	const o = changetype<usize>(h)
-	const a = changetype<usize>(f)
-	const b = changetype<usize>(g)
-
-	let ai = v128.load(a)
-	let bi = v128.load(b)
-	v128.store(o, v128.add<i32>(ai, bi))
-
-	ai = v128.load(a, 16)
-	bi = v128.load(b, 16)
-	v128.store(o, v128.add<i32>(ai, bi), 16)
-
-	ai = i64x2(load<i64>(a + 32), 0)
-	bi = i64x2(load<i64>(b + 32), 0)
-	v128.store_lane<i64>(o, v128.add<i32>(ai, bi), 0, 32)
+	const h_ptr = changetype<usize>(h)
+	const f_ptr = changetype<usize>(f)
+	const g_ptr = changetype<usize>(g)
+	v128.store(h_ptr, v128.add<i32>(v128.load(f_ptr, 0), v128.load(g_ptr, 0)), 0)
+	v128.store(h_ptr, v128.add<i32>(v128.load(f_ptr, 16), v128.load(g_ptr, 16)), 16)
+	v128.store_lane<i64>(h_ptr, v128.add<i32>(v128.load(f_ptr, 32), v128.load(g_ptr, 32)), 0, 32)
 }
 
 /**
@@ -84,22 +77,12 @@ export function fe_add (h: FieldElement, f: FieldElement, g: FieldElement): void
 //@ts-expect-error
 @inline
 export function fe_cmov (f: FieldElement, g: FieldElement, b: u64): void {
-	const p = changetype<usize>(f)
-	const q = changetype<usize>(g)
-
+	const f_ptr = changetype<usize>(f)
+	const g_ptr = changetype<usize>(g)
 	const c = v128.splat<u64>(0 - b)
-
-	let pi = v128.load(p)
-	let qi = v128.load(q)
-	v128.store(p, v128.bitselect(qi, pi, c))
-
-	pi = v128.load(p, 16)
-	qi = v128.load(q, 16)
-	v128.store(p, v128.bitselect(qi, pi, c), 16)
-
-	pi = i64x2(load<i64>(p + 32), 0)
-	qi = i64x2(load<i64>(q + 32), 0)
-	v128.store_lane<u64>(p, v128.bitselect(qi, pi, c), 0, 32)
+	v128.store(f_ptr, v128.bitselect(v128.load(g_ptr), v128.load(f_ptr, 0), c))
+	v128.store(f_ptr, v128.bitselect(v128.load(g_ptr, 16), v128.load(f_ptr, 16), c), 16)
+	v128.store_lane<u64>(f_ptr, v128.bitselect(v128.load(g_ptr, 32), v128.load(f_ptr, 32), c), 0, 32)
 }
 
 /**
@@ -124,25 +107,23 @@ export function fe_copy (h: FieldElement, f: FieldElement): void {
 //@ts-expect-error
 @inline
 export function fe_cswap (f: FieldElement, g: FieldElement, b: u64): void {
-	const p = changetype<usize>(f)
-	const q = changetype<usize>(g)
-
+	const f_ptr = changetype<usize>(f)
+	const g_ptr = changetype<usize>(g)
 	const c = v128.splat<u64>(0 - b)
 
-	let pi = v128.load(p)
-	let qi = v128.load(q)
-	v128.store(p, v128.bitselect(qi, pi, c))
-	v128.store(q, v128.bitselect(pi, qi, c))
-
-	pi = v128.load(p, 16)
-	qi = v128.load(q, 16)
-	v128.store(p, v128.bitselect(qi, pi, c), 16)
-	v128.store(q, v128.bitselect(pi, qi, c), 16)
-
-	pi = i64x2(load<i64>(p + 32), 0)
-	qi = i64x2(load<i64>(q + 32), 0)
-	v128.store_lane<i64>(p, v128.bitselect(qi, pi, c), 0, 32)
-	v128.store_lane<i64>(q, v128.bitselect(pi, qi, c), 0, 32)
+	const f03 = v128.load(f_ptr)
+	const g03 = v128.load(g_ptr)
+	const f47 = v128.load(f_ptr, 16)
+	const g47 = v128.load(g_ptr, 16)
+	const f89 = v128.load(f_ptr, 32)
+	const g89 = v128.load(g_ptr, 32)
+
+	v128.store(f_ptr, v128.bitselect(g03, f03, c))
+	v128.store(g_ptr, v128.bitselect(f03, g03, c))
+	v128.store(f_ptr, v128.bitselect(g47, f47, c), 16)
+	v128.store(g_ptr, v128.bitselect(f47, g47, c), 16)
+	v128.store_lane<i64>(f_ptr, v128.bitselect(g89, f89, c), 0, 32)
+	v128.store_lane<i64>(g_ptr, v128.bitselect(f89, g89, c), 0, 32)
 }
 
 /**
@@ -154,17 +135,11 @@ export function fe_cswap (f: FieldElement, g: FieldElement, b: u64): void {
 //@ts-expect-error
 @inline
 export function fe_dbl (h: FieldElement, f: FieldElement): void {
-	const o = changetype<usize>(h)
-	const a = changetype<usize>(f)
-
-	let ai = v128.load(a)
-	v128.store(o, v128.add<i32>(ai, ai))
-
-	ai = v128.load(a, 16)
-	v128.store(o, v128.add<i32>(ai, ai), 16)
-
-	ai = i64x2(load<i64>(a + 32), 0)
-	v128.store_lane<i64>(o, v128.add<i32>(ai, ai), 0, 32)
+	const h_ptr = changetype<usize>(h)
+	const f_ptr = changetype<usize>(f)
+	v128.store(h_ptr, v128.shl<i32>(v128.load(f_ptr, 0), 1), 0)
+	v128.store(h_ptr, v128.shl<i32>(v128.load(f_ptr, 16), 1), 16)
+	v128.store_lane<i64>(h_ptr, v128.shl<i32>(v128.load(f_ptr, 32), 1), 0, 32)
 }
 
 /**
@@ -195,46 +170,47 @@ export function fe_frombytes (h: FieldElement, s: StaticArray<u8>): void {
 
 	carry9 = (h9 + i64(1 << 24)) >> 25
 	h0 += carry9 * 19
-	h9 -= carry9 * u64(1 << 25)
+	h9 -= carry9 << 25
 	carry1 = (h1 + i64(1 << 24)) >> 25
 	h2 += carry1
-	h1 -= carry1 * u64(1 << 25)
+	h1 -= carry1 << 25
 	carry3 = (h3 + i64(1 << 24)) >> 25
 	h4 += carry3
-	h3 -= carry3 * u64(1 << 25)
+	h3 -= carry3 << 25
 	carry5 = (h5 + i64(1 << 24)) >> 25
 	h6 += carry5
-	h5 -= carry5 * u64(1 << 25)
+	h5 -= carry5 << 25
 	carry7 = (h7 + i64(1 << 24)) >> 25
 	h8 += carry7
-	h7 -= carry7 * u64(1 << 25)
+	h7 -= carry7 << 25
 
 	carry0 = (h0 + i64(1 << 25)) >> 26
 	h1 += carry0
-	h0 -= carry0 * u64(1 << 26)
+	h0 -= carry0 << 26
 	carry2 = (h2 + i64(1 << 25)) >> 26
 	h3 += carry2
-	h2 -= carry2 * u64(1 << 26)
+	h2 -= carry2 << 26
 	carry4 = (h4 + i64(1 << 25)) >> 26
 	h5 += carry4
-	h4 -= carry4 * u64(1 << 26)
+	h4 -= carry4 << 26
 	carry6 = (h6 + i64(1 << 25)) >> 26
 	h7 += carry6
-	h6 -= carry6 * u64(1 << 26)
+	h6 -= carry6 << 26
 	carry8 = (h8 + i64(1 << 25)) >> 26
 	h9 += carry8
-	h8 -= carry8 * u64(1 << 26)
-
-	h[0] = i32(h0)
-	h[1] = i32(h1)
-	h[2] = i32(h2)
-	h[3] = i32(h3)
-	h[4] = i32(h4)
-	h[5] = i32(h5)
-	h[6] = i32(h6)
-	h[7] = i32(h7)
-	h[8] = i32(h8)
-	h[9] = i32(h9)
+	h8 -= carry8 << 26
+
+	const h_ptr: usize = changetype<usize>(h)
+	store<i32>(h_ptr, h0, 0)
+	store<i32>(h_ptr, h1, 4)
+	store<i32>(h_ptr, h2, 8)
+	store<i32>(h_ptr, h3, 12)
+	store<i32>(h_ptr, h4, 16)
+	store<i32>(h_ptr, h5, 20)
+	store<i32>(h_ptr, h6, 24)
+	store<i32>(h_ptr, h7, 28)
+	store<i32>(h_ptr, h8, 32)
+	store<i32>(h_ptr, h9, 36)
 }
 
 const t0: FieldElement = fe()
@@ -291,7 +267,7 @@ export function fe_invert (out: FieldElement, z: FieldElement): void {
 	fe_mul(out, t1, t0)
 }
 
-const fe_isnegative_s = new StaticArray<u8>(32)
+const fe_isnegative_t = fe()
 /**
  * return 1 if f is in {1,3,5,...,q-2}
  * return 0 if f is in {0,2,4,...,q-1}
@@ -302,9 +278,9 @@ const fe_isnegative_s = new StaticArray<u8>(32)
 //@ts-expect-error
 @inline
 export function fe_isnegative (f: FieldElement): u8 {
-	const s = fe_isnegative_s
-	fe_tobytes(s, f)
-	return s[0] & 1
+	const t = fe_isnegative_t
+	fe_reduce(t, f)
+	return load<u8>(changetype<usize>(t), 0) & 1
 }
 
 const fe_iszero_s = new StaticArray<u8>(32)
@@ -323,7 +299,6 @@ export function fe_iszero (f: FieldElement): u8 {
 	return sodium_is_zero(s, 32)
 }
 
-const multiply_t = new StaticArray<i64>(24)
 /**
  * Multiply two FieldElements and store the product.
  *
@@ -332,60 +307,256 @@ const multiply_t = new StaticArray<i64>(24)
  * @param g FieldElement multiplicand source
  */
 export function fe_mul (h: FieldElement, f: FieldElement, g: FieldElement): void {
-	const o = changetype<usize>(h)
-	const a = changetype<usize>(f)
-	const b = changetype<usize>(g)
-
-	const t = changetype<usize>(multiply_t)
-	memory.fill(t, 0, 160)
-
-	const b0: v128 = v128.load(b)
-	const b4: v128 = v128.load(b + 16)
-	const b8: v128 = i32x4(load<i32>(b + 32), load<i32>(b + 36), 0, 0)
-
-	for (let i = 0; i < 10; i++) {
-		const odd = (i & 1) + 1
-		const ai = v128.splat<i32>(load<i32>(a + (i << 2)))
-
-		let ptr = t + (usize(i) << 3)
-		let pLo = i64x2.extmul_low_i32x4_s(ai, b0)
-		let pHi = i64x2.extmul_high_i32x4_s(ai, b0)
-		pLo = v128.mul<i64>(pLo, i64x2(1, odd))
-		pHi = v128.mul<i64>(pHi, i64x2(1, odd))
-		let tLo = v128.load(ptr)
-		let tHi = v128.load(ptr + 16)
-		tLo = i64x2.add(tLo, pLo)
-		tHi = i64x2.add(tHi, pHi)
-		v128.store(ptr, tLo)
-		v128.store(ptr + 16, tHi)
-
-		ptr += 32
-		pLo = i64x2.extmul_low_i32x4_s(ai, b4)
-		pHi = i64x2.extmul_high_i32x4_s(ai, b4)
-		pLo = v128.mul<i64>(pLo, i64x2(1, odd))
-		pHi = v128.mul<i64>(pHi, i64x2(1, odd))
-		tLo = v128.load(ptr)
-		tHi = v128.load(ptr + 16)
-		tLo = i64x2.add(tLo, pLo)
-		tHi = i64x2.add(tHi, pHi)
-		v128.store(ptr, tLo)
-		v128.store(ptr + 16, tHi)
-
-		ptr += 32
-		pLo = i64x2.extmul_low_i32x4_s(ai, b8)
-		pHi = i64x2.extmul_high_i32x4_s(ai, b8)
-		pLo = v128.mul<i64>(pLo, i64x2(1, odd))
-		pHi = v128.mul<i64>(pHi, i64x2(1, odd))
-		tLo = v128.load(ptr)
-		tHi = v128.load(ptr + 16)
-		tLo = i64x2.add(tLo, pLo)
-		tHi = i64x2.add(tHi, pHi)
-		v128.store(ptr, tLo)
-		v128.store(ptr + 16, tHi)
-	}
-	// discard last 4 elements since we do not actually need them
-	memory.fill(t + 160, 0, 32)
-	normalize(o, t)
+	const f_ptr: usize = changetype<usize>(f)
+	const g_ptr: usize = changetype<usize>(g)
+	const g0123: v128 = v128.load(g_ptr, 0)
+	const g4567: v128 = v128.load(g_ptr, 16)
+	const g89xx: v128 = v128.load(g_ptr, 32)
+
+	// mask to select even lane from first vector and odd lane from second
+	const m: v128 = i32x4(-1, 0, -1, 0)
+
+	// mask to double odd-on-odd indexes
+	const odds: v128 = i32x4(0, -1, 0, -1)
+
+	// factor of 19 for wrap from h9 to h0
+	const v19: v128 = i32x4.splat(19)
+	const g0123_19: v128 = i32x4.mul(g0123, v19)
+	const g4567_19: v128 = i32x4.mul(g4567, v19)
+	const g89xx_19: v128 = i32x4.mul(g89xx, v19)
+
+	// f[0]
+	let fi: i32 = load<i32>(f_ptr, 0)
+	let fv: v128 = i32x4.splat(fi)
+
+	let h01: v128 = i64x2.extmul_low_i32x4_s(fv, g0123)
+	let h23: v128 = i64x2.extmul_high_i32x4_s(fv, g0123)
+	let h45: v128 = i64x2.extmul_low_i32x4_s(fv, g4567)
+	let h67: v128 = i64x2.extmul_high_i32x4_s(fv, g4567)
+	let h89: v128 = i64x2.extmul_low_i32x4_s(fv, g89xx)
+
+	// f[1]
+	fi = load<i32>(f_ptr, 4)
+	fv = i32x4.splat(fi)
+	fv = i32x4.add(fv, v128.and(fv, odds))
+
+	let h12: v128 = i64x2.extmul_low_i32x4_s(fv, g0123)
+	let h34: v128 = i64x2.extmul_high_i32x4_s(fv, g0123)
+	let h56: v128 = i64x2.extmul_low_i32x4_s(fv, g4567)
+	let h78: v128 = i64x2.extmul_high_i32x4_s(fv, g4567)
+	let h90: v128 = i64x2.extmul_low_i32x4_s(fv, v128.bitselect(g89xx, g89xx_19, m))
+
+	// f[2]
+	fi = load<i32>(f_ptr, 8)
+	fv = i32x4.splat(fi)
+
+	let t0: v128 = i64x2.extmul_low_i32x4_s(fv, g0123)
+	let t1: v128 = i64x2.extmul_high_i32x4_s(fv, g0123)
+	let t2: v128 = i64x2.extmul_low_i32x4_s(fv, g4567)
+	let t3: v128 = i64x2.extmul_high_i32x4_s(fv, g4567)
+	let t4: v128 = i64x2.extmul_low_i32x4_s(fv, g89xx_19)
+
+	h23 = i64x2.add(h23, t0)
+	h45 = i64x2.add(h45, t1)
+	h67 = i64x2.add(h67, t2)
+	h89 = i64x2.add(h89, t3)
+	h01 = i64x2.add(h01, t4)
+
+	// f[3]
+	fi = load<i32>(f_ptr, 12)
+	fv = i32x4.splat(fi)
+	fv = i32x4.add(fv, v128.and(fv, odds))
+
+	t0 = i64x2.extmul_low_i32x4_s(fv, g0123)
+	t1 = i64x2.extmul_high_i32x4_s(fv, g0123)
+	t2 = i64x2.extmul_low_i32x4_s(fv, g4567)
+	t3 = i64x2.extmul_high_i32x4_s(fv, v128.bitselect(g4567, g4567_19, m))
+	t4 = i64x2.extmul_low_i32x4_s(fv, g89xx_19)
+
+	h34 = i64x2.add(h34, t0)
+	h56 = i64x2.add(h56, t1)
+	h78 = i64x2.add(h78, t2)
+	h90 = i64x2.add(h90, t3)
+	h12 = i64x2.add(h12, t4)
+
+	// f[4]
+	fi = load<i32>(f_ptr, 16)
+	fv = i32x4.splat(fi)
+
+	t0 = i64x2.extmul_low_i32x4_s(fv, g0123)
+	t1 = i64x2.extmul_high_i32x4_s(fv, g0123)
+	t2 = i64x2.extmul_low_i32x4_s(fv, g4567)
+	t3 = i64x2.extmul_high_i32x4_s(fv, g4567_19)
+	t4 = i64x2.extmul_low_i32x4_s(fv, g89xx_19)
+
+	h45 = i64x2.add(h45, t0)
+	h67 = i64x2.add(h67, t1)
+	h89 = i64x2.add(h89, t2)
+	h01 = i64x2.add(h01, t3)
+	h23 = i64x2.add(h23, t4)
+
+	// f[5]
+	fi = load<i32>(f_ptr, 20)
+	fv = i32x4.splat(fi)
+	fv = i32x4.add(fv, v128.and(fv, odds))
+
+	t0 = i64x2.extmul_low_i32x4_s(fv, g0123)
+	t1 = i64x2.extmul_high_i32x4_s(fv, g0123)
+	t2 = i64x2.extmul_low_i32x4_s(fv, v128.bitselect(g4567, g4567_19, m))
+	t3 = i64x2.extmul_high_i32x4_s(fv, g4567_19)
+	t4 = i64x2.extmul_low_i32x4_s(fv, g89xx_19)
+
+	h56 = i64x2.add(h56, t0)
+	h78 = i64x2.add(h78, t1)
+	h90 = i64x2.add(h90, t2)
+	h12 = i64x2.add(h12, t3)
+	h34 = i64x2.add(h34, t4)
+
+	// f[6]
+	fi = load<i32>(f_ptr, 24)
+	fv = i32x4.splat(fi)
+
+	t0 = i64x2.extmul_low_i32x4_s(fv, g0123)
+	t1 = i64x2.extmul_high_i32x4_s(fv, g0123)
+	t2 = i64x2.extmul_low_i32x4_s(fv, g4567_19)
+	t3 = i64x2.extmul_high_i32x4_s(fv, g4567_19)
+	t4 = i64x2.extmul_low_i32x4_s(fv, g89xx_19)
+
+	h67 = i64x2.add(h67, t0)
+	h89 = i64x2.add(h89, t1)
+	h01 = i64x2.add(h01, t2)
+	h23 = i64x2.add(h23, t3)
+	h45 = i64x2.add(h45, t4)
+
+	// f[7]
+	fi = load<i32>(f_ptr, 28)
+	fv = i32x4.splat(fi)
+	fv = i32x4.add(fv, v128.and(fv, odds))
+
+	t0 = i64x2.extmul_low_i32x4_s(fv, g0123)
+	t1 = i64x2.extmul_high_i32x4_s(fv, v128.bitselect(g0123, g0123_19, m))
+	t2 = i64x2.extmul_low_i32x4_s(fv, g4567_19)
+	t3 = i64x2.extmul_high_i32x4_s(fv, g4567_19)
+	t4 = i64x2.extmul_low_i32x4_s(fv, g89xx_19)
+
+	h78 = i64x2.add(h78, t0)
+	h90 = i64x2.add(h90, t1)
+	h12 = i64x2.add(h12, t2)
+	h34 = i64x2.add(h34, t3)
+	h56 = i64x2.add(h56, t4)
+
+	// f[8]
+	fi = load<i32>(f_ptr, 32)
+	fv = i32x4.splat(fi)
+
+	t0 = i64x2.extmul_low_i32x4_s(fv, g0123)
+	t1 = i64x2.extmul_high_i32x4_s(fv, g0123_19)
+	t2 = i64x2.extmul_low_i32x4_s(fv, g4567_19)
+	t3 = i64x2.extmul_high_i32x4_s(fv, g4567_19)
+	t4 = i64x2.extmul_low_i32x4_s(fv, g89xx_19)
+
+	h89 = i64x2.add(h89, t0)
+	h01 = i64x2.add(h01, t1)
+	h23 = i64x2.add(h23, t2)
+	h45 = i64x2.add(h45, t3)
+	h67 = i64x2.add(h67, t4)
+
+	// f[9]
+	fi = load<i32>(f_ptr, 36)
+	fv = i32x4.splat(fi)
+	fv = i32x4.add(fv, v128.and(fv, odds))
+
+	t0 = i64x2.extmul_low_i32x4_s(fv, v128.bitselect(g0123, g0123_19, m))
+	t1 = i64x2.extmul_high_i32x4_s(fv, g0123_19)
+	t2 = i64x2.extmul_low_i32x4_s(fv, g4567_19)
+	t3 = i64x2.extmul_high_i32x4_s(fv, g4567_19)
+	t4 = i64x2.extmul_low_i32x4_s(fv, g89xx_19)
+
+	h90 = i64x2.add(h90, t0)
+	h12 = i64x2.add(h12, t1)
+	h34 = i64x2.add(h34, t2)
+	h56 = i64x2.add(h56, t3)
+	h78 = i64x2.add(h78, t4)
+
+	// extract scalars from vectors
+	let h0: i64 = i64x2.extract_lane(h90, 1) + i64x2.extract_lane(h01, 0)
+	let h1: i64 = i64x2.extract_lane(h01, 1) + i64x2.extract_lane(h12, 0)
+	let h2: i64 = i64x2.extract_lane(h12, 1) + i64x2.extract_lane(h23, 0)
+	let h3: i64 = i64x2.extract_lane(h23, 1) + i64x2.extract_lane(h34, 0)
+	let h4: i64 = i64x2.extract_lane(h34, 1) + i64x2.extract_lane(h45, 0)
+	let h5: i64 = i64x2.extract_lane(h45, 1) + i64x2.extract_lane(h56, 0)
+	let h6: i64 = i64x2.extract_lane(h56, 1) + i64x2.extract_lane(h67, 0)
+	let h7: i64 = i64x2.extract_lane(h67, 1) + i64x2.extract_lane(h78, 0)
+	let h8: i64 = i64x2.extract_lane(h78, 1) + i64x2.extract_lane(h89, 0)
+	let h9: i64 = i64x2.extract_lane(h89, 1) + i64x2.extract_lane(h90, 0)
+
+	// extract scalars from vectors during first carry
+	let carry0: i64
+	let carry1: i64
+	let carry2: i64
+	let carry3: i64
+	let carry4: i64
+	let carry5: i64
+	let carry6: i64
+	let carry7: i64
+	let carry8: i64
+	let carry9: i64
+
+	carry0 = (h0 + (1 << 25)) >> 26
+	h1 += carry0
+	h0 -= carry0 << 26
+	carry4 = (h4 + (1 << 25)) >> 26
+	h5 += carry4
+	h4 -= carry4 << 26
+
+	carry1 = (h1 + (1 << 24)) >> 25
+	h2 += carry1
+	h1 -= carry1 << 25
+	carry5 = (h5 + (1 << 24)) >> 25
+	h6 += carry5
+	h5 -= carry5 << 25
+
+	carry2 = (h2 + (1 << 25)) >> 26
+	h3 += carry2
+	h2 -= carry2 << 26
+	carry6 = (h6 + (1 << 25)) >> 26
+	h7 += carry6
+	h6 -= carry6 << 26
+
+	carry3 = (h3 + (1 << 24)) >> 25
+	h4 += carry3
+	h3 -= carry3 << 25
+	carry7 = (h7 + (1 << 24)) >> 25
+	h8 += carry7
+	h7 -= carry7 << 25
+
+	carry4 = (h4 + (1 << 25)) >> 26
+	h5 += carry4
+	h4 -= carry4 << 26
+	carry8 = (h8 + (1 << 25)) >> 26
+	h9 += carry8
+	h8 -= carry8 << 26
+
+	carry9 = (h9 + (1 << 24)) >> 25
+	h0 += carry9 * 19
+	h9 -= carry9 << 25
+
+	carry0 = (h0 + (1 << 25)) >> 26
+	h1 += carry0
+	h0 -= carry0 << 26
+
+	// assign results to output
+	const h_ptr: usize = changetype<usize>(h)
+	store<i32>(h_ptr, h0, 0)
+	store<i32>(h_ptr, h1, 4)
+	store<i32>(h_ptr, h2, 8)
+	store<i32>(h_ptr, h3, 12)
+	store<i32>(h_ptr, h4, 16)
+	store<i32>(h_ptr, h5, 20)
+	store<i32>(h_ptr, h6, 24)
+	store<i32>(h_ptr, h7, 28)
+	store<i32>(h_ptr, h8, 32)
+	store<i32>(h_ptr, h9, 36)
 }
 
 /**
@@ -397,17 +568,11 @@ export function fe_mul (h: FieldElement, f: FieldElement, g: FieldElement): void
 //@ts-expect-error
 @inline
 export function fe_neg (h: FieldElement, f: FieldElement): void {
-	const o = changetype<usize>(h)
-	const a = changetype<usize>(f)
-
-	let ai = v128.load(a)
-	v128.store(o, v128.neg<i32>(ai))
-
-	ai = v128.load(a, 16)
-	v128.store(o, v128.neg<i32>(ai), 16)
-
-	ai = i64x2(load<i64>(a + 32), 0)
-	v128.store_lane<i64>(o, v128.neg<i32>(ai), 0, 32)
+	const h_ptr = changetype<usize>(h)
+	const f_ptr = changetype<usize>(f)
+	v128.store(h_ptr, v128.neg<i32>(v128.load(f_ptr)))
+	v128.store(h_ptr, v128.neg<i32>(v128.load(f_ptr, 16)), 16)
+	v128.store_lane<i64>(h_ptr, v128.neg<i32>(v128.load(f_ptr, 32)), 0, 32)
 }
 
 const fe_pow22523_t0: FieldElement = fe()
@@ -492,100 +657,752 @@ export function fe_pow22523 (out: FieldElement, z: FieldElement): void {
  * so floor(2â»Â²âµâµ(h + 19 2^(-25) h9 + 2^(-1))) = q.
  */
 export function fe_reduce (h: FieldElement, f: FieldElement): void {
-	let h0 = f[0]
-	let h1 = f[1]
-	let h2 = f[2]
-	let h3 = f[3]
-	let h4 = f[4]
-	let h5 = f[5]
-	let h6 = f[6]
-	let h7 = f[7]
-	let h8 = f[8]
-	let h9 = f[9]
+	const f_ptr: usize = changetype<usize>(f)
+	let f0: i32 = load<i32>(f_ptr, 0)
+	let f1: i32 = load<i32>(f_ptr, 4)
+	let f2: i32 = load<i32>(f_ptr, 8)
+	let f3: i32 = load<i32>(f_ptr, 12)
+	let f4: i32 = load<i32>(f_ptr, 16)
+	let f5: i32 = load<i32>(f_ptr, 20)
+	let f6: i32 = load<i32>(f_ptr, 24)
+	let f7: i32 = load<i32>(f_ptr, 28)
+	let f8: i32 = load<i32>(f_ptr, 32)
+	let f9: i32 = load<i32>(f_ptr, 36)
 	let q: i32
-	let carry0: i32, carry1: i32, carry2: i32, carry3: i32, carry4: i32, carry5: i32, carry6: i32, carry7: i32, carry8: i32, carry9: i32
-
-	q = (19 * h9 + (u32(1) << 24)) >> 25
-	q = (h0 + q) >> 26
-	q = (h1 + q) >> 25
-	q = (h2 + q) >> 26
-	q = (h3 + q) >> 25
-	q = (h4 + q) >> 26
-	q = (h5 + q) >> 25
-	q = (h6 + q) >> 26
-	q = (h7 + q) >> 25
-	q = (h8 + q) >> 26
-	q = (h9 + q) >> 25
+	let c0: i32, c1: i32, c2: i32, c3: i32, c4: i32, c5: i32, c6: i32, c7: i32, c8: i32, c9: i32
+
+	q = (19 * f9 + (1 << 24)) >> 25
+	q = (f0 + q) >> 26
+	q = (f1 + q) >> 25
+	q = (f2 + q) >> 26
+	q = (f3 + q) >> 25
+	q = (f4 + q) >> 26
+	q = (f5 + q) >> 25
+	q = (f6 + q) >> 26
+	q = (f7 + q) >> 25
+	q = (f8 + q) >> 26
+	q = (f9 + q) >> 25
 
 	/* Goal: Output h-(2Â²âµâµ-19)q, which is between 0 and 2Â²âµâµ-20. */
-	h0 += 19 * q
+	f0 += 19 * q
 	/* Goal: Output h-2Â²âµâµ q, which is between 0 and 2Â²âµâµ-20. */
 
-	carry0 = h0 >> 26
+	c0 = f0 >> 26
+	f1 += c0
+	f0 -= c0 << 26
+	c1 = f1 >> 25
+	f2 += c1
+	f1 -= c1 << 25
+	c2 = f2 >> 26
+	f3 += c2
+	f2 -= c2 << 26
+	c3 = f3 >> 25
+	f4 += c3
+	f3 -= c3 << 25
+	c4 = f4 >> 26
+	f5 += c4
+	f4 -= c4 << 26
+	c5 = f5 >> 25
+	f6 += c5
+	f5 -= c5 << 25
+	c6 = f6 >> 26
+	f7 += c6
+	f6 -= c6 << 26
+	c7 = f7 >> 25
+	f8 += c7
+	f7 -= c7 << 25
+	c8 = f8 >> 26
+	f9 += c8
+	f8 -= c8 << 26
+	c9 = f9 >> 25
+	f9 -= c9 << 25
+
+	const h_ptr: usize = changetype<usize>(h)
+	store<i32>(h_ptr, f0, 0)
+	store<i32>(h_ptr, f1, 4)
+	store<i32>(h_ptr, f2, 8)
+	store<i32>(h_ptr, f3, 12)
+	store<i32>(h_ptr, f4, 16)
+	store<i32>(h_ptr, f5, 20)
+	store<i32>(h_ptr, f6, 24)
+	store<i32>(h_ptr, f7, 28)
+	store<i32>(h_ptr, f8, 32)
+	store<i32>(h_ptr, f9, 36)
+}
+
+/**
+ * Square a FieldElement and store the result.
+ *
+ * Some iterations of the inner loop can be skipped since
+ * `f[x]*f[y] + f[y]*f[x] = 2*f[x]*f[y]`
+ *
+ * ```
+ * // non-constant-time example
+ * for (let i = 0; i < 10; i++) {
+ * 	for (let j = i; j < 10; j++) {
+ * 		h[i + j] += f[i] * g[j]
+ * 		if (i < j) {
+ * 			h[i + j] *= 2
+ * 		}
+ * 	}
+ * }
+ * ```
+ *
+ * h = f * f
+ * Can overlap h with f.
+ *
+ * Preconditions:
+ * |f| bounded by 1.65*2^26,1.65*2^25,1.65*2^26,1.65*2^25,etc.
+ *
+ * Postconditions:
+ * |h| bounded by 1.01*2^25,1.01*2^24,1.01*2^25,1.01*2^24,etc.
+ *
+ * @param h FieldElement power destination
+ * @param f FieldElement base source
+ */
+export function fe_sq (h: FieldElement, f: FieldElement): void {
+	const f_ptr: usize = changetype<usize>(f)
+	const f0: i64 = i64(load<i32>(f_ptr, 0))
+	const f1: i64 = i64(load<i32>(f_ptr, 4))
+	const f2: i64 = i64(load<i32>(f_ptr, 8))
+	const f3: i64 = i64(load<i32>(f_ptr, 12))
+	const f4: i64 = i64(load<i32>(f_ptr, 16))
+	const f5: i64 = i64(load<i32>(f_ptr, 20))
+	const f6: i64 = i64(load<i32>(f_ptr, 24))
+	const f7: i64 = i64(load<i32>(f_ptr, 28))
+	const f8: i64 = i64(load<i32>(f_ptr, 32))
+	const f9: i64 = i64(load<i32>(f_ptr, 36))
+
+	const f0_2: i64 = f0 * 2
+	const f1_2: i64 = f1 * 2
+	const f2_2: i64 = f2 * 2
+	const f3_2: i64 = f3 * 2
+	const f4_2: i64 = f4 * 2
+	const f5_2: i64 = f5 * 2
+	const f6_2: i64 = f6 * 2
+	const f7_2: i64 = f7 * 2
+
+	const f5_19: i64 = f5 * 19 /* 1.959375*2^29 */
+	const f6_19: i64 = f6 * 19 /* 1.959375*2^30 */
+	const f7_19: i64 = f7 * 19 /* 1.959375*2^29 */
+	const f8_19: i64 = f8 * 19 /* 1.959375*2^30 */
+	const f9_19: i64 = f9 * 19 /* 1.959375*2^29 */
+
+	const f7_38: i64 = f7 * 38 /* 1.959375*2^30 */
+	const f9_38: i64 = f9 * 38 /* 1.959375*2^30 */
+
+	let h0: i64 = f0 * f0
+	let h1: i64 = f0_2 * f1
+	let h2: i64 = f0_2 * f2
+	let h3: i64 = f0_2 * f3
+	let h4: i64 = f0_2 * f4
+	let h5: i64 = f0_2 * f5
+	let h6: i64 = f0_2 * f6
+	let h7: i64 = f0_2 * f7
+	let h8: i64 = f0_2 * f8
+	let h9: i64 = f0_2 * f9
+
+	h2 += f1_2 * f1
+	h3 += f1_2 * f2
+	h4 += f1_2 * f3_2
+	h5 += f1_2 * f4
+	h6 += f1_2 * f5_2
+	h7 += f1_2 * f6
+	h8 += f1_2 * f7_2
+	h9 += f1_2 * f8
+	h0 += f1_2 * f9_38
+
+	h4 += f2 * f2
+	h5 += f2_2 * f3
+	h6 += f2_2 * f4
+	h7 += f2_2 * f5
+	h8 += f2_2 * f6
+	h9 += f2_2 * f7
+	h0 += f2_2 * f8_19
+	h1 += f2_2 * f9_19
+
+	h6 += f3_2 * f3
+	h7 += f3_2 * f4
+	h8 += f3_2 * f5_2
+	h9 += f3_2 * f6
+	h0 += f3_2 * f7_38
+	h1 += f3_2 * f8_19
+	h2 += f3_2 * f9_38
+
+	h8 += f4 * f4
+	h9 += f4_2 * f5
+	h0 += f4_2 * f6_19
+	h1 += f4_2 * f7_19
+	h2 += f4_2 * f8_19
+	h3 += f4_2 * f9_19
+
+	h0 += f5_2 * f5_19
+	h1 += f5_2 * f6_19
+	h2 += f5_2 * f7_38
+	h3 += f5_2 * f8_19
+	h4 += f5_2 * f9_38
+
+	h2 += f6 * f6_19
+	h3 += f6_2 * f7_19
+	h4 += f6_2 * f8_19
+	h5 += f6_2 * f9_19
+
+	h4 += f7_2 * f7_19
+	h5 += f7_2 * f8_19
+	h6 += f7_2 * f9_38
+
+	h6 += f8 * f8_19
+	h7 += f8 * f9_38
+
+	h8 += f9 * f9_38
+
+	let carry0: i64
+	let carry1: i64
+	let carry2: i64
+	let carry3: i64
+	let carry4: i64
+	let carry5: i64
+	let carry6: i64
+	let carry7: i64
+	let carry8: i64
+	let carry9: i64
+
+	carry0 = (h0 + i64(1 << 25)) >> 26
 	h1 += carry0
-	h0 -= carry0 * u32(1 << 26)
-	carry1 = h1 >> 25
-	h2 += carry1
-	h1 -= carry1 * u32(1 << 25)
-	carry2 = h2 >> 26
-	h3 += carry2
-	h2 -= carry2 * u32(1 << 26)
-	carry3 = h3 >> 25
-	h4 += carry3
-	h3 -= carry3 * u32(1 << 25)
-	carry4 = h4 >> 26
+	h0 -= carry0 << 26
+	carry4 = (h4 + i64(1 << 25)) >> 26
 	h5 += carry4
-	h4 -= carry4 * u32(1 << 26)
-	carry5 = h5 >> 25
+	h4 -= carry4 << 26
+
+	carry1 = (h1 + i64(1 << 24)) >> 25
+	h2 += carry1
+	h1 -= carry1 << 25
+	carry5 = (h5 + i64(1 << 24)) >> 25
 	h6 += carry5
-	h5 -= carry5 * u32(1 << 25)
-	carry6 = h6 >> 26
+	h5 -= carry5 << 25
+
+	carry2 = (h2 + i64(1 << 25)) >> 26
+	h3 += carry2
+	h2 -= carry2 << 26
+	carry6 = (h6 + i64(1 << 25)) >> 26
 	h7 += carry6
-	h6 -= carry6 * u32(1 << 26)
-	carry7 = h7 >> 25
+	h6 -= carry6 << 26
+
+	carry3 = (h3 + i64(1 << 24)) >> 25
+	h4 += carry3
+	h3 -= carry3 << 25
+	carry7 = (h7 + i64(1 << 24)) >> 25
 	h8 += carry7
-	h7 -= carry7 * u32(1 << 25)
-	carry8 = h8 >> 26
+	h7 -= carry7 << 25
+
+	carry4 = (h4 + i64(1 << 25)) >> 26
+	h5 += carry4
+	h4 -= carry4 << 26
+	carry8 = (h8 + i64(1 << 25)) >> 26
 	h9 += carry8
-	h8 -= carry8 * u32(1 << 26)
-	carry9 = h9 >> 25
-	h9 -= carry9 * u32(1 << 25)
-
-	h[0] = h0
-	h[1] = h1
-	h[2] = h2
-	h[3] = h3
-	h[4] = h4
-	h[5] = h5
-	h[6] = h6
-	h[7] = h7
-	h[8] = h8
-	h[9] = h9
+	h8 -= carry8 << 26
+
+	carry9 = (h9 + i64(1 << 24)) >> 25
+	h0 += carry9 * 19
+	h9 -= carry9 << 25
+
+	carry0 = (h0 + i64(1 << 25)) >> 26
+	h1 += carry0
+	h0 -= carry0 << 26
+
+	const h_ptr: usize = changetype<usize>(h)
+	store<i32>(h_ptr, h0, 0)
+	store<i32>(h_ptr, h1, 4)
+	store<i32>(h_ptr, h2, 8)
+	store<i32>(h_ptr, h3, 12)
+	store<i32>(h_ptr, h4, 16)
+	store<i32>(h_ptr, h5, 20)
+	store<i32>(h_ptr, h6, 24)
+	store<i32>(h_ptr, h7, 28)
+	store<i32>(h_ptr, h8, 32)
+	store<i32>(h_ptr, h9, 36)
 }
 
 /**
  * Square a FieldElement and store the result.
  *
- * @param h FieldElement power destination
- * @param f FieldElement base source
+ * Some iterations of the inner loop can be skipped since
+ * `f[x]*f[y] + f[y]*f[x] = 2*f[x]*f[y]`
+ *
+ * ```
+ * // non-constant-time example
+ * for (let i = 0; i < 10; i++) {
+ * 	for (let j = i; j < 10; j++) {
+ * 		h[i + j] += f[i] * g[j]
+ * 		if (i < j) {
+ * 			h[i + j] *= 2
+ * 		}
+ * 	}
+ * }
+ * ```
+ *
+ * h = f * f
+ * Can overlap h with f.
+ *
+ * Preconditions:
+ * |f| bounded by 1.65*2^26,1.65*2^25,1.65*2^26,1.65*2^25,etc.
+ *
+ * Postconditions:
+ * |h| bounded by 1.01*2^25,1.01*2^24,1.01*2^25,1.01*2^24,etc.
+ *
+ * @param hx FieldElement power destination
+ * @param fx FieldElement base source
  */
-//@ts-expect-error
-@inline
-export function fe_sq (h: FieldElement, f: FieldElement): void {
-	fe_mul(h, f, f)
+export function fe_sq_vec (hx: FieldElement, fx: FieldElement, hy: FieldElement, fy: FieldElement): void {
+	const fx_ptr: usize = changetype<usize>(fx)
+	const fy_ptr: usize = changetype<usize>(fy)
+	const f0: v128 = i32x4(load<i32>(fx_ptr, 0), load<i32>(fy_ptr, 0), 0, 0)
+	const f1: v128 = i32x4(load<i32>(fx_ptr, 4), load<i32>(fy_ptr, 4), 0, 0)
+	const f2: v128 = i32x4(load<i32>(fx_ptr, 8), load<i32>(fy_ptr, 8), 0, 0)
+	const f3: v128 = i32x4(load<i32>(fx_ptr, 12), load<i32>(fy_ptr, 12), 0, 0)
+	const f4: v128 = i32x4(load<i32>(fx_ptr, 16), load<i32>(fy_ptr, 16), 0, 0)
+	const f5: v128 = i32x4(load<i32>(fx_ptr, 20), load<i32>(fy_ptr, 20), 0, 0)
+	const f6: v128 = i32x4(load<i32>(fx_ptr, 24), load<i32>(fy_ptr, 24), 0, 0)
+	const f7: v128 = i32x4(load<i32>(fx_ptr, 28), load<i32>(fy_ptr, 28), 0, 0)
+	const f8: v128 = i32x4(load<i32>(fx_ptr, 32), load<i32>(fy_ptr, 32), 0, 0)
+	const f9: v128 = i32x4(load<i32>(fx_ptr, 36), load<i32>(fy_ptr, 36), 0, 0)
+
+	const v19: v128 = i32x4.splat(19)
+	const f5_19: v128 = i32x4.mul(f5, v19) /* 1.959375*2^29 */
+	const f6_19: v128 = i32x4.mul(f6, v19) /* 1.959375*2^30 */
+	const f7_19: v128 = i32x4.mul(f7, v19) /* 1.959375*2^29 */
+	const f8_19: v128 = i32x4.mul(f8, v19) /* 1.959375*2^30 */
+	const f9_19: v128 = i32x4.mul(f9, v19) /* 1.959375*2^29 */
+
+	// f[0]
+	let f_2: v128 = i32x4.shl(f0, 1)
+
+	let h0: v128 = i64x2.extmul_low_i32x4_s(f0, f0)
+	let h1: v128 = i64x2.extmul_low_i32x4_s(f_2, f1)
+	let h2: v128 = i64x2.extmul_low_i32x4_s(f_2, f2)
+	let h3: v128 = i64x2.extmul_low_i32x4_s(f_2, f3)
+	let h4: v128 = i64x2.extmul_low_i32x4_s(f_2, f4)
+	let h5: v128 = i64x2.extmul_low_i32x4_s(f_2, f5)
+	let h6: v128 = i64x2.extmul_low_i32x4_s(f_2, f6)
+	let h7: v128 = i64x2.extmul_low_i32x4_s(f_2, f7)
+	let h8: v128 = i64x2.extmul_low_i32x4_s(f_2, f8)
+	let h9: v128 = i64x2.extmul_low_i32x4_s(f_2, f9)
+
+	// f[1]
+	f_2 = i32x4.shl(f1, 1)
+	let f_4: v128 = i32x4.shl(f1, 2)
+
+	let t0: v128 = i64x2.extmul_low_i32x4_s(f_2, f1)
+	let t1: v128 = i64x2.extmul_low_i32x4_s(f_2, f2)
+	let t2: v128 = i64x2.extmul_low_i32x4_s(f_4, f3)
+	let t3: v128 = i64x2.extmul_low_i32x4_s(f_2, f4)
+	let t4: v128 = i64x2.extmul_low_i32x4_s(f_4, f5)
+	let t5: v128 = i64x2.extmul_low_i32x4_s(f_2, f6)
+	let t6: v128 = i64x2.extmul_low_i32x4_s(f_4, f7)
+	let t7: v128 = i64x2.extmul_low_i32x4_s(f_2, f8)
+	let t8: v128 = i64x2.extmul_low_i32x4_s(f_4, f9_19)
+
+	h2 = i64x2.add(h2, t0)
+	h3 = i64x2.add(h3, t1)
+	h4 = i64x2.add(h4, t2)
+	h5 = i64x2.add(h5, t3)
+	h6 = i64x2.add(h6, t4)
+	h7 = i64x2.add(h7, t5)
+	h8 = i64x2.add(h8, t6)
+	h9 = i64x2.add(h9, t7)
+	h0 = i64x2.add(h0, t8)
+
+	// f[2]
+	f_2 = i32x4.shl(f2, 1)
+
+	t0 = i64x2.extmul_low_i32x4_s(f2, f2)
+	t1 = i64x2.extmul_low_i32x4_s(f_2, f3)
+	t2 = i64x2.extmul_low_i32x4_s(f_2, f4)
+	t3 = i64x2.extmul_low_i32x4_s(f_2, f5)
+	t4 = i64x2.extmul_low_i32x4_s(f_2, f6)
+	t5 = i64x2.extmul_low_i32x4_s(f_2, f7)
+	t6 = i64x2.extmul_low_i32x4_s(f_2, f8_19)
+	t7 = i64x2.extmul_low_i32x4_s(f_2, f9_19)
+
+	h4 = i64x2.add(h4, t0)
+	h5 = i64x2.add(h5, t1)
+	h6 = i64x2.add(h6, t2)
+	h7 = i64x2.add(h7, t3)
+	h8 = i64x2.add(h8, t4)
+	h9 = i64x2.add(h9, t5)
+	h0 = i64x2.add(h0, t6)
+	h1 = i64x2.add(h1, t7)
+
+	// f[3]
+	f_2 = i32x4.shl(f3, 1)
+	f_4 = i32x4.shl(f3, 2)
+
+	t0 = i64x2.extmul_low_i32x4_s(f_2, f3)
+	t1 = i64x2.extmul_low_i32x4_s(f_2, f4)
+	t2 = i64x2.extmul_low_i32x4_s(f_4, f5)
+	t3 = i64x2.extmul_low_i32x4_s(f_2, f6)
+	t4 = i64x2.extmul_low_i32x4_s(f_4, f7_19) /* 1.959375*2^30 */
+	t5 = i64x2.extmul_low_i32x4_s(f_2, f8_19)
+	t6 = i64x2.extmul_low_i32x4_s(f_4, f9_19)
+
+	h6 = i64x2.add(h6, t0)
+	h7 = i64x2.add(h7, t1)
+	h8 = i64x2.add(h8, t2)
+	h9 = i64x2.add(h9, t3)
+	h0 = i64x2.add(h0, t4) /* 1.959375*2^30 */
+	h1 = i64x2.add(h1, t5)
+	h2 = i64x2.add(h2, t6)
+
+	// f[4]
+	f_2 = i32x4.shl(f4, 1)
+
+	t0 = i64x2.extmul_low_i32x4_s(f4, f4)
+	t1 = i64x2.extmul_low_i32x4_s(f_2, f5)
+	t2 = i64x2.extmul_low_i32x4_s(f_2, f6_19)
+	t3 = i64x2.extmul_low_i32x4_s(f_2, f7_19)
+	t4 = i64x2.extmul_low_i32x4_s(f_2, f8_19)
+	t5 = i64x2.extmul_low_i32x4_s(f_2, f9_19)
+
+	h8 = i64x2.add(h8, t0)
+	h9 = i64x2.add(h9, t1)
+	h0 = i64x2.add(h0, t2)
+	h1 = i64x2.add(h1, t3)
+	h2 = i64x2.add(h2, t4)
+	h3 = i64x2.add(h3, t5)
+
+	// f[5]
+	f_2 = i32x4.shl(f5, 1)
+	f_4 = i32x4.shl(f5, 2)
+
+	t0 = i64x2.extmul_low_i32x4_s(f_2, f5_19)
+	t1 = i64x2.extmul_low_i32x4_s(f_2, f6_19)
+	t2 = i64x2.extmul_low_i32x4_s(f_4, f7_19)
+	t3 = i64x2.extmul_low_i32x4_s(f_2, f8_19)
+	t4 = i64x2.extmul_low_i32x4_s(f_4, f9_19) /* 1.959375*2^30 */
+
+	h0 = i64x2.add(h0, t0)
+	h1 = i64x2.add(h1, t1)
+	h2 = i64x2.add(h2, t2)
+	h3 = i64x2.add(h3, t3)
+	h4 = i64x2.add(h4, t4) /* 1.959375*2^30 */
+
+	// f[6]
+	f_2 = i32x4.shl(f6, 1)
+
+	t0 = i64x2.extmul_low_i32x4_s(f6, f6_19)
+	t1 = i64x2.extmul_low_i32x4_s(f_2, f7_19)
+	t2 = i64x2.extmul_low_i32x4_s(f_2, f8_19)
+	t3 = i64x2.extmul_low_i32x4_s(f_2, f9_19)
+
+	h2 = i64x2.add(h2, t0)
+	h3 = i64x2.add(h3, t1)
+	h4 = i64x2.add(h4, t2)
+	h5 = i64x2.add(h5, t3)
+
+	// f[7]
+	f_2 = i32x4.shl(f7, 1)
+	f_4 = i32x4.shl(f7, 2)
+
+	t0 = i64x2.extmul_low_i32x4_s(f_2, f7_19)
+	t1 = i64x2.extmul_low_i32x4_s(f_2, f8_19)
+	t2 = i64x2.extmul_low_i32x4_s(f_4, f9_19)
+
+	h4 = i64x2.add(h4, t0)
+	h5 = i64x2.add(h5, t1)
+	h6 = i64x2.add(h6, t2)
+
+	// f[8]
+	f_2 = i32x4.shl(f8, 1)
+
+	t0 = i64x2.extmul_low_i32x4_s(f8, f8_19)
+	t1 = i64x2.extmul_low_i32x4_s(f_2, f9_19)
+
+	h6 = i64x2.add(h6, t0)
+	h7 = i64x2.add(h7, t1)
+
+	// f[9]
+	f_2 = i32x4.shl(f9, 1)
+
+	t0 = i64x2.extmul_low_i32x4_s(f_2, f9_19)
+
+	h8 = i64x2.add(h8, t0)
+
+	const v24: v128 = i64x2.splat(1 << 24)
+	const v25: v128 = i64x2.splat(1 << 25)
+	let carry0: v128
+	let carry1: v128
+	let carry2: v128
+	let carry3: v128
+	let carry4: v128
+	let carry5: v128
+	let carry6: v128
+	let carry7: v128
+	let carry8: v128
+	let carry9: v128
+
+	carry0 = i64x2.shr_s(((i64x2.add(h0, v25))), 26)
+	h1 = i64x2.add(h1, carry0)
+	h0 = i64x2.sub(h0, i64x2.shl(carry0, 26))
+	carry4 = i64x2.shr_s(((i64x2.add(h4, v25))), 26)
+	h5 = i64x2.add(h5, carry4)
+	h4 = i64x2.sub(h4, i64x2.shl(carry4, 26))
+
+	carry1 = i64x2.shr_s(((i64x2.add(h1, v24))), 25)
+	h2 = i64x2.add(h2, carry1)
+	h1 = i64x2.sub(h1, i64x2.shl(carry1, 25))
+	carry5 = i64x2.shr_s(((i64x2.add(h5, v24))), 25)
+	h6 = i64x2.add(h6, carry5)
+	h5 = i64x2.sub(h5, i64x2.shl(carry5, 25))
+
+	carry2 = i64x2.shr_s(((i64x2.add(h2, v25))), 26)
+	h3 = i64x2.add(h3, carry2)
+	h2 = i64x2.sub(h2, i64x2.shl(carry2, 26))
+	carry6 = i64x2.shr_s(((i64x2.add(h6, v25))), 26)
+	h7 = i64x2.add(h7, carry6)
+	h6 = i64x2.sub(h6, i64x2.shl(carry6, 26))
+
+	carry3 = i64x2.shr_s(((i64x2.add(h3, v24))), 25)
+	h4 = i64x2.add(h4, carry3)
+	h3 = i64x2.sub(h3, i64x2.shl(carry3, 25))
+	carry7 = i64x2.shr_s(((i64x2.add(h7, v24))), 25)
+	h8 = i64x2.add(h8, carry7)
+	h7 = i64x2.sub(h7, i64x2.shl(carry7, 25))
+
+	carry4 = i64x2.shr_s(((i64x2.add(h4, v25))), 26)
+	h5 = i64x2.add(h5, carry4)
+	h4 = i64x2.sub(h4, i64x2.shl(carry4, 26))
+	carry8 = i64x2.shr_s(((i64x2.add(h8, v25))), 26)
+	h9 = i64x2.add(h9, carry8)
+	h8 = i64x2.sub(h8, i64x2.shl(carry8, 26))
+
+	carry9 = i64x2.shr_s(((i64x2.add(h9, v24))), 25)
+	h0 = i64x2.add(h0, i64x2.mul(carry9, i64x2.splat(19)))
+	h9 = i64x2.sub(h9, i64x2.shl(carry9, 25))
+
+	carry0 = i64x2.shr_s(((i64x2.add(h0, v25))), 26)
+	h1 = i64x2.add(h1, carry0)
+	h0 = i64x2.sub(h0, i64x2.shl(carry0, 26))
+
+	const hx_ptr: usize = changetype<usize>(hx)
+	store<i32>(hx_ptr, i64x2.extract_lane(h0, 0), 0)
+	store<i32>(hx_ptr, i64x2.extract_lane(h1, 0), 4)
+	store<i32>(hx_ptr, i64x2.extract_lane(h2, 0), 8)
+	store<i32>(hx_ptr, i64x2.extract_lane(h3, 0), 12)
+	store<i32>(hx_ptr, i64x2.extract_lane(h4, 0), 16)
+	store<i32>(hx_ptr, i64x2.extract_lane(h5, 0), 20)
+	store<i32>(hx_ptr, i64x2.extract_lane(h6, 0), 24)
+	store<i32>(hx_ptr, i64x2.extract_lane(h7, 0), 28)
+	store<i32>(hx_ptr, i64x2.extract_lane(h8, 0), 32)
+	store<i32>(hx_ptr, i64x2.extract_lane(h9, 0), 36)
+
+	const hy_ptr: usize = changetype<usize>(hy)
+	store<i32>(hy_ptr, i64x2.extract_lane(h0, 1), 0)
+	store<i32>(hy_ptr, i64x2.extract_lane(h1, 1), 4)
+	store<i32>(hy_ptr, i64x2.extract_lane(h2, 1), 8)
+	store<i32>(hy_ptr, i64x2.extract_lane(h3, 1), 12)
+	store<i32>(hy_ptr, i64x2.extract_lane(h4, 1), 16)
+	store<i32>(hy_ptr, i64x2.extract_lane(h5, 1), 20)
+	store<i32>(hy_ptr, i64x2.extract_lane(h6, 1), 24)
+	store<i32>(hy_ptr, i64x2.extract_lane(h7, 1), 28)
+	store<i32>(hy_ptr, i64x2.extract_lane(h8, 1), 32)
+	store<i32>(hy_ptr, i64x2.extract_lane(h9, 1), 36)
 }
 
 /**
  * Square a FieldElement, double it, and store the result.
  *
+ * h = 2 * f * f
+ * Can overlap h with f.
+ *
+ * Preconditions:
+ * |f| bounded by 1.65*2^26,1.65*2^25,1.65*2^26,1.65*2^25,etc.
+ *
+ * Postconditions:
+ * |h| bounded by 1.01*2^25,1.01*2^24,1.01*2^25,1.01*2^24,etc.
+ *
  * @param h FieldElement power destination
  * @param f FieldElement base source
  */
-//@ts-expect-error
-@inline
 export function fe_sq2 (h: FieldElement, f: FieldElement): void {
-	fe_mul(h, f, f)
-	fe_add(h, h, h)
+	const f_ptr: usize = changetype<usize>(f)
+	const f0: i64 = i64(load<i32>(f_ptr, 0))
+	const f1: i64 = i64(load<i32>(f_ptr, 4))
+	const f2: i64 = i64(load<i32>(f_ptr, 8))
+	const f3: i64 = i64(load<i32>(f_ptr, 12))
+	const f4: i64 = i64(load<i32>(f_ptr, 16))
+	const f5: i64 = i64(load<i32>(f_ptr, 20))
+	const f6: i64 = i64(load<i32>(f_ptr, 24))
+	const f7: i64 = i64(load<i32>(f_ptr, 28))
+	const f8: i64 = i64(load<i32>(f_ptr, 32))
+	const f9: i64 = i64(load<i32>(f_ptr, 36))
+
+	const f0_2: i64 = f0 * 2
+	const f1_2: i64 = f1 * 2
+	const f2_2: i64 = f2 * 2
+	const f3_2: i64 = f3 * 2
+	const f4_2: i64 = f4 * 2
+	const f5_2: i64 = f5 * 2
+	const f6_2: i64 = f6 * 2
+	const f7_2: i64 = f7 * 2
+
+	const f5_38: i64 = f5 * 38 /* 1.959375*2^30 */
+	const f6_19: i64 = f6 * 19 /* 1.959375*2^30 */
+	const f7_38: i64 = f7 * 38/* 1.959375*2^30 */
+	const f8_19: i64 = f8 * 19/* 1.959375*2^30 */
+	const f9_38: i64 = f9 * 38/* 1.959375*2^30 */
+
+	const f0f0: i64 = f0 * f0
+	const f0f1_2: i64 = f0_2 * f1
+	const f0f2_2: i64 = f0_2 * f2
+	const f0f3_2: i64 = f0_2 * f3
+	const f0f4_2: i64 = f0_2 * f4
+	const f0f5_2: i64 = f0_2 * f5
+	const f0f6_2: i64 = f0_2 * f6
+	const f0f7_2: i64 = f0_2 * f7
+	const f0f8_2: i64 = f0_2 * f8
+	const f0f9_2: i64 = f0_2 * f9
+
+	const f1f1_2: i64 = f1_2 * f1
+	const f1f2_2: i64 = f1_2 * f2
+	const f1f3_4: i64 = f1_2 * f3_2
+	const f1f4_2: i64 = f1_2 * f4
+	const f1f5_4: i64 = f1_2 * f5_2
+	const f1f6_2: i64 = f1_2 * f6
+	const f1f7_4: i64 = f1_2 * f7_2
+	const f1f8_2: i64 = f1_2 * f8
+	const f1f9_76: i64 = f1_2 * f9_38
+
+	const f2f2: i64 = f2 * f2
+	const f2f3_2: i64 = f2_2 * f3
+	const f2f4_2: i64 = f2_2 * f4
+	const f2f5_2: i64 = f2_2 * f5
+	const f2f6_2: i64 = f2_2 * f6
+	const f2f7_2: i64 = f2_2 * f7
+	const f2f8_38: i64 = f2_2 * f8_19
+	const f2f9_38: i64 = f2 * f9_38
+
+	const f3f3_2: i64 = f3_2 * f3
+	const f3f4_2: i64 = f3_2 * f4
+	const f3f5_4: i64 = f3_2 * f5_2
+	const f3f6_2: i64 = f3_2 * f6
+	const f3f7_76: i64 = f3_2 * f7_38
+	const f3f8_38: i64 = f3_2 * f8_19
+	const f3f9_76: i64 = f3_2 * f9_38
+
+	const f4f4: i64 = f4 * f4
+	const f4f5_2: i64 = f4_2 * f5
+	const f4f6_38: i64 = f4_2 * f6_19
+	const f4f7_38: i64 = f4 * f7_38
+	const f4f8_38: i64 = f4_2 * f8_19
+	const f4f9_38: i64 = f4 * f9_38
+
+	const f5f5_38: i64 = f5 * f5_38
+	const f5f6_38: i64 = f5_2 * f6_19
+	const f5f7_76: i64 = f5_2 * f7_38
+	const f5f8_38: i64 = f5_2 * f8_19
+	const f5f9_76: i64 = f5_2 * f9_38
+
+	const f6f6_19: i64 = f6 * f6_19
+	const f6f7_38: i64 = f6 * f7_38
+	const f6f8_38: i64 = f6_2 * f8_19
+	const f6f9_38: i64 = f6 * f9_38
+
+	const f7f7_38: i64 = f7 * f7_38
+	const f7f8_38: i64 = f7_2 * f8_19
+	const f7f9_76: i64 = f7_2 * f9_38
+
+	const f8f8_19: i64 = f8 * f8_19
+	const f8f9_38: i64 = f8 * f9_38
+
+	const f9f9_38: i64 = f9 * f9_38
+
+	let h0: i64 = f0f0 + f1f9_76 + f2f8_38 + f3f7_76 + f4f6_38 + f5f5_38
+	let h1: i64 = f0f1_2 + f2f9_38 + f3f8_38 + f4f7_38 + f5f6_38
+	let h2: i64 = f0f2_2 + f1f1_2 + f3f9_76 + f4f8_38 + f5f7_76 + f6f6_19
+	let h3: i64 = f0f3_2 + f1f2_2 + f4f9_38 + f5f8_38 + f6f7_38
+	let h4: i64 = f0f4_2 + f1f3_4 + f2f2 + f5f9_76 + f6f8_38 + f7f7_38
+	let h5: i64 = f0f5_2 + f1f4_2 + f2f3_2 + f6f9_38 + f7f8_38
+	let h6: i64 = f0f6_2 + f1f5_4 + f2f4_2 + f3f3_2 + f7f9_76 + f8f8_19
+	let h7: i64 = f0f7_2 + f1f6_2 + f2f5_2 + f3f4_2 + f8f9_38
+	let h8: i64 = f0f8_2 + f1f7_4 + f2f6_2 + f3f5_4 + f4f4 + f9f9_38
+	let h9: i64 = f0f9_2 + f1f8_2 + f2f7_2 + f3f6_2 + f4f5_2
+
+	h0 *= 2
+	h1 *= 2
+	h2 *= 2
+	h3 *= 2
+	h4 *= 2
+	h5 *= 2
+	h6 *= 2
+	h7 *= 2
+	h8 *= 2
+	h9 *= 2
+
+	let carry0: i64
+	let carry1: i64
+	let carry2: i64
+	let carry3: i64
+	let carry4: i64
+	let carry5: i64
+	let carry6: i64
+	let carry7: i64
+	let carry8: i64
+	let carry9: i64
+
+	carry0 = (h0 + i64(1 << 25)) >> 26
+	h1 += carry0
+	h0 -= carry0 << 26
+	carry4 = (h4 + i64(1 << 25)) >> 26
+	h5 += carry4
+	h4 -= carry4 << 26
+
+	carry1 = (h1 + i64(1 << 24)) >> 25
+	h2 += carry1
+	h1 -= carry1 << 25
+	carry5 = (h5 + i64(1 << 24)) >> 25
+	h6 += carry5
+	h5 -= carry5 << 25
+
+	carry2 = (h2 + i64(1 << 25)) >> 26
+	h3 += carry2
+	h2 -= carry2 << 26
+	carry6 = (h6 + i64(1 << 25)) >> 26
+	h7 += carry6
+	h6 -= carry6 << 26
+
+	carry3 = (h3 + i64(1 << 24)) >> 25
+	h4 += carry3
+	h3 -= carry3 << 25
+	carry7 = (h7 + i64(1 << 24)) >> 25
+	h8 += carry7
+	h7 -= carry7 << 25
+
+	carry4 = (h4 + i64(1 << 25)) >> 26
+	h5 += carry4
+	h4 -= carry4 << 26
+	carry8 = (h8 + i64(1 << 25)) >> 26
+	h9 += carry8
+	h8 -= carry8 << 26
+
+	carry9 = (h9 + i64(1 << 24)) >> 25
+	h0 += carry9 * 19
+	h9 -= carry9 << 25
+
+	carry0 = (h0 + i64(1 << 25)) >> 26
+	h1 += carry0
+	h0 -= carry0 << 26
+
+	const h_ptr: usize = changetype<usize>(h)
+	store<i32>(h_ptr, h0, 0)
+	store<i32>(h_ptr, h1, 4)
+	store<i32>(h_ptr, h2, 8)
+	store<i32>(h_ptr, h3, 12)
+	store<i32>(h_ptr, h4, 16)
+	store<i32>(h_ptr, h5, 20)
+	store<i32>(h_ptr, h6, 24)
+	store<i32>(h_ptr, h7, 28)
+	store<i32>(h_ptr, h8, 32)
+	store<i32>(h_ptr, h9, 36)
 }
 
 /**
@@ -598,21 +1415,12 @@ export function fe_sq2 (h: FieldElement, f: FieldElement): void {
 //@ts-expect-error
 @inline
 export function fe_sub (h: FieldElement, f: FieldElement, g: FieldElement): void {
-	const o = changetype<usize>(h)
-	const a = changetype<usize>(f)
-	const b = changetype<usize>(g)
-
-	let ai = v128.load(a)
-	let bi = v128.load(b)
-	v128.store(o, v128.sub<i32>(ai, bi))
-
-	ai = v128.load(a, 16)
-	bi = v128.load(b, 16)
-	v128.store(o, v128.sub<i32>(ai, bi), 16)
-
-	ai = i64x2(load<i64>(a + 32), 0)
-	bi = i64x2(load<i64>(b + 32), 0)
-	v128.store_lane<i64>(o, v128.sub<i32>(ai, bi), 0, 32)
+	const f_ptr = changetype<usize>(f)
+	const g_ptr = changetype<usize>(g)
+	const h_ptr = changetype<usize>(h)
+	v128.store(h_ptr, v128.sub<i32>(v128.load(f_ptr, 0), v128.load(g_ptr, 0)), 0)
+	v128.store(h_ptr, v128.sub<i32>(v128.load(f_ptr, 16), v128.load(g_ptr, 16)), 16)
+	v128.store_lane<i64>(h_ptr, v128.sub<i32>(v128.load(f_ptr, 32), v128.load(g_ptr, 32)), 0, 32)
 }
 
 const fe_tobytes_t: FieldElement = fe()
@@ -626,183 +1434,49 @@ const fe_tobytes_t: FieldElement = fe()
 export function fe_tobytes (s: StaticArray<u8>, h: FieldElement): void {
 	const t = fe_tobytes_t
 	fe_reduce(t, h)
-	s[0] = u8(t[0] >> 0)
-	s[1] = u8(t[0] >> 8)
-	s[2] = u8(t[0] >> 16)
-	s[3] = u8((t[0] >> 24) | (t[1] * (u32(1) << 2)))
-	s[4] = u8(t[1] >> 6)
-	s[5] = u8(t[1] >> 14)
-	s[6] = u8((t[1] >> 22) | (t[2] * (u32(1) << 3)))
-	s[7] = u8(t[2] >> 5)
-	s[8] = u8(t[2] >> 13)
-	s[9] = u8((t[2] >> 21) | (t[3] * (u32(1) << 5)))
-	s[10] = u8(t[3] >> 3)
-	s[11] = u8(t[3] >> 11)
-	s[12] = u8((t[3] >> 19) | (t[4] * (u32(1) << 6)))
-	s[13] = u8(t[4] >> 2)
-	s[14] = u8(t[4] >> 10)
-	s[15] = u8(t[4] >> 18)
-	s[16] = u8(t[5] >> 0)
-	s[17] = u8(t[5] >> 8)
-	s[18] = u8(t[5] >> 16)
-	s[19] = u8((t[5] >> 24) | (t[6] * (u32(1) << 1)))
-	s[20] = u8(t[6] >> 7)
-	s[21] = u8(t[6] >> 15)
-	s[22] = u8((t[6] >> 23) | (t[7] * (u32(1) << 3)))
-	s[23] = u8(t[7] >> 5)
-	s[24] = u8(t[7] >> 13)
-	s[25] = u8((t[7] >> 21) | (t[8] * (u32(1) << 4)))
-	s[26] = u8(t[8] >> 4)
-	s[27] = u8(t[8] >> 12)
-	s[28] = u8((t[8] >> 20) | (t[9] * (u32(1) << 6)))
-	s[29] = u8(t[9] >> 2)
-	s[30] = u8(t[9] >> 10)
-	s[31] = u8(t[9] >> 18)
-}
-
-/**
- * Internal function.
- *
- * Normalize results of `fe_multiply` and `fe_square` across limbs. This
- * requires a reduction step and two carry steps.
- *
- * 	// reduce
- * 	for (let i = 0; i < 15; i++) {
- * 		t[i] += 38 * t[i + 16]
- * 	}
- * 	// carry twice
- * 	for (let i = 0; i < 2; i++) {
- * 		for (let i = 0; i < 16; i++) {
- * 			c += t[i]
- * 			t[i] = c & 0xFFFF
- * 			c >>= 16
- * 		}
- * 		t[0] += 38 * c
- * }
-*/
-//@ts-expect-error
-@inline
-function normalize (o: usize, t: usize): void {
-	// reduce from 20 limbs to 10
-	let x = load<i64>(t)
-	let y = load<i64>(t, 80)
-	store<i64>(t, x + (19 * y))
-
-	x = load<i64>(t, 8)
-	y = load<i64>(t, 88)
-	store<i64>(t, x + (19 * y), 8)
-
-	x = load<i64>(t, 16)
-	y = load<i64>(t, 96)
-	store<i64>(t, x + (19 * y), 16)
-
-	x = load<i64>(t, 24)
-	y = load<i64>(t, 104)
-	store<i64>(t, x + (19 * y), 24)
-
-	x = load<i64>(t, 32)
-	y = load<i64>(t, 112)
-	store<i64>(t, x + (19 * y), 32)
-
-	x = load<i64>(t, 40)
-	y = load<i64>(t, 120)
-	store<i64>(t, x + (19 * y), 40)
-
-	x = load<i64>(t, 48)
-	y = load<i64>(t, 128)
-	store<i64>(t, x + (19 * y), 48)
-
-	x = load<i64>(t, 56)
-	y = load<i64>(t, 136)
-	store<i64>(t, x + (19 * y), 56)
-
-	x = load<i64>(t, 64)
-	y = load<i64>(t, 144)
-	store<i64>(t, x + (19 * y), 64)
-
-	x = load<i64>(t, 72)
-	y = load<i64>(t, 152)
-	store<i64>(t, x + (19 * y), 72)
-
-	// first carry
-	let v: i64 = load<i64>(t, 0)
-	let c: i64 = (v + (1 << 25)) >> 26
-	store<i64>(t, v - (c << 26), 0)
-
-	v = load<i64>(t, 8) + c
-	c = (v + (1 << 24)) >> 25
-	store<i64>(t, v - (c << 25), 8)
-
-	v = load<i64>(t, 16) + c
-	c = (v + (1 << 25)) >> 26
-	store<i64>(t, v - (c << 26), 16)
-
-	v = load<i64>(t, 24) + c
-	c = (v + (1 << 24)) >> 25
-	store<i64>(t, v - (c << 25), 24)
-
-	v = load<i64>(t, 32) + c
-	c = (v + (1 << 25)) >> 26
-	store<i64>(t, v - (c << 26), 32)
-
-	v = load<i64>(t, 40) + c
-	c = (v + (1 << 24)) >> 25
-	store<i64>(t, v - (c << 25), 40)
-
-	v = load<i64>(t, 48) + c
-	c = (v + (1 << 25)) >> 26
-	store<i64>(t, v - (c << 26), 48)
-
-	v = load<i64>(t, 56) + c
-	c = (v + (1 << 24)) >> 25
-	store<i64>(t, v - (c << 25), 56)
-
-	v = load<i64>(t, 64) + c
-	c = (v + (1 << 25)) >> 26
-	store<i64>(t, v - (c << 26), 64)
-
-	v = load<i64>(t, 72) + c
-	c = (v + (1 << 24)) >> 25
-	store<i64>(t, v - (c << 25), 72)
-
-	// fold final carry back to start then do second carry and assign to output
-	v = load<i64>(t, 0) + (19 * c)
-	c = (v + (1 << 25)) >> 26
-	store<i32>(o, v - (c << 26), 0)
-
-	v = load<i64>(t, 8) + c
-	c = (v + (1 << 24)) >> 25
-	store<i32>(o, v - (c << 25), 4)
-
-	v = load<i64>(t, 16) + c
-	c = (v + (1 << 25)) >> 26
-	store<i32>(o, v - (c << 26), 8)
-
-	v = load<i64>(t, 24) + c
-	c = (v + (1 << 24)) >> 25
-	store<i32>(o, v - (c << 25), 12)
-
-	v = load<i64>(t, 32) + c
-	c = (v + (1 << 25)) >> 26
-	store<i32>(o, v - (c << 26), 16)
-
-	v = load<i64>(t, 40) + c
-	c = (v + (1 << 24)) >> 25
-	store<i32>(o, v - (c << 25), 20)
-
-	v = load<i64>(t, 48) + c
-	c = (v + (1 << 25)) >> 26
-	store<i32>(o, v - (c << 26), 24)
-
-	v = load<i64>(t, 56) + c
-	c = (v + (1 << 24)) >> 25
-	store<i32>(o, v - (c << 25), 28)
-
-	v = load<i64>(t, 64) + c
-	c = (v + (1 << 25)) >> 26
-	store<i32>(o, v - (c << 26), 32)
-
-	v = load<i64>(t, 72) + c
-	c = (v + (1 << 24)) >> 25
-	store<i32>(o, v - (c << 25), 36)
+	const t_ptr: usize = changetype<usize>(t)
+	const t0 = load<i32>(t_ptr, 0)
+	const t1 = load<i32>(t_ptr, 4)
+	const t2 = load<i32>(t_ptr, 8)
+	const t3 = load<i32>(t_ptr, 12)
+	const t4 = load<i32>(t_ptr, 16)
+	const t5 = load<i32>(t_ptr, 20)
+	const t6 = load<i32>(t_ptr, 24)
+	const t7 = load<i32>(t_ptr, 28)
+	const t8 = load<i32>(t_ptr, 32)
+	const t9 = load<i32>(t_ptr, 36)
+
+	const s_ptr: usize = changetype<usize>(s)
+	store<u8>(s_ptr, t0 >> 0, 0)
+	store<u8>(s_ptr, t0 >> 8, 1)
+	store<u8>(s_ptr, t0 >> 16, 2)
+	store<u8>(s_ptr, (t0 >> 24) | (t1 << 2), 3)
+	store<u8>(s_ptr, t1 >> 6, 4)
+	store<u8>(s_ptr, t1 >> 14, 5)
+	store<u8>(s_ptr, (t1 >> 22) | (t2 << 3), 6)
+	store<u8>(s_ptr, t2 >> 5, 7)
+	store<u8>(s_ptr, t2 >> 13, 8)
+	store<u8>(s_ptr, (t2 >> 21) | (t3 << 5), 9)
+	store<u8>(s_ptr, t3 >> 3, 10)
+	store<u8>(s_ptr, t3 >> 11, 11)
+	store<u8>(s_ptr, (t3 >> 19) | (t4 << 6), 12)
+	store<u8>(s_ptr, t4 >> 2, 13)
+	store<u8>(s_ptr, t4 >> 10, 14)
+	store<u8>(s_ptr, t4 >> 18, 15)
+	store<u8>(s_ptr, t5 >> 0, 16)
+	store<u8>(s_ptr, t5 >> 8, 17)
+	store<u8>(s_ptr, t5 >> 16, 18)
+	store<u8>(s_ptr, (t5 >> 24) | (t6 << 1), 19)
+	store<u8>(s_ptr, t6 >> 7, 20)
+	store<u8>(s_ptr, t6 >> 15, 21)
+	store<u8>(s_ptr, (t6 >> 23) | (t7 << 3), 22)
+	store<u8>(s_ptr, t7 >> 5, 23)
+	store<u8>(s_ptr, t7 >> 13, 24)
+	store<u8>(s_ptr, (t7 >> 21) | (t8 << 4), 25)
+	store<u8>(s_ptr, t8 >> 4, 26)
+	store<u8>(s_ptr, t8 >> 12, 27)
+	store<u8>(s_ptr, (t8 >> 20) | (t9 << 6), 28)
+	store<u8>(s_ptr, t9 >> 2, 29)
+	store<u8>(s_ptr, t9 >> 10, 30)
+	store<u8>(s_ptr, t9 >> 18, 31)
 }
diff --git a/assembly/ge.ts b/assembly/ge.ts
index 386a7c9..6abd642 100644
--- a/assembly/ge.ts
+++ b/assembly/ge.ts
@@ -17,77 +17,13 @@
  * - ge_precomp (Duif): (y+x,y-x,2dxy)
 */
 import { base, base2 } from './base'
-import { ed25519_d, ed25519_d2, fe25519_sqrtm1 } from './constants'
-import { fe, fe_0, fe_1, fe_add, fe_cmov, fe_copy, fe_frombytes, fe_invert, fe_isnegative, fe_iszero, fe_mul, fe_neg, fe_pow22523, fe_sq, fe_sq2, fe_sub, fe_tobytes, FieldElement } from './fe'
+import { ed25519_d, fe25519_sqrtm1 } from './constants'
+import { fe, fe_1, fe_add, fe_cmov, fe_copy, fe_frombytes, fe_isnegative, fe_iszero, fe_mul, fe_neg, fe_pow22523, fe_sq, fe_sub, FieldElement } from './fe'
+import { ge_add_cached, ge_add_precomp, ge_cached, ge_p1p1, ge_p1p1_to_p2, ge_p1p1_to_p3, ge_p2, ge_p2_0, ge_p2_dbl, ge_p2_to_p3, ge_p3, ge_p3_0, ge_p3_dbl, ge_p3_to_cached, ge_p3_tobytes, ge_precomp, ge_precomp_0, ge_sub_cached, ge_sub_precomp } from './p'
 import { equal, negative, optblocker_u8 } from './utils'
 
-export class ge_p2 {
-	X: FieldElement = fe()
-	Y: FieldElement = fe()
-	Z: FieldElement = fe()
-}
-
-export class ge_p3 extends ge_p2 {
-	T: FieldElement = fe()
-	__brand: 'ge_p3' = 'ge_p3'
-}
-
-export class ge_p1p1 extends ge_p2 {
-	T: FieldElement = fe()
-	__brand: 'ge_p1p1' = 'ge_p1p1'
-}
-
-export class ge_precomp {
-	yplusx: FieldElement = fe()
-	yminusx: FieldElement = fe()
-	xy2d: FieldElement = fe()
-}
-
-export class ge_cached {
-	YplusX: FieldElement = fe()
-	YminusX: FieldElement = fe()
-	Z: FieldElement = fe()
-	T2d: FieldElement = fe()
-}
-
-//@ts-expect-error
-@inline
-function ge_cached_8 (): StaticArray<ge_cached> {
-	return StaticArray.fromArray<ge_cached>([
-		new ge_cached(),
-		new ge_cached(),
-		new ge_cached(),
-		new ge_cached(),
-		new ge_cached(),
-		new ge_cached(),
-		new ge_cached(),
-		new ge_cached()
-	])
-}
-
-//@ts-expect-error
-@inline
-export function ge_p2_0 (h: ge_p2): void {
-	fe_0(h.X)
-	fe_1(h.Y)
-	fe_1(h.Z)
-}
-
-//@ts-expect-error
-@inline
-export function ge_p3_0 (h: ge_p3): void {
-	ge_p2_0(h)
-	fe_0(h.T)
-}
-
 //@ts-expect-error
 @inline
-export function ge_precomp_0 (h: ge_precomp): void {
-	fe_1(h.yplusx)
-	fe_1(h.yminusx)
-	fe_0(h.xy2d)
-}
-
 function ge_cmov (t: ge_precomp, u: ge_precomp, b: u8): void {
 	fe_cmov(t.yplusx, u.yplusx, b)
 	fe_cmov(t.yminusx, u.yminusx, b)
@@ -116,354 +52,12 @@ function ge_cmov8 (t: ge_precomp, precomp: StaticArray<ge_precomp>, b: i8): void
 	ge_cmov(t, minust, bnegative)
 }
 
+//@ts-expect-error
+@inline
 function ge_cmov8_base (t: ge_precomp, pos: i32, b: i8): void {
 	ge_cmov8(t, base[pos], b)
 }
 
-const ge_add_precomp_t0: FieldElement = fe()
-/**
- * r = p + q
- */
-function ge_add_precomp (r: ge_p1p1, p: ge_p3, q: ge_precomp): void {
-	const t0 = ge_add_precomp_t0
-	fe_add(r.X, p.Y, p.X)
-	fe_sub(r.Y, p.Y, p.X)
-	fe_mul(r.Z, r.X, q.yplusx)
-	fe_mul(r.Y, r.Y, q.yminusx)
-	fe_mul(r.T, q.xy2d, p.T)
-	fe_add(t0, p.Z, p.Z)
-	fe_sub(r.X, r.Z, r.Y)
-	fe_add(r.Y, r.Z, r.Y)
-	fe_add(r.Z, t0, r.T)
-	fe_sub(r.T, t0, r.T)
-}
-
-function ge_p1p1_to_p3 (r: ge_p3, p: ge_p1p1): void {
-	fe_mul(r.X, p.X, p.T)
-	fe_mul(r.Y, p.Y, p.Z)
-	fe_mul(r.Z, p.Z, p.T)
-	fe_mul(r.T, p.X, p.Y)
-}
-
-function ge_p3_to_p2 (r: ge_p2, p: ge_p3): void {
-	fe_copy(r.X, p.X)
-	fe_copy(r.Y, p.Y)
-	fe_copy(r.Z, p.Z)
-}
-
-const ge_p2_dbl_t0: FieldElement = fe()
-function ge_p2_dbl (r: ge_p1p1, p: ge_p2): void {
-	const t0 = ge_p2_dbl_t0
-	fe_sq(r.X, p.X)
-	fe_sq(r.Z, p.Y)
-	fe_sq2(r.T, p.Z)
-	fe_add(r.Y, p.X, p.Y)
-	fe_sq(t0, r.Y)
-	fe_add(r.Y, r.Z, r.X)
-	fe_sub(r.Z, r.Z, r.X)
-	fe_sub(r.X, t0, r.Y)
-	fe_sub(r.T, r.T, r.Z)
-}
-
-const ge_p3_dbl_q: ge_p2 = new ge_p2()
-/**
- * r = 2 * p
- */
-function ge_p3_dbl (r: ge_p1p1, p: ge_p3): void {
-	const q = ge_p3_dbl_q
-	ge_p3_to_p2(q, p)
-	ge_p2_dbl(r, q)
-}
-
-/**
- * r = p
- */
-function ge_p1p1_to_p2 (r: ge_p2, p: ge_p1p1): void {
-	fe_mul(r.X, p.X, p.T)
-	fe_mul(r.Y, p.Y, p.Z)
-	fe_mul(r.Z, p.Z, p.T)
-}
-
-/**
- * r = p
- */
-function ge_p3_to_cached (r: ge_cached, p: ge_p3): void {
-	fe_add(r.YplusX, p.Y, p.X)
-	fe_sub(r.YminusX, p.Y, p.X)
-	fe_copy(r.Z, p.Z)
-	fe_mul(r.T2d, p.T, ed25519_d2)
-}
-
-const ge_add_cached_t0: FieldElement = fe()
-/**
- * r = p + q
- */
-function ge_add_cached (r: ge_p1p1, p: ge_p3, q: ge_cached): void {
-	const t0 = ge_add_cached_t0
-	fe_add(r.X, p.Y, p.X)
-	fe_sub(r.Y, p.Y, p.X)
-	fe_mul(r.Z, r.X, q.YplusX)
-	fe_mul(r.Y, r.Y, q.YminusX)
-	fe_mul(r.T, q.T2d, p.T)
-	fe_mul(r.X, p.Z, q.Z)
-	fe_add(t0, r.X, r.X)
-	fe_sub(r.X, r.Z, r.Y)
-	fe_add(r.Y, r.Z, r.Y)
-	fe_add(r.Z, t0, r.T)
-	fe_sub(r.T, t0, r.T)
-}
-
-function ge_cached_0 (h: ge_cached): void {
-	fe_1(h.YplusX)
-	fe_1(h.YminusX)
-	fe_1(h.Z)
-	fe_0(h.T2d)
-}
-
-function ge_cmov_cached (t: ge_cached, u: ge_cached, b: u8): void {
-	fe_cmov(t.YplusX, u.YplusX, b)
-	fe_cmov(t.YminusX, u.YminusX, b)
-	fe_cmov(t.Z, u.Z, b)
-	fe_cmov(t.T2d, u.T2d, b)
-}
-
-const ge_cmov8_cached_minust: ge_cached = new ge_cached()
-function ge_cmov8_cached (t: ge_cached, cached: StaticArray<ge_cached>, b: i8): void {
-	const minust = ge_cmov8_cached_minust
-	const bnegative: u8 = negative(b)
-	const babs: u8 = b - (((-bnegative) & b) << 1)
-
-	ge_cached_0(t)
-	ge_cmov_cached(t, cached[0], equal(babs, 1))
-	ge_cmov_cached(t, cached[1], equal(babs, 2))
-	ge_cmov_cached(t, cached[2], equal(babs, 3))
-	ge_cmov_cached(t, cached[3], equal(babs, 4))
-	ge_cmov_cached(t, cached[4], equal(babs, 5))
-	ge_cmov_cached(t, cached[5], equal(babs, 6))
-	ge_cmov_cached(t, cached[6], equal(babs, 7))
-	ge_cmov_cached(t, cached[7], equal(babs, 8))
-	fe_copy(minust.YplusX, t.YminusX)
-	fe_copy(minust.YminusX, t.YplusX)
-	fe_copy(minust.Z, t.Z)
-	fe_neg(minust.T2d, t.T2d)
-	ge_cmov_cached(t, minust, bnegative)
-}
-
-const ge_scalarmult_e: StaticArray<i8> = new StaticArray<i8>(64)
-const ge_scalarmult_r: ge_p1p1 = new ge_p1p1()
-const ge_scalarmult_s: ge_p2 = new ge_p2()
-const ge_scalarmult_t2: ge_p1p1 = new ge_p1p1()
-const ge_scalarmult_t3: ge_p1p1 = new ge_p1p1()
-const ge_scalarmult_t4: ge_p1p1 = new ge_p1p1()
-const ge_scalarmult_t5: ge_p1p1 = new ge_p1p1()
-const ge_scalarmult_t6: ge_p1p1 = new ge_p1p1()
-const ge_scalarmult_t7: ge_p1p1 = new ge_p1p1()
-const ge_scalarmult_t8: ge_p1p1 = new ge_p1p1()
-const ge_scalarmult_p2: ge_p3 = new ge_p3()
-const ge_scalarmult_p3: ge_p3 = new ge_p3()
-const ge_scalarmult_p4: ge_p3 = new ge_p3()
-const ge_scalarmult_p5: ge_p3 = new ge_p3()
-const ge_scalarmult_p6: ge_p3 = new ge_p3()
-const ge_scalarmult_p7: ge_p3 = new ge_p3()
-const ge_scalarmult_p8: ge_p3 = new ge_p3()
-const ge_scalarmult_pi: StaticArray<ge_cached> = ge_cached_8()
-const ge_scalarmult_t: ge_cached = new ge_cached()
-/**
- * `h = a * p`
- *
- * where
- *
- * `a = 256â°a[0] + 256Â¹a[1] + ... + 256Â³Â¹a[31]`
- *
- * Preconditions:
- * - `a[31] <= 127`
- * - `p` is public
- */
-export function ge_scalarmult (h: ge_p3, a: StaticArray<u8>, p: ge_p3): void {
-	const e = ge_scalarmult_e
-	const r = ge_scalarmult_r
-	const s = ge_scalarmult_s
-	const t2 = ge_scalarmult_t2
-	const t3 = ge_scalarmult_t3
-	const t4 = ge_scalarmult_t4
-	const t5 = ge_scalarmult_t5
-	const t6 = ge_scalarmult_t6
-	const t7 = ge_scalarmult_t7
-	const t8 = ge_scalarmult_t8
-	const p2 = ge_scalarmult_p2
-	const p3 = ge_scalarmult_p3
-	const p4 = ge_scalarmult_p4
-	const p5 = ge_scalarmult_p5
-	const p6 = ge_scalarmult_p6
-	const p7 = ge_scalarmult_p7
-	const p8 = ge_scalarmult_p8
-	const pi = ge_scalarmult_pi
-	const t = ge_scalarmult_t
-	let carry: i8
-	let i: i8
-
-	ge_p3_to_cached(pi[1 - 1], p)   /* p */
-
-	ge_p3_dbl(t2, p)
-	ge_p1p1_to_p3(p2, t2)
-	ge_p3_to_cached(pi[2 - 1], p2) /* 2p = 2*p */
-
-	ge_add_cached(t3, p, pi[2 - 1])
-	ge_p1p1_to_p3(p3, t3)
-	ge_p3_to_cached(pi[3 - 1], p3) /* 3p = 2p+p */
-
-	ge_p3_dbl(t4, p2)
-	ge_p1p1_to_p3(p4, t4)
-	ge_p3_to_cached(pi[4 - 1], p4) /* 4p = 2*2p */
-
-	ge_add_cached(t5, p, pi[4 - 1])
-	ge_p1p1_to_p3(p5, t5)
-	ge_p3_to_cached(pi[5 - 1], p5) /* 5p = 4p+p */
-
-	ge_p3_dbl(t6, p3)
-	ge_p1p1_to_p3(p6, t6)
-	ge_p3_to_cached(pi[6 - 1], p6) /* 6p = 2*3p */
-
-	ge_add_cached(t7, p, pi[6 - 1])
-	ge_p1p1_to_p3(p7, t7)
-	ge_p3_to_cached(pi[7 - 1], p7) /* 7p = 6p+p */
-
-	ge_p3_dbl(t8, p4)
-	ge_p1p1_to_p3(p8, t8)
-	ge_p3_to_cached(pi[8 - 1], p8) /* 8p = 2*4p */
-
-	for (i = 0; i < 32; ++i) {
-		e[2 * i + 0] = (a[i] >> 0) & 15
-		e[2 * i + 1] = (a[i] >> 4) & 15
-	}
-	/* each e[i] is between 0 and 15 */
-	/* e[63] is between 0 and 7 */
-
-	carry = 0
-	for (i = 0; i < 63; ++i) {
-		e[i] += carry
-		carry = e[i] + 8
-		carry >>= 4
-		e[i] -= i8(carry << 4)
-	}
-	e[63] += carry
-	/* each e[i] is between -8 and 8 */
-
-	ge_p3_0(h)
-
-	for (i = 63; i != 0; i--) {
-		ge_cmov8_cached(t, pi, e[i])
-		ge_add_cached(r, h, t)
-
-		ge_p1p1_to_p2(s, r)
-		ge_p2_dbl(r, s)
-		ge_p1p1_to_p2(s, r)
-		ge_p2_dbl(r, s)
-		ge_p1p1_to_p2(s, r)
-		ge_p2_dbl(r, s)
-		ge_p1p1_to_p2(s, r)
-		ge_p2_dbl(r, s)
-
-		ge_p1p1_to_p3(h, r)  /* *16 */
-	}
-	ge_cmov8_cached(t, pi, e[i])
-	ge_add_cached(r, h, t)
-
-	ge_p1p1_to_p3(h, r)
-}
-
-const ge_scalarmult_base_e: StaticArray<i8> = new StaticArray<i8>(64)
-const ge_scalarmult_base_r: ge_p1p1 = new ge_p1p1()
-const ge_scalarmult_base_s: ge_p2 = new ge_p2()
-const ge_scalarmult_base_t: ge_precomp = new ge_precomp()
-/**
- * `h = a * B` (with precomputation)
- *
- * where
- *
- * `a = 256â°a[0] + 256Â¹a[1] + ... + 256Â³Â¹a[31]`
- *
- * B is the Ed25519 base point `(x,â)` with `x` positive. As bytes:
- *
- * `0x5866666666666666666666666666666666666666666666666666666666666666`
- *
- * Preconditions:
- * - `a[31] <= 127`
- */
-export function ge_scalarmult_base (h: ge_p3, a: StaticArray<u8>): void {
-	const e = ge_scalarmult_base_e
-	const r = ge_scalarmult_base_r
-	const s = ge_scalarmult_base_s
-	const t = ge_scalarmult_base_t
-
-	for (let i = 0; i < 32; ++i) {
-		e[(i << 1) + 0] = (a[i] >> 0) & 15
-		e[(i << 1) + 1] = (a[i] >> 4) & 15
-	}
-	/* each e[i] is between 0 and 15 */
-	/* e[63] is between 0 and 7 */
-
-	let carry: i8 = 0
-	for (let i = 0; i < 63; ++i) {
-		e[i] += carry
-		carry = e[i] + 8
-		carry >>= 4
-		e[i] -= carry * (i8(1) << 4)
-	}
-	e[63] += carry
-	/* each e[i] is between -8 and 8 */
-
-	ge_p3_0(h)
-
-	for (let i = 1; i < 64; i += 2) {
-		ge_cmov8_base(t, i / 2, e[i])
-		ge_add_precomp(r, h, t)
-		ge_p1p1_to_p3(h, r)
-	}
-
-	ge_p3_dbl(r, h)
-	ge_p1p1_to_p2(s, r)
-	ge_p2_dbl(r, s)
-	ge_p1p1_to_p2(s, r)
-	ge_p2_dbl(r, s)
-	ge_p1p1_to_p2(s, r)
-	ge_p2_dbl(r, s)
-	ge_p1p1_to_p3(h, r)
-
-	for (let i = 0; i < 64; i += 2) {
-		ge_cmov8_base(t, i / 2, e[i])
-		ge_add_precomp(r, h, t)
-		ge_p1p1_to_p3(h, r)
-	}
-}
-
-const ge_p3_tobytes_recip: FieldElement = fe()
-const ge_p3_tobytes_x: FieldElement = fe()
-const ge_p3_tobytes_y: FieldElement = fe()
-export function ge_p3_tobytes (s: StaticArray<u8>, h: ge_p3): void {
-	const recip = ge_p3_tobytes_recip
-	const x = ge_p3_tobytes_x
-	const y = ge_p3_tobytes_y
-	fe_invert(recip, h.Z)
-	fe_mul(x, h.X, recip)
-	fe_mul(y, h.Y, recip)
-	fe_tobytes(s, y)
-	s[31] ^= fe_isnegative(x) << 7
-}
-
-/**
- * true if `s < p`
- */
-export function ge_is_canonical (s: StaticArray<u8>): u8 {
-	let c: u8 = (s[31] & 0x7f) ^ 0x7f
-	for (let i = 30; i > 0; i--) {
-		c |= s[i] ^ 0xff
-	}
-	c = (c - 1) >> 8
-	const d: u8 = (0xec - s[0]) >> 8
-	return 1 - (c & d & 1)
-}
-
 const u: FieldElement = fe()
 const v: FieldElement = fe()
 const v3: FieldElement = fe()
@@ -499,7 +93,7 @@ export function ge_frombytes_negate_vartime (h: ge_p3, s: StaticArray<u8>): i32
 		fe_mul(h.X, h.X, fe25519_sqrtm1)
 	}
 
-	if (fe_isnegative(h.X) == (s[31] >> 7)) { /* vartime function - compiler optimization is fine */
+	if (fe_isnegative(h.X) == (load<u8>(changetype<usize>(s), 31) >> 7)) { /* vartime function - compiler optimization is fine */
 		fe_neg(h.X, h.X)
 	}
 	fe_mul(h.T, h.X, h.Y)
@@ -572,45 +166,102 @@ export function ge_has_small_order (p: ge_p3): i32 {
 }
 
 /**
- * r = p
+ * true if `s < p`
  */
-export function ge_p2_to_p3 (r: ge_p3, p: ge_p2): void {
-	fe_copy(r.X, p.X)
-	fe_copy(r.Y, p.Y)
-	fe_copy(r.Z, p.Z)
-	fe_mul(r.T, p.X, p.Y)
+export function ge_is_canonical (s: StaticArray<u8>): u8 {
+	let c: u8 = (s[31] & 0x7f) ^ 0x7f
+	for (let i = 30; i > 0; i--) {
+		c |= s[i] ^ 0xff
+	}
+	c = (c - 1) >> 8
+	const d: u8 = (0xec - s[0]) >> 8
+	return 1 - (c & d & 1)
 }
 
-const ge_p3_sub_q_neg: ge_p3 = new ge_p3()
-/* r = p-q */
-export function ge_p3_sub (r: ge_p3, p: ge_p3, q: ge_p3): void {
-	const q_neg = ge_p3_sub_q_neg
-	ge_p3_neg(q_neg, q)
-	ge_p3_add(r, p, q_neg)
-}
+const ge_scalarmult_base_e: StaticArray<i8> = new StaticArray<i8>(64)
+const ge_scalarmult_base_r: ge_p1p1 = new ge_p1p1()
+const ge_scalarmult_base_s: ge_p2 = new ge_p2()
+const ge_scalarmult_base_t: ge_precomp = new ge_precomp()
+/**
+ * `h = a * B` (with precomputation)
+ *
+ * where
+ *
+ * `a = 256â°a[0] + 256Â¹a[1] + ... + 256Â³Â¹a[31]`
+ *
+ * B is the Ed25519 base point `(x,â)` with `x` positive. As bytes:
+ *
+ * `0x5866666666666666666666666666666666666666666666666666666666666666`
+ *
+ * Preconditions:
+ * - `a[31] <= 127`
+ */
+function ge_scalarmult_base (h: ge_p3, a: StaticArray<u8>): void {
+	const e = ge_scalarmult_base_e
+	const r = ge_scalarmult_base_r
+	const s = ge_scalarmult_base_s
+	const t = ge_scalarmult_base_t
+
+	for (let i = 0; i < 32; ++i) {
+		e[(i << 1) + 0] = (a[i] >> 0) & 15
+		e[(i << 1) + 1] = (a[i] >> 4) & 15
+	}
+	/* each e[i] is between 0 and 15 */
+	/* e[63] is between 0 and 7 */
+
+	let carry: i8 = 0
+	for (let i = 0; i < 63; ++i) {
+		e[i] += carry
+		carry = e[i] + 8
+		carry >>= 4
+		e[i] -= carry * (i8(1) << 4)
+	}
+	e[63] += carry
+	/* each e[i] is between -8 and 8 */
+
+	ge_p3_0(h)
+
+	for (let i = 1; i < 64; i += 2) {
+		ge_cmov8_base(t, i / 2, e[i])
+		ge_add_precomp(r, h, t)
+		ge_p1p1_to_p3(h, r)
+	}
+
+	ge_p3_dbl(r, h)
+	ge_p1p1_to_p2(s, r)
+	ge_p2_dbl(r, s)
+	ge_p1p1_to_p2(s, r)
+	ge_p2_dbl(r, s)
+	ge_p1p1_to_p2(s, r)
+	ge_p2_dbl(r, s)
+	ge_p1p1_to_p3(h, r)
 
-/* r = -p */
-function ge_p3_neg (r: ge_p3, p: ge_p3): void {
-	fe_neg(r.X, p.X)
-	fe_copy(r.Y, p.Y)
-	fe_copy(r.Z, p.Z)
-	fe_neg(r.T, p.T)
+	for (let i = 0; i < 64; i += 2) {
+		ge_cmov8_base(t, i / 2, e[i])
+		ge_add_precomp(r, h, t)
+		ge_p1p1_to_p3(h, r)
+	}
 }
 
-const ge_p3_add_q_cached = new ge_cached()
-const ge_p3_add_q_p1p1 = new ge_p1p1()
-/* r = p+q */
-function ge_p3_add (r: ge_p3, p: ge_p3, q: ge_p3): void {
-	const q_cached = ge_p3_add_q_cached
-	const p1p1 = ge_p3_add_q_p1p1
-	ge_p3_to_cached(q_cached, q)
-	ge_add_cached(p1p1, p, q_cached)
-	ge_p1p1_to_p3(r, p1p1)
+const ge_scalarmult_base_tobytes_h: ge_p3 = new ge_p3()
+export function ge_scalarmult_base_tobytes (s: StaticArray<u8>, a: StaticArray<u8>): void {
+	const h = ge_scalarmult_base_tobytes_h
+	ge_scalarmult_base(h, a)
+	ge_p3_tobytes(s, h)
 }
 
 const ge_double_scalarmult_vartime_aslide: StaticArray<i8> = new StaticArray<i8>(256)
 const ge_double_scalarmult_vartime_bslide: StaticArray<i8> = new StaticArray<i8>(256)
-const ge_double_scalarmult_vartime_Ai: StaticArray<ge_cached> = ge_cached_8() /* A,3A,5A,7A,9A,11A,13A,15A */
+const ge_double_scalarmult_vartime_Ai: StaticArray<ge_cached> = StaticArray.fromArray<ge_cached>([
+	new ge_cached(),
+	new ge_cached(),
+	new ge_cached(),
+	new ge_cached(),
+	new ge_cached(),
+	new ge_cached(),
+	new ge_cached(),
+	new ge_cached()
+]) /* A,3A,5A,7A,9A,11A,13A,15A */
 const ge_double_scalarmult_vartime_t: ge_p1p1 = new ge_p1p1()
 const ge_double_scalarmult_vartime_u: ge_p3 = new ge_p3()
 const ge_double_scalarmult_vartime_A2: ge_p3 = new ge_p3()
@@ -622,7 +273,7 @@ const ge_double_scalarmult_vartime_A2: ge_p3 = new ge_p3()
  *
  * Only used for signatures verification.
  */
-export function ge_double_scalarmult_vartime (r: ge_p2, a: StaticArray<u8>, A: ge_p3, b: StaticArray<u8>): void {
+function ge_double_scalarmult_vartime (r: ge_p2, a: StaticArray<u8>, A: ge_p3, b: StaticArray<u8>): void {
 	const Bi = base2
 	const aslide = ge_double_scalarmult_vartime_aslide
 	const bslide = ge_double_scalarmult_vartime_bslide
@@ -699,6 +350,13 @@ export function ge_double_scalarmult_vartime (r: ge_p2, a: StaticArray<u8>, A: g
 	}
 }
 
+const ge_double_scalarmult_vartime_p: ge_p2 = new ge_p2()
+export function ge_double_scalarmult_vartime_to_p3 (r: ge_p3, a: StaticArray<u8>, A: ge_p3, b: StaticArray<u8>): void {
+	const p = ge_double_scalarmult_vartime_p
+	ge_double_scalarmult_vartime(p, a, A, b)
+	ge_p2_to_p3(r, p)
+}
+
 function slide_vartime (r: StaticArray<i8>, a: StaticArray<u8>): void {
 	let i: i32
 	let b: i32
@@ -739,40 +397,3 @@ function slide_vartime (r: StaticArray<i8>, a: StaticArray<u8>): void {
 		}
 	}
 }
-
-const ge_sub_cached_t0: FieldElement = fe()
-/**
- * r = p - q
- */
-function ge_sub_cached (r: ge_p1p1, p: ge_p3, q: ge_cached): void {
-	const t0 = ge_sub_cached_t0
-	fe_add(r.X, p.Y, p.X)
-	fe_sub(r.Y, p.Y, p.X)
-	fe_mul(r.Z, r.X, q.YminusX)
-	fe_mul(r.Y, r.Y, q.YplusX)
-	fe_mul(r.T, q.T2d, p.T)
-	fe_mul(r.X, p.Z, q.Z)
-	fe_add(t0, r.X, r.X)
-	fe_sub(r.X, r.Z, r.Y)
-	fe_add(r.Y, r.Z, r.Y)
-	fe_sub(r.Z, t0, r.T)
-	fe_add(r.T, t0, r.T)
-}
-
-const ge_sub_precomp_t0: FieldElement = fe()
-/**
- * r = p - q
- */
-function ge_sub_precomp (r: ge_p1p1, p: ge_p3, q: ge_precomp): void {
-	const t0 = ge_sub_precomp_t0
-	fe_add(r.X, p.Y, p.X)
-	fe_sub(r.Y, p.Y, p.X)
-	fe_mul(r.Z, r.X, q.yminusx)
-	fe_mul(r.Y, r.Y, q.yplusx)
-	fe_mul(r.T, q.xy2d, p.T)
-	fe_add(t0, p.Z, p.Z)
-	fe_sub(r.X, r.Z, r.Y)
-	fe_add(r.Y, r.Z, r.Y)
-	fe_sub(r.Z, t0, r.T)
-	fe_add(r.T, t0, r.T)
-}
diff --git a/assembly/nano-nacl.ts b/assembly/nano-nacl.ts
index 6e2f75b..36fadc7 100644
--- a/assembly/nano-nacl.ts
+++ b/assembly/nano-nacl.ts
@@ -2,7 +2,8 @@
 //! SPDX-License-Identifier: GPL-3.0-or-later
 
 import { Blake2b } from './blake2b'
-import { ge_double_scalarmult_vartime, ge_frombytes, ge_frombytes_negate_vartime, ge_has_small_order, ge_is_canonical, ge_p2, ge_p2_to_p3, ge_p3, ge_p3_0, ge_p3_sub, ge_p3_tobytes, ge_scalarmult_base } from './ge'
+import { ge_double_scalarmult_vartime_to_p3, ge_frombytes, ge_frombytes_negate_vartime, ge_has_small_order, ge_is_canonical, ge_scalarmult_base_tobytes } from './ge'
+import { ge_p3, ge_sub_p3 } from './p'
 import { sc_is_canonical, sc_muladd, sc_reduce } from './sc'
 
 const BLOCKHASH_BYTES: i32 = 32
@@ -29,18 +30,11 @@ function crypto_hash (o: StaticArray<u8>, i: StaticArray<u8>): void {
 	blake2b.init().update(i).digest(o)
 }
 
-const crypto_derive_A = new ge_p3()
 function crypto_derive (pk: StaticArray<u8>, sk: StaticArray<u8>, seed: StaticArray<u8>): void {
-	const A = crypto_derive_A
-	ge_p3_0(A)
-
 	crypto_hash(sk, seed)
 	clamp(sk)
 
-	trace(`seed hashed to sk: ${sk.map((b: u8) => b.toString(16).padStart(2, '0')).join('')}`)
-
-	ge_scalarmult_base(A, sk)
-	ge_p3_tobytes(pk, A)
+	ge_scalarmult_base_tobytes(pk, sk)
 	memory.copy(changetype<usize>(sk), changetype<usize>(seed), PRIVATEKEY_BYTES)
 	memory.copy(changetype<usize>(sk) + 32, changetype<usize>(pk), PUBLICKEY_BYTES)
 }
@@ -48,17 +42,14 @@ function crypto_derive (pk: StaticArray<u8>, sk: StaticArray<u8>, seed: StaticAr
 const crypto_sign_az = new StaticArray<u8>(64)
 const crypto_sign_nonce = new StaticArray<u8>(64)
 const crypto_sign_hram = new StaticArray<u8>(64)
-const crypto_sign_R = new ge_p3()
 const crypto_sign_zm = new StaticArray<u8>(SIGNATURE_BYTES)
 const crypto_sign_t = new StaticArray<u8>(PRIVATEKEY_BYTES)
 function crypto_sign (s: StaticArray<u8>, m: StaticArray<u8>, sk: StaticArray<u8>): void {
 	const az = crypto_sign_az
 	const nonce = crypto_sign_nonce
 	const hram = crypto_sign_hram
-	const R = crypto_sign_R
 	const zm = crypto_sign_zm
 	const t = crypto_sign_t
-	ge_p3_0(R)
 
 	// Hash secret key to private scalar `a` and prefix for nonce derivation `z`
 	memory.copy(changetype<usize>(t), changetype<usize>(sk), PRIVATEKEY_BYTES)
@@ -71,9 +62,8 @@ function crypto_sign (s: StaticArray<u8>, m: StaticArray<u8>, sk: StaticArray<u8
 	crypto_hash(nonce, zm)
 	sc_reduce(nonce)
 
-	// Compute R = rB
-	ge_scalarmult_base(R, nonce)
-	ge_p3_tobytes(s, R)
+	// Compute R = rB, output to bytes s
+	ge_scalarmult_base_tobytes(s, nonce)
 
 	// Concatenate public key `A` and message `M`
 	// from parameter arguments: A = sk[0,32], M = m
@@ -97,7 +87,6 @@ const crypto_verify_check = new ge_p3()
 const crypto_verify_expected_r = new ge_p3()
 const crypto_verify_A = new ge_p3()
 const crypto_verify_sb_ah = new ge_p3()
-const crypto_verify_sb_ah_p2 = new ge_p2()
 const crypto_verify_ram = new StaticArray<u8>(SIGNEDBLOCKHASH_BYTES)
 const crypto_verify_S = new StaticArray<u8>(32)
 /**
@@ -109,7 +98,6 @@ function crypto_verify (s: StaticArray<u8>, m: StaticArray<u8>, pk: StaticArray<
 	const expected_r = crypto_verify_expected_r
 	const A = crypto_verify_A
 	const sb_ah = crypto_verify_sb_ah
-	const sb_ah_p2 = crypto_verify_sb_ah_p2
 	const ram = crypto_verify_ram
 	const S = crypto_verify_S
 
@@ -136,9 +124,8 @@ function crypto_verify (s: StaticArray<u8>, m: StaticArray<u8>, pk: StaticArray<
 	crypto_hash(h, ram)
 	sc_reduce(h)
 
-	ge_double_scalarmult_vartime(sb_ah_p2, h, A, S)
-	ge_p2_to_p3(sb_ah, sb_ah_p2)
-	ge_p3_sub(check, expected_r, sb_ah)
+	ge_double_scalarmult_vartime_to_p3(sb_ah, h, A, S)
+	ge_sub_p3(check, expected_r, sb_ah)
 
 	return ge_has_small_order(check) - 1
 }
@@ -169,11 +156,7 @@ export function derive (): void {
 		seed[i] = load<u8>(INPUT_BUFFER + i)
 	}
 
-	const start = performance.now()
 	crypto_derive(pk, sk, seed)
-	const end = performance.now()
-	trace('derive time', 1, end - start)
-
 	for (let i = 0; i < PUBLICKEY_BYTES; i++) {
 		store<u8>(OUTPUT_BUFFER + i, pk[i])
 	}
@@ -202,11 +185,7 @@ export function sign (): void {
 		sk[i] = load<u8>(usize(i) + ptr)
 	}
 
-	const start = performance.now()
 	crypto_sign(s, h, sk)
-	const end = performance.now()
-	trace('sign time', 1, end - start)
-
 	for (let i = 0; i < SIGNATURE_BYTES; i++) {
 		store<u8>(OUTPUT_BUFFER + i, s[i])
 	}
@@ -241,10 +220,6 @@ export function verify (): void {
 		k[i] = load<u8>(usize(i) + ptr)
 	}
 
-	const start = performance.now()
 	const v = crypto_verify(s, h, k)
-	const end = performance.now()
-	trace('verify time', 1, end - start)
-
 	store<u8>(OUTPUT_BUFFER, v)
 }
diff --git a/assembly/p.ts b/assembly/p.ts
new file mode 100644
index 0000000..537eeec
--- /dev/null
+++ b/assembly/p.ts
@@ -0,0 +1,236 @@
+//! SPDX-FileCopyrightText: 2026 Chris Duncan <chris@codecow.com>
+//! SPDX-License-Identifier: GPL-3.0-or-later
+
+import { ed25519_d2 } from './constants'
+import { FieldElement, fe, fe_0, fe_1, fe_add, fe_copy, fe_dbl, fe_invert, fe_isnegative, fe_mul, fe_neg, fe_sq, fe_sq_vec, fe_sq2, fe_sub, fe_tobytes } from './fe'
+
+export class ge_p2 {
+	X: FieldElement = fe()
+	Y: FieldElement = fe()
+	Z: FieldElement = fe()
+}
+
+export class ge_p1p1 extends ge_p2 {
+	T: FieldElement = fe()
+	__brand: 'ge_p1p1' = 'ge_p1p1'
+}
+
+export class ge_p3 extends ge_p2 {
+	T: FieldElement = fe()
+	__brand: 'ge_p3' = 'ge_p3'
+}
+export class ge_cached {
+	YplusX: FieldElement = fe()
+	YminusX: FieldElement = fe()
+	Z: FieldElement = fe()
+	T2d: FieldElement = fe()
+}
+
+export class ge_precomp {
+	yplusx: FieldElement = fe()
+	yminusx: FieldElement = fe()
+	xy2d: FieldElement = fe()
+}
+
+//@ts-expect-error
+@inline
+export function ge_p2_0 (h: ge_p2): void {
+	fe_0(h.X)
+	fe_1(h.Y)
+	fe_1(h.Z)
+}
+
+const ge_p2_dbl_t0: FieldElement = fe()
+//@ts-expect-error
+@inline
+export function ge_p2_dbl (r: ge_p1p1, p: ge_p2): void {
+	const t0 = ge_p2_dbl_t0
+	fe_sq_vec(r.X, p.X, r.Z, p.Y)
+	fe_sq2(r.T, p.Z)
+	fe_add(r.Y, p.X, p.Y)
+	fe_sq(t0, r.Y)
+	fe_add(r.Y, r.Z, r.X)
+	fe_sub(r.Z, r.Z, r.X)
+	fe_sub(r.X, t0, r.Y)
+	fe_sub(r.T, r.T, r.Z)
+}
+
+/**
+ * r = p
+ */
+//@ts-expect-error
+@inline
+export function ge_p2_to_p3 (r: ge_p3, p: ge_p2): void {
+	fe_copy(r.X, p.X)
+	fe_copy(r.Y, p.Y)
+	fe_copy(r.Z, p.Z)
+	fe_mul(r.T, p.X, p.Y)
+}
+
+/**
+ * r = p
+ */
+//@ts-expect-error
+@inline
+export function ge_p1p1_to_p2 (r: ge_p2, p: ge_p1p1): void {
+	fe_mul(r.X, p.X, p.T)
+	fe_mul(r.Y, p.Y, p.Z)
+	fe_mul(r.Z, p.Z, p.T)
+}
+
+//@ts-expect-error
+@inline
+export function ge_p1p1_to_p3 (r: ge_p3, p: ge_p1p1): void {
+	fe_mul(r.X, p.X, p.T)
+	fe_mul(r.Y, p.Y, p.Z)
+	fe_mul(r.Z, p.Z, p.T)
+	fe_mul(r.T, p.X, p.Y)
+}
+
+//@ts-expect-error
+@inline
+export function ge_p3_0 (h: ge_p3): void {
+	ge_p2_0(h)
+	fe_0(h.T)
+}
+
+const ge_p3_dbl_q: ge_p2 = new ge_p2()
+/**
+ * r = 2 * p
+ */
+export function ge_p3_dbl (r: ge_p1p1, p: ge_p3): void {
+	const q = ge_p3_dbl_q
+	fe_copy(q.X, p.X)
+	fe_copy(q.Y, p.Y)
+	fe_copy(q.Z, p.Z)
+	ge_p2_dbl(r, q)
+}
+
+const ge_p3_tobytes_recip: FieldElement = fe()
+const ge_p3_tobytes_x: FieldElement = fe()
+const ge_p3_tobytes_y: FieldElement = fe()
+export function ge_p3_tobytes (s: StaticArray<u8>, h: ge_p3): void {
+	const recip = ge_p3_tobytes_recip
+	const x = ge_p3_tobytes_x
+	const y = ge_p3_tobytes_y
+	fe_invert(recip, h.Z)
+	fe_mul(x, h.X, recip)
+	fe_mul(y, h.Y, recip)
+	fe_tobytes(s, y)
+	s[31] ^= fe_isnegative(x) << 7
+}
+
+/**
+ * r = p
+ */
+//@ts-expect-error
+@inline
+export function ge_p3_to_cached (r: ge_cached, p: ge_p3): void {
+	fe_add(r.YplusX, p.Y, p.X)
+	fe_sub(r.YminusX, p.Y, p.X)
+	fe_copy(r.Z, p.Z)
+	fe_mul(r.T2d, p.T, ed25519_d2)
+}
+
+//@ts-expect-error
+@inline
+export function ge_precomp_0 (h: ge_precomp): void {
+	fe_1(h.yplusx)
+	fe_1(h.yminusx)
+	fe_0(h.xy2d)
+}
+
+const ge_add_cached_t0: FieldElement = fe()
+/**
+ * r = p + q
+ */
+export function ge_add_cached (r: ge_p1p1, p: ge_p3, q: ge_cached): void {
+	const t0 = ge_add_cached_t0
+	fe_add(r.X, p.Y, p.X)
+	fe_sub(r.Y, p.Y, p.X)
+	fe_mul(r.Z, r.X, q.YplusX)
+	fe_mul(r.Y, r.Y, q.YminusX)
+	fe_mul(r.T, q.T2d, p.T)
+	fe_mul(r.X, p.Z, q.Z)
+	fe_dbl(t0, r.X)
+	fe_sub(r.X, r.Z, r.Y)
+	fe_add(r.Y, r.Z, r.Y)
+	fe_add(r.Z, t0, r.T)
+	fe_sub(r.T, t0, r.T)
+}
+
+const ge_add_precomp_t0: FieldElement = fe()
+/**
+ * r = p + q
+ */
+//@ts-expect-error
+@inline
+export function ge_add_precomp (r: ge_p1p1, p: ge_p3, q: ge_precomp): void {
+	const t0 = ge_add_precomp_t0
+	fe_add(r.X, p.Y, p.X)
+	fe_sub(r.Y, p.Y, p.X)
+	fe_mul(r.Z, r.X, q.yplusx)
+	fe_mul(r.Y, r.Y, q.yminusx)
+	fe_mul(r.T, q.xy2d, p.T)
+	fe_dbl(t0, p.Z)
+	fe_sub(r.X, r.Z, r.Y)
+	fe_add(r.Y, r.Z, r.Y)
+	fe_add(r.Z, t0, r.T)
+	fe_sub(r.T, t0, r.T)
+}
+
+const ge_sub_p3_q_cached = new ge_cached()
+const ge_sub_p3_p1p1 = new ge_p1p1()
+/* r = p-q */
+//@ts-expect-error
+@inline
+export function ge_sub_p3 (r: ge_p3, p: ge_p3, q: ge_p3): void {
+	const q_cached = ge_sub_p3_q_cached
+	const p1p1 = ge_sub_p3_p1p1
+
+	fe_neg(r.X, q.X)
+	fe_copy(r.Y, q.Y)
+	fe_copy(r.Z, q.Z)
+	fe_neg(r.T, q.T)
+
+	ge_p3_to_cached(q_cached, r)
+	ge_add_cached(p1p1, p, q_cached)
+	ge_p1p1_to_p3(r, p1p1)
+}
+
+const ge_sub_cached_t0: FieldElement = fe()
+/**
+ * r = p - q
+ */
+export function ge_sub_cached (r: ge_p1p1, p: ge_p3, q: ge_cached): void {
+	const t0 = ge_sub_cached_t0
+	fe_add(r.X, p.Y, p.X)
+	fe_sub(r.Y, p.Y, p.X)
+	fe_mul(r.Z, r.X, q.YminusX)
+	fe_mul(r.Y, r.Y, q.YplusX)
+	fe_mul(r.T, q.T2d, p.T)
+	fe_mul(r.X, p.Z, q.Z)
+	fe_dbl(t0, r.X)
+	fe_sub(r.X, r.Z, r.Y)
+	fe_add(r.Y, r.Z, r.Y)
+	fe_sub(r.Z, t0, r.T)
+	fe_add(r.T, t0, r.T)
+}
+
+const ge_sub_precomp_t0: FieldElement = fe()
+/**
+ * r = p - q
+ */
+export function ge_sub_precomp (r: ge_p1p1, p: ge_p3, q: ge_precomp): void {
+	const t0 = ge_sub_precomp_t0
+	fe_add(r.X, p.Y, p.X)
+	fe_sub(r.Y, p.Y, p.X)
+	fe_mul(r.Z, r.X, q.yminusx)
+	fe_mul(r.Y, r.Y, q.yplusx)
+	fe_mul(r.T, q.xy2d, p.T)
+	fe_dbl(t0, p.Z)
+	fe_sub(r.X, r.Z, r.Y)
+	fe_add(r.Y, r.Z, r.Y)
+	fe_sub(r.Z, t0, r.T)
+	fe_add(r.T, t0, r.T)
+}
diff --git a/index.html b/index.html
index fcfcd4c..656709b 100644
--- a/index.html
+++ b/index.html
@@ -112,26 +112,33 @@ SPDX-License-Identifier: GPL-3.0-or-later
 		}
 
 		export async function test (api, size, runs, isDebug, isSelfCheck) {
+			if (typeof size !== 'number' || size < 1) {
+				size = 1
+			}
+			if (typeof runs !== 'number' || runs < 1) {
+				runs = 1
+			}
 			const selectedApi = api
 			// Execute once on load to initialize worker and WASM
 			const h = random(), k = random(64)
 			await NanoNaCl.sign(h, k)
 
 			const derive = sk => {
+				const sk8 = new Uint8Array(sk.match(/.{2}/g).map(b => parseInt(b, 16)))
 				switch (api) {
 					case 'NanoNaCl': {
-						return NanoNaCl.derive(sk)
+						const pk8 = NanoNaCl.deriveSync(sk8)
+						return [...pk8].map(b => b.toString(16).padStart(2, '0')).join('')
 					}
 					case 'NanocurrencyWeb': {
 						return NanocurrencyWeb.wallet.legacyAccounts(sk)[0].publicKey
 					}
 					case 'Sodium': {
-						const sk8 = sk.match(/.{2}/g).map(b => parseInt(b, 16))
-						return sodium.crypto_sign_seed_keypair(new Uint8Array(sk8), 'hex').publicKey
+						return sodium.crypto_sign_seed_keypair(sk8, 'hex').publicKey
 					}
 					case 'TweetNaCl.js': {
 						const sk8 = sk.match(/.{2}/g).map(b => parseInt(b, 16))
-						const pk8 = nacl.sign.keyPair.fromSeed(new Uint8Array(sk8)).publicKey
+						const pk8 = nacl.sign.keyPair.fromSeed(sk8).publicKey
 						return [...pk8].map(b => b.toString(16).padStart(2, '0')).join('')
 					}
 					default: {
@@ -140,24 +147,21 @@ SPDX-License-Identifier: GPL-3.0-or-later
 				}
 			}
 			const sign = (h, sk, pk) => {
+				const h8 = new Uint8Array(h.match(/.{2}/g).map(b => parseInt(b, 16)))
+				const sk8 = new Uint8Array((sk + pk).match(/.{2}/g).map(b => parseInt(b, 16)))
 				switch (api) {
 					case 'NanoNaCl': {
-						return NanoNaCl.sign(h, sk + pk)
+						const s8 = NanoNaCl.signSync(h8, sk8)
+						return [...s8].map(b => b.toString(16).padStart(2, '0')).join('')
 					}
 					case 'NanocurrencyWeb': {
 						const { privateKey } = NanocurrencyWeb.wallet.legacyAccounts(sk)[0]
 						return NanocurrencyWeb.tools.sign(privateKey, h)
 					}
 					case 'Sodium': {
-						sk = sk + pk
-						const h8 = h.match(/.{2}/g).map(b => parseInt(b, 16))
-						const sk8 = sk.match(/.{2}/g).map(b => parseInt(b, 16))
 						return sodium.crypto_sign_detached(new Uint8Array(h8), new Uint8Array(sk8), 'hex')
 					}
 					case 'TweetNaCl.js': {
-						sk = sk + pk
-						const h8 = h.match(/.{2}/g).map(b => parseInt(b, 16))
-						const sk8 = sk.match(/.{2}/g).map(b => parseInt(b, 16))
 						const s8 = nacl.sign.detached(new Uint8Array(h8), new Uint8Array(sk8))
 						return [...s8].map(b => b.toString(16).padStart(2, '0')).join('')
 					}
@@ -167,24 +171,22 @@ SPDX-License-Identifier: GPL-3.0-or-later
 				}
 			}
 			const verify = (s, h, pk) => {
+				const h8 = new Uint8Array(h.match(/.{2}/g).map(b => parseInt(b, 16)))
+				const s8 = new Uint8Array(s.match(/.{2}/g).map(b => parseInt(b, 16)))
+				const pk8 = new Uint8Array(pk.match(/.{2}/g).map(b => parseInt(b, 16)))
 				switch (api) {
 					case 'NanoNaCl': {
-						return NanoNaCl.verify(s, h, pk)
+						const v = NanoNaCl.verifySync(s8, h8, pk8)
+						return v[0] === 0
 					}
 					case 'NanocurrencyWeb': {
 						return NanocurrencyWeb.tools.verify(pk, s, h)
 					}
 					case 'Sodium': {
-						const h8 = h.match(/.{2}/g).map(b => parseInt(b, 16))
-						const s8 = s.match(/.{2}/g).map(b => parseInt(b, 16))
-						const pk8 = pk.match(/.{2}/g).map(b => parseInt(b, 16))
-						return sodium.crypto_sign_verify_detached(new Uint8Array(s8), new Uint8Array(h8), new Uint8Array(pk8))
+						return sodium.crypto_sign_verify_detached(s8, h8, pk8)
 					}
 					case 'TweetNaCl.js': {
-						const h8 = h.match(/.{2}/g).map(b => parseInt(b, 16))
-						const s8 = s.match(/.{2}/g).map(b => parseInt(b, 16))
-						const pk8 = pk.match(/.{2}/g).map(b => parseInt(b, 16))
-						return nacl.sign.detached.verify(new Uint8Array(h8), new Uint8Array(s8), new Uint8Array(pk8))
+						return nacl.sign.detached.verify(h8, s8, pk8)
 					}
 					default: {
 						return null
@@ -196,6 +198,7 @@ SPDX-License-Identifier: GPL-3.0-or-later
 				api = 'NanoNaCl'
 				document.getElementById('status').innerHTML = `RUNNING SELF-CHECK`
 				console.log(`%cNanoNaCl`, 'color:green', 'Checking validation against known values')
+				// https://docs.nano.org/integration-guides/key-management/#success-response_2
 				TEST_VECTORS ??= {
 					blockHash: 'BB569136FA05F8CBF65CEF2EDE368475B289C4477342976556BA4C0DDF216E45',
 					privateKey: '781186FB9EF17DB6E3D1056550D9FAE5D5BBADA6A6BC370E4CBB938B1DC71DA3',
@@ -390,9 +393,9 @@ SPDX-License-Identifier: GPL-3.0-or-later
 		</select>
 	</span>
 	<label for="size">Test Size</label>
-	<input id="size" type="number" value="100" autofocus />
+	<input id="size" type="number" value="100" min="1" autofocus />
 	<label for="runs">Test Runs</label>
-	<input id="runs" type="number" value="10" autofocus />
+	<input id="runs" type="number" value="100" min="1" autofocus />
 	<span>
 		<label for="isDebug">Debug?</label>
 		<input id="isDebug" type="checkbox" />
diff --git a/index.ts b/index.ts
index d571e7e..92d9644 100644
--- a/index.ts
+++ b/index.ts
@@ -16,7 +16,7 @@ type Data = {
 	signature?: string
 }
 
-const NanoNaCl = async (bytes: number[]): Promise<void> => {
+export const NanoNaCl = async (bytes: number[]): Promise<any> => {
 	let isReady = false
 	let wasm
 	let module
@@ -314,6 +314,129 @@ async function run (data: Record<string, string>): Promise<string> {
 	}
 }
 
+let isReady = false
+let wasm
+let module
+let instance
+let memory: WebAssembly.Memory
+let deriveSync: (k: Uint8Array) => Uint8Array<ArrayBuffer>
+let signSync: (h: Uint8Array, k: Uint8Array) => Uint8Array<ArrayBuffer>
+let verifySync: (s: Uint8Array, h: Uint8Array, k: Uint8Array) => Uint8Array<ArrayBuffer>
+try {
+	wasm = Uint8Array.from(nacl)
+	module = await WebAssembly.compile(wasm)
+	instance = await WebAssembly.instantiate(module, {
+		env: {
+			abort: (msg: any, file: any, row: any, col: any) => {
+				// ~lib/builtins/abort(~lib/string/String | null?, ~lib/string/String | null?, u32?, u32?) => void
+				// msg = __liftString(msg >>> 0)
+				// file = __liftString(file >>> 0)
+				row = row >>> 0
+				col = col >>> 0
+				console.error('wasm abort:', `msg ${msg}`, `file ${file}`, `row ${row}`, `col ${col}`)
+				throw new Error(msg, { cause: { file, row, col } })
+			},
+			"performance.now" () {
+				// ~lib/bindings/dom/performance.now() => f64
+				return performance.now()
+			},
+			trace: (message: any, n?: number, a0?: number, a1?: number, a2?: number, a3?: number, a4?: number) => {
+				// ~lib/builtins/trace(~lib/string/String, i32?, f64?, f64?, f64?, f64?, f64?) => void
+				message = __liftString(message >>> 0);
+				(() => {
+					// @external.js
+					console.log(message, ...[a0, a1, a2, a3, a4].slice(0, n))
+				})()
+			},
+			memory: new WebAssembly.Memory({ initial: 4, maximum: 4 })
+		}
+	})
+	const { exports } = instance as { exports: { derive: Derive, sign: Sign, verify: Verify, getInputPointer: GetInputPointer, getOutputPointer: GetOutputPointer, memory: WebAssembly.Memory } }
+	memory = exports.memory
+
+	function __liftString (pointer: number) {
+		if (!pointer) return null
+		const
+			//@ts-ignore
+			end = pointer + new Uint32Array(memory.buffer)[pointer - 4 >>> 2] >>> 1,
+			//@ts-ignore
+			memoryU16 = new Uint16Array(memory.buffer)
+		let
+			start = pointer >>> 1,
+			string = ""
+		while (end - start > 1024) string += String.fromCharCode(...memoryU16.subarray(start, start += 1024))
+		return string + String.fromCharCode(...memoryU16.subarray(start, end))
+	}
+
+	deriveSync = function (k: Uint8Array): Uint8Array<ArrayBuffer> {
+		// assembly/nano-nacl/derive() => void
+		let buffer: DataView | undefined = new DataView(memory.buffer)
+		let inPtr = exports.getInputPointer()
+		for (let i = 0; i < 32; i++) {
+			buffer.setUint8(inPtr + i, k[i])
+		}
+		exports.derive()
+		const outPtr = exports.getOutputPointer()
+		const pk = new Uint8Array(32)
+		buffer = new DataView(memory.buffer)
+		for (let i = 0; i < 32; i++) {
+			pk[i] = buffer.getUint8(outPtr + i)
+		}
+		buffer = undefined
+		return pk
+	}
+
+	signSync = function (h: Uint8Array, k: Uint8Array): Uint8Array<ArrayBuffer> {
+		// assembly/nano-nacl/sign() => void
+		let buffer: DataView | undefined = new DataView(memory.buffer)
+		let inPtr = exports.getInputPointer()
+		for (let i = 0; i < 32; i++) {
+			buffer.setUint8(inPtr + i, h[i])
+		}
+		inPtr += 32
+		for (let i = 0; i < 64; i++) {
+			buffer.setUint8(inPtr + i, k[i])
+		}
+		exports.sign()
+		const outPtr = exports.getOutputPointer()
+		const s = new Uint8Array(64)
+		buffer = new DataView(memory.buffer)
+		for (let i = 0; i < 64; i++) {
+			s[i] = buffer.getUint8(outPtr + i)
+		}
+		buffer = undefined
+		return s
+	}
+
+	verifySync = function (s: Uint8Array, h: Uint8Array, k: Uint8Array): Uint8Array<ArrayBuffer> {
+		// assembly/nano-nacl/verify() => void
+		let buffer: DataView | undefined = new DataView(memory.buffer)
+		let inPtr = exports.getInputPointer()
+		for (let i = 0; i < 64; i++) {
+			buffer.setUint8(inPtr + i, s[i])
+		}
+		inPtr += 64
+		for (let i = 0; i < 32; i++) {
+			buffer.setUint8(inPtr + i, h[i])
+		}
+		inPtr += 32
+		for (let i = 0; i < 32; i++) {
+			buffer.setUint8(inPtr + i, k[i])
+		}
+		exports.verify()
+		const outPtr = exports.getOutputPointer()
+		const v = new Uint8Array(1)
+		buffer = new DataView(memory.buffer)
+		v[0] = buffer.getUint8(outPtr)
+		buffer = undefined
+		return v
+	}
+	isReady = true
+} catch (err) {
+	throw new Error('Error instantiating WebAssembly', { cause: err })
+}
+export { deriveSync, signSync, verifySync }
+
 /**
 * Nano public key derivation using WebAssembly.
 */
-- 
2.52.0