From a3679ae459cf71b50b2c7f965be2d82fbab65d8a Mon Sep 17 00:00:00 2001 From: Chris Duncan Date: Thu, 26 Jun 2025 17:54:38 -0700 Subject: [PATCH] Refactor BLAKE2b codegen. Combine API-specific blake code generation files into one abstracted script. Reset hashes between score benchmark runs so it doesn't pull from cache. --- asconfig.json | 2 +- docs/blake2.md | 26 + package.json | 11 +- scripts/blake2b-gen.js | 408 ++++ src/bin/cli.ts | 17 +- src/lib/generate/wasm/asm/compute.d.ts | 13 + src/lib/generate/wasm/asm/compute.js | 28 + src/lib/generate/wasm/asm/compute.wasm | Bin 0 -> 13246 bytes src/lib/generate/wasm/asm/generate.ts | 173 -- src/lib/generate/wasm/asm/index.ts | 1380 ++++++++++++ src/lib/generate/wasm/worker.ts | 2 +- src/lib/generate/webgl/shaders/draw.frag | 1832 ++++++++++++++++ src/lib/generate/webgl/shaders/generate.ts | 241 --- src/lib/generate/webgl/shaders/index.ts | 2 +- src/lib/generate/webgl/shaders/tsconfig.json | 11 - .../webgl/shaders/tsconfig.json.license | 2 - src/lib/generate/webgpu/index.ts | 2 +- src/lib/generate/webgpu/shaders/compute.wgsl | 1857 +++++++++++++++++ src/lib/generate/webgpu/shaders/generate.ts | 270 --- src/lib/generate/webgpu/shaders/tsconfig.json | 11 - .../webgpu/shaders/tsconfig.json.license | 2 - test/index.html | 4 +- 22 files changed, 5566 insertions(+), 728 deletions(-) create mode 100644 docs/blake2.md create mode 100644 scripts/blake2b-gen.js create mode 100644 src/lib/generate/wasm/asm/compute.d.ts create mode 100644 src/lib/generate/wasm/asm/compute.js create mode 100644 src/lib/generate/wasm/asm/compute.wasm delete mode 100644 src/lib/generate/wasm/asm/generate.ts create mode 100644 src/lib/generate/wasm/asm/index.ts create mode 100644 src/lib/generate/webgl/shaders/draw.frag delete mode 100644 src/lib/generate/webgl/shaders/generate.ts delete mode 100644 src/lib/generate/webgl/shaders/tsconfig.json delete mode 100644 src/lib/generate/webgl/shaders/tsconfig.json.license create mode 100644 src/lib/generate/webgpu/shaders/compute.wgsl delete mode 100644 src/lib/generate/webgpu/shaders/generate.ts delete mode 100644 src/lib/generate/webgpu/shaders/tsconfig.json delete mode 100644 src/lib/generate/webgpu/shaders/tsconfig.json.license diff --git a/asconfig.json b/asconfig.json index 002045b..d96c7bc 100644 --- a/asconfig.json +++ b/asconfig.json @@ -1,6 +1,6 @@ { "options": { - "outFile": "./src/lib/generate/wasm/asm/build/compute.wasm", + "outFile": "./src/lib/generate/wasm/asm/compute.wasm", "optimizeLevel": 3, "shrinkLevel": 2, "converge": true, diff --git a/docs/blake2.md b/docs/blake2.md new file mode 100644 index 0000000..e1be4b7 --- /dev/null +++ b/docs/blake2.md @@ -0,0 +1,26 @@ +Twelve rounds of G mixing as part of BLAKE2b compression step, each divided into eight subprocesses. Each subprocess applies transformations to `m` and `v` variables based on a defined set of index inputs. The algorithm for each subprocess is defined as ollows: + +* r is the current round +* i is the current subprocess within that round +* a, b, c, d are elements of `v` at specific indexes +* sigma is a defined set of array indexes for `m` +* rotr64 is a right-hand bit rotation function + +``` +a = a + b +a = a + m[sigma[r][2*i+0]] +d = rotr64(d ^ a, 32) +c = c + d +b = rotr64(b ^ c, 24) +a = a + b +a = a + m[sigma[r][2*i+1]] +d = rotr64(d ^ a, 16) +c = c + d +b = rotr64(b ^ c, 63) +``` + +Each sum step has an extra carry addition. Note that the m[sigma] sum is skipped if m[sigma] is zero since it effectively does nothing. Also note that rotations must be applied differently from the reference implementation due to the lack of both a native rotate function and 64-bit support in WGSL. + +Subprocesses 1-4 are entirely independent of each other, as are subprocesses 5-8, and so an opportunity to compute in parallel presents itself. However, testing showed this to be ineffective at improving performance. + +These subprocesses could also be parallelized by pairing variables of two different hash processes and packing them into vectors (i.e. a uvec2 for high and low bits of a u64 into a uvec4). Think of it as a two lane highway where traffic in one lane has entirely different cars than the other, but they both are travelling toward the same destination. Once again, however, testing showed this to have a neglible effect. diff --git a/package.json b/package.json index abb2f95..ae86604 100644 --- a/package.json +++ b/package.json @@ -47,17 +47,14 @@ "url": "git+https://zoso.dev/nano-pow.git" }, "scripts": { - "asgenerate": "cd src/lib/generate/wasm/asm && rm -rf build && tsc && node build/generate.js > build/index.ts && cp tsconfig.json* build", "benchmark": "npm run build && ./dist/bin/nano-pow.sh --benchmark 10 --debug", - "build": "rm -rf {dist,types} && tsc && npm run generate && asc src/lib/generate/wasm/asm/build/index.ts && node esbuild.mjs && cp -p src/bin/nano-pow.sh dist/bin", + "build": "rm -rf {dist,types} && tsc && npm run generate && node esbuild.mjs && cp -p src/bin/nano-pow.sh dist/bin", "build:dev": "NODE_ENV=development npm run build", - "generate": "npm run asgenerate && npm run glgenerate && npm run gpugenerate", - "glgenerate": "cd src/lib/generate/webgl/shaders && rm -rf build && tsc && node build/generate.js > build/draw.frag && cp tsconfig.json* build", - "gpugenerate": "cd src/lib/generate/webgpu/shaders && rm -rf build && tsc && node build/generate.js > build/compute.wgsl && cp tsconfig.json* build", + "generate": "node ./scripts/blake2b-gen.js && asc src/lib/generate/wasm/asm/index.ts", "prepare": "npm run build", - "score": "npm run build && ./dist/bin/nano-pow.sh --effort 4 --benchmark 100 --score 100", + "score": "npm run build && ./dist/bin/nano-pow.sh --effort 4 --benchmark 10 --score 10", "start": "node --max-http-header-size=1024 --max-old-space-size=256 ./dist/bin/server.js", - "test": "npm run build && ./test/script.sh" + "test": "npm run build:dev && ./test/script.sh" }, "devDependencies": { "@types/node": "^24.0.3", diff --git a/scripts/blake2b-gen.js b/scripts/blake2b-gen.js new file mode 100644 index 0000000..2da80fd --- /dev/null +++ b/scripts/blake2b-gen.js @@ -0,0 +1,408 @@ +//! SPDX-FileCopyrightText: 2025 Chris Duncan +//! SPDX-License-Identifier: GPL-3.0-or-later + +import { writeFile } from 'node:fs/promises' + +function generate (api) { + + if (api !== 'wasm' && api !== 'webgl' && api !== 'webgpu') { + throw new TypeError(`Cannot generate BLAKE2b code for ${api}`) + } + + // Shorthand for adding two operands and storing sum in the first. + function add (a, b) { + switch (api) { + case 'wasm': { + return ` + ${a} = unchecked(v128.add(${a}, ${b})); + ` + } + case 'webgl': { + return ` + ${a} += ${b}; + ${a}.y += uint(${a}.x < ${b}.x); + ` + } + case 'webgpu': { + return ` + ${a} += ${b}; + ${a}.y += u32(${a}.x < ${b}.x); + ` + } + } + } + + // Shorthand for how values are constructed. + function ctr (lo, hi) { + switch (api) { + case 'wasm': { + if (hi) return `v128.splat((${hi} << 32) | ${lo})` + return `v128.splat(${lo})` + } + case 'webgl': { + if (hi) return `uvec2(${lo}u, ${hi}u)` + return `uvec2(${lo}u)` + } + case 'webgpu': { + if (hi) return `vec2(${lo}u, ${hi}u)` + return `vec2(${lo}u)` + } + } + } + + // Shorthand for declaration statement. + const declare = (() => { + switch (api) { + case 'wasm': return 'let' + case 'webgl': return 'uvec2' + case 'webgpu': return 'var' + } + })() + + // Shorthand for righthand bit rotation of a SIMD i64x2 value. + function rotr (v, i) { + switch (api) { + case 'wasm': { + const shift = i / 8 + if (Number.isInteger(shift)) { + const s0 = ['0', '1', '2', '3', '4', '5', '6', '7'] + const s1 = ['8', '9', '10', '11', '12', '13', '14', '15'] + for (let i = 0; i < shift; i++) { + s0.push(s0.shift() ?? '') + s1.push(s1.shift() ?? '') + } + return `${v} = v128.shuffle(${v}, ${v}, ${s0.join(',')}, ${s1.join(',')});` + } else { + return `${v} = v128.or(v128.shr(${v}, ${i}), v128.shl(${v}, ${64 - i}));` + } + } + case 'webgl': { + if (i === 32) { + return `${v} = ${v}.yx;` + } else if (i > 32) { + return `${v} = (${v} >> uvec2(${i - 32}u)).yx | (${v} << uvec2(${64 - i}u));` + } else { + return `${v} = (${v} >> uvec2(${i}u)) | (${v} << uvec2(${32 - i}u)).yx;` + } + } + case 'webgpu': { + if (i === 32) { + return `${v} = ${v}.yx;` + } else if (i > 32) { + return `${v} = (${v} >> vec2(${i - 32}u)).yx | (${v} << vec2(${64 - i}u));` + } else { + return `${v} = (${v} >> vec2(${i}u)) | (${v} << vec2(${32 - i}u)).yx;` + } + } + } + } + + // Shorthand for XOR'ing two SIMD i64x2 operands and saving result in the first. + function xor (a, b) { + switch (api) { + case 'wasm': return `${a} = v128.xor(${a}, ${b});` + case 'webgl': return `${a} ^= ${b};` + case 'webgpu': return `${a} ^= ${b};` + } + } + + const blake2b_state = [ + [0, 4, 8, 12], + [1, 5, 9, 13], + [2, 6, 10, 14], + [3, 7, 11, 15], + [0, 5, 10, 15], + [1, 6, 11, 12], + [2, 7, 8, 13], + [3, 4, 9, 14] + ] + + const blake2b_sigma = [ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], + [14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3], + [11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4], + [7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8], + [9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13], + [2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9], + [12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11], + [13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10], + [6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5], + [10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0], + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], + [14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3] + ] + + // Initialization vector defined by BLAKE2. + // Application of each XOR is defined by BLAKE2 section 2.4 compression + // function. Each value represents two halves of the original u64 value from the + // reference implementation. They appear reversed pairwise in order to align + // with little-endian computation. + const blake2b_iv = [ + ctr(`0xf3bcc908`, `0x6a09e667`), + ctr(`0x84caa73b`, `0xbb67ae85`), + ctr(`0xfe94f82b`, `0x3c6ef372`), + ctr(`0x5f1d36f1`, `0xa54ff53a`), + ctr(`0xade682d1`, `0x510e527f`), + ctr(`0x2b3e6c1f`, `0x9b05688c`), + ctr(`0xfb41bd6b`, `0x1f83d9ab`), + ctr(`0x137e2179`, `0x5be0cd19`) + ] + + // Parameter block as defined in BLAKE2 section 2.8 and configured as follows: + // maximal depth = 1, fanout = 1, digest byte length = 8 + const blake2b_param = ctr(`0x01010008`, `0x0`) + + // Message input length which is always 40 for Nano. + // 8 nonce bytes + 32 block hash bytes + const blake2b_inlen = ctr(`0x28`, `0x0`) + + // Finalization flag as defined in BLAKE2 section 2.4 and set to ~0 since this + // is the final (and only) message block being hashed. + const blake2b_final = ctr(`~0x0`) + + function G (a, b, c, d, x, y) { + return ` + ${add(`v${a}`, `v${b}`)} + ${x > 4 ? '// NOP' : add(`v${a}`, `m${x}`)} + ${xor(`v${d}`, `v${a}`)} + ${rotr(`v${d}`, 32)} + ${add(`v${c}`, `v${d}`)} + ${xor(`v${b}`, `v${c}`)} + ${rotr(`v${b}`, 24)} + ${add(`v${a}`, `v${b}`)} + ${y > 4 ? '// NOP' : add(`v${a}`, `m${y}`)} + ${xor(`v${d}`, `v${a}`)} + ${rotr(`v${d}`, 16)} + ${add(`v${c}`, `v${d}`)} + ${xor(`v${b}`, `v${c}`)} + ${rotr(`v${b}`, 63)} + ` + } + + function ROUND (r) { + let output = ` + // ROUND ${r} + ` + for (let i = 0; i < 8; i++) { + const [a, b, c, d] = blake2b_state[i] + const s = blake2b_sigma[r] + output += r === 11 && (i === 5 || i === 7) + ? `// G NOP` + : G(a, b, c, d, s[2 * i], s[2 * i + 1]) + } + return output + } + + function INIT () { + let output = ` + // INITIALIZE STATE VECTOR + // v0: depth=1; fanout=1; outlen=8 + // v12: input byte length + // v14: final block flag + ` + for (let i = 0; i < 8; i++) { + output += ` + ${declare} v${i} = ${blake2b_iv[i]}; + ${declare} v${i + 8} = ${blake2b_iv[i]}; + ` + } + return output += ` + ${xor(`v0`, blake2b_param)} + ${xor(`v12`, blake2b_inlen)} + ${xor(`v14`, blake2b_final)} + ` + } + + function COMPRESS () { + let output = ` + ${INIT()} + // COMPRESS + ` + for (let r = 0; r < 12; r++) { + output += ROUND(r) + } + return output.replace(/\t{2,}/g, '\t') + } + + const main = (() => { + switch (api) { + case 'wasm': { + return `export function main (seed: u64, h0: u64, h1: u64, h2: u64, h3: u64, difficulty: u64): u64 { + ${declare} m0 = ${ctr(`0`)}; + const m1 = ${ctr(`h0`)}; + const m2 = ${ctr(`h1`)}; + const m3 = ${ctr(`h2`)}; + const m4 = ${ctr(`h3`)}; + ${declare} r0: u64 = 0 + ${declare} r1: u64 = 0 + ${declare} result = ${ctr(`0`)}; + const blake2b_iv_param = v128.xor(${blake2b_iv[0]}, ${blake2b_param}); + const iterations: u64 = 1 << 24; + + for (let i: u64 = 0; i < iterations; i++) { + m0 = i64x2(unchecked(seed + i), unchecked(seed + i + 1)); + i += 2 + ${COMPRESS()} + + // RESULT + result = v128.xor(blake2b_iv_param, v128.xor(v0, v8)); + r0 = v128.extract_lane(result, 0); + r1 = v128.extract_lane(result, 1); + if (r0 >= difficulty || r1 >= difficulty) break + } + + // Throw trap if no valid nonce found, else get result from SIMD lane. + if (r0 < difficulty && r1 < difficulty) { + return 1 / 0 + } + return select(v128.extract_lane(m0, 0), v128.extract_lane(m0, 1), r0 >= difficulty); +} +` + } + case 'webgl': { + return `#version 300 es +#pragma vscode_glsllint_stage: frag +//! SPDX-FileCopyrightText: 2025 Chris Duncan +//! SPDX-FileContributor: Ben Green +//! SPDX-License-Identifier: GPL-3.0-or-later AND MIT + +// hash - Array of 32-bit integers comprising a 32-byte Nano block hash +// difficulty - Minimum threshold for BLAKE2b result for work to be valid +// seed - Random value which is uniquely varied by pixel coordinates +layout(std140) uniform INPUT { + uint hash[8]; + uvec2 difficulty; + uvec2 seed; +}; + +// work - Pixel value output if and only if valid nonce is found +out uvec4 work; + +// Main draw function +// +// Draws a single pixel per shader invocation, multiplied by the dimensions of +// the canvas. +// +// Each component of a random 8-byte value, provided by the INPUT as a uvec2, +// is XOR'd with the 2-D coordinates of the pixel on the canvas to create a +// unique nonce value for each. +// +// Where the reference implementation uses array lookups, the NanoPow +// implementation assigns each array element to its own variable to enhance +// performance, but the variable name still contains the original index digit. +void main() { + // Initialize fragment output + work = uvec4(0u); + + // Initialize unique nonce and block hash + uvec2 m0 = seed ^ uvec2(gl_FragCoord); + uvec2 m1 = uvec2(hash[0u], hash[1u]); + uvec2 m2 = uvec2(hash[2u], hash[3u]); + uvec2 m3 = uvec2(hash[4u], hash[5u]); + uvec2 m4 = uvec2(hash[6u], hash[7u]); + + ${COMPRESS()} + + // Set pixel value if it exceeds difficulty threshold, else discard it. + uvec2 result = ${blake2b_iv[0]} ^ ${blake2b_param} ^ v0 ^ v8; + if (result.y > difficulty.y || (result.y == difficulty.y && result.x >= difficulty.x)) { + work = uvec4(m0, result); + } + if (work.x == 0u) { + discard; + } +} +` + } + case 'webgpu': { + return `// Input buffers +struct INPUT { + hash: array, 2>, + difficulty: vec2, + seed: vec2 +}; +@group(0) @binding(0) varinput: INPUT; + +// Output buffers +struct OUTPUT { + found: atomic, + work: vec2, + difficulty: vec2 +}; +@group(0) @binding(1) var output: OUTPUT; + +// Shared flag to prevent execution for all workgroup threads based on the +// atomicLoad() result of a single member thread. +var found: bool; + +// Shared memory for hash, difficulty, and seed. +var m1: vec2; +var m2: vec2; +var m3: vec2; +var m4: vec2; +var d: vec2; +var seed: vec2; + +// Main compute function +// +// Computes with a workgroup size of 64 which balances warps between NVIDIA and +// AMD cards while still considering the power-sensitive requirements of mobile +// devices. The entire workgroup exits immediately if a nonce was already found +// by a previous workgroup. +// +// Each component of a random 8-byte value, provided by the UBO as a vec2, +// is XOR'd with a different dimensional index from the global thread identifier +// to create a unique nonce value for each thread. +// +// Where the reference implementation uses array lookups, the NanoPow +// implementation assigns each array element to its own variable to enhance +// performance, but the variable name still contains the original index digit. + +@compute @workgroup_size(64) +fn main(@builtin(global_invocation_id) global_id: vec3, @builtin(local_invocation_id) local_id: vec3) { + if (local_id.x == 0u) { + found = atomicLoad(& output.found) != 0u; + seed = input.seed; + m1 = input.hash[0u].xy; + m2 = input.hash[0u].zw; + m3 = input.hash[1u].xy; + m4 = input.hash[1u].zw; + d = input.difficulty; + } + workgroupBarrier(); + if (found) { return; } + + // Initialize unique nonce + let m0: vec2 = seed ^ global_id.xy; + + ${COMPRESS()} + + // Set nonce if it exceeds difficulty threshold and no other thread has set it. + let result = ${blake2b_iv[0]} ^ ${blake2b_param} ^ v0 ^ v8; + if (result.y > input.difficulty.y || (result.y == input.difficulty.y && result.x >= input.difficulty.x)) { + loop { + let swap = atomicCompareExchangeWeak(&output.found, 0u, 1u); + if (swap.exchanged) { + output.work = m0; + output.difficulty = result; + break; + } + if (swap.old_value != 0u) { + break; + } + } + return; + } +} +` + } + default: return '' + } + })() + + return main.replace(/[ \t]+\n/g, '\n').replace(/\n{2,}/g, '\n') +} + +await writeFile('./src/lib/generate/wasm/asm/index.ts', generate('wasm')) +await writeFile('./src/lib/generate/webgl/shaders/draw.frag', generate('webgl')) +await writeFile('./src/lib/generate/webgpu/shaders/compute.wgsl', generate('webgpu')) diff --git a/src/bin/cli.ts b/src/bin/cli.ts index 2157c24..d1e8cd9 100755 --- a/src/bin/cli.ts +++ b/src/bin/cli.ts @@ -93,11 +93,7 @@ for (let i = 0; i < args.length; i++) { if (b == null) throw new Error('Missing argument for benchmark') const count = +b if (count < 1) throw new Error('Invalid benchmark count') - const random = new Uint8Array(32) - while (hashes.length < count) { - getRandomValues(random) - hashes.push(Buffer.from(random).toString('hex')) - } + generateHashes(count) isBenchmark = true break } @@ -250,6 +246,14 @@ async function execute (): Promise { if (isBatch) console.log(results) } +function generateHashes (count: number) { + const random = new Uint8Array(32) + while (hashes.length < count) { + getRandomValues(random) + hashes.push(Buffer.from(random).toString('hex')) + } +} + async function request (body: { [key: string]: any }): Promise { return new Promise((resolve, reject): void => { const listener = async (msg: Serializable): Promise => { @@ -276,6 +280,9 @@ async function score (): Promise { const result = await benchmark() logger.log(result) if (result != null) rates.push(result.truncatedRate) + const count = hashes.length + hashes.splice(0, hashes.length) + generateHashes(count) } catch (err) { logger.log(err) } diff --git a/src/lib/generate/wasm/asm/compute.d.ts b/src/lib/generate/wasm/asm/compute.d.ts new file mode 100644 index 0000000..02b5669 --- /dev/null +++ b/src/lib/generate/wasm/asm/compute.d.ts @@ -0,0 +1,13 @@ +/** Exported memory */ +export declare const memory: WebAssembly.Memory; +/** + * src/lib/generate/wasm/asm/index/main + * @param seed `u64` + * @param h0 `u64` + * @param h1 `u64` + * @param h2 `u64` + * @param h3 `u64` + * @param difficulty `u64` + * @returns `u64` + */ +export declare function main(seed: bigint, h0: bigint, h1: bigint, h2: bigint, h3: bigint, difficulty: bigint): bigint; diff --git a/src/lib/generate/wasm/asm/compute.js b/src/lib/generate/wasm/asm/compute.js new file mode 100644 index 0000000..c0a5d23 --- /dev/null +++ b/src/lib/generate/wasm/asm/compute.js @@ -0,0 +1,28 @@ +async function instantiate(module, imports = {}) { + const { exports } = await WebAssembly.instantiate(module, imports); + const memory = exports.memory || imports.env.memory; + const adaptedExports = Object.setPrototypeOf({ + main(seed, h0, h1, h2, h3, difficulty) { + // src/lib/generate/wasm/asm/index/main(u64, u64, u64, u64, u64, u64) => u64 + seed = seed || 0n; + h0 = h0 || 0n; + h1 = h1 || 0n; + h2 = h2 || 0n; + h3 = h3 || 0n; + difficulty = difficulty || 0n; + return BigInt.asUintN(64, exports.main(seed, h0, h1, h2, h3, difficulty)); + }, + }, exports); + return adaptedExports; +} +export const { + memory, + main, +} = await (async url => instantiate( + await (async () => { + const isNodeOrBun = typeof process != "undefined" && process.versions != null && (process.versions.node != null || process.versions.bun != null); + if (isNodeOrBun) { return globalThis.WebAssembly.compile(await (await import("node:fs/promises")).readFile(url)); } + else { return await globalThis.WebAssembly.compileStreaming(globalThis.fetch(url)); } + })(), { + } +))(new URL("compute.wasm", import.meta.url)); diff --git a/src/lib/generate/wasm/asm/compute.wasm b/src/lib/generate/wasm/asm/compute.wasm new file mode 100644 index 0000000000000000000000000000000000000000..dd51107d1e6f6cda7412870c1e0bc60ee8b63760 GIT binary patch literal 13246 zcmeHOO^6&t6z-mx{q35+p8uKM&Fel0l7mMDC2rP>;Kh~@G=dmJ*T0Pz@gQ#E;tzs& z@UV#BMN|-xNG_gYJb8&Bfm{S{p5o1eK`=;(yjN3QJM*f0W|+)mve`Xs!S>A8uU>uc ztM{tjIw#-QavjHUJ@?u9v-IPh&1PI@F6%nZLOGM$I{DHm$C=-He(SZzai*xA`xJeFI}tV?n2-RtMxd7=>MQHsS& zKi=59zVxylrMTJsj2@-9_Sn~7{P95Zf*!$vmF>`#KDZn1wp??=MS0Z?JsE`pgv{sW z<`*0{lNI?wadBzc3k5#T%P1^BLFTjF=U(@DSb#hT@Sq^Cx-j1ZmFa$Xx%=TgP)8q@ zS6nC@b>(GOJ{}aLc;({Jf4}?fy+eAG;_yGa|DC>Z>LEQ!vAgk8?$|xszw1$oPk;FJ z=Gv9Np4KBQpan%4g(WCiCR4)4Wf_G&_?F4|P=*o|!S6v~^DroC9=@K34`nI-^uIcD z=I>{|)FXIO9D4Q0;g4UO`&^GwJowEo?|<<9<^?_4K69e7biUl!I<7}p(*7+&*)qE_ zKCZ|ptU}c?yDC(m3?5W*sg`@7*c1(vnqsv(#Sp(7)?^gcp>8puj*lBqhZ)gV5~2runA3z8=Bw~N>h7)c+YGg-V=)QKvOSr zQ;PY{3qRlY$n`h%2ze+&k^ddvfT2@_qQzH5e2iplLCa!i3o#)NtsWF+r$JG6wzQr2 zeE~HnKm|%tRL>oG^nC8KT|H97hC&B6i+c>{up}eJH5T_6Cl98z(inx=VNe)J8{k8QR4KAA4s9wJh!v<-4s_Cf9I_NAU z9icEgXk#N3Wk*MANe8Mb+9?)=dMb1bkyI9RkTMEUkXE5g2pB#jxJ*$HMQuwl6f+Q^ zf-1D2Myw++n=!c1DXXTxk5f%H#-K1eNLX5+H31{cCM-q`m{Z8r`upjcVu+Z$YF%L| z?-iDmBq3c`G>W1v(uJUy7ieMa>e7KaRG^>;hwGUVyh!{uv@Nu4cTw6%+Zct}K@vbH ziVJN`+cs!nAP=jjJ<@L^fvwQwgoSdiSPj^VjT$86FGg4l3Ui792n%iT1BC@M4$KZ`U04i-QnrI3 z)rI!gxMZ>}-9*%7e+!G!&N$&DVbT`TdZf!i+LBtV_LWwQE|#Kb zR@B9)0b|Mt!eUU^+zraen$lOqFat-ZhsGBcvk&c?dT2ypi8qh2>!DfE6hknosu*|2 zsE3qx-olbE> zIUTu8XDqiSSus0Y@)bl-9eCYrQd5!y=B*J9T2xeweKn*+>Xsxhrmh}ZV$5E$Z_aW} z8f486?x41oS&^9iuP#QT7-#Uza0p>BLSWUDVBp~`He?AI1^&PuxV}t+y$=;X818&l zcT)GyIw(?Ct!`TzM1Zo4f?CfJstH17UR;Ybpke7y!&Z0fb8E<}XdeBKM8A2|g^Mk4 z>KM18yVEMvl1a+m-wfGp(+pIPHTEuJn|DH&p^|o74Q%zszFuE4BQU|lZgQFr?!PHL z>AsADiWJVJi+|z2m*|nY=CSX)WQV88_dBp_wyj|rP4bmd*x76Gfxf-&_b7qlL%HgN zt1x#0Jh|$IYcO}>bl!oLjqG-?A~K%m!5pj|hdEe1{(5eG!`*OpiXJS=RR@2$_~6No F_dlpU2eSYG literal 0 HcmV?d00001 diff --git a/src/lib/generate/wasm/asm/generate.ts b/src/lib/generate/wasm/asm/generate.ts deleted file mode 100644 index 7c34492..0000000 --- a/src/lib/generate/wasm/asm/generate.ts +++ /dev/null @@ -1,173 +0,0 @@ -//! SPDX-FileCopyrightText: 2025 Chris Duncan -//! SPDX-License-Identifier: GPL-3.0-or-later - -const blake2b_sigma: (0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15)[][] = [ - [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], - [14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3], - [11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4], - [7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8], - [9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13], - [2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9], - [12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11], - [13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10], - [6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5], - [10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0], - [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], - [14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3] -] - -/** -* Initialization vector defined by BLAKE2. -*/ -const blake2b_iv = [ - '0x6a09e667f3bcc908', - '0xbb67ae8584caa73b', - '0x3c6ef372fe94f82b', - '0xa54ff53a5f1d36f1', - '0x510e527fade682d1', - '0x9b05688c2b3e6c1f', - '0x1f83d9abfb41bd6b', - '0x5be0cd19137e2179' -] - -/** -* Parameter block as defined in BLAKE2 section 2.8 and configured as follows: -* maximal depth = 1, fanout = 1, digest byte length = 8 -*/ -const blake2b_param = `${blake2b_iv[0]} ^ 0x01010008` - -/** -* Message input length which is always 40 for Nano. -* 8 nonce bytes + 32 block hash bytes -*/ -const blake2b_inlen = `${blake2b_iv[4]} ^ 0x28` - -/** -* Finalization flag as defined in BLAKE2 section 2.4 and set to ~0 since this is -* the final (and only) message block being hashed. -*/ -const blake2b_final = `~${blake2b_iv[6]}` - -function G ( - a: 0 | 1 | 2 | 3, - b: 4 | 5 | 6 | 7, - c: 8 | 9 | 10 | 11, - d: 12 | 13 | 14 | 15, - x: 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15, - y: 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 -): string { - return ` - v${a} = unchecked(v128.add(v${a}, v${b})) - v${a} = unchecked(v128.add(v${a}, m${x})) - v${d} = v128.xor(v${d}, v${a}) - v${d} = v128.shuffle(v${d}, v${d}, 4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11) - v${c} = unchecked(v128.add(v${c}, v${d})) - v${b} = v128.xor(v${b}, v${c}) - v${b} = v128.shuffle(v${b}, v${b}, 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10) - v${a} = unchecked(v128.add(v${a}, v${b})) - v${a} = unchecked(v128.add(v${a}, m${y})) - v${d} = v128.xor(v${d}, v${a}) - v${d} = v128.shuffle(v${d}, v${d}, 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9) - v${c} = unchecked(v128.add(v${c}, v${d})) - v${b} = v128.xor(v${b}, v${c}) - v${b} = v128.or(v128.shr(v${b}, 63), v128.shl(v${b}, 1)) - ` -} - -function ROUND (i: (0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11)): string { - return ` - // ROUND ${i} - ${G(0, 4, 8, 12, blake2b_sigma[i][0], blake2b_sigma[i][1])} - ${G(1, 5, 9, 13, blake2b_sigma[i][2], blake2b_sigma[i][3])} - ${G(2, 6, 10, 14, blake2b_sigma[i][4], blake2b_sigma[i][5])} - ${G(3, 7, 11, 15, blake2b_sigma[i][6], blake2b_sigma[i][7])} - ${G(0, 5, 10, 15, blake2b_sigma[i][8], blake2b_sigma[i][9])} - ${G(1, 6, 11, 12, blake2b_sigma[i][10], blake2b_sigma[i][11])} - ${G(2, 7, 8, 13, blake2b_sigma[i][12], blake2b_sigma[i][13])} - ${G(3, 4, 9, 14, blake2b_sigma[i][14], blake2b_sigma[i][15])} - ` -} - -function SETUP (): string { - return ` - // input parameter configuration - // v0: depth=1; fanout=1; outlen=8 - // v12: inlen - // v14: final block flag - let v0 = v128.splat(${blake2b_param}) - let v1 = v128.splat(${blake2b_iv[1]}) - let v2 = v128.splat(${blake2b_iv[2]}) - let v3 = v128.splat(${blake2b_iv[3]}) - let v4 = v128.splat(${blake2b_iv[4]}) - let v5 = v128.splat(${blake2b_iv[5]}) - let v6 = v128.splat(${blake2b_iv[6]}) - let v7 = v128.splat(${blake2b_iv[7]}) - let v8 = v128.splat(${blake2b_iv[0]}) - let v9 = v128.splat(${blake2b_iv[1]}) - let v10 = v128.splat(${blake2b_iv[2]}) - let v11 = v128.splat(${blake2b_iv[3]}) - let v12 = v128.splat(${blake2b_inlen}) - let v13 = v128.splat(${blake2b_iv[5]}) - let v14 = v128.splat(${blake2b_final}) - let v15 = v128.splat(${blake2b_iv[7]}) - ` -} - -function hash (): string { - return ` - ${SETUP()} - ${ROUND(0)} - ${ROUND(1)} - ${ROUND(2)} - ${ROUND(3)} - ${ROUND(4)} - ${ROUND(5)} - ${ROUND(6)} - ${ROUND(7)} - ${ROUND(8)} - ${ROUND(9)} - ${ROUND(10)} - ${ROUND(11)} - ` -} - -console.log(`export function main (seed: u64, h0: u64, h1: u64, h2: u64, h3: u64, difficulty: u64): u64 { - let m0 = v128.splat(seed) - const m1 = v128.splat(h0) - const m2 = v128.splat(h1) - const m3 = v128.splat(h2) - const m4 = v128.splat(h3) - const m5 = v128.splat(0) - const m6 = v128.splat(0) - const m7 = v128.splat(0) - const m8 = v128.splat(0) - const m9 = v128.splat(0) - const m10 = v128.splat(0) - const m11 = v128.splat(0) - const m12 = v128.splat(0) - const m13 = v128.splat(0) - const m14 = v128.splat(0) - const m15 = v128.splat(0) - const finalizer = v128.splat(${blake2b_param}) - - let r0: u64 = 0 - let r1: u64 = 0 - let result = v128.splat(0) - const iterations: u64 = 1 << 24 - - for (let i: u64 = 0; i < iterations; i++) { - m0 = i64x2(unchecked(seed + i), unchecked(seed + i + 1)) - i += 2 - ${hash()} - result = v128.xor(finalizer, v128.xor(v0, v8)) - r0 = v128.extract_lane(result, 0) - r1 = v128.extract_lane(result, 1) - if (r0 >= difficulty || r1 >= difficulty) break - } - - if (r0 < difficulty && r1 < difficulty) { - return 1/0 - } - return select(v128.extract_lane(m0, 0), v128.extract_lane(m0, 1), r0 >= difficulty) -} -`) diff --git a/src/lib/generate/wasm/asm/index.ts b/src/lib/generate/wasm/asm/index.ts new file mode 100644 index 0000000..04f0d00 --- /dev/null +++ b/src/lib/generate/wasm/asm/index.ts @@ -0,0 +1,1380 @@ +export function main (seed: u64, h0: u64, h1: u64, h2: u64, h3: u64, difficulty: u64): u64 { + let m0 = v128.splat(0); + const m1 = v128.splat(h0); + const m2 = v128.splat(h1); + const m3 = v128.splat(h2); + const m4 = v128.splat(h3); + let r0: u64 = 0 + let r1: u64 = 0 + let result = v128.splat(0); + const blake2b_iv_param = v128.xor(v128.splat((0x6a09e667 << 32) | 0xf3bcc908), v128.splat((0x0 << 32) | 0x01010008)); + const iterations: u64 = 1 << 24; + for (let i: u64 = 0; i < iterations; i++) { + m0 = i64x2(unchecked(seed + i), unchecked(seed + i + 1)); + i += 2 + // INITIALIZE STATE VECTOR + // v0: depth=1; fanout=1; outlen=8 + // v12: input byte length + // v14: final block flag + let v0 = v128.splat((0x6a09e667 << 32) | 0xf3bcc908); + let v8 = v128.splat((0x6a09e667 << 32) | 0xf3bcc908); + let v1 = v128.splat((0xbb67ae85 << 32) | 0x84caa73b); + let v9 = v128.splat((0xbb67ae85 << 32) | 0x84caa73b); + let v2 = v128.splat((0x3c6ef372 << 32) | 0xfe94f82b); + let v10 = v128.splat((0x3c6ef372 << 32) | 0xfe94f82b); + let v3 = v128.splat((0xa54ff53a << 32) | 0x5f1d36f1); + let v11 = v128.splat((0xa54ff53a << 32) | 0x5f1d36f1); + let v4 = v128.splat((0x510e527f << 32) | 0xade682d1); + let v12 = v128.splat((0x510e527f << 32) | 0xade682d1); + let v5 = v128.splat((0x9b05688c << 32) | 0x2b3e6c1f); + let v13 = v128.splat((0x9b05688c << 32) | 0x2b3e6c1f); + let v6 = v128.splat((0x1f83d9ab << 32) | 0xfb41bd6b); + let v14 = v128.splat((0x1f83d9ab << 32) | 0xfb41bd6b); + let v7 = v128.splat((0x5be0cd19 << 32) | 0x137e2179); + let v15 = v128.splat((0x5be0cd19 << 32) | 0x137e2179); + v0 = v128.xor(v0, v128.splat((0x0 << 32) | 0x01010008)); + v12 = v128.xor(v12, v128.splat((0x0 << 32) | 0x28)); + v14 = v128.xor(v14, v128.splat(~0x0)); + // COMPRESS + // ROUND 0 + v0 = unchecked(v128.add(v0, v4)); + v0 = unchecked(v128.add(v0, m0)); + v12 = v128.xor(v12, v0); + v12 = v128.shuffle(v12, v12, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11); + v8 = unchecked(v128.add(v8, v12)); + v4 = v128.xor(v4, v8); + v4 = v128.shuffle(v4, v4, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10); + v0 = unchecked(v128.add(v0, v4)); + v0 = unchecked(v128.add(v0, m1)); + v12 = v128.xor(v12, v0); + v12 = v128.shuffle(v12, v12, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9); + v8 = unchecked(v128.add(v8, v12)); + v4 = v128.xor(v4, v8); + v4 = v128.or(v128.shr(v4, 63), v128.shl(v4, 1)); + v1 = unchecked(v128.add(v1, v5)); + v1 = unchecked(v128.add(v1, m2)); + v13 = v128.xor(v13, v1); + v13 = v128.shuffle(v13, v13, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11); + v9 = unchecked(v128.add(v9, v13)); + v5 = v128.xor(v5, v9); + v5 = v128.shuffle(v5, v5, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10); + v1 = unchecked(v128.add(v1, v5)); + v1 = unchecked(v128.add(v1, m3)); + v13 = v128.xor(v13, v1); + v13 = v128.shuffle(v13, v13, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9); + v9 = unchecked(v128.add(v9, v13)); + v5 = v128.xor(v5, v9); + v5 = v128.or(v128.shr(v5, 63), v128.shl(v5, 1)); + v2 = unchecked(v128.add(v2, v6)); + v2 = unchecked(v128.add(v2, m4)); + v14 = v128.xor(v14, v2); + v14 = v128.shuffle(v14, v14, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11); + v10 = unchecked(v128.add(v10, v14)); + v6 = v128.xor(v6, v10); + v6 = v128.shuffle(v6, v6, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10); + v2 = unchecked(v128.add(v2, v6)); + // NOP + v14 = v128.xor(v14, v2); + v14 = v128.shuffle(v14, v14, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9); + v10 = unchecked(v128.add(v10, v14)); + v6 = v128.xor(v6, v10); + v6 = v128.or(v128.shr(v6, 63), v128.shl(v6, 1)); + v3 = unchecked(v128.add(v3, v7)); + // NOP + v15 = v128.xor(v15, v3); + v15 = v128.shuffle(v15, v15, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11); + v11 = unchecked(v128.add(v11, v15)); + v7 = v128.xor(v7, v11); + v7 = v128.shuffle(v7, v7, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10); + v3 = unchecked(v128.add(v3, v7)); + // NOP + v15 = v128.xor(v15, v3); + v15 = v128.shuffle(v15, v15, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9); + v11 = unchecked(v128.add(v11, v15)); + v7 = v128.xor(v7, v11); + v7 = v128.or(v128.shr(v7, 63), v128.shl(v7, 1)); + v0 = unchecked(v128.add(v0, v5)); + // NOP + v15 = v128.xor(v15, v0); + v15 = v128.shuffle(v15, v15, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11); + v10 = unchecked(v128.add(v10, v15)); + v5 = v128.xor(v5, v10); + v5 = v128.shuffle(v5, v5, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10); + v0 = unchecked(v128.add(v0, v5)); + // NOP + v15 = v128.xor(v15, v0); + v15 = v128.shuffle(v15, v15, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9); + v10 = unchecked(v128.add(v10, v15)); + v5 = v128.xor(v5, v10); + v5 = v128.or(v128.shr(v5, 63), v128.shl(v5, 1)); + v1 = unchecked(v128.add(v1, v6)); + // NOP + v12 = v128.xor(v12, v1); + v12 = v128.shuffle(v12, v12, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11); + v11 = unchecked(v128.add(v11, v12)); + v6 = v128.xor(v6, v11); + v6 = v128.shuffle(v6, v6, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10); + v1 = unchecked(v128.add(v1, v6)); + // NOP + v12 = v128.xor(v12, v1); + v12 = v128.shuffle(v12, v12, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9); + v11 = unchecked(v128.add(v11, v12)); + v6 = v128.xor(v6, v11); + v6 = v128.or(v128.shr(v6, 63), v128.shl(v6, 1)); + v2 = unchecked(v128.add(v2, v7)); + // NOP + v13 = v128.xor(v13, v2); + v13 = v128.shuffle(v13, v13, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11); + v8 = unchecked(v128.add(v8, v13)); + v7 = v128.xor(v7, v8); + v7 = v128.shuffle(v7, v7, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10); + v2 = unchecked(v128.add(v2, v7)); + // NOP + v13 = v128.xor(v13, v2); + v13 = v128.shuffle(v13, v13, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9); + v8 = unchecked(v128.add(v8, v13)); + v7 = v128.xor(v7, v8); + v7 = v128.or(v128.shr(v7, 63), v128.shl(v7, 1)); + v3 = unchecked(v128.add(v3, v4)); + // NOP + v14 = v128.xor(v14, v3); + v14 = v128.shuffle(v14, v14, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11); + v9 = unchecked(v128.add(v9, v14)); + v4 = v128.xor(v4, v9); + v4 = v128.shuffle(v4, v4, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10); + v3 = unchecked(v128.add(v3, v4)); + // NOP + v14 = v128.xor(v14, v3); + v14 = v128.shuffle(v14, v14, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9); + v9 = unchecked(v128.add(v9, v14)); + v4 = v128.xor(v4, v9); + v4 = v128.or(v128.shr(v4, 63), v128.shl(v4, 1)); + // ROUND 1 + v0 = unchecked(v128.add(v0, v4)); + // NOP + v12 = v128.xor(v12, v0); + v12 = v128.shuffle(v12, v12, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11); + v8 = unchecked(v128.add(v8, v12)); + v4 = v128.xor(v4, v8); + v4 = v128.shuffle(v4, v4, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10); + v0 = unchecked(v128.add(v0, v4)); + // NOP + v12 = v128.xor(v12, v0); + v12 = v128.shuffle(v12, v12, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9); + v8 = unchecked(v128.add(v8, v12)); + v4 = v128.xor(v4, v8); + v4 = v128.or(v128.shr(v4, 63), v128.shl(v4, 1)); + v1 = unchecked(v128.add(v1, v5)); + v1 = unchecked(v128.add(v1, m4)); + v13 = v128.xor(v13, v1); + v13 = v128.shuffle(v13, v13, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11); + v9 = unchecked(v128.add(v9, v13)); + v5 = v128.xor(v5, v9); + v5 = v128.shuffle(v5, v5, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10); + v1 = unchecked(v128.add(v1, v5)); + // NOP + v13 = v128.xor(v13, v1); + v13 = v128.shuffle(v13, v13, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9); + v9 = unchecked(v128.add(v9, v13)); + v5 = v128.xor(v5, v9); + v5 = v128.or(v128.shr(v5, 63), v128.shl(v5, 1)); + v2 = unchecked(v128.add(v2, v6)); + // NOP + v14 = v128.xor(v14, v2); + v14 = v128.shuffle(v14, v14, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11); + v10 = unchecked(v128.add(v10, v14)); + v6 = v128.xor(v6, v10); + v6 = v128.shuffle(v6, v6, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10); + v2 = unchecked(v128.add(v2, v6)); + // NOP + v14 = v128.xor(v14, v2); + v14 = v128.shuffle(v14, v14, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9); + v10 = unchecked(v128.add(v10, v14)); + v6 = v128.xor(v6, v10); + v6 = v128.or(v128.shr(v6, 63), v128.shl(v6, 1)); + v3 = unchecked(v128.add(v3, v7)); + // NOP + v15 = v128.xor(v15, v3); + v15 = v128.shuffle(v15, v15, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11); + v11 = unchecked(v128.add(v11, v15)); + v7 = v128.xor(v7, v11); + v7 = v128.shuffle(v7, v7, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10); + v3 = unchecked(v128.add(v3, v7)); + // NOP + v15 = v128.xor(v15, v3); + v15 = v128.shuffle(v15, v15, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9); + v11 = unchecked(v128.add(v11, v15)); + v7 = v128.xor(v7, v11); + v7 = v128.or(v128.shr(v7, 63), v128.shl(v7, 1)); + v0 = unchecked(v128.add(v0, v5)); + v0 = unchecked(v128.add(v0, m1)); + v15 = v128.xor(v15, v0); + v15 = v128.shuffle(v15, v15, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11); + v10 = unchecked(v128.add(v10, v15)); + v5 = v128.xor(v5, v10); + v5 = v128.shuffle(v5, v5, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10); + v0 = unchecked(v128.add(v0, v5)); + // NOP + v15 = v128.xor(v15, v0); + v15 = v128.shuffle(v15, v15, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9); + v10 = unchecked(v128.add(v10, v15)); + v5 = v128.xor(v5, v10); + v5 = v128.or(v128.shr(v5, 63), v128.shl(v5, 1)); + v1 = unchecked(v128.add(v1, v6)); + v1 = unchecked(v128.add(v1, m0)); + v12 = v128.xor(v12, v1); + v12 = v128.shuffle(v12, v12, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11); + v11 = unchecked(v128.add(v11, v12)); + v6 = v128.xor(v6, v11); + v6 = v128.shuffle(v6, v6, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10); + v1 = unchecked(v128.add(v1, v6)); + v1 = unchecked(v128.add(v1, m2)); + v12 = v128.xor(v12, v1); + v12 = v128.shuffle(v12, v12, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9); + v11 = unchecked(v128.add(v11, v12)); + v6 = v128.xor(v6, v11); + v6 = v128.or(v128.shr(v6, 63), v128.shl(v6, 1)); + v2 = unchecked(v128.add(v2, v7)); + // NOP + v13 = v128.xor(v13, v2); + v13 = v128.shuffle(v13, v13, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11); + v8 = unchecked(v128.add(v8, v13)); + v7 = v128.xor(v7, v8); + v7 = v128.shuffle(v7, v7, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10); + v2 = unchecked(v128.add(v2, v7)); + // NOP + v13 = v128.xor(v13, v2); + v13 = v128.shuffle(v13, v13, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9); + v8 = unchecked(v128.add(v8, v13)); + v7 = v128.xor(v7, v8); + v7 = v128.or(v128.shr(v7, 63), v128.shl(v7, 1)); + v3 = unchecked(v128.add(v3, v4)); + // NOP + v14 = v128.xor(v14, v3); + v14 = v128.shuffle(v14, v14, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11); + v9 = unchecked(v128.add(v9, v14)); + v4 = v128.xor(v4, v9); + v4 = v128.shuffle(v4, v4, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10); + v3 = unchecked(v128.add(v3, v4)); + v3 = unchecked(v128.add(v3, m3)); + v14 = v128.xor(v14, v3); + v14 = v128.shuffle(v14, v14, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9); + v9 = unchecked(v128.add(v9, v14)); + v4 = v128.xor(v4, v9); + v4 = v128.or(v128.shr(v4, 63), v128.shl(v4, 1)); + // ROUND 2 + v0 = unchecked(v128.add(v0, v4)); + // NOP + v12 = v128.xor(v12, v0); + v12 = v128.shuffle(v12, v12, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11); + v8 = unchecked(v128.add(v8, v12)); + v4 = v128.xor(v4, v8); + v4 = v128.shuffle(v4, v4, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10); + v0 = unchecked(v128.add(v0, v4)); + // NOP + v12 = v128.xor(v12, v0); + v12 = v128.shuffle(v12, v12, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9); + v8 = unchecked(v128.add(v8, v12)); + v4 = v128.xor(v4, v8); + v4 = v128.or(v128.shr(v4, 63), v128.shl(v4, 1)); + v1 = unchecked(v128.add(v1, v5)); + // NOP + v13 = v128.xor(v13, v1); + v13 = v128.shuffle(v13, v13, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11); + v9 = unchecked(v128.add(v9, v13)); + v5 = v128.xor(v5, v9); + v5 = v128.shuffle(v5, v5, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10); + v1 = unchecked(v128.add(v1, v5)); + v1 = unchecked(v128.add(v1, m0)); + v13 = v128.xor(v13, v1); + v13 = v128.shuffle(v13, v13, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9); + v9 = unchecked(v128.add(v9, v13)); + v5 = v128.xor(v5, v9); + v5 = v128.or(v128.shr(v5, 63), v128.shl(v5, 1)); + v2 = unchecked(v128.add(v2, v6)); + // NOP + v14 = v128.xor(v14, v2); + v14 = v128.shuffle(v14, v14, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11); + v10 = unchecked(v128.add(v10, v14)); + v6 = v128.xor(v6, v10); + v6 = v128.shuffle(v6, v6, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10); + v2 = unchecked(v128.add(v2, v6)); + v2 = unchecked(v128.add(v2, m2)); + v14 = v128.xor(v14, v2); + v14 = v128.shuffle(v14, v14, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9); + v10 = unchecked(v128.add(v10, v14)); + v6 = v128.xor(v6, v10); + v6 = v128.or(v128.shr(v6, 63), v128.shl(v6, 1)); + v3 = unchecked(v128.add(v3, v7)); + // NOP + v15 = v128.xor(v15, v3); + v15 = v128.shuffle(v15, v15, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11); + v11 = unchecked(v128.add(v11, v15)); + v7 = v128.xor(v7, v11); + v7 = v128.shuffle(v7, v7, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10); + v3 = unchecked(v128.add(v3, v7)); + // NOP + v15 = v128.xor(v15, v3); + v15 = v128.shuffle(v15, v15, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9); + v11 = unchecked(v128.add(v11, v15)); + v7 = v128.xor(v7, v11); + v7 = v128.or(v128.shr(v7, 63), v128.shl(v7, 1)); + v0 = unchecked(v128.add(v0, v5)); + // NOP + v15 = v128.xor(v15, v0); + v15 = v128.shuffle(v15, v15, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11); + v10 = unchecked(v128.add(v10, v15)); + v5 = v128.xor(v5, v10); + v5 = v128.shuffle(v5, v5, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10); + v0 = unchecked(v128.add(v0, v5)); + // NOP + v15 = v128.xor(v15, v0); + v15 = v128.shuffle(v15, v15, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9); + v10 = unchecked(v128.add(v10, v15)); + v5 = v128.xor(v5, v10); + v5 = v128.or(v128.shr(v5, 63), v128.shl(v5, 1)); + v1 = unchecked(v128.add(v1, v6)); + v1 = unchecked(v128.add(v1, m3)); + v12 = v128.xor(v12, v1); + v12 = v128.shuffle(v12, v12, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11); + v11 = unchecked(v128.add(v11, v12)); + v6 = v128.xor(v6, v11); + v6 = v128.shuffle(v6, v6, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10); + v1 = unchecked(v128.add(v1, v6)); + // NOP + v12 = v128.xor(v12, v1); + v12 = v128.shuffle(v12, v12, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9); + v11 = unchecked(v128.add(v11, v12)); + v6 = v128.xor(v6, v11); + v6 = v128.or(v128.shr(v6, 63), v128.shl(v6, 1)); + v2 = unchecked(v128.add(v2, v7)); + // NOP + v13 = v128.xor(v13, v2); + v13 = v128.shuffle(v13, v13, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11); + v8 = unchecked(v128.add(v8, v13)); + v7 = v128.xor(v7, v8); + v7 = v128.shuffle(v7, v7, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10); + v2 = unchecked(v128.add(v2, v7)); + v2 = unchecked(v128.add(v2, m1)); + v13 = v128.xor(v13, v2); + v13 = v128.shuffle(v13, v13, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9); + v8 = unchecked(v128.add(v8, v13)); + v7 = v128.xor(v7, v8); + v7 = v128.or(v128.shr(v7, 63), v128.shl(v7, 1)); + v3 = unchecked(v128.add(v3, v4)); + // NOP + v14 = v128.xor(v14, v3); + v14 = v128.shuffle(v14, v14, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11); + v9 = unchecked(v128.add(v9, v14)); + v4 = v128.xor(v4, v9); + v4 = v128.shuffle(v4, v4, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10); + v3 = unchecked(v128.add(v3, v4)); + v3 = unchecked(v128.add(v3, m4)); + v14 = v128.xor(v14, v3); + v14 = v128.shuffle(v14, v14, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9); + v9 = unchecked(v128.add(v9, v14)); + v4 = v128.xor(v4, v9); + v4 = v128.or(v128.shr(v4, 63), v128.shl(v4, 1)); + // ROUND 3 + v0 = unchecked(v128.add(v0, v4)); + // NOP + v12 = v128.xor(v12, v0); + v12 = v128.shuffle(v12, v12, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11); + v8 = unchecked(v128.add(v8, v12)); + v4 = v128.xor(v4, v8); + v4 = v128.shuffle(v4, v4, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10); + v0 = unchecked(v128.add(v0, v4)); + // NOP + v12 = v128.xor(v12, v0); + v12 = v128.shuffle(v12, v12, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9); + v8 = unchecked(v128.add(v8, v12)); + v4 = v128.xor(v4, v8); + v4 = v128.or(v128.shr(v4, 63), v128.shl(v4, 1)); + v1 = unchecked(v128.add(v1, v5)); + v1 = unchecked(v128.add(v1, m3)); + v13 = v128.xor(v13, v1); + v13 = v128.shuffle(v13, v13, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11); + v9 = unchecked(v128.add(v9, v13)); + v5 = v128.xor(v5, v9); + v5 = v128.shuffle(v5, v5, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10); + v1 = unchecked(v128.add(v1, v5)); + v1 = unchecked(v128.add(v1, m1)); + v13 = v128.xor(v13, v1); + v13 = v128.shuffle(v13, v13, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9); + v9 = unchecked(v128.add(v9, v13)); + v5 = v128.xor(v5, v9); + v5 = v128.or(v128.shr(v5, 63), v128.shl(v5, 1)); + v2 = unchecked(v128.add(v2, v6)); + // NOP + v14 = v128.xor(v14, v2); + v14 = v128.shuffle(v14, v14, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11); + v10 = unchecked(v128.add(v10, v14)); + v6 = v128.xor(v6, v10); + v6 = v128.shuffle(v6, v6, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10); + v2 = unchecked(v128.add(v2, v6)); + // NOP + v14 = v128.xor(v14, v2); + v14 = v128.shuffle(v14, v14, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9); + v10 = unchecked(v128.add(v10, v14)); + v6 = v128.xor(v6, v10); + v6 = v128.or(v128.shr(v6, 63), v128.shl(v6, 1)); + v3 = unchecked(v128.add(v3, v7)); + // NOP + v15 = v128.xor(v15, v3); + v15 = v128.shuffle(v15, v15, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11); + v11 = unchecked(v128.add(v11, v15)); + v7 = v128.xor(v7, v11); + v7 = v128.shuffle(v7, v7, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10); + v3 = unchecked(v128.add(v3, v7)); + // NOP + v15 = v128.xor(v15, v3); + v15 = v128.shuffle(v15, v15, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9); + v11 = unchecked(v128.add(v11, v15)); + v7 = v128.xor(v7, v11); + v7 = v128.or(v128.shr(v7, 63), v128.shl(v7, 1)); + v0 = unchecked(v128.add(v0, v5)); + v0 = unchecked(v128.add(v0, m2)); + v15 = v128.xor(v15, v0); + v15 = v128.shuffle(v15, v15, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11); + v10 = unchecked(v128.add(v10, v15)); + v5 = v128.xor(v5, v10); + v5 = v128.shuffle(v5, v5, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10); + v0 = unchecked(v128.add(v0, v5)); + // NOP + v15 = v128.xor(v15, v0); + v15 = v128.shuffle(v15, v15, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9); + v10 = unchecked(v128.add(v10, v15)); + v5 = v128.xor(v5, v10); + v5 = v128.or(v128.shr(v5, 63), v128.shl(v5, 1)); + v1 = unchecked(v128.add(v1, v6)); + // NOP + v12 = v128.xor(v12, v1); + v12 = v128.shuffle(v12, v12, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11); + v11 = unchecked(v128.add(v11, v12)); + v6 = v128.xor(v6, v11); + v6 = v128.shuffle(v6, v6, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10); + v1 = unchecked(v128.add(v1, v6)); + // NOP + v12 = v128.xor(v12, v1); + v12 = v128.shuffle(v12, v12, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9); + v11 = unchecked(v128.add(v11, v12)); + v6 = v128.xor(v6, v11); + v6 = v128.or(v128.shr(v6, 63), v128.shl(v6, 1)); + v2 = unchecked(v128.add(v2, v7)); + v2 = unchecked(v128.add(v2, m4)); + v13 = v128.xor(v13, v2); + v13 = v128.shuffle(v13, v13, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11); + v8 = unchecked(v128.add(v8, v13)); + v7 = v128.xor(v7, v8); + v7 = v128.shuffle(v7, v7, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10); + v2 = unchecked(v128.add(v2, v7)); + v2 = unchecked(v128.add(v2, m0)); + v13 = v128.xor(v13, v2); + v13 = v128.shuffle(v13, v13, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9); + v8 = unchecked(v128.add(v8, v13)); + v7 = v128.xor(v7, v8); + v7 = v128.or(v128.shr(v7, 63), v128.shl(v7, 1)); + v3 = unchecked(v128.add(v3, v4)); + // NOP + v14 = v128.xor(v14, v3); + v14 = v128.shuffle(v14, v14, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11); + v9 = unchecked(v128.add(v9, v14)); + v4 = v128.xor(v4, v9); + v4 = v128.shuffle(v4, v4, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10); + v3 = unchecked(v128.add(v3, v4)); + // NOP + v14 = v128.xor(v14, v3); + v14 = v128.shuffle(v14, v14, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9); + v9 = unchecked(v128.add(v9, v14)); + v4 = v128.xor(v4, v9); + v4 = v128.or(v128.shr(v4, 63), v128.shl(v4, 1)); + // ROUND 4 + v0 = unchecked(v128.add(v0, v4)); + // NOP + v12 = v128.xor(v12, v0); + v12 = v128.shuffle(v12, v12, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11); + v8 = unchecked(v128.add(v8, v12)); + v4 = v128.xor(v4, v8); + v4 = v128.shuffle(v4, v4, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10); + v0 = unchecked(v128.add(v0, v4)); + v0 = unchecked(v128.add(v0, m0)); + v12 = v128.xor(v12, v0); + v12 = v128.shuffle(v12, v12, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9); + v8 = unchecked(v128.add(v8, v12)); + v4 = v128.xor(v4, v8); + v4 = v128.or(v128.shr(v4, 63), v128.shl(v4, 1)); + v1 = unchecked(v128.add(v1, v5)); + // NOP + v13 = v128.xor(v13, v1); + v13 = v128.shuffle(v13, v13, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11); + v9 = unchecked(v128.add(v9, v13)); + v5 = v128.xor(v5, v9); + v5 = v128.shuffle(v5, v5, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10); + v1 = unchecked(v128.add(v1, v5)); + // NOP + v13 = v128.xor(v13, v1); + v13 = v128.shuffle(v13, v13, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9); + v9 = unchecked(v128.add(v9, v13)); + v5 = v128.xor(v5, v9); + v5 = v128.or(v128.shr(v5, 63), v128.shl(v5, 1)); + v2 = unchecked(v128.add(v2, v6)); + v2 = unchecked(v128.add(v2, m2)); + v14 = v128.xor(v14, v2); + v14 = v128.shuffle(v14, v14, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11); + v10 = unchecked(v128.add(v10, v14)); + v6 = v128.xor(v6, v10); + v6 = v128.shuffle(v6, v6, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10); + v2 = unchecked(v128.add(v2, v6)); + v2 = unchecked(v128.add(v2, m4)); + v14 = v128.xor(v14, v2); + v14 = v128.shuffle(v14, v14, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9); + v10 = unchecked(v128.add(v10, v14)); + v6 = v128.xor(v6, v10); + v6 = v128.or(v128.shr(v6, 63), v128.shl(v6, 1)); + v3 = unchecked(v128.add(v3, v7)); + // NOP + v15 = v128.xor(v15, v3); + v15 = v128.shuffle(v15, v15, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11); + v11 = unchecked(v128.add(v11, v15)); + v7 = v128.xor(v7, v11); + v7 = v128.shuffle(v7, v7, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10); + v3 = unchecked(v128.add(v3, v7)); + // NOP + v15 = v128.xor(v15, v3); + v15 = v128.shuffle(v15, v15, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9); + v11 = unchecked(v128.add(v11, v15)); + v7 = v128.xor(v7, v11); + v7 = v128.or(v128.shr(v7, 63), v128.shl(v7, 1)); + v0 = unchecked(v128.add(v0, v5)); + // NOP + v15 = v128.xor(v15, v0); + v15 = v128.shuffle(v15, v15, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11); + v10 = unchecked(v128.add(v10, v15)); + v5 = v128.xor(v5, v10); + v5 = v128.shuffle(v5, v5, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10); + v0 = unchecked(v128.add(v0, v5)); + v0 = unchecked(v128.add(v0, m1)); + v15 = v128.xor(v15, v0); + v15 = v128.shuffle(v15, v15, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9); + v10 = unchecked(v128.add(v10, v15)); + v5 = v128.xor(v5, v10); + v5 = v128.or(v128.shr(v5, 63), v128.shl(v5, 1)); + v1 = unchecked(v128.add(v1, v6)); + // NOP + v12 = v128.xor(v12, v1); + v12 = v128.shuffle(v12, v12, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11); + v11 = unchecked(v128.add(v11, v12)); + v6 = v128.xor(v6, v11); + v6 = v128.shuffle(v6, v6, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10); + v1 = unchecked(v128.add(v1, v6)); + // NOP + v12 = v128.xor(v12, v1); + v12 = v128.shuffle(v12, v12, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9); + v11 = unchecked(v128.add(v11, v12)); + v6 = v128.xor(v6, v11); + v6 = v128.or(v128.shr(v6, 63), v128.shl(v6, 1)); + v2 = unchecked(v128.add(v2, v7)); + // NOP + v13 = v128.xor(v13, v2); + v13 = v128.shuffle(v13, v13, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11); + v8 = unchecked(v128.add(v8, v13)); + v7 = v128.xor(v7, v8); + v7 = v128.shuffle(v7, v7, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10); + v2 = unchecked(v128.add(v2, v7)); + // NOP + v13 = v128.xor(v13, v2); + v13 = v128.shuffle(v13, v13, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9); + v8 = unchecked(v128.add(v8, v13)); + v7 = v128.xor(v7, v8); + v7 = v128.or(v128.shr(v7, 63), v128.shl(v7, 1)); + v3 = unchecked(v128.add(v3, v4)); + v3 = unchecked(v128.add(v3, m3)); + v14 = v128.xor(v14, v3); + v14 = v128.shuffle(v14, v14, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11); + v9 = unchecked(v128.add(v9, v14)); + v4 = v128.xor(v4, v9); + v4 = v128.shuffle(v4, v4, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10); + v3 = unchecked(v128.add(v3, v4)); + // NOP + v14 = v128.xor(v14, v3); + v14 = v128.shuffle(v14, v14, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9); + v9 = unchecked(v128.add(v9, v14)); + v4 = v128.xor(v4, v9); + v4 = v128.or(v128.shr(v4, 63), v128.shl(v4, 1)); + // ROUND 5 + v0 = unchecked(v128.add(v0, v4)); + v0 = unchecked(v128.add(v0, m2)); + v12 = v128.xor(v12, v0); + v12 = v128.shuffle(v12, v12, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11); + v8 = unchecked(v128.add(v8, v12)); + v4 = v128.xor(v4, v8); + v4 = v128.shuffle(v4, v4, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10); + v0 = unchecked(v128.add(v0, v4)); + // NOP + v12 = v128.xor(v12, v0); + v12 = v128.shuffle(v12, v12, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9); + v8 = unchecked(v128.add(v8, v12)); + v4 = v128.xor(v4, v8); + v4 = v128.or(v128.shr(v4, 63), v128.shl(v4, 1)); + v1 = unchecked(v128.add(v1, v5)); + // NOP + v13 = v128.xor(v13, v1); + v13 = v128.shuffle(v13, v13, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11); + v9 = unchecked(v128.add(v9, v13)); + v5 = v128.xor(v5, v9); + v5 = v128.shuffle(v5, v5, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10); + v1 = unchecked(v128.add(v1, v5)); + // NOP + v13 = v128.xor(v13, v1); + v13 = v128.shuffle(v13, v13, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9); + v9 = unchecked(v128.add(v9, v13)); + v5 = v128.xor(v5, v9); + v5 = v128.or(v128.shr(v5, 63), v128.shl(v5, 1)); + v2 = unchecked(v128.add(v2, v6)); + v2 = unchecked(v128.add(v2, m0)); + v14 = v128.xor(v14, v2); + v14 = v128.shuffle(v14, v14, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11); + v10 = unchecked(v128.add(v10, v14)); + v6 = v128.xor(v6, v10); + v6 = v128.shuffle(v6, v6, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10); + v2 = unchecked(v128.add(v2, v6)); + // NOP + v14 = v128.xor(v14, v2); + v14 = v128.shuffle(v14, v14, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9); + v10 = unchecked(v128.add(v10, v14)); + v6 = v128.xor(v6, v10); + v6 = v128.or(v128.shr(v6, 63), v128.shl(v6, 1)); + v3 = unchecked(v128.add(v3, v7)); + // NOP + v15 = v128.xor(v15, v3); + v15 = v128.shuffle(v15, v15, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11); + v11 = unchecked(v128.add(v11, v15)); + v7 = v128.xor(v7, v11); + v7 = v128.shuffle(v7, v7, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10); + v3 = unchecked(v128.add(v3, v7)); + v3 = unchecked(v128.add(v3, m3)); + v15 = v128.xor(v15, v3); + v15 = v128.shuffle(v15, v15, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9); + v11 = unchecked(v128.add(v11, v15)); + v7 = v128.xor(v7, v11); + v7 = v128.or(v128.shr(v7, 63), v128.shl(v7, 1)); + v0 = unchecked(v128.add(v0, v5)); + v0 = unchecked(v128.add(v0, m4)); + v15 = v128.xor(v15, v0); + v15 = v128.shuffle(v15, v15, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11); + v10 = unchecked(v128.add(v10, v15)); + v5 = v128.xor(v5, v10); + v5 = v128.shuffle(v5, v5, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10); + v0 = unchecked(v128.add(v0, v5)); + // NOP + v15 = v128.xor(v15, v0); + v15 = v128.shuffle(v15, v15, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9); + v10 = unchecked(v128.add(v10, v15)); + v5 = v128.xor(v5, v10); + v5 = v128.or(v128.shr(v5, 63), v128.shl(v5, 1)); + v1 = unchecked(v128.add(v1, v6)); + // NOP + v12 = v128.xor(v12, v1); + v12 = v128.shuffle(v12, v12, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11); + v11 = unchecked(v128.add(v11, v12)); + v6 = v128.xor(v6, v11); + v6 = v128.shuffle(v6, v6, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10); + v1 = unchecked(v128.add(v1, v6)); + // NOP + v12 = v128.xor(v12, v1); + v12 = v128.shuffle(v12, v12, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9); + v11 = unchecked(v128.add(v11, v12)); + v6 = v128.xor(v6, v11); + v6 = v128.or(v128.shr(v6, 63), v128.shl(v6, 1)); + v2 = unchecked(v128.add(v2, v7)); + // NOP + v13 = v128.xor(v13, v2); + v13 = v128.shuffle(v13, v13, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11); + v8 = unchecked(v128.add(v8, v13)); + v7 = v128.xor(v7, v8); + v7 = v128.shuffle(v7, v7, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10); + v2 = unchecked(v128.add(v2, v7)); + // NOP + v13 = v128.xor(v13, v2); + v13 = v128.shuffle(v13, v13, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9); + v8 = unchecked(v128.add(v8, v13)); + v7 = v128.xor(v7, v8); + v7 = v128.or(v128.shr(v7, 63), v128.shl(v7, 1)); + v3 = unchecked(v128.add(v3, v4)); + v3 = unchecked(v128.add(v3, m1)); + v14 = v128.xor(v14, v3); + v14 = v128.shuffle(v14, v14, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11); + v9 = unchecked(v128.add(v9, v14)); + v4 = v128.xor(v4, v9); + v4 = v128.shuffle(v4, v4, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10); + v3 = unchecked(v128.add(v3, v4)); + // NOP + v14 = v128.xor(v14, v3); + v14 = v128.shuffle(v14, v14, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9); + v9 = unchecked(v128.add(v9, v14)); + v4 = v128.xor(v4, v9); + v4 = v128.or(v128.shr(v4, 63), v128.shl(v4, 1)); + // ROUND 6 + v0 = unchecked(v128.add(v0, v4)); + // NOP + v12 = v128.xor(v12, v0); + v12 = v128.shuffle(v12, v12, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11); + v8 = unchecked(v128.add(v8, v12)); + v4 = v128.xor(v4, v8); + v4 = v128.shuffle(v4, v4, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10); + v0 = unchecked(v128.add(v0, v4)); + // NOP + v12 = v128.xor(v12, v0); + v12 = v128.shuffle(v12, v12, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9); + v8 = unchecked(v128.add(v8, v12)); + v4 = v128.xor(v4, v8); + v4 = v128.or(v128.shr(v4, 63), v128.shl(v4, 1)); + v1 = unchecked(v128.add(v1, v5)); + v1 = unchecked(v128.add(v1, m1)); + v13 = v128.xor(v13, v1); + v13 = v128.shuffle(v13, v13, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11); + v9 = unchecked(v128.add(v9, v13)); + v5 = v128.xor(v5, v9); + v5 = v128.shuffle(v5, v5, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10); + v1 = unchecked(v128.add(v1, v5)); + // NOP + v13 = v128.xor(v13, v1); + v13 = v128.shuffle(v13, v13, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9); + v9 = unchecked(v128.add(v9, v13)); + v5 = v128.xor(v5, v9); + v5 = v128.or(v128.shr(v5, 63), v128.shl(v5, 1)); + v2 = unchecked(v128.add(v2, v6)); + // NOP + v14 = v128.xor(v14, v2); + v14 = v128.shuffle(v14, v14, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11); + v10 = unchecked(v128.add(v10, v14)); + v6 = v128.xor(v6, v10); + v6 = v128.shuffle(v6, v6, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10); + v2 = unchecked(v128.add(v2, v6)); + // NOP + v14 = v128.xor(v14, v2); + v14 = v128.shuffle(v14, v14, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9); + v10 = unchecked(v128.add(v10, v14)); + v6 = v128.xor(v6, v10); + v6 = v128.or(v128.shr(v6, 63), v128.shl(v6, 1)); + v3 = unchecked(v128.add(v3, v7)); + v3 = unchecked(v128.add(v3, m4)); + v15 = v128.xor(v15, v3); + v15 = v128.shuffle(v15, v15, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11); + v11 = unchecked(v128.add(v11, v15)); + v7 = v128.xor(v7, v11); + v7 = v128.shuffle(v7, v7, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10); + v3 = unchecked(v128.add(v3, v7)); + // NOP + v15 = v128.xor(v15, v3); + v15 = v128.shuffle(v15, v15, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9); + v11 = unchecked(v128.add(v11, v15)); + v7 = v128.xor(v7, v11); + v7 = v128.or(v128.shr(v7, 63), v128.shl(v7, 1)); + v0 = unchecked(v128.add(v0, v5)); + v0 = unchecked(v128.add(v0, m0)); + v15 = v128.xor(v15, v0); + v15 = v128.shuffle(v15, v15, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11); + v10 = unchecked(v128.add(v10, v15)); + v5 = v128.xor(v5, v10); + v5 = v128.shuffle(v5, v5, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10); + v0 = unchecked(v128.add(v0, v5)); + // NOP + v15 = v128.xor(v15, v0); + v15 = v128.shuffle(v15, v15, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9); + v10 = unchecked(v128.add(v10, v15)); + v5 = v128.xor(v5, v10); + v5 = v128.or(v128.shr(v5, 63), v128.shl(v5, 1)); + v1 = unchecked(v128.add(v1, v6)); + // NOP + v12 = v128.xor(v12, v1); + v12 = v128.shuffle(v12, v12, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11); + v11 = unchecked(v128.add(v11, v12)); + v6 = v128.xor(v6, v11); + v6 = v128.shuffle(v6, v6, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10); + v1 = unchecked(v128.add(v1, v6)); + v1 = unchecked(v128.add(v1, m3)); + v12 = v128.xor(v12, v1); + v12 = v128.shuffle(v12, v12, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9); + v11 = unchecked(v128.add(v11, v12)); + v6 = v128.xor(v6, v11); + v6 = v128.or(v128.shr(v6, 63), v128.shl(v6, 1)); + v2 = unchecked(v128.add(v2, v7)); + // NOP + v13 = v128.xor(v13, v2); + v13 = v128.shuffle(v13, v13, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11); + v8 = unchecked(v128.add(v8, v13)); + v7 = v128.xor(v7, v8); + v7 = v128.shuffle(v7, v7, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10); + v2 = unchecked(v128.add(v2, v7)); + v2 = unchecked(v128.add(v2, m2)); + v13 = v128.xor(v13, v2); + v13 = v128.shuffle(v13, v13, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9); + v8 = unchecked(v128.add(v8, v13)); + v7 = v128.xor(v7, v8); + v7 = v128.or(v128.shr(v7, 63), v128.shl(v7, 1)); + v3 = unchecked(v128.add(v3, v4)); + // NOP + v14 = v128.xor(v14, v3); + v14 = v128.shuffle(v14, v14, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11); + v9 = unchecked(v128.add(v9, v14)); + v4 = v128.xor(v4, v9); + v4 = v128.shuffle(v4, v4, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10); + v3 = unchecked(v128.add(v3, v4)); + // NOP + v14 = v128.xor(v14, v3); + v14 = v128.shuffle(v14, v14, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9); + v9 = unchecked(v128.add(v9, v14)); + v4 = v128.xor(v4, v9); + v4 = v128.or(v128.shr(v4, 63), v128.shl(v4, 1)); + // ROUND 7 + v0 = unchecked(v128.add(v0, v4)); + // NOP + v12 = v128.xor(v12, v0); + v12 = v128.shuffle(v12, v12, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11); + v8 = unchecked(v128.add(v8, v12)); + v4 = v128.xor(v4, v8); + v4 = v128.shuffle(v4, v4, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10); + v0 = unchecked(v128.add(v0, v4)); + // NOP + v12 = v128.xor(v12, v0); + v12 = v128.shuffle(v12, v12, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9); + v8 = unchecked(v128.add(v8, v12)); + v4 = v128.xor(v4, v8); + v4 = v128.or(v128.shr(v4, 63), v128.shl(v4, 1)); + v1 = unchecked(v128.add(v1, v5)); + // NOP + v13 = v128.xor(v13, v1); + v13 = v128.shuffle(v13, v13, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11); + v9 = unchecked(v128.add(v9, v13)); + v5 = v128.xor(v5, v9); + v5 = v128.shuffle(v5, v5, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10); + v1 = unchecked(v128.add(v1, v5)); + // NOP + v13 = v128.xor(v13, v1); + v13 = v128.shuffle(v13, v13, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9); + v9 = unchecked(v128.add(v9, v13)); + v5 = v128.xor(v5, v9); + v5 = v128.or(v128.shr(v5, 63), v128.shl(v5, 1)); + v2 = unchecked(v128.add(v2, v6)); + // NOP + v14 = v128.xor(v14, v2); + v14 = v128.shuffle(v14, v14, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11); + v10 = unchecked(v128.add(v10, v14)); + v6 = v128.xor(v6, v10); + v6 = v128.shuffle(v6, v6, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10); + v2 = unchecked(v128.add(v2, v6)); + v2 = unchecked(v128.add(v2, m1)); + v14 = v128.xor(v14, v2); + v14 = v128.shuffle(v14, v14, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9); + v10 = unchecked(v128.add(v10, v14)); + v6 = v128.xor(v6, v10); + v6 = v128.or(v128.shr(v6, 63), v128.shl(v6, 1)); + v3 = unchecked(v128.add(v3, v7)); + v3 = unchecked(v128.add(v3, m3)); + v15 = v128.xor(v15, v3); + v15 = v128.shuffle(v15, v15, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11); + v11 = unchecked(v128.add(v11, v15)); + v7 = v128.xor(v7, v11); + v7 = v128.shuffle(v7, v7, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10); + v3 = unchecked(v128.add(v3, v7)); + // NOP + v15 = v128.xor(v15, v3); + v15 = v128.shuffle(v15, v15, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9); + v11 = unchecked(v128.add(v11, v15)); + v7 = v128.xor(v7, v11); + v7 = v128.or(v128.shr(v7, 63), v128.shl(v7, 1)); + v0 = unchecked(v128.add(v0, v5)); + // NOP + v15 = v128.xor(v15, v0); + v15 = v128.shuffle(v15, v15, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11); + v10 = unchecked(v128.add(v10, v15)); + v5 = v128.xor(v5, v10); + v5 = v128.shuffle(v5, v5, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10); + v0 = unchecked(v128.add(v0, v5)); + v0 = unchecked(v128.add(v0, m0)); + v15 = v128.xor(v15, v0); + v15 = v128.shuffle(v15, v15, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9); + v10 = unchecked(v128.add(v10, v15)); + v5 = v128.xor(v5, v10); + v5 = v128.or(v128.shr(v5, 63), v128.shl(v5, 1)); + v1 = unchecked(v128.add(v1, v6)); + // NOP + v12 = v128.xor(v12, v1); + v12 = v128.shuffle(v12, v12, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11); + v11 = unchecked(v128.add(v11, v12)); + v6 = v128.xor(v6, v11); + v6 = v128.shuffle(v6, v6, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10); + v1 = unchecked(v128.add(v1, v6)); + v1 = unchecked(v128.add(v1, m4)); + v12 = v128.xor(v12, v1); + v12 = v128.shuffle(v12, v12, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9); + v11 = unchecked(v128.add(v11, v12)); + v6 = v128.xor(v6, v11); + v6 = v128.or(v128.shr(v6, 63), v128.shl(v6, 1)); + v2 = unchecked(v128.add(v2, v7)); + // NOP + v13 = v128.xor(v13, v2); + v13 = v128.shuffle(v13, v13, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11); + v8 = unchecked(v128.add(v8, v13)); + v7 = v128.xor(v7, v8); + v7 = v128.shuffle(v7, v7, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10); + v2 = unchecked(v128.add(v2, v7)); + // NOP + v13 = v128.xor(v13, v2); + v13 = v128.shuffle(v13, v13, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9); + v8 = unchecked(v128.add(v8, v13)); + v7 = v128.xor(v7, v8); + v7 = v128.or(v128.shr(v7, 63), v128.shl(v7, 1)); + v3 = unchecked(v128.add(v3, v4)); + v3 = unchecked(v128.add(v3, m2)); + v14 = v128.xor(v14, v3); + v14 = v128.shuffle(v14, v14, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11); + v9 = unchecked(v128.add(v9, v14)); + v4 = v128.xor(v4, v9); + v4 = v128.shuffle(v4, v4, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10); + v3 = unchecked(v128.add(v3, v4)); + // NOP + v14 = v128.xor(v14, v3); + v14 = v128.shuffle(v14, v14, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9); + v9 = unchecked(v128.add(v9, v14)); + v4 = v128.xor(v4, v9); + v4 = v128.or(v128.shr(v4, 63), v128.shl(v4, 1)); + // ROUND 8 + v0 = unchecked(v128.add(v0, v4)); + // NOP + v12 = v128.xor(v12, v0); + v12 = v128.shuffle(v12, v12, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11); + v8 = unchecked(v128.add(v8, v12)); + v4 = v128.xor(v4, v8); + v4 = v128.shuffle(v4, v4, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10); + v0 = unchecked(v128.add(v0, v4)); + // NOP + v12 = v128.xor(v12, v0); + v12 = v128.shuffle(v12, v12, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9); + v8 = unchecked(v128.add(v8, v12)); + v4 = v128.xor(v4, v8); + v4 = v128.or(v128.shr(v4, 63), v128.shl(v4, 1)); + v1 = unchecked(v128.add(v1, v5)); + // NOP + v13 = v128.xor(v13, v1); + v13 = v128.shuffle(v13, v13, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11); + v9 = unchecked(v128.add(v9, v13)); + v5 = v128.xor(v5, v9); + v5 = v128.shuffle(v5, v5, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10); + v1 = unchecked(v128.add(v1, v5)); + // NOP + v13 = v128.xor(v13, v1); + v13 = v128.shuffle(v13, v13, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9); + v9 = unchecked(v128.add(v9, v13)); + v5 = v128.xor(v5, v9); + v5 = v128.or(v128.shr(v5, 63), v128.shl(v5, 1)); + v2 = unchecked(v128.add(v2, v6)); + // NOP + v14 = v128.xor(v14, v2); + v14 = v128.shuffle(v14, v14, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11); + v10 = unchecked(v128.add(v10, v14)); + v6 = v128.xor(v6, v10); + v6 = v128.shuffle(v6, v6, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10); + v2 = unchecked(v128.add(v2, v6)); + v2 = unchecked(v128.add(v2, m3)); + v14 = v128.xor(v14, v2); + v14 = v128.shuffle(v14, v14, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9); + v10 = unchecked(v128.add(v10, v14)); + v6 = v128.xor(v6, v10); + v6 = v128.or(v128.shr(v6, 63), v128.shl(v6, 1)); + v3 = unchecked(v128.add(v3, v7)); + v3 = unchecked(v128.add(v3, m0)); + v15 = v128.xor(v15, v3); + v15 = v128.shuffle(v15, v15, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11); + v11 = unchecked(v128.add(v11, v15)); + v7 = v128.xor(v7, v11); + v7 = v128.shuffle(v7, v7, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10); + v3 = unchecked(v128.add(v3, v7)); + // NOP + v15 = v128.xor(v15, v3); + v15 = v128.shuffle(v15, v15, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9); + v11 = unchecked(v128.add(v11, v15)); + v7 = v128.xor(v7, v11); + v7 = v128.or(v128.shr(v7, 63), v128.shl(v7, 1)); + v0 = unchecked(v128.add(v0, v5)); + // NOP + v15 = v128.xor(v15, v0); + v15 = v128.shuffle(v15, v15, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11); + v10 = unchecked(v128.add(v10, v15)); + v5 = v128.xor(v5, v10); + v5 = v128.shuffle(v5, v5, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10); + v0 = unchecked(v128.add(v0, v5)); + v0 = unchecked(v128.add(v0, m2)); + v15 = v128.xor(v15, v0); + v15 = v128.shuffle(v15, v15, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9); + v10 = unchecked(v128.add(v10, v15)); + v5 = v128.xor(v5, v10); + v5 = v128.or(v128.shr(v5, 63), v128.shl(v5, 1)); + v1 = unchecked(v128.add(v1, v6)); + // NOP + v12 = v128.xor(v12, v1); + v12 = v128.shuffle(v12, v12, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11); + v11 = unchecked(v128.add(v11, v12)); + v6 = v128.xor(v6, v11); + v6 = v128.shuffle(v6, v6, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10); + v1 = unchecked(v128.add(v1, v6)); + // NOP + v12 = v128.xor(v12, v1); + v12 = v128.shuffle(v12, v12, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9); + v11 = unchecked(v128.add(v11, v12)); + v6 = v128.xor(v6, v11); + v6 = v128.or(v128.shr(v6, 63), v128.shl(v6, 1)); + v2 = unchecked(v128.add(v2, v7)); + v2 = unchecked(v128.add(v2, m1)); + v13 = v128.xor(v13, v2); + v13 = v128.shuffle(v13, v13, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11); + v8 = unchecked(v128.add(v8, v13)); + v7 = v128.xor(v7, v8); + v7 = v128.shuffle(v7, v7, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10); + v2 = unchecked(v128.add(v2, v7)); + v2 = unchecked(v128.add(v2, m4)); + v13 = v128.xor(v13, v2); + v13 = v128.shuffle(v13, v13, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9); + v8 = unchecked(v128.add(v8, v13)); + v7 = v128.xor(v7, v8); + v7 = v128.or(v128.shr(v7, 63), v128.shl(v7, 1)); + v3 = unchecked(v128.add(v3, v4)); + // NOP + v14 = v128.xor(v14, v3); + v14 = v128.shuffle(v14, v14, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11); + v9 = unchecked(v128.add(v9, v14)); + v4 = v128.xor(v4, v9); + v4 = v128.shuffle(v4, v4, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10); + v3 = unchecked(v128.add(v3, v4)); + // NOP + v14 = v128.xor(v14, v3); + v14 = v128.shuffle(v14, v14, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9); + v9 = unchecked(v128.add(v9, v14)); + v4 = v128.xor(v4, v9); + v4 = v128.or(v128.shr(v4, 63), v128.shl(v4, 1)); + // ROUND 9 + v0 = unchecked(v128.add(v0, v4)); + // NOP + v12 = v128.xor(v12, v0); + v12 = v128.shuffle(v12, v12, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11); + v8 = unchecked(v128.add(v8, v12)); + v4 = v128.xor(v4, v8); + v4 = v128.shuffle(v4, v4, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10); + v0 = unchecked(v128.add(v0, v4)); + v0 = unchecked(v128.add(v0, m2)); + v12 = v128.xor(v12, v0); + v12 = v128.shuffle(v12, v12, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9); + v8 = unchecked(v128.add(v8, v12)); + v4 = v128.xor(v4, v8); + v4 = v128.or(v128.shr(v4, 63), v128.shl(v4, 1)); + v1 = unchecked(v128.add(v1, v5)); + // NOP + v13 = v128.xor(v13, v1); + v13 = v128.shuffle(v13, v13, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11); + v9 = unchecked(v128.add(v9, v13)); + v5 = v128.xor(v5, v9); + v5 = v128.shuffle(v5, v5, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10); + v1 = unchecked(v128.add(v1, v5)); + v1 = unchecked(v128.add(v1, m4)); + v13 = v128.xor(v13, v1); + v13 = v128.shuffle(v13, v13, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9); + v9 = unchecked(v128.add(v9, v13)); + v5 = v128.xor(v5, v9); + v5 = v128.or(v128.shr(v5, 63), v128.shl(v5, 1)); + v2 = unchecked(v128.add(v2, v6)); + // NOP + v14 = v128.xor(v14, v2); + v14 = v128.shuffle(v14, v14, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11); + v10 = unchecked(v128.add(v10, v14)); + v6 = v128.xor(v6, v10); + v6 = v128.shuffle(v6, v6, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10); + v2 = unchecked(v128.add(v2, v6)); + // NOP + v14 = v128.xor(v14, v2); + v14 = v128.shuffle(v14, v14, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9); + v10 = unchecked(v128.add(v10, v14)); + v6 = v128.xor(v6, v10); + v6 = v128.or(v128.shr(v6, 63), v128.shl(v6, 1)); + v3 = unchecked(v128.add(v3, v7)); + v3 = unchecked(v128.add(v3, m1)); + v15 = v128.xor(v15, v3); + v15 = v128.shuffle(v15, v15, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11); + v11 = unchecked(v128.add(v11, v15)); + v7 = v128.xor(v7, v11); + v7 = v128.shuffle(v7, v7, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10); + v3 = unchecked(v128.add(v3, v7)); + // NOP + v15 = v128.xor(v15, v3); + v15 = v128.shuffle(v15, v15, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9); + v11 = unchecked(v128.add(v11, v15)); + v7 = v128.xor(v7, v11); + v7 = v128.or(v128.shr(v7, 63), v128.shl(v7, 1)); + v0 = unchecked(v128.add(v0, v5)); + // NOP + v15 = v128.xor(v15, v0); + v15 = v128.shuffle(v15, v15, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11); + v10 = unchecked(v128.add(v10, v15)); + v5 = v128.xor(v5, v10); + v5 = v128.shuffle(v5, v5, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10); + v0 = unchecked(v128.add(v0, v5)); + // NOP + v15 = v128.xor(v15, v0); + v15 = v128.shuffle(v15, v15, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9); + v10 = unchecked(v128.add(v10, v15)); + v5 = v128.xor(v5, v10); + v5 = v128.or(v128.shr(v5, 63), v128.shl(v5, 1)); + v1 = unchecked(v128.add(v1, v6)); + // NOP + v12 = v128.xor(v12, v1); + v12 = v128.shuffle(v12, v12, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11); + v11 = unchecked(v128.add(v11, v12)); + v6 = v128.xor(v6, v11); + v6 = v128.shuffle(v6, v6, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10); + v1 = unchecked(v128.add(v1, v6)); + // NOP + v12 = v128.xor(v12, v1); + v12 = v128.shuffle(v12, v12, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9); + v11 = unchecked(v128.add(v11, v12)); + v6 = v128.xor(v6, v11); + v6 = v128.or(v128.shr(v6, 63), v128.shl(v6, 1)); + v2 = unchecked(v128.add(v2, v7)); + v2 = unchecked(v128.add(v2, m3)); + v13 = v128.xor(v13, v2); + v13 = v128.shuffle(v13, v13, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11); + v8 = unchecked(v128.add(v8, v13)); + v7 = v128.xor(v7, v8); + v7 = v128.shuffle(v7, v7, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10); + v2 = unchecked(v128.add(v2, v7)); + // NOP + v13 = v128.xor(v13, v2); + v13 = v128.shuffle(v13, v13, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9); + v8 = unchecked(v128.add(v8, v13)); + v7 = v128.xor(v7, v8); + v7 = v128.or(v128.shr(v7, 63), v128.shl(v7, 1)); + v3 = unchecked(v128.add(v3, v4)); + // NOP + v14 = v128.xor(v14, v3); + v14 = v128.shuffle(v14, v14, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11); + v9 = unchecked(v128.add(v9, v14)); + v4 = v128.xor(v4, v9); + v4 = v128.shuffle(v4, v4, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10); + v3 = unchecked(v128.add(v3, v4)); + v3 = unchecked(v128.add(v3, m0)); + v14 = v128.xor(v14, v3); + v14 = v128.shuffle(v14, v14, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9); + v9 = unchecked(v128.add(v9, v14)); + v4 = v128.xor(v4, v9); + v4 = v128.or(v128.shr(v4, 63), v128.shl(v4, 1)); + // ROUND 10 + v0 = unchecked(v128.add(v0, v4)); + v0 = unchecked(v128.add(v0, m0)); + v12 = v128.xor(v12, v0); + v12 = v128.shuffle(v12, v12, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11); + v8 = unchecked(v128.add(v8, v12)); + v4 = v128.xor(v4, v8); + v4 = v128.shuffle(v4, v4, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10); + v0 = unchecked(v128.add(v0, v4)); + v0 = unchecked(v128.add(v0, m1)); + v12 = v128.xor(v12, v0); + v12 = v128.shuffle(v12, v12, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9); + v8 = unchecked(v128.add(v8, v12)); + v4 = v128.xor(v4, v8); + v4 = v128.or(v128.shr(v4, 63), v128.shl(v4, 1)); + v1 = unchecked(v128.add(v1, v5)); + v1 = unchecked(v128.add(v1, m2)); + v13 = v128.xor(v13, v1); + v13 = v128.shuffle(v13, v13, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11); + v9 = unchecked(v128.add(v9, v13)); + v5 = v128.xor(v5, v9); + v5 = v128.shuffle(v5, v5, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10); + v1 = unchecked(v128.add(v1, v5)); + v1 = unchecked(v128.add(v1, m3)); + v13 = v128.xor(v13, v1); + v13 = v128.shuffle(v13, v13, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9); + v9 = unchecked(v128.add(v9, v13)); + v5 = v128.xor(v5, v9); + v5 = v128.or(v128.shr(v5, 63), v128.shl(v5, 1)); + v2 = unchecked(v128.add(v2, v6)); + v2 = unchecked(v128.add(v2, m4)); + v14 = v128.xor(v14, v2); + v14 = v128.shuffle(v14, v14, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11); + v10 = unchecked(v128.add(v10, v14)); + v6 = v128.xor(v6, v10); + v6 = v128.shuffle(v6, v6, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10); + v2 = unchecked(v128.add(v2, v6)); + // NOP + v14 = v128.xor(v14, v2); + v14 = v128.shuffle(v14, v14, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9); + v10 = unchecked(v128.add(v10, v14)); + v6 = v128.xor(v6, v10); + v6 = v128.or(v128.shr(v6, 63), v128.shl(v6, 1)); + v3 = unchecked(v128.add(v3, v7)); + // NOP + v15 = v128.xor(v15, v3); + v15 = v128.shuffle(v15, v15, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11); + v11 = unchecked(v128.add(v11, v15)); + v7 = v128.xor(v7, v11); + v7 = v128.shuffle(v7, v7, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10); + v3 = unchecked(v128.add(v3, v7)); + // NOP + v15 = v128.xor(v15, v3); + v15 = v128.shuffle(v15, v15, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9); + v11 = unchecked(v128.add(v11, v15)); + v7 = v128.xor(v7, v11); + v7 = v128.or(v128.shr(v7, 63), v128.shl(v7, 1)); + v0 = unchecked(v128.add(v0, v5)); + // NOP + v15 = v128.xor(v15, v0); + v15 = v128.shuffle(v15, v15, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11); + v10 = unchecked(v128.add(v10, v15)); + v5 = v128.xor(v5, v10); + v5 = v128.shuffle(v5, v5, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10); + v0 = unchecked(v128.add(v0, v5)); + // NOP + v15 = v128.xor(v15, v0); + v15 = v128.shuffle(v15, v15, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9); + v10 = unchecked(v128.add(v10, v15)); + v5 = v128.xor(v5, v10); + v5 = v128.or(v128.shr(v5, 63), v128.shl(v5, 1)); + v1 = unchecked(v128.add(v1, v6)); + // NOP + v12 = v128.xor(v12, v1); + v12 = v128.shuffle(v12, v12, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11); + v11 = unchecked(v128.add(v11, v12)); + v6 = v128.xor(v6, v11); + v6 = v128.shuffle(v6, v6, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10); + v1 = unchecked(v128.add(v1, v6)); + // NOP + v12 = v128.xor(v12, v1); + v12 = v128.shuffle(v12, v12, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9); + v11 = unchecked(v128.add(v11, v12)); + v6 = v128.xor(v6, v11); + v6 = v128.or(v128.shr(v6, 63), v128.shl(v6, 1)); + v2 = unchecked(v128.add(v2, v7)); + // NOP + v13 = v128.xor(v13, v2); + v13 = v128.shuffle(v13, v13, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11); + v8 = unchecked(v128.add(v8, v13)); + v7 = v128.xor(v7, v8); + v7 = v128.shuffle(v7, v7, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10); + v2 = unchecked(v128.add(v2, v7)); + // NOP + v13 = v128.xor(v13, v2); + v13 = v128.shuffle(v13, v13, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9); + v8 = unchecked(v128.add(v8, v13)); + v7 = v128.xor(v7, v8); + v7 = v128.or(v128.shr(v7, 63), v128.shl(v7, 1)); + v3 = unchecked(v128.add(v3, v4)); + // NOP + v14 = v128.xor(v14, v3); + v14 = v128.shuffle(v14, v14, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11); + v9 = unchecked(v128.add(v9, v14)); + v4 = v128.xor(v4, v9); + v4 = v128.shuffle(v4, v4, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10); + v3 = unchecked(v128.add(v3, v4)); + // NOP + v14 = v128.xor(v14, v3); + v14 = v128.shuffle(v14, v14, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9); + v9 = unchecked(v128.add(v9, v14)); + v4 = v128.xor(v4, v9); + v4 = v128.or(v128.shr(v4, 63), v128.shl(v4, 1)); + // ROUND 11 + v0 = unchecked(v128.add(v0, v4)); + // NOP + v12 = v128.xor(v12, v0); + v12 = v128.shuffle(v12, v12, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11); + v8 = unchecked(v128.add(v8, v12)); + v4 = v128.xor(v4, v8); + v4 = v128.shuffle(v4, v4, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10); + v0 = unchecked(v128.add(v0, v4)); + // NOP + v12 = v128.xor(v12, v0); + v12 = v128.shuffle(v12, v12, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9); + v8 = unchecked(v128.add(v8, v12)); + v4 = v128.xor(v4, v8); + v4 = v128.or(v128.shr(v4, 63), v128.shl(v4, 1)); + v1 = unchecked(v128.add(v1, v5)); + v1 = unchecked(v128.add(v1, m4)); + v13 = v128.xor(v13, v1); + v13 = v128.shuffle(v13, v13, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11); + v9 = unchecked(v128.add(v9, v13)); + v5 = v128.xor(v5, v9); + v5 = v128.shuffle(v5, v5, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10); + v1 = unchecked(v128.add(v1, v5)); + // NOP + v13 = v128.xor(v13, v1); + v13 = v128.shuffle(v13, v13, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9); + v9 = unchecked(v128.add(v9, v13)); + v5 = v128.xor(v5, v9); + v5 = v128.or(v128.shr(v5, 63), v128.shl(v5, 1)); + v2 = unchecked(v128.add(v2, v6)); + // NOP + v14 = v128.xor(v14, v2); + v14 = v128.shuffle(v14, v14, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11); + v10 = unchecked(v128.add(v10, v14)); + v6 = v128.xor(v6, v10); + v6 = v128.shuffle(v6, v6, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10); + v2 = unchecked(v128.add(v2, v6)); + // NOP + v14 = v128.xor(v14, v2); + v14 = v128.shuffle(v14, v14, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9); + v10 = unchecked(v128.add(v10, v14)); + v6 = v128.xor(v6, v10); + v6 = v128.or(v128.shr(v6, 63), v128.shl(v6, 1)); + v3 = unchecked(v128.add(v3, v7)); + // NOP + v15 = v128.xor(v15, v3); + v15 = v128.shuffle(v15, v15, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11); + v11 = unchecked(v128.add(v11, v15)); + v7 = v128.xor(v7, v11); + v7 = v128.shuffle(v7, v7, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10); + v3 = unchecked(v128.add(v3, v7)); + // NOP + v15 = v128.xor(v15, v3); + v15 = v128.shuffle(v15, v15, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9); + v11 = unchecked(v128.add(v11, v15)); + v7 = v128.xor(v7, v11); + v7 = v128.or(v128.shr(v7, 63), v128.shl(v7, 1)); + v0 = unchecked(v128.add(v0, v5)); + v0 = unchecked(v128.add(v0, m1)); + v15 = v128.xor(v15, v0); + v15 = v128.shuffle(v15, v15, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11); + v10 = unchecked(v128.add(v10, v15)); + v5 = v128.xor(v5, v10); + v5 = v128.shuffle(v5, v5, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10); + v0 = unchecked(v128.add(v0, v5)); + // NOP + v15 = v128.xor(v15, v0); + v15 = v128.shuffle(v15, v15, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9); + v10 = unchecked(v128.add(v10, v15)); + v5 = v128.xor(v5, v10); + v5 = v128.or(v128.shr(v5, 63), v128.shl(v5, 1)); + // G NOP + v2 = unchecked(v128.add(v2, v7)); + // NOP + v13 = v128.xor(v13, v2); + v13 = v128.shuffle(v13, v13, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11); + v8 = unchecked(v128.add(v8, v13)); + v7 = v128.xor(v7, v8); + v7 = v128.shuffle(v7, v7, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10); + v2 = unchecked(v128.add(v2, v7)); + // NOP + v13 = v128.xor(v13, v2); + v13 = v128.shuffle(v13, v13, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9); + v8 = unchecked(v128.add(v8, v13)); + v7 = v128.xor(v7, v8); + v7 = v128.or(v128.shr(v7, 63), v128.shl(v7, 1)); + // G NOP + // RESULT + result = v128.xor(blake2b_iv_param, v128.xor(v0, v8)); + r0 = v128.extract_lane(result, 0); + r1 = v128.extract_lane(result, 1); + if (r0 >= difficulty || r1 >= difficulty) break + } + // Throw trap if no valid nonce found, else get result from SIMD lane. + if (r0 < difficulty && r1 < difficulty) { + return 1 / 0 + } + return select(v128.extract_lane(m0, 0), v128.extract_lane(m0, 1), r0 >= difficulty); +} diff --git a/src/lib/generate/wasm/worker.ts b/src/lib/generate/wasm/worker.ts index 2de0fce..4a1b229 100644 --- a/src/lib/generate/wasm/worker.ts +++ b/src/lib/generate/wasm/worker.ts @@ -2,7 +2,7 @@ //! SPDX-License-Identifier: GPL-3.0-or-later //@ts-expect-error -import compute from './asm/build/compute.wasm' +import compute from './asm/compute.wasm' type Main = (w: bigint, h0: bigint, h1: bigint, h2: bigint, h3: bigint, d: bigint) => any const worker = async (compute: number[]): Promise => { diff --git a/src/lib/generate/webgl/shaders/draw.frag b/src/lib/generate/webgl/shaders/draw.frag new file mode 100644 index 0000000..deb4009 --- /dev/null +++ b/src/lib/generate/webgl/shaders/draw.frag @@ -0,0 +1,1832 @@ +#version 300 es +#pragma vscode_glsllint_stage: frag +//! SPDX-FileCopyrightText: 2025 Chris Duncan +//! SPDX-FileContributor: Ben Green +//! SPDX-License-Identifier: GPL-3.0-or-later AND MIT +// hash - Array of 32-bit integers comprising a 32-byte Nano block hash +// difficulty - Minimum threshold for BLAKE2b result for work to be valid +// seed - Random value which is uniquely varied by pixel coordinates +layout(std140) uniform INPUT { + uint hash[8]; + uvec2 difficulty; + uvec2 seed; +}; +// work - Pixel value output if and only if valid nonce is found +out uvec4 work; +// Main draw function +// +// Draws a single pixel per shader invocation, multiplied by the dimensions of +// the canvas. +// +// Each component of a random 8-byte value, provided by the INPUT as a uvec2, +// is XOR'd with the 2-D coordinates of the pixel on the canvas to create a +// unique nonce value for each. +// +// Where the reference implementation uses array lookups, the NanoPow +// implementation assigns each array element to its own variable to enhance +// performance, but the variable name still contains the original index digit. +void main() { + // Initialize fragment output + work = uvec4(0u); + // Initialize unique nonce and block hash + uvec2 m0 = seed ^ uvec2(gl_FragCoord); + uvec2 m1 = uvec2(hash[0u], hash[1u]); + uvec2 m2 = uvec2(hash[2u], hash[3u]); + uvec2 m3 = uvec2(hash[4u], hash[5u]); + uvec2 m4 = uvec2(hash[6u], hash[7u]); + // INITIALIZE STATE VECTOR + // v0: depth=1; fanout=1; outlen=8 + // v12: input byte length + // v14: final block flag + uvec2 v0 = uvec2(0xf3bcc908u, 0x6a09e667u); + uvec2 v8 = uvec2(0xf3bcc908u, 0x6a09e667u); + uvec2 v1 = uvec2(0x84caa73bu, 0xbb67ae85u); + uvec2 v9 = uvec2(0x84caa73bu, 0xbb67ae85u); + uvec2 v2 = uvec2(0xfe94f82bu, 0x3c6ef372u); + uvec2 v10 = uvec2(0xfe94f82bu, 0x3c6ef372u); + uvec2 v3 = uvec2(0x5f1d36f1u, 0xa54ff53au); + uvec2 v11 = uvec2(0x5f1d36f1u, 0xa54ff53au); + uvec2 v4 = uvec2(0xade682d1u, 0x510e527fu); + uvec2 v12 = uvec2(0xade682d1u, 0x510e527fu); + uvec2 v5 = uvec2(0x2b3e6c1fu, 0x9b05688cu); + uvec2 v13 = uvec2(0x2b3e6c1fu, 0x9b05688cu); + uvec2 v6 = uvec2(0xfb41bd6bu, 0x1f83d9abu); + uvec2 v14 = uvec2(0xfb41bd6bu, 0x1f83d9abu); + uvec2 v7 = uvec2(0x137e2179u, 0x5be0cd19u); + uvec2 v15 = uvec2(0x137e2179u, 0x5be0cd19u); + v0 ^= uvec2(0x01010008u, 0x0u); + v12 ^= uvec2(0x28u, 0x0u); + v14 ^= uvec2(~0x0u); + // COMPRESS + // ROUND 0 + v0 += v4; + v0.y += uint(v0.x < v4.x); + v0 += m0; + v0.y += uint(v0.x < m0.x); + v12 ^= v0; + v12 = v12.yx; + v8 += v12; + v8.y += uint(v8.x < v12.x); + v4 ^= v8; + v4 = (v4 >> uvec2(24u)) | (v4 << uvec2(8u)).yx; + v0 += v4; + v0.y += uint(v0.x < v4.x); + v0 += m1; + v0.y += uint(v0.x < m1.x); + v12 ^= v0; + v12 = (v12 >> uvec2(16u)) | (v12 << uvec2(16u)).yx; + v8 += v12; + v8.y += uint(v8.x < v12.x); + v4 ^= v8; + v4 = (v4 >> uvec2(31u)).yx | (v4 << uvec2(1u)); + v1 += v5; + v1.y += uint(v1.x < v5.x); + v1 += m2; + v1.y += uint(v1.x < m2.x); + v13 ^= v1; + v13 = v13.yx; + v9 += v13; + v9.y += uint(v9.x < v13.x); + v5 ^= v9; + v5 = (v5 >> uvec2(24u)) | (v5 << uvec2(8u)).yx; + v1 += v5; + v1.y += uint(v1.x < v5.x); + v1 += m3; + v1.y += uint(v1.x < m3.x); + v13 ^= v1; + v13 = (v13 >> uvec2(16u)) | (v13 << uvec2(16u)).yx; + v9 += v13; + v9.y += uint(v9.x < v13.x); + v5 ^= v9; + v5 = (v5 >> uvec2(31u)).yx | (v5 << uvec2(1u)); + v2 += v6; + v2.y += uint(v2.x < v6.x); + v2 += m4; + v2.y += uint(v2.x < m4.x); + v14 ^= v2; + v14 = v14.yx; + v10 += v14; + v10.y += uint(v10.x < v14.x); + v6 ^= v10; + v6 = (v6 >> uvec2(24u)) | (v6 << uvec2(8u)).yx; + v2 += v6; + v2.y += uint(v2.x < v6.x); + // NOP + v14 ^= v2; + v14 = (v14 >> uvec2(16u)) | (v14 << uvec2(16u)).yx; + v10 += v14; + v10.y += uint(v10.x < v14.x); + v6 ^= v10; + v6 = (v6 >> uvec2(31u)).yx | (v6 << uvec2(1u)); + v3 += v7; + v3.y += uint(v3.x < v7.x); + // NOP + v15 ^= v3; + v15 = v15.yx; + v11 += v15; + v11.y += uint(v11.x < v15.x); + v7 ^= v11; + v7 = (v7 >> uvec2(24u)) | (v7 << uvec2(8u)).yx; + v3 += v7; + v3.y += uint(v3.x < v7.x); + // NOP + v15 ^= v3; + v15 = (v15 >> uvec2(16u)) | (v15 << uvec2(16u)).yx; + v11 += v15; + v11.y += uint(v11.x < v15.x); + v7 ^= v11; + v7 = (v7 >> uvec2(31u)).yx | (v7 << uvec2(1u)); + v0 += v5; + v0.y += uint(v0.x < v5.x); + // NOP + v15 ^= v0; + v15 = v15.yx; + v10 += v15; + v10.y += uint(v10.x < v15.x); + v5 ^= v10; + v5 = (v5 >> uvec2(24u)) | (v5 << uvec2(8u)).yx; + v0 += v5; + v0.y += uint(v0.x < v5.x); + // NOP + v15 ^= v0; + v15 = (v15 >> uvec2(16u)) | (v15 << uvec2(16u)).yx; + v10 += v15; + v10.y += uint(v10.x < v15.x); + v5 ^= v10; + v5 = (v5 >> uvec2(31u)).yx | (v5 << uvec2(1u)); + v1 += v6; + v1.y += uint(v1.x < v6.x); + // NOP + v12 ^= v1; + v12 = v12.yx; + v11 += v12; + v11.y += uint(v11.x < v12.x); + v6 ^= v11; + v6 = (v6 >> uvec2(24u)) | (v6 << uvec2(8u)).yx; + v1 += v6; + v1.y += uint(v1.x < v6.x); + // NOP + v12 ^= v1; + v12 = (v12 >> uvec2(16u)) | (v12 << uvec2(16u)).yx; + v11 += v12; + v11.y += uint(v11.x < v12.x); + v6 ^= v11; + v6 = (v6 >> uvec2(31u)).yx | (v6 << uvec2(1u)); + v2 += v7; + v2.y += uint(v2.x < v7.x); + // NOP + v13 ^= v2; + v13 = v13.yx; + v8 += v13; + v8.y += uint(v8.x < v13.x); + v7 ^= v8; + v7 = (v7 >> uvec2(24u)) | (v7 << uvec2(8u)).yx; + v2 += v7; + v2.y += uint(v2.x < v7.x); + // NOP + v13 ^= v2; + v13 = (v13 >> uvec2(16u)) | (v13 << uvec2(16u)).yx; + v8 += v13; + v8.y += uint(v8.x < v13.x); + v7 ^= v8; + v7 = (v7 >> uvec2(31u)).yx | (v7 << uvec2(1u)); + v3 += v4; + v3.y += uint(v3.x < v4.x); + // NOP + v14 ^= v3; + v14 = v14.yx; + v9 += v14; + v9.y += uint(v9.x < v14.x); + v4 ^= v9; + v4 = (v4 >> uvec2(24u)) | (v4 << uvec2(8u)).yx; + v3 += v4; + v3.y += uint(v3.x < v4.x); + // NOP + v14 ^= v3; + v14 = (v14 >> uvec2(16u)) | (v14 << uvec2(16u)).yx; + v9 += v14; + v9.y += uint(v9.x < v14.x); + v4 ^= v9; + v4 = (v4 >> uvec2(31u)).yx | (v4 << uvec2(1u)); + // ROUND 1 + v0 += v4; + v0.y += uint(v0.x < v4.x); + // NOP + v12 ^= v0; + v12 = v12.yx; + v8 += v12; + v8.y += uint(v8.x < v12.x); + v4 ^= v8; + v4 = (v4 >> uvec2(24u)) | (v4 << uvec2(8u)).yx; + v0 += v4; + v0.y += uint(v0.x < v4.x); + // NOP + v12 ^= v0; + v12 = (v12 >> uvec2(16u)) | (v12 << uvec2(16u)).yx; + v8 += v12; + v8.y += uint(v8.x < v12.x); + v4 ^= v8; + v4 = (v4 >> uvec2(31u)).yx | (v4 << uvec2(1u)); + v1 += v5; + v1.y += uint(v1.x < v5.x); + v1 += m4; + v1.y += uint(v1.x < m4.x); + v13 ^= v1; + v13 = v13.yx; + v9 += v13; + v9.y += uint(v9.x < v13.x); + v5 ^= v9; + v5 = (v5 >> uvec2(24u)) | (v5 << uvec2(8u)).yx; + v1 += v5; + v1.y += uint(v1.x < v5.x); + // NOP + v13 ^= v1; + v13 = (v13 >> uvec2(16u)) | (v13 << uvec2(16u)).yx; + v9 += v13; + v9.y += uint(v9.x < v13.x); + v5 ^= v9; + v5 = (v5 >> uvec2(31u)).yx | (v5 << uvec2(1u)); + v2 += v6; + v2.y += uint(v2.x < v6.x); + // NOP + v14 ^= v2; + v14 = v14.yx; + v10 += v14; + v10.y += uint(v10.x < v14.x); + v6 ^= v10; + v6 = (v6 >> uvec2(24u)) | (v6 << uvec2(8u)).yx; + v2 += v6; + v2.y += uint(v2.x < v6.x); + // NOP + v14 ^= v2; + v14 = (v14 >> uvec2(16u)) | (v14 << uvec2(16u)).yx; + v10 += v14; + v10.y += uint(v10.x < v14.x); + v6 ^= v10; + v6 = (v6 >> uvec2(31u)).yx | (v6 << uvec2(1u)); + v3 += v7; + v3.y += uint(v3.x < v7.x); + // NOP + v15 ^= v3; + v15 = v15.yx; + v11 += v15; + v11.y += uint(v11.x < v15.x); + v7 ^= v11; + v7 = (v7 >> uvec2(24u)) | (v7 << uvec2(8u)).yx; + v3 += v7; + v3.y += uint(v3.x < v7.x); + // NOP + v15 ^= v3; + v15 = (v15 >> uvec2(16u)) | (v15 << uvec2(16u)).yx; + v11 += v15; + v11.y += uint(v11.x < v15.x); + v7 ^= v11; + v7 = (v7 >> uvec2(31u)).yx | (v7 << uvec2(1u)); + v0 += v5; + v0.y += uint(v0.x < v5.x); + v0 += m1; + v0.y += uint(v0.x < m1.x); + v15 ^= v0; + v15 = v15.yx; + v10 += v15; + v10.y += uint(v10.x < v15.x); + v5 ^= v10; + v5 = (v5 >> uvec2(24u)) | (v5 << uvec2(8u)).yx; + v0 += v5; + v0.y += uint(v0.x < v5.x); + // NOP + v15 ^= v0; + v15 = (v15 >> uvec2(16u)) | (v15 << uvec2(16u)).yx; + v10 += v15; + v10.y += uint(v10.x < v15.x); + v5 ^= v10; + v5 = (v5 >> uvec2(31u)).yx | (v5 << uvec2(1u)); + v1 += v6; + v1.y += uint(v1.x < v6.x); + v1 += m0; + v1.y += uint(v1.x < m0.x); + v12 ^= v1; + v12 = v12.yx; + v11 += v12; + v11.y += uint(v11.x < v12.x); + v6 ^= v11; + v6 = (v6 >> uvec2(24u)) | (v6 << uvec2(8u)).yx; + v1 += v6; + v1.y += uint(v1.x < v6.x); + v1 += m2; + v1.y += uint(v1.x < m2.x); + v12 ^= v1; + v12 = (v12 >> uvec2(16u)) | (v12 << uvec2(16u)).yx; + v11 += v12; + v11.y += uint(v11.x < v12.x); + v6 ^= v11; + v6 = (v6 >> uvec2(31u)).yx | (v6 << uvec2(1u)); + v2 += v7; + v2.y += uint(v2.x < v7.x); + // NOP + v13 ^= v2; + v13 = v13.yx; + v8 += v13; + v8.y += uint(v8.x < v13.x); + v7 ^= v8; + v7 = (v7 >> uvec2(24u)) | (v7 << uvec2(8u)).yx; + v2 += v7; + v2.y += uint(v2.x < v7.x); + // NOP + v13 ^= v2; + v13 = (v13 >> uvec2(16u)) | (v13 << uvec2(16u)).yx; + v8 += v13; + v8.y += uint(v8.x < v13.x); + v7 ^= v8; + v7 = (v7 >> uvec2(31u)).yx | (v7 << uvec2(1u)); + v3 += v4; + v3.y += uint(v3.x < v4.x); + // NOP + v14 ^= v3; + v14 = v14.yx; + v9 += v14; + v9.y += uint(v9.x < v14.x); + v4 ^= v9; + v4 = (v4 >> uvec2(24u)) | (v4 << uvec2(8u)).yx; + v3 += v4; + v3.y += uint(v3.x < v4.x); + v3 += m3; + v3.y += uint(v3.x < m3.x); + v14 ^= v3; + v14 = (v14 >> uvec2(16u)) | (v14 << uvec2(16u)).yx; + v9 += v14; + v9.y += uint(v9.x < v14.x); + v4 ^= v9; + v4 = (v4 >> uvec2(31u)).yx | (v4 << uvec2(1u)); + // ROUND 2 + v0 += v4; + v0.y += uint(v0.x < v4.x); + // NOP + v12 ^= v0; + v12 = v12.yx; + v8 += v12; + v8.y += uint(v8.x < v12.x); + v4 ^= v8; + v4 = (v4 >> uvec2(24u)) | (v4 << uvec2(8u)).yx; + v0 += v4; + v0.y += uint(v0.x < v4.x); + // NOP + v12 ^= v0; + v12 = (v12 >> uvec2(16u)) | (v12 << uvec2(16u)).yx; + v8 += v12; + v8.y += uint(v8.x < v12.x); + v4 ^= v8; + v4 = (v4 >> uvec2(31u)).yx | (v4 << uvec2(1u)); + v1 += v5; + v1.y += uint(v1.x < v5.x); + // NOP + v13 ^= v1; + v13 = v13.yx; + v9 += v13; + v9.y += uint(v9.x < v13.x); + v5 ^= v9; + v5 = (v5 >> uvec2(24u)) | (v5 << uvec2(8u)).yx; + v1 += v5; + v1.y += uint(v1.x < v5.x); + v1 += m0; + v1.y += uint(v1.x < m0.x); + v13 ^= v1; + v13 = (v13 >> uvec2(16u)) | (v13 << uvec2(16u)).yx; + v9 += v13; + v9.y += uint(v9.x < v13.x); + v5 ^= v9; + v5 = (v5 >> uvec2(31u)).yx | (v5 << uvec2(1u)); + v2 += v6; + v2.y += uint(v2.x < v6.x); + // NOP + v14 ^= v2; + v14 = v14.yx; + v10 += v14; + v10.y += uint(v10.x < v14.x); + v6 ^= v10; + v6 = (v6 >> uvec2(24u)) | (v6 << uvec2(8u)).yx; + v2 += v6; + v2.y += uint(v2.x < v6.x); + v2 += m2; + v2.y += uint(v2.x < m2.x); + v14 ^= v2; + v14 = (v14 >> uvec2(16u)) | (v14 << uvec2(16u)).yx; + v10 += v14; + v10.y += uint(v10.x < v14.x); + v6 ^= v10; + v6 = (v6 >> uvec2(31u)).yx | (v6 << uvec2(1u)); + v3 += v7; + v3.y += uint(v3.x < v7.x); + // NOP + v15 ^= v3; + v15 = v15.yx; + v11 += v15; + v11.y += uint(v11.x < v15.x); + v7 ^= v11; + v7 = (v7 >> uvec2(24u)) | (v7 << uvec2(8u)).yx; + v3 += v7; + v3.y += uint(v3.x < v7.x); + // NOP + v15 ^= v3; + v15 = (v15 >> uvec2(16u)) | (v15 << uvec2(16u)).yx; + v11 += v15; + v11.y += uint(v11.x < v15.x); + v7 ^= v11; + v7 = (v7 >> uvec2(31u)).yx | (v7 << uvec2(1u)); + v0 += v5; + v0.y += uint(v0.x < v5.x); + // NOP + v15 ^= v0; + v15 = v15.yx; + v10 += v15; + v10.y += uint(v10.x < v15.x); + v5 ^= v10; + v5 = (v5 >> uvec2(24u)) | (v5 << uvec2(8u)).yx; + v0 += v5; + v0.y += uint(v0.x < v5.x); + // NOP + v15 ^= v0; + v15 = (v15 >> uvec2(16u)) | (v15 << uvec2(16u)).yx; + v10 += v15; + v10.y += uint(v10.x < v15.x); + v5 ^= v10; + v5 = (v5 >> uvec2(31u)).yx | (v5 << uvec2(1u)); + v1 += v6; + v1.y += uint(v1.x < v6.x); + v1 += m3; + v1.y += uint(v1.x < m3.x); + v12 ^= v1; + v12 = v12.yx; + v11 += v12; + v11.y += uint(v11.x < v12.x); + v6 ^= v11; + v6 = (v6 >> uvec2(24u)) | (v6 << uvec2(8u)).yx; + v1 += v6; + v1.y += uint(v1.x < v6.x); + // NOP + v12 ^= v1; + v12 = (v12 >> uvec2(16u)) | (v12 << uvec2(16u)).yx; + v11 += v12; + v11.y += uint(v11.x < v12.x); + v6 ^= v11; + v6 = (v6 >> uvec2(31u)).yx | (v6 << uvec2(1u)); + v2 += v7; + v2.y += uint(v2.x < v7.x); + // NOP + v13 ^= v2; + v13 = v13.yx; + v8 += v13; + v8.y += uint(v8.x < v13.x); + v7 ^= v8; + v7 = (v7 >> uvec2(24u)) | (v7 << uvec2(8u)).yx; + v2 += v7; + v2.y += uint(v2.x < v7.x); + v2 += m1; + v2.y += uint(v2.x < m1.x); + v13 ^= v2; + v13 = (v13 >> uvec2(16u)) | (v13 << uvec2(16u)).yx; + v8 += v13; + v8.y += uint(v8.x < v13.x); + v7 ^= v8; + v7 = (v7 >> uvec2(31u)).yx | (v7 << uvec2(1u)); + v3 += v4; + v3.y += uint(v3.x < v4.x); + // NOP + v14 ^= v3; + v14 = v14.yx; + v9 += v14; + v9.y += uint(v9.x < v14.x); + v4 ^= v9; + v4 = (v4 >> uvec2(24u)) | (v4 << uvec2(8u)).yx; + v3 += v4; + v3.y += uint(v3.x < v4.x); + v3 += m4; + v3.y += uint(v3.x < m4.x); + v14 ^= v3; + v14 = (v14 >> uvec2(16u)) | (v14 << uvec2(16u)).yx; + v9 += v14; + v9.y += uint(v9.x < v14.x); + v4 ^= v9; + v4 = (v4 >> uvec2(31u)).yx | (v4 << uvec2(1u)); + // ROUND 3 + v0 += v4; + v0.y += uint(v0.x < v4.x); + // NOP + v12 ^= v0; + v12 = v12.yx; + v8 += v12; + v8.y += uint(v8.x < v12.x); + v4 ^= v8; + v4 = (v4 >> uvec2(24u)) | (v4 << uvec2(8u)).yx; + v0 += v4; + v0.y += uint(v0.x < v4.x); + // NOP + v12 ^= v0; + v12 = (v12 >> uvec2(16u)) | (v12 << uvec2(16u)).yx; + v8 += v12; + v8.y += uint(v8.x < v12.x); + v4 ^= v8; + v4 = (v4 >> uvec2(31u)).yx | (v4 << uvec2(1u)); + v1 += v5; + v1.y += uint(v1.x < v5.x); + v1 += m3; + v1.y += uint(v1.x < m3.x); + v13 ^= v1; + v13 = v13.yx; + v9 += v13; + v9.y += uint(v9.x < v13.x); + v5 ^= v9; + v5 = (v5 >> uvec2(24u)) | (v5 << uvec2(8u)).yx; + v1 += v5; + v1.y += uint(v1.x < v5.x); + v1 += m1; + v1.y += uint(v1.x < m1.x); + v13 ^= v1; + v13 = (v13 >> uvec2(16u)) | (v13 << uvec2(16u)).yx; + v9 += v13; + v9.y += uint(v9.x < v13.x); + v5 ^= v9; + v5 = (v5 >> uvec2(31u)).yx | (v5 << uvec2(1u)); + v2 += v6; + v2.y += uint(v2.x < v6.x); + // NOP + v14 ^= v2; + v14 = v14.yx; + v10 += v14; + v10.y += uint(v10.x < v14.x); + v6 ^= v10; + v6 = (v6 >> uvec2(24u)) | (v6 << uvec2(8u)).yx; + v2 += v6; + v2.y += uint(v2.x < v6.x); + // NOP + v14 ^= v2; + v14 = (v14 >> uvec2(16u)) | (v14 << uvec2(16u)).yx; + v10 += v14; + v10.y += uint(v10.x < v14.x); + v6 ^= v10; + v6 = (v6 >> uvec2(31u)).yx | (v6 << uvec2(1u)); + v3 += v7; + v3.y += uint(v3.x < v7.x); + // NOP + v15 ^= v3; + v15 = v15.yx; + v11 += v15; + v11.y += uint(v11.x < v15.x); + v7 ^= v11; + v7 = (v7 >> uvec2(24u)) | (v7 << uvec2(8u)).yx; + v3 += v7; + v3.y += uint(v3.x < v7.x); + // NOP + v15 ^= v3; + v15 = (v15 >> uvec2(16u)) | (v15 << uvec2(16u)).yx; + v11 += v15; + v11.y += uint(v11.x < v15.x); + v7 ^= v11; + v7 = (v7 >> uvec2(31u)).yx | (v7 << uvec2(1u)); + v0 += v5; + v0.y += uint(v0.x < v5.x); + v0 += m2; + v0.y += uint(v0.x < m2.x); + v15 ^= v0; + v15 = v15.yx; + v10 += v15; + v10.y += uint(v10.x < v15.x); + v5 ^= v10; + v5 = (v5 >> uvec2(24u)) | (v5 << uvec2(8u)).yx; + v0 += v5; + v0.y += uint(v0.x < v5.x); + // NOP + v15 ^= v0; + v15 = (v15 >> uvec2(16u)) | (v15 << uvec2(16u)).yx; + v10 += v15; + v10.y += uint(v10.x < v15.x); + v5 ^= v10; + v5 = (v5 >> uvec2(31u)).yx | (v5 << uvec2(1u)); + v1 += v6; + v1.y += uint(v1.x < v6.x); + // NOP + v12 ^= v1; + v12 = v12.yx; + v11 += v12; + v11.y += uint(v11.x < v12.x); + v6 ^= v11; + v6 = (v6 >> uvec2(24u)) | (v6 << uvec2(8u)).yx; + v1 += v6; + v1.y += uint(v1.x < v6.x); + // NOP + v12 ^= v1; + v12 = (v12 >> uvec2(16u)) | (v12 << uvec2(16u)).yx; + v11 += v12; + v11.y += uint(v11.x < v12.x); + v6 ^= v11; + v6 = (v6 >> uvec2(31u)).yx | (v6 << uvec2(1u)); + v2 += v7; + v2.y += uint(v2.x < v7.x); + v2 += m4; + v2.y += uint(v2.x < m4.x); + v13 ^= v2; + v13 = v13.yx; + v8 += v13; + v8.y += uint(v8.x < v13.x); + v7 ^= v8; + v7 = (v7 >> uvec2(24u)) | (v7 << uvec2(8u)).yx; + v2 += v7; + v2.y += uint(v2.x < v7.x); + v2 += m0; + v2.y += uint(v2.x < m0.x); + v13 ^= v2; + v13 = (v13 >> uvec2(16u)) | (v13 << uvec2(16u)).yx; + v8 += v13; + v8.y += uint(v8.x < v13.x); + v7 ^= v8; + v7 = (v7 >> uvec2(31u)).yx | (v7 << uvec2(1u)); + v3 += v4; + v3.y += uint(v3.x < v4.x); + // NOP + v14 ^= v3; + v14 = v14.yx; + v9 += v14; + v9.y += uint(v9.x < v14.x); + v4 ^= v9; + v4 = (v4 >> uvec2(24u)) | (v4 << uvec2(8u)).yx; + v3 += v4; + v3.y += uint(v3.x < v4.x); + // NOP + v14 ^= v3; + v14 = (v14 >> uvec2(16u)) | (v14 << uvec2(16u)).yx; + v9 += v14; + v9.y += uint(v9.x < v14.x); + v4 ^= v9; + v4 = (v4 >> uvec2(31u)).yx | (v4 << uvec2(1u)); + // ROUND 4 + v0 += v4; + v0.y += uint(v0.x < v4.x); + // NOP + v12 ^= v0; + v12 = v12.yx; + v8 += v12; + v8.y += uint(v8.x < v12.x); + v4 ^= v8; + v4 = (v4 >> uvec2(24u)) | (v4 << uvec2(8u)).yx; + v0 += v4; + v0.y += uint(v0.x < v4.x); + v0 += m0; + v0.y += uint(v0.x < m0.x); + v12 ^= v0; + v12 = (v12 >> uvec2(16u)) | (v12 << uvec2(16u)).yx; + v8 += v12; + v8.y += uint(v8.x < v12.x); + v4 ^= v8; + v4 = (v4 >> uvec2(31u)).yx | (v4 << uvec2(1u)); + v1 += v5; + v1.y += uint(v1.x < v5.x); + // NOP + v13 ^= v1; + v13 = v13.yx; + v9 += v13; + v9.y += uint(v9.x < v13.x); + v5 ^= v9; + v5 = (v5 >> uvec2(24u)) | (v5 << uvec2(8u)).yx; + v1 += v5; + v1.y += uint(v1.x < v5.x); + // NOP + v13 ^= v1; + v13 = (v13 >> uvec2(16u)) | (v13 << uvec2(16u)).yx; + v9 += v13; + v9.y += uint(v9.x < v13.x); + v5 ^= v9; + v5 = (v5 >> uvec2(31u)).yx | (v5 << uvec2(1u)); + v2 += v6; + v2.y += uint(v2.x < v6.x); + v2 += m2; + v2.y += uint(v2.x < m2.x); + v14 ^= v2; + v14 = v14.yx; + v10 += v14; + v10.y += uint(v10.x < v14.x); + v6 ^= v10; + v6 = (v6 >> uvec2(24u)) | (v6 << uvec2(8u)).yx; + v2 += v6; + v2.y += uint(v2.x < v6.x); + v2 += m4; + v2.y += uint(v2.x < m4.x); + v14 ^= v2; + v14 = (v14 >> uvec2(16u)) | (v14 << uvec2(16u)).yx; + v10 += v14; + v10.y += uint(v10.x < v14.x); + v6 ^= v10; + v6 = (v6 >> uvec2(31u)).yx | (v6 << uvec2(1u)); + v3 += v7; + v3.y += uint(v3.x < v7.x); + // NOP + v15 ^= v3; + v15 = v15.yx; + v11 += v15; + v11.y += uint(v11.x < v15.x); + v7 ^= v11; + v7 = (v7 >> uvec2(24u)) | (v7 << uvec2(8u)).yx; + v3 += v7; + v3.y += uint(v3.x < v7.x); + // NOP + v15 ^= v3; + v15 = (v15 >> uvec2(16u)) | (v15 << uvec2(16u)).yx; + v11 += v15; + v11.y += uint(v11.x < v15.x); + v7 ^= v11; + v7 = (v7 >> uvec2(31u)).yx | (v7 << uvec2(1u)); + v0 += v5; + v0.y += uint(v0.x < v5.x); + // NOP + v15 ^= v0; + v15 = v15.yx; + v10 += v15; + v10.y += uint(v10.x < v15.x); + v5 ^= v10; + v5 = (v5 >> uvec2(24u)) | (v5 << uvec2(8u)).yx; + v0 += v5; + v0.y += uint(v0.x < v5.x); + v0 += m1; + v0.y += uint(v0.x < m1.x); + v15 ^= v0; + v15 = (v15 >> uvec2(16u)) | (v15 << uvec2(16u)).yx; + v10 += v15; + v10.y += uint(v10.x < v15.x); + v5 ^= v10; + v5 = (v5 >> uvec2(31u)).yx | (v5 << uvec2(1u)); + v1 += v6; + v1.y += uint(v1.x < v6.x); + // NOP + v12 ^= v1; + v12 = v12.yx; + v11 += v12; + v11.y += uint(v11.x < v12.x); + v6 ^= v11; + v6 = (v6 >> uvec2(24u)) | (v6 << uvec2(8u)).yx; + v1 += v6; + v1.y += uint(v1.x < v6.x); + // NOP + v12 ^= v1; + v12 = (v12 >> uvec2(16u)) | (v12 << uvec2(16u)).yx; + v11 += v12; + v11.y += uint(v11.x < v12.x); + v6 ^= v11; + v6 = (v6 >> uvec2(31u)).yx | (v6 << uvec2(1u)); + v2 += v7; + v2.y += uint(v2.x < v7.x); + // NOP + v13 ^= v2; + v13 = v13.yx; + v8 += v13; + v8.y += uint(v8.x < v13.x); + v7 ^= v8; + v7 = (v7 >> uvec2(24u)) | (v7 << uvec2(8u)).yx; + v2 += v7; + v2.y += uint(v2.x < v7.x); + // NOP + v13 ^= v2; + v13 = (v13 >> uvec2(16u)) | (v13 << uvec2(16u)).yx; + v8 += v13; + v8.y += uint(v8.x < v13.x); + v7 ^= v8; + v7 = (v7 >> uvec2(31u)).yx | (v7 << uvec2(1u)); + v3 += v4; + v3.y += uint(v3.x < v4.x); + v3 += m3; + v3.y += uint(v3.x < m3.x); + v14 ^= v3; + v14 = v14.yx; + v9 += v14; + v9.y += uint(v9.x < v14.x); + v4 ^= v9; + v4 = (v4 >> uvec2(24u)) | (v4 << uvec2(8u)).yx; + v3 += v4; + v3.y += uint(v3.x < v4.x); + // NOP + v14 ^= v3; + v14 = (v14 >> uvec2(16u)) | (v14 << uvec2(16u)).yx; + v9 += v14; + v9.y += uint(v9.x < v14.x); + v4 ^= v9; + v4 = (v4 >> uvec2(31u)).yx | (v4 << uvec2(1u)); + // ROUND 5 + v0 += v4; + v0.y += uint(v0.x < v4.x); + v0 += m2; + v0.y += uint(v0.x < m2.x); + v12 ^= v0; + v12 = v12.yx; + v8 += v12; + v8.y += uint(v8.x < v12.x); + v4 ^= v8; + v4 = (v4 >> uvec2(24u)) | (v4 << uvec2(8u)).yx; + v0 += v4; + v0.y += uint(v0.x < v4.x); + // NOP + v12 ^= v0; + v12 = (v12 >> uvec2(16u)) | (v12 << uvec2(16u)).yx; + v8 += v12; + v8.y += uint(v8.x < v12.x); + v4 ^= v8; + v4 = (v4 >> uvec2(31u)).yx | (v4 << uvec2(1u)); + v1 += v5; + v1.y += uint(v1.x < v5.x); + // NOP + v13 ^= v1; + v13 = v13.yx; + v9 += v13; + v9.y += uint(v9.x < v13.x); + v5 ^= v9; + v5 = (v5 >> uvec2(24u)) | (v5 << uvec2(8u)).yx; + v1 += v5; + v1.y += uint(v1.x < v5.x); + // NOP + v13 ^= v1; + v13 = (v13 >> uvec2(16u)) | (v13 << uvec2(16u)).yx; + v9 += v13; + v9.y += uint(v9.x < v13.x); + v5 ^= v9; + v5 = (v5 >> uvec2(31u)).yx | (v5 << uvec2(1u)); + v2 += v6; + v2.y += uint(v2.x < v6.x); + v2 += m0; + v2.y += uint(v2.x < m0.x); + v14 ^= v2; + v14 = v14.yx; + v10 += v14; + v10.y += uint(v10.x < v14.x); + v6 ^= v10; + v6 = (v6 >> uvec2(24u)) | (v6 << uvec2(8u)).yx; + v2 += v6; + v2.y += uint(v2.x < v6.x); + // NOP + v14 ^= v2; + v14 = (v14 >> uvec2(16u)) | (v14 << uvec2(16u)).yx; + v10 += v14; + v10.y += uint(v10.x < v14.x); + v6 ^= v10; + v6 = (v6 >> uvec2(31u)).yx | (v6 << uvec2(1u)); + v3 += v7; + v3.y += uint(v3.x < v7.x); + // NOP + v15 ^= v3; + v15 = v15.yx; + v11 += v15; + v11.y += uint(v11.x < v15.x); + v7 ^= v11; + v7 = (v7 >> uvec2(24u)) | (v7 << uvec2(8u)).yx; + v3 += v7; + v3.y += uint(v3.x < v7.x); + v3 += m3; + v3.y += uint(v3.x < m3.x); + v15 ^= v3; + v15 = (v15 >> uvec2(16u)) | (v15 << uvec2(16u)).yx; + v11 += v15; + v11.y += uint(v11.x < v15.x); + v7 ^= v11; + v7 = (v7 >> uvec2(31u)).yx | (v7 << uvec2(1u)); + v0 += v5; + v0.y += uint(v0.x < v5.x); + v0 += m4; + v0.y += uint(v0.x < m4.x); + v15 ^= v0; + v15 = v15.yx; + v10 += v15; + v10.y += uint(v10.x < v15.x); + v5 ^= v10; + v5 = (v5 >> uvec2(24u)) | (v5 << uvec2(8u)).yx; + v0 += v5; + v0.y += uint(v0.x < v5.x); + // NOP + v15 ^= v0; + v15 = (v15 >> uvec2(16u)) | (v15 << uvec2(16u)).yx; + v10 += v15; + v10.y += uint(v10.x < v15.x); + v5 ^= v10; + v5 = (v5 >> uvec2(31u)).yx | (v5 << uvec2(1u)); + v1 += v6; + v1.y += uint(v1.x < v6.x); + // NOP + v12 ^= v1; + v12 = v12.yx; + v11 += v12; + v11.y += uint(v11.x < v12.x); + v6 ^= v11; + v6 = (v6 >> uvec2(24u)) | (v6 << uvec2(8u)).yx; + v1 += v6; + v1.y += uint(v1.x < v6.x); + // NOP + v12 ^= v1; + v12 = (v12 >> uvec2(16u)) | (v12 << uvec2(16u)).yx; + v11 += v12; + v11.y += uint(v11.x < v12.x); + v6 ^= v11; + v6 = (v6 >> uvec2(31u)).yx | (v6 << uvec2(1u)); + v2 += v7; + v2.y += uint(v2.x < v7.x); + // NOP + v13 ^= v2; + v13 = v13.yx; + v8 += v13; + v8.y += uint(v8.x < v13.x); + v7 ^= v8; + v7 = (v7 >> uvec2(24u)) | (v7 << uvec2(8u)).yx; + v2 += v7; + v2.y += uint(v2.x < v7.x); + // NOP + v13 ^= v2; + v13 = (v13 >> uvec2(16u)) | (v13 << uvec2(16u)).yx; + v8 += v13; + v8.y += uint(v8.x < v13.x); + v7 ^= v8; + v7 = (v7 >> uvec2(31u)).yx | (v7 << uvec2(1u)); + v3 += v4; + v3.y += uint(v3.x < v4.x); + v3 += m1; + v3.y += uint(v3.x < m1.x); + v14 ^= v3; + v14 = v14.yx; + v9 += v14; + v9.y += uint(v9.x < v14.x); + v4 ^= v9; + v4 = (v4 >> uvec2(24u)) | (v4 << uvec2(8u)).yx; + v3 += v4; + v3.y += uint(v3.x < v4.x); + // NOP + v14 ^= v3; + v14 = (v14 >> uvec2(16u)) | (v14 << uvec2(16u)).yx; + v9 += v14; + v9.y += uint(v9.x < v14.x); + v4 ^= v9; + v4 = (v4 >> uvec2(31u)).yx | (v4 << uvec2(1u)); + // ROUND 6 + v0 += v4; + v0.y += uint(v0.x < v4.x); + // NOP + v12 ^= v0; + v12 = v12.yx; + v8 += v12; + v8.y += uint(v8.x < v12.x); + v4 ^= v8; + v4 = (v4 >> uvec2(24u)) | (v4 << uvec2(8u)).yx; + v0 += v4; + v0.y += uint(v0.x < v4.x); + // NOP + v12 ^= v0; + v12 = (v12 >> uvec2(16u)) | (v12 << uvec2(16u)).yx; + v8 += v12; + v8.y += uint(v8.x < v12.x); + v4 ^= v8; + v4 = (v4 >> uvec2(31u)).yx | (v4 << uvec2(1u)); + v1 += v5; + v1.y += uint(v1.x < v5.x); + v1 += m1; + v1.y += uint(v1.x < m1.x); + v13 ^= v1; + v13 = v13.yx; + v9 += v13; + v9.y += uint(v9.x < v13.x); + v5 ^= v9; + v5 = (v5 >> uvec2(24u)) | (v5 << uvec2(8u)).yx; + v1 += v5; + v1.y += uint(v1.x < v5.x); + // NOP + v13 ^= v1; + v13 = (v13 >> uvec2(16u)) | (v13 << uvec2(16u)).yx; + v9 += v13; + v9.y += uint(v9.x < v13.x); + v5 ^= v9; + v5 = (v5 >> uvec2(31u)).yx | (v5 << uvec2(1u)); + v2 += v6; + v2.y += uint(v2.x < v6.x); + // NOP + v14 ^= v2; + v14 = v14.yx; + v10 += v14; + v10.y += uint(v10.x < v14.x); + v6 ^= v10; + v6 = (v6 >> uvec2(24u)) | (v6 << uvec2(8u)).yx; + v2 += v6; + v2.y += uint(v2.x < v6.x); + // NOP + v14 ^= v2; + v14 = (v14 >> uvec2(16u)) | (v14 << uvec2(16u)).yx; + v10 += v14; + v10.y += uint(v10.x < v14.x); + v6 ^= v10; + v6 = (v6 >> uvec2(31u)).yx | (v6 << uvec2(1u)); + v3 += v7; + v3.y += uint(v3.x < v7.x); + v3 += m4; + v3.y += uint(v3.x < m4.x); + v15 ^= v3; + v15 = v15.yx; + v11 += v15; + v11.y += uint(v11.x < v15.x); + v7 ^= v11; + v7 = (v7 >> uvec2(24u)) | (v7 << uvec2(8u)).yx; + v3 += v7; + v3.y += uint(v3.x < v7.x); + // NOP + v15 ^= v3; + v15 = (v15 >> uvec2(16u)) | (v15 << uvec2(16u)).yx; + v11 += v15; + v11.y += uint(v11.x < v15.x); + v7 ^= v11; + v7 = (v7 >> uvec2(31u)).yx | (v7 << uvec2(1u)); + v0 += v5; + v0.y += uint(v0.x < v5.x); + v0 += m0; + v0.y += uint(v0.x < m0.x); + v15 ^= v0; + v15 = v15.yx; + v10 += v15; + v10.y += uint(v10.x < v15.x); + v5 ^= v10; + v5 = (v5 >> uvec2(24u)) | (v5 << uvec2(8u)).yx; + v0 += v5; + v0.y += uint(v0.x < v5.x); + // NOP + v15 ^= v0; + v15 = (v15 >> uvec2(16u)) | (v15 << uvec2(16u)).yx; + v10 += v15; + v10.y += uint(v10.x < v15.x); + v5 ^= v10; + v5 = (v5 >> uvec2(31u)).yx | (v5 << uvec2(1u)); + v1 += v6; + v1.y += uint(v1.x < v6.x); + // NOP + v12 ^= v1; + v12 = v12.yx; + v11 += v12; + v11.y += uint(v11.x < v12.x); + v6 ^= v11; + v6 = (v6 >> uvec2(24u)) | (v6 << uvec2(8u)).yx; + v1 += v6; + v1.y += uint(v1.x < v6.x); + v1 += m3; + v1.y += uint(v1.x < m3.x); + v12 ^= v1; + v12 = (v12 >> uvec2(16u)) | (v12 << uvec2(16u)).yx; + v11 += v12; + v11.y += uint(v11.x < v12.x); + v6 ^= v11; + v6 = (v6 >> uvec2(31u)).yx | (v6 << uvec2(1u)); + v2 += v7; + v2.y += uint(v2.x < v7.x); + // NOP + v13 ^= v2; + v13 = v13.yx; + v8 += v13; + v8.y += uint(v8.x < v13.x); + v7 ^= v8; + v7 = (v7 >> uvec2(24u)) | (v7 << uvec2(8u)).yx; + v2 += v7; + v2.y += uint(v2.x < v7.x); + v2 += m2; + v2.y += uint(v2.x < m2.x); + v13 ^= v2; + v13 = (v13 >> uvec2(16u)) | (v13 << uvec2(16u)).yx; + v8 += v13; + v8.y += uint(v8.x < v13.x); + v7 ^= v8; + v7 = (v7 >> uvec2(31u)).yx | (v7 << uvec2(1u)); + v3 += v4; + v3.y += uint(v3.x < v4.x); + // NOP + v14 ^= v3; + v14 = v14.yx; + v9 += v14; + v9.y += uint(v9.x < v14.x); + v4 ^= v9; + v4 = (v4 >> uvec2(24u)) | (v4 << uvec2(8u)).yx; + v3 += v4; + v3.y += uint(v3.x < v4.x); + // NOP + v14 ^= v3; + v14 = (v14 >> uvec2(16u)) | (v14 << uvec2(16u)).yx; + v9 += v14; + v9.y += uint(v9.x < v14.x); + v4 ^= v9; + v4 = (v4 >> uvec2(31u)).yx | (v4 << uvec2(1u)); + // ROUND 7 + v0 += v4; + v0.y += uint(v0.x < v4.x); + // NOP + v12 ^= v0; + v12 = v12.yx; + v8 += v12; + v8.y += uint(v8.x < v12.x); + v4 ^= v8; + v4 = (v4 >> uvec2(24u)) | (v4 << uvec2(8u)).yx; + v0 += v4; + v0.y += uint(v0.x < v4.x); + // NOP + v12 ^= v0; + v12 = (v12 >> uvec2(16u)) | (v12 << uvec2(16u)).yx; + v8 += v12; + v8.y += uint(v8.x < v12.x); + v4 ^= v8; + v4 = (v4 >> uvec2(31u)).yx | (v4 << uvec2(1u)); + v1 += v5; + v1.y += uint(v1.x < v5.x); + // NOP + v13 ^= v1; + v13 = v13.yx; + v9 += v13; + v9.y += uint(v9.x < v13.x); + v5 ^= v9; + v5 = (v5 >> uvec2(24u)) | (v5 << uvec2(8u)).yx; + v1 += v5; + v1.y += uint(v1.x < v5.x); + // NOP + v13 ^= v1; + v13 = (v13 >> uvec2(16u)) | (v13 << uvec2(16u)).yx; + v9 += v13; + v9.y += uint(v9.x < v13.x); + v5 ^= v9; + v5 = (v5 >> uvec2(31u)).yx | (v5 << uvec2(1u)); + v2 += v6; + v2.y += uint(v2.x < v6.x); + // NOP + v14 ^= v2; + v14 = v14.yx; + v10 += v14; + v10.y += uint(v10.x < v14.x); + v6 ^= v10; + v6 = (v6 >> uvec2(24u)) | (v6 << uvec2(8u)).yx; + v2 += v6; + v2.y += uint(v2.x < v6.x); + v2 += m1; + v2.y += uint(v2.x < m1.x); + v14 ^= v2; + v14 = (v14 >> uvec2(16u)) | (v14 << uvec2(16u)).yx; + v10 += v14; + v10.y += uint(v10.x < v14.x); + v6 ^= v10; + v6 = (v6 >> uvec2(31u)).yx | (v6 << uvec2(1u)); + v3 += v7; + v3.y += uint(v3.x < v7.x); + v3 += m3; + v3.y += uint(v3.x < m3.x); + v15 ^= v3; + v15 = v15.yx; + v11 += v15; + v11.y += uint(v11.x < v15.x); + v7 ^= v11; + v7 = (v7 >> uvec2(24u)) | (v7 << uvec2(8u)).yx; + v3 += v7; + v3.y += uint(v3.x < v7.x); + // NOP + v15 ^= v3; + v15 = (v15 >> uvec2(16u)) | (v15 << uvec2(16u)).yx; + v11 += v15; + v11.y += uint(v11.x < v15.x); + v7 ^= v11; + v7 = (v7 >> uvec2(31u)).yx | (v7 << uvec2(1u)); + v0 += v5; + v0.y += uint(v0.x < v5.x); + // NOP + v15 ^= v0; + v15 = v15.yx; + v10 += v15; + v10.y += uint(v10.x < v15.x); + v5 ^= v10; + v5 = (v5 >> uvec2(24u)) | (v5 << uvec2(8u)).yx; + v0 += v5; + v0.y += uint(v0.x < v5.x); + v0 += m0; + v0.y += uint(v0.x < m0.x); + v15 ^= v0; + v15 = (v15 >> uvec2(16u)) | (v15 << uvec2(16u)).yx; + v10 += v15; + v10.y += uint(v10.x < v15.x); + v5 ^= v10; + v5 = (v5 >> uvec2(31u)).yx | (v5 << uvec2(1u)); + v1 += v6; + v1.y += uint(v1.x < v6.x); + // NOP + v12 ^= v1; + v12 = v12.yx; + v11 += v12; + v11.y += uint(v11.x < v12.x); + v6 ^= v11; + v6 = (v6 >> uvec2(24u)) | (v6 << uvec2(8u)).yx; + v1 += v6; + v1.y += uint(v1.x < v6.x); + v1 += m4; + v1.y += uint(v1.x < m4.x); + v12 ^= v1; + v12 = (v12 >> uvec2(16u)) | (v12 << uvec2(16u)).yx; + v11 += v12; + v11.y += uint(v11.x < v12.x); + v6 ^= v11; + v6 = (v6 >> uvec2(31u)).yx | (v6 << uvec2(1u)); + v2 += v7; + v2.y += uint(v2.x < v7.x); + // NOP + v13 ^= v2; + v13 = v13.yx; + v8 += v13; + v8.y += uint(v8.x < v13.x); + v7 ^= v8; + v7 = (v7 >> uvec2(24u)) | (v7 << uvec2(8u)).yx; + v2 += v7; + v2.y += uint(v2.x < v7.x); + // NOP + v13 ^= v2; + v13 = (v13 >> uvec2(16u)) | (v13 << uvec2(16u)).yx; + v8 += v13; + v8.y += uint(v8.x < v13.x); + v7 ^= v8; + v7 = (v7 >> uvec2(31u)).yx | (v7 << uvec2(1u)); + v3 += v4; + v3.y += uint(v3.x < v4.x); + v3 += m2; + v3.y += uint(v3.x < m2.x); + v14 ^= v3; + v14 = v14.yx; + v9 += v14; + v9.y += uint(v9.x < v14.x); + v4 ^= v9; + v4 = (v4 >> uvec2(24u)) | (v4 << uvec2(8u)).yx; + v3 += v4; + v3.y += uint(v3.x < v4.x); + // NOP + v14 ^= v3; + v14 = (v14 >> uvec2(16u)) | (v14 << uvec2(16u)).yx; + v9 += v14; + v9.y += uint(v9.x < v14.x); + v4 ^= v9; + v4 = (v4 >> uvec2(31u)).yx | (v4 << uvec2(1u)); + // ROUND 8 + v0 += v4; + v0.y += uint(v0.x < v4.x); + // NOP + v12 ^= v0; + v12 = v12.yx; + v8 += v12; + v8.y += uint(v8.x < v12.x); + v4 ^= v8; + v4 = (v4 >> uvec2(24u)) | (v4 << uvec2(8u)).yx; + v0 += v4; + v0.y += uint(v0.x < v4.x); + // NOP + v12 ^= v0; + v12 = (v12 >> uvec2(16u)) | (v12 << uvec2(16u)).yx; + v8 += v12; + v8.y += uint(v8.x < v12.x); + v4 ^= v8; + v4 = (v4 >> uvec2(31u)).yx | (v4 << uvec2(1u)); + v1 += v5; + v1.y += uint(v1.x < v5.x); + // NOP + v13 ^= v1; + v13 = v13.yx; + v9 += v13; + v9.y += uint(v9.x < v13.x); + v5 ^= v9; + v5 = (v5 >> uvec2(24u)) | (v5 << uvec2(8u)).yx; + v1 += v5; + v1.y += uint(v1.x < v5.x); + // NOP + v13 ^= v1; + v13 = (v13 >> uvec2(16u)) | (v13 << uvec2(16u)).yx; + v9 += v13; + v9.y += uint(v9.x < v13.x); + v5 ^= v9; + v5 = (v5 >> uvec2(31u)).yx | (v5 << uvec2(1u)); + v2 += v6; + v2.y += uint(v2.x < v6.x); + // NOP + v14 ^= v2; + v14 = v14.yx; + v10 += v14; + v10.y += uint(v10.x < v14.x); + v6 ^= v10; + v6 = (v6 >> uvec2(24u)) | (v6 << uvec2(8u)).yx; + v2 += v6; + v2.y += uint(v2.x < v6.x); + v2 += m3; + v2.y += uint(v2.x < m3.x); + v14 ^= v2; + v14 = (v14 >> uvec2(16u)) | (v14 << uvec2(16u)).yx; + v10 += v14; + v10.y += uint(v10.x < v14.x); + v6 ^= v10; + v6 = (v6 >> uvec2(31u)).yx | (v6 << uvec2(1u)); + v3 += v7; + v3.y += uint(v3.x < v7.x); + v3 += m0; + v3.y += uint(v3.x < m0.x); + v15 ^= v3; + v15 = v15.yx; + v11 += v15; + v11.y += uint(v11.x < v15.x); + v7 ^= v11; + v7 = (v7 >> uvec2(24u)) | (v7 << uvec2(8u)).yx; + v3 += v7; + v3.y += uint(v3.x < v7.x); + // NOP + v15 ^= v3; + v15 = (v15 >> uvec2(16u)) | (v15 << uvec2(16u)).yx; + v11 += v15; + v11.y += uint(v11.x < v15.x); + v7 ^= v11; + v7 = (v7 >> uvec2(31u)).yx | (v7 << uvec2(1u)); + v0 += v5; + v0.y += uint(v0.x < v5.x); + // NOP + v15 ^= v0; + v15 = v15.yx; + v10 += v15; + v10.y += uint(v10.x < v15.x); + v5 ^= v10; + v5 = (v5 >> uvec2(24u)) | (v5 << uvec2(8u)).yx; + v0 += v5; + v0.y += uint(v0.x < v5.x); + v0 += m2; + v0.y += uint(v0.x < m2.x); + v15 ^= v0; + v15 = (v15 >> uvec2(16u)) | (v15 << uvec2(16u)).yx; + v10 += v15; + v10.y += uint(v10.x < v15.x); + v5 ^= v10; + v5 = (v5 >> uvec2(31u)).yx | (v5 << uvec2(1u)); + v1 += v6; + v1.y += uint(v1.x < v6.x); + // NOP + v12 ^= v1; + v12 = v12.yx; + v11 += v12; + v11.y += uint(v11.x < v12.x); + v6 ^= v11; + v6 = (v6 >> uvec2(24u)) | (v6 << uvec2(8u)).yx; + v1 += v6; + v1.y += uint(v1.x < v6.x); + // NOP + v12 ^= v1; + v12 = (v12 >> uvec2(16u)) | (v12 << uvec2(16u)).yx; + v11 += v12; + v11.y += uint(v11.x < v12.x); + v6 ^= v11; + v6 = (v6 >> uvec2(31u)).yx | (v6 << uvec2(1u)); + v2 += v7; + v2.y += uint(v2.x < v7.x); + v2 += m1; + v2.y += uint(v2.x < m1.x); + v13 ^= v2; + v13 = v13.yx; + v8 += v13; + v8.y += uint(v8.x < v13.x); + v7 ^= v8; + v7 = (v7 >> uvec2(24u)) | (v7 << uvec2(8u)).yx; + v2 += v7; + v2.y += uint(v2.x < v7.x); + v2 += m4; + v2.y += uint(v2.x < m4.x); + v13 ^= v2; + v13 = (v13 >> uvec2(16u)) | (v13 << uvec2(16u)).yx; + v8 += v13; + v8.y += uint(v8.x < v13.x); + v7 ^= v8; + v7 = (v7 >> uvec2(31u)).yx | (v7 << uvec2(1u)); + v3 += v4; + v3.y += uint(v3.x < v4.x); + // NOP + v14 ^= v3; + v14 = v14.yx; + v9 += v14; + v9.y += uint(v9.x < v14.x); + v4 ^= v9; + v4 = (v4 >> uvec2(24u)) | (v4 << uvec2(8u)).yx; + v3 += v4; + v3.y += uint(v3.x < v4.x); + // NOP + v14 ^= v3; + v14 = (v14 >> uvec2(16u)) | (v14 << uvec2(16u)).yx; + v9 += v14; + v9.y += uint(v9.x < v14.x); + v4 ^= v9; + v4 = (v4 >> uvec2(31u)).yx | (v4 << uvec2(1u)); + // ROUND 9 + v0 += v4; + v0.y += uint(v0.x < v4.x); + // NOP + v12 ^= v0; + v12 = v12.yx; + v8 += v12; + v8.y += uint(v8.x < v12.x); + v4 ^= v8; + v4 = (v4 >> uvec2(24u)) | (v4 << uvec2(8u)).yx; + v0 += v4; + v0.y += uint(v0.x < v4.x); + v0 += m2; + v0.y += uint(v0.x < m2.x); + v12 ^= v0; + v12 = (v12 >> uvec2(16u)) | (v12 << uvec2(16u)).yx; + v8 += v12; + v8.y += uint(v8.x < v12.x); + v4 ^= v8; + v4 = (v4 >> uvec2(31u)).yx | (v4 << uvec2(1u)); + v1 += v5; + v1.y += uint(v1.x < v5.x); + // NOP + v13 ^= v1; + v13 = v13.yx; + v9 += v13; + v9.y += uint(v9.x < v13.x); + v5 ^= v9; + v5 = (v5 >> uvec2(24u)) | (v5 << uvec2(8u)).yx; + v1 += v5; + v1.y += uint(v1.x < v5.x); + v1 += m4; + v1.y += uint(v1.x < m4.x); + v13 ^= v1; + v13 = (v13 >> uvec2(16u)) | (v13 << uvec2(16u)).yx; + v9 += v13; + v9.y += uint(v9.x < v13.x); + v5 ^= v9; + v5 = (v5 >> uvec2(31u)).yx | (v5 << uvec2(1u)); + v2 += v6; + v2.y += uint(v2.x < v6.x); + // NOP + v14 ^= v2; + v14 = v14.yx; + v10 += v14; + v10.y += uint(v10.x < v14.x); + v6 ^= v10; + v6 = (v6 >> uvec2(24u)) | (v6 << uvec2(8u)).yx; + v2 += v6; + v2.y += uint(v2.x < v6.x); + // NOP + v14 ^= v2; + v14 = (v14 >> uvec2(16u)) | (v14 << uvec2(16u)).yx; + v10 += v14; + v10.y += uint(v10.x < v14.x); + v6 ^= v10; + v6 = (v6 >> uvec2(31u)).yx | (v6 << uvec2(1u)); + v3 += v7; + v3.y += uint(v3.x < v7.x); + v3 += m1; + v3.y += uint(v3.x < m1.x); + v15 ^= v3; + v15 = v15.yx; + v11 += v15; + v11.y += uint(v11.x < v15.x); + v7 ^= v11; + v7 = (v7 >> uvec2(24u)) | (v7 << uvec2(8u)).yx; + v3 += v7; + v3.y += uint(v3.x < v7.x); + // NOP + v15 ^= v3; + v15 = (v15 >> uvec2(16u)) | (v15 << uvec2(16u)).yx; + v11 += v15; + v11.y += uint(v11.x < v15.x); + v7 ^= v11; + v7 = (v7 >> uvec2(31u)).yx | (v7 << uvec2(1u)); + v0 += v5; + v0.y += uint(v0.x < v5.x); + // NOP + v15 ^= v0; + v15 = v15.yx; + v10 += v15; + v10.y += uint(v10.x < v15.x); + v5 ^= v10; + v5 = (v5 >> uvec2(24u)) | (v5 << uvec2(8u)).yx; + v0 += v5; + v0.y += uint(v0.x < v5.x); + // NOP + v15 ^= v0; + v15 = (v15 >> uvec2(16u)) | (v15 << uvec2(16u)).yx; + v10 += v15; + v10.y += uint(v10.x < v15.x); + v5 ^= v10; + v5 = (v5 >> uvec2(31u)).yx | (v5 << uvec2(1u)); + v1 += v6; + v1.y += uint(v1.x < v6.x); + // NOP + v12 ^= v1; + v12 = v12.yx; + v11 += v12; + v11.y += uint(v11.x < v12.x); + v6 ^= v11; + v6 = (v6 >> uvec2(24u)) | (v6 << uvec2(8u)).yx; + v1 += v6; + v1.y += uint(v1.x < v6.x); + // NOP + v12 ^= v1; + v12 = (v12 >> uvec2(16u)) | (v12 << uvec2(16u)).yx; + v11 += v12; + v11.y += uint(v11.x < v12.x); + v6 ^= v11; + v6 = (v6 >> uvec2(31u)).yx | (v6 << uvec2(1u)); + v2 += v7; + v2.y += uint(v2.x < v7.x); + v2 += m3; + v2.y += uint(v2.x < m3.x); + v13 ^= v2; + v13 = v13.yx; + v8 += v13; + v8.y += uint(v8.x < v13.x); + v7 ^= v8; + v7 = (v7 >> uvec2(24u)) | (v7 << uvec2(8u)).yx; + v2 += v7; + v2.y += uint(v2.x < v7.x); + // NOP + v13 ^= v2; + v13 = (v13 >> uvec2(16u)) | (v13 << uvec2(16u)).yx; + v8 += v13; + v8.y += uint(v8.x < v13.x); + v7 ^= v8; + v7 = (v7 >> uvec2(31u)).yx | (v7 << uvec2(1u)); + v3 += v4; + v3.y += uint(v3.x < v4.x); + // NOP + v14 ^= v3; + v14 = v14.yx; + v9 += v14; + v9.y += uint(v9.x < v14.x); + v4 ^= v9; + v4 = (v4 >> uvec2(24u)) | (v4 << uvec2(8u)).yx; + v3 += v4; + v3.y += uint(v3.x < v4.x); + v3 += m0; + v3.y += uint(v3.x < m0.x); + v14 ^= v3; + v14 = (v14 >> uvec2(16u)) | (v14 << uvec2(16u)).yx; + v9 += v14; + v9.y += uint(v9.x < v14.x); + v4 ^= v9; + v4 = (v4 >> uvec2(31u)).yx | (v4 << uvec2(1u)); + // ROUND 10 + v0 += v4; + v0.y += uint(v0.x < v4.x); + v0 += m0; + v0.y += uint(v0.x < m0.x); + v12 ^= v0; + v12 = v12.yx; + v8 += v12; + v8.y += uint(v8.x < v12.x); + v4 ^= v8; + v4 = (v4 >> uvec2(24u)) | (v4 << uvec2(8u)).yx; + v0 += v4; + v0.y += uint(v0.x < v4.x); + v0 += m1; + v0.y += uint(v0.x < m1.x); + v12 ^= v0; + v12 = (v12 >> uvec2(16u)) | (v12 << uvec2(16u)).yx; + v8 += v12; + v8.y += uint(v8.x < v12.x); + v4 ^= v8; + v4 = (v4 >> uvec2(31u)).yx | (v4 << uvec2(1u)); + v1 += v5; + v1.y += uint(v1.x < v5.x); + v1 += m2; + v1.y += uint(v1.x < m2.x); + v13 ^= v1; + v13 = v13.yx; + v9 += v13; + v9.y += uint(v9.x < v13.x); + v5 ^= v9; + v5 = (v5 >> uvec2(24u)) | (v5 << uvec2(8u)).yx; + v1 += v5; + v1.y += uint(v1.x < v5.x); + v1 += m3; + v1.y += uint(v1.x < m3.x); + v13 ^= v1; + v13 = (v13 >> uvec2(16u)) | (v13 << uvec2(16u)).yx; + v9 += v13; + v9.y += uint(v9.x < v13.x); + v5 ^= v9; + v5 = (v5 >> uvec2(31u)).yx | (v5 << uvec2(1u)); + v2 += v6; + v2.y += uint(v2.x < v6.x); + v2 += m4; + v2.y += uint(v2.x < m4.x); + v14 ^= v2; + v14 = v14.yx; + v10 += v14; + v10.y += uint(v10.x < v14.x); + v6 ^= v10; + v6 = (v6 >> uvec2(24u)) | (v6 << uvec2(8u)).yx; + v2 += v6; + v2.y += uint(v2.x < v6.x); + // NOP + v14 ^= v2; + v14 = (v14 >> uvec2(16u)) | (v14 << uvec2(16u)).yx; + v10 += v14; + v10.y += uint(v10.x < v14.x); + v6 ^= v10; + v6 = (v6 >> uvec2(31u)).yx | (v6 << uvec2(1u)); + v3 += v7; + v3.y += uint(v3.x < v7.x); + // NOP + v15 ^= v3; + v15 = v15.yx; + v11 += v15; + v11.y += uint(v11.x < v15.x); + v7 ^= v11; + v7 = (v7 >> uvec2(24u)) | (v7 << uvec2(8u)).yx; + v3 += v7; + v3.y += uint(v3.x < v7.x); + // NOP + v15 ^= v3; + v15 = (v15 >> uvec2(16u)) | (v15 << uvec2(16u)).yx; + v11 += v15; + v11.y += uint(v11.x < v15.x); + v7 ^= v11; + v7 = (v7 >> uvec2(31u)).yx | (v7 << uvec2(1u)); + v0 += v5; + v0.y += uint(v0.x < v5.x); + // NOP + v15 ^= v0; + v15 = v15.yx; + v10 += v15; + v10.y += uint(v10.x < v15.x); + v5 ^= v10; + v5 = (v5 >> uvec2(24u)) | (v5 << uvec2(8u)).yx; + v0 += v5; + v0.y += uint(v0.x < v5.x); + // NOP + v15 ^= v0; + v15 = (v15 >> uvec2(16u)) | (v15 << uvec2(16u)).yx; + v10 += v15; + v10.y += uint(v10.x < v15.x); + v5 ^= v10; + v5 = (v5 >> uvec2(31u)).yx | (v5 << uvec2(1u)); + v1 += v6; + v1.y += uint(v1.x < v6.x); + // NOP + v12 ^= v1; + v12 = v12.yx; + v11 += v12; + v11.y += uint(v11.x < v12.x); + v6 ^= v11; + v6 = (v6 >> uvec2(24u)) | (v6 << uvec2(8u)).yx; + v1 += v6; + v1.y += uint(v1.x < v6.x); + // NOP + v12 ^= v1; + v12 = (v12 >> uvec2(16u)) | (v12 << uvec2(16u)).yx; + v11 += v12; + v11.y += uint(v11.x < v12.x); + v6 ^= v11; + v6 = (v6 >> uvec2(31u)).yx | (v6 << uvec2(1u)); + v2 += v7; + v2.y += uint(v2.x < v7.x); + // NOP + v13 ^= v2; + v13 = v13.yx; + v8 += v13; + v8.y += uint(v8.x < v13.x); + v7 ^= v8; + v7 = (v7 >> uvec2(24u)) | (v7 << uvec2(8u)).yx; + v2 += v7; + v2.y += uint(v2.x < v7.x); + // NOP + v13 ^= v2; + v13 = (v13 >> uvec2(16u)) | (v13 << uvec2(16u)).yx; + v8 += v13; + v8.y += uint(v8.x < v13.x); + v7 ^= v8; + v7 = (v7 >> uvec2(31u)).yx | (v7 << uvec2(1u)); + v3 += v4; + v3.y += uint(v3.x < v4.x); + // NOP + v14 ^= v3; + v14 = v14.yx; + v9 += v14; + v9.y += uint(v9.x < v14.x); + v4 ^= v9; + v4 = (v4 >> uvec2(24u)) | (v4 << uvec2(8u)).yx; + v3 += v4; + v3.y += uint(v3.x < v4.x); + // NOP + v14 ^= v3; + v14 = (v14 >> uvec2(16u)) | (v14 << uvec2(16u)).yx; + v9 += v14; + v9.y += uint(v9.x < v14.x); + v4 ^= v9; + v4 = (v4 >> uvec2(31u)).yx | (v4 << uvec2(1u)); + // ROUND 11 + v0 += v4; + v0.y += uint(v0.x < v4.x); + // NOP + v12 ^= v0; + v12 = v12.yx; + v8 += v12; + v8.y += uint(v8.x < v12.x); + v4 ^= v8; + v4 = (v4 >> uvec2(24u)) | (v4 << uvec2(8u)).yx; + v0 += v4; + v0.y += uint(v0.x < v4.x); + // NOP + v12 ^= v0; + v12 = (v12 >> uvec2(16u)) | (v12 << uvec2(16u)).yx; + v8 += v12; + v8.y += uint(v8.x < v12.x); + v4 ^= v8; + v4 = (v4 >> uvec2(31u)).yx | (v4 << uvec2(1u)); + v1 += v5; + v1.y += uint(v1.x < v5.x); + v1 += m4; + v1.y += uint(v1.x < m4.x); + v13 ^= v1; + v13 = v13.yx; + v9 += v13; + v9.y += uint(v9.x < v13.x); + v5 ^= v9; + v5 = (v5 >> uvec2(24u)) | (v5 << uvec2(8u)).yx; + v1 += v5; + v1.y += uint(v1.x < v5.x); + // NOP + v13 ^= v1; + v13 = (v13 >> uvec2(16u)) | (v13 << uvec2(16u)).yx; + v9 += v13; + v9.y += uint(v9.x < v13.x); + v5 ^= v9; + v5 = (v5 >> uvec2(31u)).yx | (v5 << uvec2(1u)); + v2 += v6; + v2.y += uint(v2.x < v6.x); + // NOP + v14 ^= v2; + v14 = v14.yx; + v10 += v14; + v10.y += uint(v10.x < v14.x); + v6 ^= v10; + v6 = (v6 >> uvec2(24u)) | (v6 << uvec2(8u)).yx; + v2 += v6; + v2.y += uint(v2.x < v6.x); + // NOP + v14 ^= v2; + v14 = (v14 >> uvec2(16u)) | (v14 << uvec2(16u)).yx; + v10 += v14; + v10.y += uint(v10.x < v14.x); + v6 ^= v10; + v6 = (v6 >> uvec2(31u)).yx | (v6 << uvec2(1u)); + v3 += v7; + v3.y += uint(v3.x < v7.x); + // NOP + v15 ^= v3; + v15 = v15.yx; + v11 += v15; + v11.y += uint(v11.x < v15.x); + v7 ^= v11; + v7 = (v7 >> uvec2(24u)) | (v7 << uvec2(8u)).yx; + v3 += v7; + v3.y += uint(v3.x < v7.x); + // NOP + v15 ^= v3; + v15 = (v15 >> uvec2(16u)) | (v15 << uvec2(16u)).yx; + v11 += v15; + v11.y += uint(v11.x < v15.x); + v7 ^= v11; + v7 = (v7 >> uvec2(31u)).yx | (v7 << uvec2(1u)); + v0 += v5; + v0.y += uint(v0.x < v5.x); + v0 += m1; + v0.y += uint(v0.x < m1.x); + v15 ^= v0; + v15 = v15.yx; + v10 += v15; + v10.y += uint(v10.x < v15.x); + v5 ^= v10; + v5 = (v5 >> uvec2(24u)) | (v5 << uvec2(8u)).yx; + v0 += v5; + v0.y += uint(v0.x < v5.x); + // NOP + v15 ^= v0; + v15 = (v15 >> uvec2(16u)) | (v15 << uvec2(16u)).yx; + v10 += v15; + v10.y += uint(v10.x < v15.x); + v5 ^= v10; + v5 = (v5 >> uvec2(31u)).yx | (v5 << uvec2(1u)); + // G NOP + v2 += v7; + v2.y += uint(v2.x < v7.x); + // NOP + v13 ^= v2; + v13 = v13.yx; + v8 += v13; + v8.y += uint(v8.x < v13.x); + v7 ^= v8; + v7 = (v7 >> uvec2(24u)) | (v7 << uvec2(8u)).yx; + v2 += v7; + v2.y += uint(v2.x < v7.x); + // NOP + v13 ^= v2; + v13 = (v13 >> uvec2(16u)) | (v13 << uvec2(16u)).yx; + v8 += v13; + v8.y += uint(v8.x < v13.x); + v7 ^= v8; + v7 = (v7 >> uvec2(31u)).yx | (v7 << uvec2(1u)); + // G NOP + // Set pixel value if it exceeds difficulty threshold, else discard it. + uvec2 result = uvec2(0xf3bcc908u, 0x6a09e667u) ^ uvec2(0x01010008u, 0x0u) ^ v0 ^ v8; + if (result.y > difficulty.y || (result.y == difficulty.y && result.x >= difficulty.x)) { + work = uvec4(m0, result); + } + if (work.x == 0u) { + discard; + } +} diff --git a/src/lib/generate/webgl/shaders/generate.ts b/src/lib/generate/webgl/shaders/generate.ts deleted file mode 100644 index 6a12f2a..0000000 --- a/src/lib/generate/webgl/shaders/generate.ts +++ /dev/null @@ -1,241 +0,0 @@ -//! SPDX-FileCopyrightText: 2025 Chris Duncan -//! SPDX-License-Identifier: GPL-3.0-or-later - -const blake2b_state: number[][] = [ - [0, 4, 8, 12], - [1, 5, 9, 13], - [2, 6, 10, 14], - [3, 7, 11, 15], - [0, 5, 10, 15], - [1, 6, 11, 12], - [2, 7, 8, 13], - [3, 4, 9, 14] -] - -const blake2b_sigma: (0 | 1 | 2 | 3 | 4 | 'Z')[][] = [ - [0, 1, 2, 3, 4, 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z'], - ['Z', 'Z', 4, 'Z', 'Z', 'Z', 'Z', 'Z', 1, 'Z', 0, 2, 'Z', 'Z', 'Z', 3], - ['Z', 'Z', 'Z', 0, 'Z', 2, 'Z', 'Z', 'Z', 'Z', 3, 'Z', 'Z', 1, 'Z', 4], - ['Z', 'Z', 3, 1, 'Z', 'Z', 'Z', 'Z', 2, 'Z', 'Z', 'Z', 4, 0, 'Z', 'Z'], - ['Z', 0, 'Z', 'Z', 2, 4, 'Z', 'Z', 'Z', 1, 'Z', 'Z', 'Z', 'Z', 3, 'Z'], - [2, 'Z', 'Z', 'Z', 0, 'Z', 'Z', 3, 4, 'Z', 'Z', 'Z', 'Z', 'Z', 1, 'Z'], - ['Z', 'Z', 1, 'Z', 'Z', 'Z', 4, 'Z', 0, 'Z', 'Z', 3, 'Z', 2, 'Z', 'Z'], - ['Z', 'Z', 'Z', 'Z', 'Z', 1, 3, 'Z', 'Z', 0, 'Z', 4, 'Z', 'Z', 2, 'Z'], - ['Z', 'Z', 'Z', 'Z', 'Z', 3, 0, 'Z', 'Z', 2, 'Z', 'Z', 1, 4, 'Z', 'Z'], - ['Z', 2, 'Z', 4, 'Z', 'Z', 1, 'Z', 'Z', 'Z', 'Z', 'Z', 3, 'Z', 'Z', 0], - [0, 1, 2, 3, 4, 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z'], - ['Z', 'Z', 4, 'Z', 'Z', 'Z', 'Z', 'Z', 1, 'Z', 0, 2, 'Z', 'Z', 'Z', 3] -] - -/** -* Initialization vector defined by BLAKE2. -* Application of each XOR is defined by BLAKE2 section 2.4 compression function. -* Each uvec2 represents two halves of the original u64 value from the reference -* implementation. They appear reversed pairwise as defined below, but this is an -* illusion due to endianness: the \`x\` component of the vector is the low bits -* and the \`y\` component is the high bits, and if you laid the bits out -* individually, they would match the little-endian 64-bit representation. -*/ -const blake2b_iv = [ - 'uvec2(0xF3BCC908u, 0x6A09E667u)', - 'uvec2(0x84CAA73Bu, 0xBB67AE85u)', - 'uvec2(0xFE94F82Bu, 0x3C6EF372u)', - 'uvec2(0x5F1D36F1u, 0xA54FF53Au)', - 'uvec2(0xADE682D1u, 0x510E527Fu)', - 'uvec2(0x2B3E6C1Fu, 0x9B05688Cu)', - 'uvec2(0xFB41BD6Bu, 0x1F83D9ABu)', - 'uvec2(0x137E2179u, 0x5BE0CD19u)' -] - -/** -* Parameter block as defined in BLAKE2 section 2.8 and configured as follows: -* maximal depth = 1, fanout = 1, digest byte length = 8 -*/ -const blake2b_param = `uvec2(0x01010008u, 0u)` - -function G (a: number, b: number, c: number, d: number, - x: 0 | 1 | 2 | 3 | 4 | 'Z', y: 0 | 1 | 2 | 3 | 4 | 'Z' -): string { - return ` -v${a} += v${b}; -v${a}.y += uint(v${a}.x < v${b}.x); - -${x === 'Z' ? `// NOP` : ` - v${a} += m${x}; - v${a}.y += uint(v${a}.x < m${x}.x);`} - -v${d} = (v${d} ^ v${a}).yx; - -v${c} += v${d}; -v${c}.y += uint(v${c}.x < v${d}.x); - -v${b} ^= v${c}; -v${b} = (v${b} >> uvec2(24u)) | (v${b} << uvec2(8u)).yx; - -v${a} += v${b}; -v${a}.y += uint(v${a}.x < v${b}.x); - -${y === 'Z' ? `// NOP` : ` - v${a} += m${y}; - v${a}.y += uint(v${a}.x < m${y}.x);`} - -v${d} ^= v${a}; -v${d} = (v${d} >> uvec2(16u)) | (v${d} << uvec2(16u)).yx; - -v${c} += v${d}; -v${c}.y += uint(v${c}.x < v${d}.x); - -v${b} ^= v${c}; -v${b} = (v${b} >> uvec2(31u)).yx | (v${b} << uvec2(1u)); -`} - -function ROUND (r: number): string { - let output = `// ROUND ${r}` - for (let i = 0; i < 8; i++) { - const [a, b, c, d] = blake2b_state[i] - const s = blake2b_sigma[r] - output += r === 11 && (i === 5 || i === 7) ? `// G NOP` : ` - ${G(a, b, c, d, s[2 * i], s[2 * i + 1])} - ` - } - return output -} - -function SETUP (): string { - let output = ` -// Initialize fragment output -work = uvec4(0u); - -// Initialize unique nonce -uvec2 m0 = seed ^ uvec2(gl_FragCoord); - -// Block hash -uvec2 m1 = uvec2(hash[0u], hash[1u]); -uvec2 m2 = uvec2(hash[2u], hash[3u]); -uvec2 m3 = uvec2(hash[4u], hash[5u]); -uvec2 m4 = uvec2(hash[6u], hash[7u]); - -// Initialize state vector configured for Nano -// v0: depth=1; fanout=1; outlen=8 -// v12: input byte length -// v14: final block flag -` - for (let i = 0; i < 8; i++) { - output += ` - uvec2 v${i} = ${blake2b_iv[i]}; - uvec2 v${i + 8} = ${blake2b_iv[i]}; - ` - } - output += ` - v0 ^= uvec2(0x01010008u, 0u); - v12 ^= uvec2(40u, 0u); - v14 = ~v14; - ` - return output -} - -function HASH () { - let output = ` -/** -* Twelve rounds of G mixing as part of BLAKE2b compression step, each divided -* into eight subprocesses, each of which is further paired to be processed in -* parallel by packing independent uvec2 variables into vec4 variables. -* Each subprocess statement execution is alternated so that the compiler can -* interleave independent instructions for improved scheduling. That is to say, -* the first statement \`a = a + b\` is executed for each subprocess, and then -* the next statement \`a = a + m[sigma[r][2*i+0]]\` is executed, and so on -* through all the steps of the G mix function. Once subprocesses 1-4 are done, -* computation on subprocesses 5-8 are executed in the same manner. -* -* Each subprocess applies transformations to \`m\` and \`v\` variables based on a -* defined set of index inputs. The algorithm for each subprocess is defined as -* follows: -* -* r is the current round -* i is the current subprocess within that round -* a, b, c, d are elements of \`v\` at specific indexes -* sigma is a defined set of array indexes for \`m\` -* rotr64 is a right-hand bit rotation function -* -* a = a + b -* a = a + m[sigma[r][2*i+0]] -* d = rotr64(d ^ a, 32) -* c = c + d -* b = rotr64(b ^ c, 24) -* a = a + b -* a = a + m[sigma[r][2*i+1]] -* d = rotr64(d ^ a, 16) -* c = c + d -* b = rotr64(b ^ c, 63) -* -* Each sum step has an extra carry addition. Note that the m[sigma] sum is -* skipped if m[sigma] is zero since it effectively does nothing. Also note -* that rotations must be applied differently from the reference implementation -* due to the lack of both a native rotate function and 64-bit support in WGSL. -*/ -` - for (let i = 0; i < 12; i++) output += ROUND(i) - return output -} - -function UNIFORMS () { - return ` -// hash - Array of 32-bit integers comprising a 32-byte Nano block hash -// difficulty - Minimum threshold for BLAKE2b result for work to be valid -// seed - Random value which is uniquely varied by pixel coordinates -layout(std140) uniform INPUT { - uint hash[8]; - uvec2 difficulty; - uvec2 seed; -}; - -// work - Pixel value output if and only if valid nonce is found -out uvec4 work; -`} - -/** Output to draw.frag using Node */ -console.log(`#version 300 es -#pragma vscode_glsllint_stage: frag -//! SPDX-FileCopyrightText: 2025 Chris Duncan -//! SPDX-FileContributor: Ben Green -//! SPDX-License-Identifier: GPL-3.0-or-later AND MIT - -#ifdef GL_FRAGMENT_PRECISION_HIGH -precision highp float; -#else -precision mediump float; -#endif -${UNIFORMS()} - -/** -* Main draw function -* -* Draws a single pixel per shader invocation, multiplied by the dimensions of -* the canvas. -* -* Each component of a random 8-byte value, provided by the INPUT as a uvec2, -* is XOR'd with the 2-D coordinates of the pixel on the canvas to create a -* unique nonce value for each. -* -* Where the reference implementation uses array lookups, the NanoPow -* implementation assigns each array element to its own variable to enhance -* performance, but the variable name still contains the original index digit. -*/ -void main() { - - ${SETUP()} - ${HASH()} - - // NONCE CHECK - // Set pixel value if it exceeds difficulty threshold, else discard it. - uvec2 result = ${blake2b_iv[0]} ^ ${blake2b_param} ^ v0 ^ v8; - if (result.y > difficulty.y || (result.y == difficulty.y && result.x >= difficulty.x)) { - work = uvec4(m0, result); - } - if (work.x == 0u) { - discard; - } -} -`) - -export { } diff --git a/src/lib/generate/webgl/shaders/index.ts b/src/lib/generate/webgl/shaders/index.ts index 86c0531..a780471 100644 --- a/src/lib/generate/webgl/shaders/index.ts +++ b/src/lib/generate/webgl/shaders/index.ts @@ -2,7 +2,7 @@ //! SPDX-License-Identifier: GPL-3.0-or-later import { default as downsampleSource } from './downsample.frag' -import { default as drawSource } from './build/draw.frag' +import { default as drawSource } from './draw.frag' import { default as quadSource } from './quad.vert' export { downsampleSource, drawSource, quadSource } diff --git a/src/lib/generate/webgl/shaders/tsconfig.json b/src/lib/generate/webgl/shaders/tsconfig.json deleted file mode 100644 index 482924f..0000000 --- a/src/lib/generate/webgl/shaders/tsconfig.json +++ /dev/null @@ -1,11 +0,0 @@ -{ - "include": [ - "./*.ts" - ], - "compilerOptions": { - "target": "ESNext", - "module": "ESNext", - "moduleResolution": "Bundler", - "outDir": "./build" - } -} diff --git a/src/lib/generate/webgl/shaders/tsconfig.json.license b/src/lib/generate/webgl/shaders/tsconfig.json.license deleted file mode 100644 index 7f56691..0000000 --- a/src/lib/generate/webgl/shaders/tsconfig.json.license +++ /dev/null @@ -1,2 +0,0 @@ -SPDX-FileCopyrightText: 2025 Chris Duncan -SPDX-License-Identifier: GPL-3.0-or-later diff --git a/src/lib/generate/webgpu/index.ts b/src/lib/generate/webgpu/index.ts index 654d1e4..75de037 100644 --- a/src/lib/generate/webgpu/index.ts +++ b/src/lib/generate/webgpu/index.ts @@ -1,7 +1,7 @@ //! SPDX-FileCopyrightText: 2025 Chris Duncan //! SPDX-License-Identifier: GPL-3.0-or-later -import { default as NanoPowGpuComputeShader } from './shaders/build/compute.wgsl' +import { default as NanoPowGpuComputeShader } from './shaders/compute.wgsl' import { WorkGenerateResponse } from '#types' import { bigintAsUintNArray, bigintRandom, bigintToHex, Logger, Queue } from '#utils' diff --git a/src/lib/generate/webgpu/shaders/compute.wgsl b/src/lib/generate/webgpu/shaders/compute.wgsl new file mode 100644 index 0000000..9701121 --- /dev/null +++ b/src/lib/generate/webgpu/shaders/compute.wgsl @@ -0,0 +1,1857 @@ +// Input buffers +struct INPUT { + hash: array, 2>, + difficulty: vec2, + seed: vec2 +}; +@group(0) @binding(0) varinput: INPUT; +// Output buffers +struct OUTPUT { + found: atomic, + work: vec2, + difficulty: vec2 +}; +@group(0) @binding(1) var output: OUTPUT; +// Shared flag to prevent execution for all workgroup threads based on the +// atomicLoad() result of a single member thread. +var found: bool; +// Shared memory for hash, difficulty, and seed. +var m1: vec2; +var m2: vec2; +var m3: vec2; +var m4: vec2; +var d: vec2; +var seed: vec2; +// Main compute function +// +// Computes with a workgroup size of 64 which balances warps between NVIDIA and +// AMD cards while still considering the power-sensitive requirements of mobile +// devices. The entire workgroup exits immediately if a nonce was already found +// by a previous workgroup. +// +// Each component of a random 8-byte value, provided by the UBO as a vec2, +// is XOR'd with a different dimensional index from the global thread identifier +// to create a unique nonce value for each thread. +// +// Where the reference implementation uses array lookups, the NanoPow +// implementation assigns each array element to its own variable to enhance +// performance, but the variable name still contains the original index digit. +@compute @workgroup_size(64) +fn main(@builtin(global_invocation_id) global_id: vec3, @builtin(local_invocation_id) local_id: vec3) { + if (local_id.x == 0u) { + found = atomicLoad(& output.found) != 0u; + seed = input.seed; + m1 = input.hash[0u].xy; + m2 = input.hash[0u].zw; + m3 = input.hash[1u].xy; + m4 = input.hash[1u].zw; + d = input.difficulty; + } + workgroupBarrier(); + if (found) { return; } + // Initialize unique nonce + let m0: vec2 = seed ^ global_id.xy; + // INITIALIZE STATE VECTOR + // v0: depth=1; fanout=1; outlen=8 + // v12: input byte length + // v14: final block flag + var v0 = vec2(0xf3bcc908u, 0x6a09e667u); + var v8 = vec2(0xf3bcc908u, 0x6a09e667u); + var v1 = vec2(0x84caa73bu, 0xbb67ae85u); + var v9 = vec2(0x84caa73bu, 0xbb67ae85u); + var v2 = vec2(0xfe94f82bu, 0x3c6ef372u); + var v10 = vec2(0xfe94f82bu, 0x3c6ef372u); + var v3 = vec2(0x5f1d36f1u, 0xa54ff53au); + var v11 = vec2(0x5f1d36f1u, 0xa54ff53au); + var v4 = vec2(0xade682d1u, 0x510e527fu); + var v12 = vec2(0xade682d1u, 0x510e527fu); + var v5 = vec2(0x2b3e6c1fu, 0x9b05688cu); + var v13 = vec2(0x2b3e6c1fu, 0x9b05688cu); + var v6 = vec2(0xfb41bd6bu, 0x1f83d9abu); + var v14 = vec2(0xfb41bd6bu, 0x1f83d9abu); + var v7 = vec2(0x137e2179u, 0x5be0cd19u); + var v15 = vec2(0x137e2179u, 0x5be0cd19u); + v0 ^= vec2(0x01010008u, 0x0u); + v12 ^= vec2(0x28u, 0x0u); + v14 ^= vec2(~0x0u); + // COMPRESS + // ROUND 0 + v0 += v4; + v0.y += u32(v0.x < v4.x); + v0 += m0; + v0.y += u32(v0.x < m0.x); + v12 ^= v0; + v12 = v12.yx; + v8 += v12; + v8.y += u32(v8.x < v12.x); + v4 ^= v8; + v4 = (v4 >> vec2(24u)) | (v4 << vec2(8u)).yx; + v0 += v4; + v0.y += u32(v0.x < v4.x); + v0 += m1; + v0.y += u32(v0.x < m1.x); + v12 ^= v0; + v12 = (v12 >> vec2(16u)) | (v12 << vec2(16u)).yx; + v8 += v12; + v8.y += u32(v8.x < v12.x); + v4 ^= v8; + v4 = (v4 >> vec2(31u)).yx | (v4 << vec2(1u)); + v1 += v5; + v1.y += u32(v1.x < v5.x); + v1 += m2; + v1.y += u32(v1.x < m2.x); + v13 ^= v1; + v13 = v13.yx; + v9 += v13; + v9.y += u32(v9.x < v13.x); + v5 ^= v9; + v5 = (v5 >> vec2(24u)) | (v5 << vec2(8u)).yx; + v1 += v5; + v1.y += u32(v1.x < v5.x); + v1 += m3; + v1.y += u32(v1.x < m3.x); + v13 ^= v1; + v13 = (v13 >> vec2(16u)) | (v13 << vec2(16u)).yx; + v9 += v13; + v9.y += u32(v9.x < v13.x); + v5 ^= v9; + v5 = (v5 >> vec2(31u)).yx | (v5 << vec2(1u)); + v2 += v6; + v2.y += u32(v2.x < v6.x); + v2 += m4; + v2.y += u32(v2.x < m4.x); + v14 ^= v2; + v14 = v14.yx; + v10 += v14; + v10.y += u32(v10.x < v14.x); + v6 ^= v10; + v6 = (v6 >> vec2(24u)) | (v6 << vec2(8u)).yx; + v2 += v6; + v2.y += u32(v2.x < v6.x); + // NOP + v14 ^= v2; + v14 = (v14 >> vec2(16u)) | (v14 << vec2(16u)).yx; + v10 += v14; + v10.y += u32(v10.x < v14.x); + v6 ^= v10; + v6 = (v6 >> vec2(31u)).yx | (v6 << vec2(1u)); + v3 += v7; + v3.y += u32(v3.x < v7.x); + // NOP + v15 ^= v3; + v15 = v15.yx; + v11 += v15; + v11.y += u32(v11.x < v15.x); + v7 ^= v11; + v7 = (v7 >> vec2(24u)) | (v7 << vec2(8u)).yx; + v3 += v7; + v3.y += u32(v3.x < v7.x); + // NOP + v15 ^= v3; + v15 = (v15 >> vec2(16u)) | (v15 << vec2(16u)).yx; + v11 += v15; + v11.y += u32(v11.x < v15.x); + v7 ^= v11; + v7 = (v7 >> vec2(31u)).yx | (v7 << vec2(1u)); + v0 += v5; + v0.y += u32(v0.x < v5.x); + // NOP + v15 ^= v0; + v15 = v15.yx; + v10 += v15; + v10.y += u32(v10.x < v15.x); + v5 ^= v10; + v5 = (v5 >> vec2(24u)) | (v5 << vec2(8u)).yx; + v0 += v5; + v0.y += u32(v0.x < v5.x); + // NOP + v15 ^= v0; + v15 = (v15 >> vec2(16u)) | (v15 << vec2(16u)).yx; + v10 += v15; + v10.y += u32(v10.x < v15.x); + v5 ^= v10; + v5 = (v5 >> vec2(31u)).yx | (v5 << vec2(1u)); + v1 += v6; + v1.y += u32(v1.x < v6.x); + // NOP + v12 ^= v1; + v12 = v12.yx; + v11 += v12; + v11.y += u32(v11.x < v12.x); + v6 ^= v11; + v6 = (v6 >> vec2(24u)) | (v6 << vec2(8u)).yx; + v1 += v6; + v1.y += u32(v1.x < v6.x); + // NOP + v12 ^= v1; + v12 = (v12 >> vec2(16u)) | (v12 << vec2(16u)).yx; + v11 += v12; + v11.y += u32(v11.x < v12.x); + v6 ^= v11; + v6 = (v6 >> vec2(31u)).yx | (v6 << vec2(1u)); + v2 += v7; + v2.y += u32(v2.x < v7.x); + // NOP + v13 ^= v2; + v13 = v13.yx; + v8 += v13; + v8.y += u32(v8.x < v13.x); + v7 ^= v8; + v7 = (v7 >> vec2(24u)) | (v7 << vec2(8u)).yx; + v2 += v7; + v2.y += u32(v2.x < v7.x); + // NOP + v13 ^= v2; + v13 = (v13 >> vec2(16u)) | (v13 << vec2(16u)).yx; + v8 += v13; + v8.y += u32(v8.x < v13.x); + v7 ^= v8; + v7 = (v7 >> vec2(31u)).yx | (v7 << vec2(1u)); + v3 += v4; + v3.y += u32(v3.x < v4.x); + // NOP + v14 ^= v3; + v14 = v14.yx; + v9 += v14; + v9.y += u32(v9.x < v14.x); + v4 ^= v9; + v4 = (v4 >> vec2(24u)) | (v4 << vec2(8u)).yx; + v3 += v4; + v3.y += u32(v3.x < v4.x); + // NOP + v14 ^= v3; + v14 = (v14 >> vec2(16u)) | (v14 << vec2(16u)).yx; + v9 += v14; + v9.y += u32(v9.x < v14.x); + v4 ^= v9; + v4 = (v4 >> vec2(31u)).yx | (v4 << vec2(1u)); + // ROUND 1 + v0 += v4; + v0.y += u32(v0.x < v4.x); + // NOP + v12 ^= v0; + v12 = v12.yx; + v8 += v12; + v8.y += u32(v8.x < v12.x); + v4 ^= v8; + v4 = (v4 >> vec2(24u)) | (v4 << vec2(8u)).yx; + v0 += v4; + v0.y += u32(v0.x < v4.x); + // NOP + v12 ^= v0; + v12 = (v12 >> vec2(16u)) | (v12 << vec2(16u)).yx; + v8 += v12; + v8.y += u32(v8.x < v12.x); + v4 ^= v8; + v4 = (v4 >> vec2(31u)).yx | (v4 << vec2(1u)); + v1 += v5; + v1.y += u32(v1.x < v5.x); + v1 += m4; + v1.y += u32(v1.x < m4.x); + v13 ^= v1; + v13 = v13.yx; + v9 += v13; + v9.y += u32(v9.x < v13.x); + v5 ^= v9; + v5 = (v5 >> vec2(24u)) | (v5 << vec2(8u)).yx; + v1 += v5; + v1.y += u32(v1.x < v5.x); + // NOP + v13 ^= v1; + v13 = (v13 >> vec2(16u)) | (v13 << vec2(16u)).yx; + v9 += v13; + v9.y += u32(v9.x < v13.x); + v5 ^= v9; + v5 = (v5 >> vec2(31u)).yx | (v5 << vec2(1u)); + v2 += v6; + v2.y += u32(v2.x < v6.x); + // NOP + v14 ^= v2; + v14 = v14.yx; + v10 += v14; + v10.y += u32(v10.x < v14.x); + v6 ^= v10; + v6 = (v6 >> vec2(24u)) | (v6 << vec2(8u)).yx; + v2 += v6; + v2.y += u32(v2.x < v6.x); + // NOP + v14 ^= v2; + v14 = (v14 >> vec2(16u)) | (v14 << vec2(16u)).yx; + v10 += v14; + v10.y += u32(v10.x < v14.x); + v6 ^= v10; + v6 = (v6 >> vec2(31u)).yx | (v6 << vec2(1u)); + v3 += v7; + v3.y += u32(v3.x < v7.x); + // NOP + v15 ^= v3; + v15 = v15.yx; + v11 += v15; + v11.y += u32(v11.x < v15.x); + v7 ^= v11; + v7 = (v7 >> vec2(24u)) | (v7 << vec2(8u)).yx; + v3 += v7; + v3.y += u32(v3.x < v7.x); + // NOP + v15 ^= v3; + v15 = (v15 >> vec2(16u)) | (v15 << vec2(16u)).yx; + v11 += v15; + v11.y += u32(v11.x < v15.x); + v7 ^= v11; + v7 = (v7 >> vec2(31u)).yx | (v7 << vec2(1u)); + v0 += v5; + v0.y += u32(v0.x < v5.x); + v0 += m1; + v0.y += u32(v0.x < m1.x); + v15 ^= v0; + v15 = v15.yx; + v10 += v15; + v10.y += u32(v10.x < v15.x); + v5 ^= v10; + v5 = (v5 >> vec2(24u)) | (v5 << vec2(8u)).yx; + v0 += v5; + v0.y += u32(v0.x < v5.x); + // NOP + v15 ^= v0; + v15 = (v15 >> vec2(16u)) | (v15 << vec2(16u)).yx; + v10 += v15; + v10.y += u32(v10.x < v15.x); + v5 ^= v10; + v5 = (v5 >> vec2(31u)).yx | (v5 << vec2(1u)); + v1 += v6; + v1.y += u32(v1.x < v6.x); + v1 += m0; + v1.y += u32(v1.x < m0.x); + v12 ^= v1; + v12 = v12.yx; + v11 += v12; + v11.y += u32(v11.x < v12.x); + v6 ^= v11; + v6 = (v6 >> vec2(24u)) | (v6 << vec2(8u)).yx; + v1 += v6; + v1.y += u32(v1.x < v6.x); + v1 += m2; + v1.y += u32(v1.x < m2.x); + v12 ^= v1; + v12 = (v12 >> vec2(16u)) | (v12 << vec2(16u)).yx; + v11 += v12; + v11.y += u32(v11.x < v12.x); + v6 ^= v11; + v6 = (v6 >> vec2(31u)).yx | (v6 << vec2(1u)); + v2 += v7; + v2.y += u32(v2.x < v7.x); + // NOP + v13 ^= v2; + v13 = v13.yx; + v8 += v13; + v8.y += u32(v8.x < v13.x); + v7 ^= v8; + v7 = (v7 >> vec2(24u)) | (v7 << vec2(8u)).yx; + v2 += v7; + v2.y += u32(v2.x < v7.x); + // NOP + v13 ^= v2; + v13 = (v13 >> vec2(16u)) | (v13 << vec2(16u)).yx; + v8 += v13; + v8.y += u32(v8.x < v13.x); + v7 ^= v8; + v7 = (v7 >> vec2(31u)).yx | (v7 << vec2(1u)); + v3 += v4; + v3.y += u32(v3.x < v4.x); + // NOP + v14 ^= v3; + v14 = v14.yx; + v9 += v14; + v9.y += u32(v9.x < v14.x); + v4 ^= v9; + v4 = (v4 >> vec2(24u)) | (v4 << vec2(8u)).yx; + v3 += v4; + v3.y += u32(v3.x < v4.x); + v3 += m3; + v3.y += u32(v3.x < m3.x); + v14 ^= v3; + v14 = (v14 >> vec2(16u)) | (v14 << vec2(16u)).yx; + v9 += v14; + v9.y += u32(v9.x < v14.x); + v4 ^= v9; + v4 = (v4 >> vec2(31u)).yx | (v4 << vec2(1u)); + // ROUND 2 + v0 += v4; + v0.y += u32(v0.x < v4.x); + // NOP + v12 ^= v0; + v12 = v12.yx; + v8 += v12; + v8.y += u32(v8.x < v12.x); + v4 ^= v8; + v4 = (v4 >> vec2(24u)) | (v4 << vec2(8u)).yx; + v0 += v4; + v0.y += u32(v0.x < v4.x); + // NOP + v12 ^= v0; + v12 = (v12 >> vec2(16u)) | (v12 << vec2(16u)).yx; + v8 += v12; + v8.y += u32(v8.x < v12.x); + v4 ^= v8; + v4 = (v4 >> vec2(31u)).yx | (v4 << vec2(1u)); + v1 += v5; + v1.y += u32(v1.x < v5.x); + // NOP + v13 ^= v1; + v13 = v13.yx; + v9 += v13; + v9.y += u32(v9.x < v13.x); + v5 ^= v9; + v5 = (v5 >> vec2(24u)) | (v5 << vec2(8u)).yx; + v1 += v5; + v1.y += u32(v1.x < v5.x); + v1 += m0; + v1.y += u32(v1.x < m0.x); + v13 ^= v1; + v13 = (v13 >> vec2(16u)) | (v13 << vec2(16u)).yx; + v9 += v13; + v9.y += u32(v9.x < v13.x); + v5 ^= v9; + v5 = (v5 >> vec2(31u)).yx | (v5 << vec2(1u)); + v2 += v6; + v2.y += u32(v2.x < v6.x); + // NOP + v14 ^= v2; + v14 = v14.yx; + v10 += v14; + v10.y += u32(v10.x < v14.x); + v6 ^= v10; + v6 = (v6 >> vec2(24u)) | (v6 << vec2(8u)).yx; + v2 += v6; + v2.y += u32(v2.x < v6.x); + v2 += m2; + v2.y += u32(v2.x < m2.x); + v14 ^= v2; + v14 = (v14 >> vec2(16u)) | (v14 << vec2(16u)).yx; + v10 += v14; + v10.y += u32(v10.x < v14.x); + v6 ^= v10; + v6 = (v6 >> vec2(31u)).yx | (v6 << vec2(1u)); + v3 += v7; + v3.y += u32(v3.x < v7.x); + // NOP + v15 ^= v3; + v15 = v15.yx; + v11 += v15; + v11.y += u32(v11.x < v15.x); + v7 ^= v11; + v7 = (v7 >> vec2(24u)) | (v7 << vec2(8u)).yx; + v3 += v7; + v3.y += u32(v3.x < v7.x); + // NOP + v15 ^= v3; + v15 = (v15 >> vec2(16u)) | (v15 << vec2(16u)).yx; + v11 += v15; + v11.y += u32(v11.x < v15.x); + v7 ^= v11; + v7 = (v7 >> vec2(31u)).yx | (v7 << vec2(1u)); + v0 += v5; + v0.y += u32(v0.x < v5.x); + // NOP + v15 ^= v0; + v15 = v15.yx; + v10 += v15; + v10.y += u32(v10.x < v15.x); + v5 ^= v10; + v5 = (v5 >> vec2(24u)) | (v5 << vec2(8u)).yx; + v0 += v5; + v0.y += u32(v0.x < v5.x); + // NOP + v15 ^= v0; + v15 = (v15 >> vec2(16u)) | (v15 << vec2(16u)).yx; + v10 += v15; + v10.y += u32(v10.x < v15.x); + v5 ^= v10; + v5 = (v5 >> vec2(31u)).yx | (v5 << vec2(1u)); + v1 += v6; + v1.y += u32(v1.x < v6.x); + v1 += m3; + v1.y += u32(v1.x < m3.x); + v12 ^= v1; + v12 = v12.yx; + v11 += v12; + v11.y += u32(v11.x < v12.x); + v6 ^= v11; + v6 = (v6 >> vec2(24u)) | (v6 << vec2(8u)).yx; + v1 += v6; + v1.y += u32(v1.x < v6.x); + // NOP + v12 ^= v1; + v12 = (v12 >> vec2(16u)) | (v12 << vec2(16u)).yx; + v11 += v12; + v11.y += u32(v11.x < v12.x); + v6 ^= v11; + v6 = (v6 >> vec2(31u)).yx | (v6 << vec2(1u)); + v2 += v7; + v2.y += u32(v2.x < v7.x); + // NOP + v13 ^= v2; + v13 = v13.yx; + v8 += v13; + v8.y += u32(v8.x < v13.x); + v7 ^= v8; + v7 = (v7 >> vec2(24u)) | (v7 << vec2(8u)).yx; + v2 += v7; + v2.y += u32(v2.x < v7.x); + v2 += m1; + v2.y += u32(v2.x < m1.x); + v13 ^= v2; + v13 = (v13 >> vec2(16u)) | (v13 << vec2(16u)).yx; + v8 += v13; + v8.y += u32(v8.x < v13.x); + v7 ^= v8; + v7 = (v7 >> vec2(31u)).yx | (v7 << vec2(1u)); + v3 += v4; + v3.y += u32(v3.x < v4.x); + // NOP + v14 ^= v3; + v14 = v14.yx; + v9 += v14; + v9.y += u32(v9.x < v14.x); + v4 ^= v9; + v4 = (v4 >> vec2(24u)) | (v4 << vec2(8u)).yx; + v3 += v4; + v3.y += u32(v3.x < v4.x); + v3 += m4; + v3.y += u32(v3.x < m4.x); + v14 ^= v3; + v14 = (v14 >> vec2(16u)) | (v14 << vec2(16u)).yx; + v9 += v14; + v9.y += u32(v9.x < v14.x); + v4 ^= v9; + v4 = (v4 >> vec2(31u)).yx | (v4 << vec2(1u)); + // ROUND 3 + v0 += v4; + v0.y += u32(v0.x < v4.x); + // NOP + v12 ^= v0; + v12 = v12.yx; + v8 += v12; + v8.y += u32(v8.x < v12.x); + v4 ^= v8; + v4 = (v4 >> vec2(24u)) | (v4 << vec2(8u)).yx; + v0 += v4; + v0.y += u32(v0.x < v4.x); + // NOP + v12 ^= v0; + v12 = (v12 >> vec2(16u)) | (v12 << vec2(16u)).yx; + v8 += v12; + v8.y += u32(v8.x < v12.x); + v4 ^= v8; + v4 = (v4 >> vec2(31u)).yx | (v4 << vec2(1u)); + v1 += v5; + v1.y += u32(v1.x < v5.x); + v1 += m3; + v1.y += u32(v1.x < m3.x); + v13 ^= v1; + v13 = v13.yx; + v9 += v13; + v9.y += u32(v9.x < v13.x); + v5 ^= v9; + v5 = (v5 >> vec2(24u)) | (v5 << vec2(8u)).yx; + v1 += v5; + v1.y += u32(v1.x < v5.x); + v1 += m1; + v1.y += u32(v1.x < m1.x); + v13 ^= v1; + v13 = (v13 >> vec2(16u)) | (v13 << vec2(16u)).yx; + v9 += v13; + v9.y += u32(v9.x < v13.x); + v5 ^= v9; + v5 = (v5 >> vec2(31u)).yx | (v5 << vec2(1u)); + v2 += v6; + v2.y += u32(v2.x < v6.x); + // NOP + v14 ^= v2; + v14 = v14.yx; + v10 += v14; + v10.y += u32(v10.x < v14.x); + v6 ^= v10; + v6 = (v6 >> vec2(24u)) | (v6 << vec2(8u)).yx; + v2 += v6; + v2.y += u32(v2.x < v6.x); + // NOP + v14 ^= v2; + v14 = (v14 >> vec2(16u)) | (v14 << vec2(16u)).yx; + v10 += v14; + v10.y += u32(v10.x < v14.x); + v6 ^= v10; + v6 = (v6 >> vec2(31u)).yx | (v6 << vec2(1u)); + v3 += v7; + v3.y += u32(v3.x < v7.x); + // NOP + v15 ^= v3; + v15 = v15.yx; + v11 += v15; + v11.y += u32(v11.x < v15.x); + v7 ^= v11; + v7 = (v7 >> vec2(24u)) | (v7 << vec2(8u)).yx; + v3 += v7; + v3.y += u32(v3.x < v7.x); + // NOP + v15 ^= v3; + v15 = (v15 >> vec2(16u)) | (v15 << vec2(16u)).yx; + v11 += v15; + v11.y += u32(v11.x < v15.x); + v7 ^= v11; + v7 = (v7 >> vec2(31u)).yx | (v7 << vec2(1u)); + v0 += v5; + v0.y += u32(v0.x < v5.x); + v0 += m2; + v0.y += u32(v0.x < m2.x); + v15 ^= v0; + v15 = v15.yx; + v10 += v15; + v10.y += u32(v10.x < v15.x); + v5 ^= v10; + v5 = (v5 >> vec2(24u)) | (v5 << vec2(8u)).yx; + v0 += v5; + v0.y += u32(v0.x < v5.x); + // NOP + v15 ^= v0; + v15 = (v15 >> vec2(16u)) | (v15 << vec2(16u)).yx; + v10 += v15; + v10.y += u32(v10.x < v15.x); + v5 ^= v10; + v5 = (v5 >> vec2(31u)).yx | (v5 << vec2(1u)); + v1 += v6; + v1.y += u32(v1.x < v6.x); + // NOP + v12 ^= v1; + v12 = v12.yx; + v11 += v12; + v11.y += u32(v11.x < v12.x); + v6 ^= v11; + v6 = (v6 >> vec2(24u)) | (v6 << vec2(8u)).yx; + v1 += v6; + v1.y += u32(v1.x < v6.x); + // NOP + v12 ^= v1; + v12 = (v12 >> vec2(16u)) | (v12 << vec2(16u)).yx; + v11 += v12; + v11.y += u32(v11.x < v12.x); + v6 ^= v11; + v6 = (v6 >> vec2(31u)).yx | (v6 << vec2(1u)); + v2 += v7; + v2.y += u32(v2.x < v7.x); + v2 += m4; + v2.y += u32(v2.x < m4.x); + v13 ^= v2; + v13 = v13.yx; + v8 += v13; + v8.y += u32(v8.x < v13.x); + v7 ^= v8; + v7 = (v7 >> vec2(24u)) | (v7 << vec2(8u)).yx; + v2 += v7; + v2.y += u32(v2.x < v7.x); + v2 += m0; + v2.y += u32(v2.x < m0.x); + v13 ^= v2; + v13 = (v13 >> vec2(16u)) | (v13 << vec2(16u)).yx; + v8 += v13; + v8.y += u32(v8.x < v13.x); + v7 ^= v8; + v7 = (v7 >> vec2(31u)).yx | (v7 << vec2(1u)); + v3 += v4; + v3.y += u32(v3.x < v4.x); + // NOP + v14 ^= v3; + v14 = v14.yx; + v9 += v14; + v9.y += u32(v9.x < v14.x); + v4 ^= v9; + v4 = (v4 >> vec2(24u)) | (v4 << vec2(8u)).yx; + v3 += v4; + v3.y += u32(v3.x < v4.x); + // NOP + v14 ^= v3; + v14 = (v14 >> vec2(16u)) | (v14 << vec2(16u)).yx; + v9 += v14; + v9.y += u32(v9.x < v14.x); + v4 ^= v9; + v4 = (v4 >> vec2(31u)).yx | (v4 << vec2(1u)); + // ROUND 4 + v0 += v4; + v0.y += u32(v0.x < v4.x); + // NOP + v12 ^= v0; + v12 = v12.yx; + v8 += v12; + v8.y += u32(v8.x < v12.x); + v4 ^= v8; + v4 = (v4 >> vec2(24u)) | (v4 << vec2(8u)).yx; + v0 += v4; + v0.y += u32(v0.x < v4.x); + v0 += m0; + v0.y += u32(v0.x < m0.x); + v12 ^= v0; + v12 = (v12 >> vec2(16u)) | (v12 << vec2(16u)).yx; + v8 += v12; + v8.y += u32(v8.x < v12.x); + v4 ^= v8; + v4 = (v4 >> vec2(31u)).yx | (v4 << vec2(1u)); + v1 += v5; + v1.y += u32(v1.x < v5.x); + // NOP + v13 ^= v1; + v13 = v13.yx; + v9 += v13; + v9.y += u32(v9.x < v13.x); + v5 ^= v9; + v5 = (v5 >> vec2(24u)) | (v5 << vec2(8u)).yx; + v1 += v5; + v1.y += u32(v1.x < v5.x); + // NOP + v13 ^= v1; + v13 = (v13 >> vec2(16u)) | (v13 << vec2(16u)).yx; + v9 += v13; + v9.y += u32(v9.x < v13.x); + v5 ^= v9; + v5 = (v5 >> vec2(31u)).yx | (v5 << vec2(1u)); + v2 += v6; + v2.y += u32(v2.x < v6.x); + v2 += m2; + v2.y += u32(v2.x < m2.x); + v14 ^= v2; + v14 = v14.yx; + v10 += v14; + v10.y += u32(v10.x < v14.x); + v6 ^= v10; + v6 = (v6 >> vec2(24u)) | (v6 << vec2(8u)).yx; + v2 += v6; + v2.y += u32(v2.x < v6.x); + v2 += m4; + v2.y += u32(v2.x < m4.x); + v14 ^= v2; + v14 = (v14 >> vec2(16u)) | (v14 << vec2(16u)).yx; + v10 += v14; + v10.y += u32(v10.x < v14.x); + v6 ^= v10; + v6 = (v6 >> vec2(31u)).yx | (v6 << vec2(1u)); + v3 += v7; + v3.y += u32(v3.x < v7.x); + // NOP + v15 ^= v3; + v15 = v15.yx; + v11 += v15; + v11.y += u32(v11.x < v15.x); + v7 ^= v11; + v7 = (v7 >> vec2(24u)) | (v7 << vec2(8u)).yx; + v3 += v7; + v3.y += u32(v3.x < v7.x); + // NOP + v15 ^= v3; + v15 = (v15 >> vec2(16u)) | (v15 << vec2(16u)).yx; + v11 += v15; + v11.y += u32(v11.x < v15.x); + v7 ^= v11; + v7 = (v7 >> vec2(31u)).yx | (v7 << vec2(1u)); + v0 += v5; + v0.y += u32(v0.x < v5.x); + // NOP + v15 ^= v0; + v15 = v15.yx; + v10 += v15; + v10.y += u32(v10.x < v15.x); + v5 ^= v10; + v5 = (v5 >> vec2(24u)) | (v5 << vec2(8u)).yx; + v0 += v5; + v0.y += u32(v0.x < v5.x); + v0 += m1; + v0.y += u32(v0.x < m1.x); + v15 ^= v0; + v15 = (v15 >> vec2(16u)) | (v15 << vec2(16u)).yx; + v10 += v15; + v10.y += u32(v10.x < v15.x); + v5 ^= v10; + v5 = (v5 >> vec2(31u)).yx | (v5 << vec2(1u)); + v1 += v6; + v1.y += u32(v1.x < v6.x); + // NOP + v12 ^= v1; + v12 = v12.yx; + v11 += v12; + v11.y += u32(v11.x < v12.x); + v6 ^= v11; + v6 = (v6 >> vec2(24u)) | (v6 << vec2(8u)).yx; + v1 += v6; + v1.y += u32(v1.x < v6.x); + // NOP + v12 ^= v1; + v12 = (v12 >> vec2(16u)) | (v12 << vec2(16u)).yx; + v11 += v12; + v11.y += u32(v11.x < v12.x); + v6 ^= v11; + v6 = (v6 >> vec2(31u)).yx | (v6 << vec2(1u)); + v2 += v7; + v2.y += u32(v2.x < v7.x); + // NOP + v13 ^= v2; + v13 = v13.yx; + v8 += v13; + v8.y += u32(v8.x < v13.x); + v7 ^= v8; + v7 = (v7 >> vec2(24u)) | (v7 << vec2(8u)).yx; + v2 += v7; + v2.y += u32(v2.x < v7.x); + // NOP + v13 ^= v2; + v13 = (v13 >> vec2(16u)) | (v13 << vec2(16u)).yx; + v8 += v13; + v8.y += u32(v8.x < v13.x); + v7 ^= v8; + v7 = (v7 >> vec2(31u)).yx | (v7 << vec2(1u)); + v3 += v4; + v3.y += u32(v3.x < v4.x); + v3 += m3; + v3.y += u32(v3.x < m3.x); + v14 ^= v3; + v14 = v14.yx; + v9 += v14; + v9.y += u32(v9.x < v14.x); + v4 ^= v9; + v4 = (v4 >> vec2(24u)) | (v4 << vec2(8u)).yx; + v3 += v4; + v3.y += u32(v3.x < v4.x); + // NOP + v14 ^= v3; + v14 = (v14 >> vec2(16u)) | (v14 << vec2(16u)).yx; + v9 += v14; + v9.y += u32(v9.x < v14.x); + v4 ^= v9; + v4 = (v4 >> vec2(31u)).yx | (v4 << vec2(1u)); + // ROUND 5 + v0 += v4; + v0.y += u32(v0.x < v4.x); + v0 += m2; + v0.y += u32(v0.x < m2.x); + v12 ^= v0; + v12 = v12.yx; + v8 += v12; + v8.y += u32(v8.x < v12.x); + v4 ^= v8; + v4 = (v4 >> vec2(24u)) | (v4 << vec2(8u)).yx; + v0 += v4; + v0.y += u32(v0.x < v4.x); + // NOP + v12 ^= v0; + v12 = (v12 >> vec2(16u)) | (v12 << vec2(16u)).yx; + v8 += v12; + v8.y += u32(v8.x < v12.x); + v4 ^= v8; + v4 = (v4 >> vec2(31u)).yx | (v4 << vec2(1u)); + v1 += v5; + v1.y += u32(v1.x < v5.x); + // NOP + v13 ^= v1; + v13 = v13.yx; + v9 += v13; + v9.y += u32(v9.x < v13.x); + v5 ^= v9; + v5 = (v5 >> vec2(24u)) | (v5 << vec2(8u)).yx; + v1 += v5; + v1.y += u32(v1.x < v5.x); + // NOP + v13 ^= v1; + v13 = (v13 >> vec2(16u)) | (v13 << vec2(16u)).yx; + v9 += v13; + v9.y += u32(v9.x < v13.x); + v5 ^= v9; + v5 = (v5 >> vec2(31u)).yx | (v5 << vec2(1u)); + v2 += v6; + v2.y += u32(v2.x < v6.x); + v2 += m0; + v2.y += u32(v2.x < m0.x); + v14 ^= v2; + v14 = v14.yx; + v10 += v14; + v10.y += u32(v10.x < v14.x); + v6 ^= v10; + v6 = (v6 >> vec2(24u)) | (v6 << vec2(8u)).yx; + v2 += v6; + v2.y += u32(v2.x < v6.x); + // NOP + v14 ^= v2; + v14 = (v14 >> vec2(16u)) | (v14 << vec2(16u)).yx; + v10 += v14; + v10.y += u32(v10.x < v14.x); + v6 ^= v10; + v6 = (v6 >> vec2(31u)).yx | (v6 << vec2(1u)); + v3 += v7; + v3.y += u32(v3.x < v7.x); + // NOP + v15 ^= v3; + v15 = v15.yx; + v11 += v15; + v11.y += u32(v11.x < v15.x); + v7 ^= v11; + v7 = (v7 >> vec2(24u)) | (v7 << vec2(8u)).yx; + v3 += v7; + v3.y += u32(v3.x < v7.x); + v3 += m3; + v3.y += u32(v3.x < m3.x); + v15 ^= v3; + v15 = (v15 >> vec2(16u)) | (v15 << vec2(16u)).yx; + v11 += v15; + v11.y += u32(v11.x < v15.x); + v7 ^= v11; + v7 = (v7 >> vec2(31u)).yx | (v7 << vec2(1u)); + v0 += v5; + v0.y += u32(v0.x < v5.x); + v0 += m4; + v0.y += u32(v0.x < m4.x); + v15 ^= v0; + v15 = v15.yx; + v10 += v15; + v10.y += u32(v10.x < v15.x); + v5 ^= v10; + v5 = (v5 >> vec2(24u)) | (v5 << vec2(8u)).yx; + v0 += v5; + v0.y += u32(v0.x < v5.x); + // NOP + v15 ^= v0; + v15 = (v15 >> vec2(16u)) | (v15 << vec2(16u)).yx; + v10 += v15; + v10.y += u32(v10.x < v15.x); + v5 ^= v10; + v5 = (v5 >> vec2(31u)).yx | (v5 << vec2(1u)); + v1 += v6; + v1.y += u32(v1.x < v6.x); + // NOP + v12 ^= v1; + v12 = v12.yx; + v11 += v12; + v11.y += u32(v11.x < v12.x); + v6 ^= v11; + v6 = (v6 >> vec2(24u)) | (v6 << vec2(8u)).yx; + v1 += v6; + v1.y += u32(v1.x < v6.x); + // NOP + v12 ^= v1; + v12 = (v12 >> vec2(16u)) | (v12 << vec2(16u)).yx; + v11 += v12; + v11.y += u32(v11.x < v12.x); + v6 ^= v11; + v6 = (v6 >> vec2(31u)).yx | (v6 << vec2(1u)); + v2 += v7; + v2.y += u32(v2.x < v7.x); + // NOP + v13 ^= v2; + v13 = v13.yx; + v8 += v13; + v8.y += u32(v8.x < v13.x); + v7 ^= v8; + v7 = (v7 >> vec2(24u)) | (v7 << vec2(8u)).yx; + v2 += v7; + v2.y += u32(v2.x < v7.x); + // NOP + v13 ^= v2; + v13 = (v13 >> vec2(16u)) | (v13 << vec2(16u)).yx; + v8 += v13; + v8.y += u32(v8.x < v13.x); + v7 ^= v8; + v7 = (v7 >> vec2(31u)).yx | (v7 << vec2(1u)); + v3 += v4; + v3.y += u32(v3.x < v4.x); + v3 += m1; + v3.y += u32(v3.x < m1.x); + v14 ^= v3; + v14 = v14.yx; + v9 += v14; + v9.y += u32(v9.x < v14.x); + v4 ^= v9; + v4 = (v4 >> vec2(24u)) | (v4 << vec2(8u)).yx; + v3 += v4; + v3.y += u32(v3.x < v4.x); + // NOP + v14 ^= v3; + v14 = (v14 >> vec2(16u)) | (v14 << vec2(16u)).yx; + v9 += v14; + v9.y += u32(v9.x < v14.x); + v4 ^= v9; + v4 = (v4 >> vec2(31u)).yx | (v4 << vec2(1u)); + // ROUND 6 + v0 += v4; + v0.y += u32(v0.x < v4.x); + // NOP + v12 ^= v0; + v12 = v12.yx; + v8 += v12; + v8.y += u32(v8.x < v12.x); + v4 ^= v8; + v4 = (v4 >> vec2(24u)) | (v4 << vec2(8u)).yx; + v0 += v4; + v0.y += u32(v0.x < v4.x); + // NOP + v12 ^= v0; + v12 = (v12 >> vec2(16u)) | (v12 << vec2(16u)).yx; + v8 += v12; + v8.y += u32(v8.x < v12.x); + v4 ^= v8; + v4 = (v4 >> vec2(31u)).yx | (v4 << vec2(1u)); + v1 += v5; + v1.y += u32(v1.x < v5.x); + v1 += m1; + v1.y += u32(v1.x < m1.x); + v13 ^= v1; + v13 = v13.yx; + v9 += v13; + v9.y += u32(v9.x < v13.x); + v5 ^= v9; + v5 = (v5 >> vec2(24u)) | (v5 << vec2(8u)).yx; + v1 += v5; + v1.y += u32(v1.x < v5.x); + // NOP + v13 ^= v1; + v13 = (v13 >> vec2(16u)) | (v13 << vec2(16u)).yx; + v9 += v13; + v9.y += u32(v9.x < v13.x); + v5 ^= v9; + v5 = (v5 >> vec2(31u)).yx | (v5 << vec2(1u)); + v2 += v6; + v2.y += u32(v2.x < v6.x); + // NOP + v14 ^= v2; + v14 = v14.yx; + v10 += v14; + v10.y += u32(v10.x < v14.x); + v6 ^= v10; + v6 = (v6 >> vec2(24u)) | (v6 << vec2(8u)).yx; + v2 += v6; + v2.y += u32(v2.x < v6.x); + // NOP + v14 ^= v2; + v14 = (v14 >> vec2(16u)) | (v14 << vec2(16u)).yx; + v10 += v14; + v10.y += u32(v10.x < v14.x); + v6 ^= v10; + v6 = (v6 >> vec2(31u)).yx | (v6 << vec2(1u)); + v3 += v7; + v3.y += u32(v3.x < v7.x); + v3 += m4; + v3.y += u32(v3.x < m4.x); + v15 ^= v3; + v15 = v15.yx; + v11 += v15; + v11.y += u32(v11.x < v15.x); + v7 ^= v11; + v7 = (v7 >> vec2(24u)) | (v7 << vec2(8u)).yx; + v3 += v7; + v3.y += u32(v3.x < v7.x); + // NOP + v15 ^= v3; + v15 = (v15 >> vec2(16u)) | (v15 << vec2(16u)).yx; + v11 += v15; + v11.y += u32(v11.x < v15.x); + v7 ^= v11; + v7 = (v7 >> vec2(31u)).yx | (v7 << vec2(1u)); + v0 += v5; + v0.y += u32(v0.x < v5.x); + v0 += m0; + v0.y += u32(v0.x < m0.x); + v15 ^= v0; + v15 = v15.yx; + v10 += v15; + v10.y += u32(v10.x < v15.x); + v5 ^= v10; + v5 = (v5 >> vec2(24u)) | (v5 << vec2(8u)).yx; + v0 += v5; + v0.y += u32(v0.x < v5.x); + // NOP + v15 ^= v0; + v15 = (v15 >> vec2(16u)) | (v15 << vec2(16u)).yx; + v10 += v15; + v10.y += u32(v10.x < v15.x); + v5 ^= v10; + v5 = (v5 >> vec2(31u)).yx | (v5 << vec2(1u)); + v1 += v6; + v1.y += u32(v1.x < v6.x); + // NOP + v12 ^= v1; + v12 = v12.yx; + v11 += v12; + v11.y += u32(v11.x < v12.x); + v6 ^= v11; + v6 = (v6 >> vec2(24u)) | (v6 << vec2(8u)).yx; + v1 += v6; + v1.y += u32(v1.x < v6.x); + v1 += m3; + v1.y += u32(v1.x < m3.x); + v12 ^= v1; + v12 = (v12 >> vec2(16u)) | (v12 << vec2(16u)).yx; + v11 += v12; + v11.y += u32(v11.x < v12.x); + v6 ^= v11; + v6 = (v6 >> vec2(31u)).yx | (v6 << vec2(1u)); + v2 += v7; + v2.y += u32(v2.x < v7.x); + // NOP + v13 ^= v2; + v13 = v13.yx; + v8 += v13; + v8.y += u32(v8.x < v13.x); + v7 ^= v8; + v7 = (v7 >> vec2(24u)) | (v7 << vec2(8u)).yx; + v2 += v7; + v2.y += u32(v2.x < v7.x); + v2 += m2; + v2.y += u32(v2.x < m2.x); + v13 ^= v2; + v13 = (v13 >> vec2(16u)) | (v13 << vec2(16u)).yx; + v8 += v13; + v8.y += u32(v8.x < v13.x); + v7 ^= v8; + v7 = (v7 >> vec2(31u)).yx | (v7 << vec2(1u)); + v3 += v4; + v3.y += u32(v3.x < v4.x); + // NOP + v14 ^= v3; + v14 = v14.yx; + v9 += v14; + v9.y += u32(v9.x < v14.x); + v4 ^= v9; + v4 = (v4 >> vec2(24u)) | (v4 << vec2(8u)).yx; + v3 += v4; + v3.y += u32(v3.x < v4.x); + // NOP + v14 ^= v3; + v14 = (v14 >> vec2(16u)) | (v14 << vec2(16u)).yx; + v9 += v14; + v9.y += u32(v9.x < v14.x); + v4 ^= v9; + v4 = (v4 >> vec2(31u)).yx | (v4 << vec2(1u)); + // ROUND 7 + v0 += v4; + v0.y += u32(v0.x < v4.x); + // NOP + v12 ^= v0; + v12 = v12.yx; + v8 += v12; + v8.y += u32(v8.x < v12.x); + v4 ^= v8; + v4 = (v4 >> vec2(24u)) | (v4 << vec2(8u)).yx; + v0 += v4; + v0.y += u32(v0.x < v4.x); + // NOP + v12 ^= v0; + v12 = (v12 >> vec2(16u)) | (v12 << vec2(16u)).yx; + v8 += v12; + v8.y += u32(v8.x < v12.x); + v4 ^= v8; + v4 = (v4 >> vec2(31u)).yx | (v4 << vec2(1u)); + v1 += v5; + v1.y += u32(v1.x < v5.x); + // NOP + v13 ^= v1; + v13 = v13.yx; + v9 += v13; + v9.y += u32(v9.x < v13.x); + v5 ^= v9; + v5 = (v5 >> vec2(24u)) | (v5 << vec2(8u)).yx; + v1 += v5; + v1.y += u32(v1.x < v5.x); + // NOP + v13 ^= v1; + v13 = (v13 >> vec2(16u)) | (v13 << vec2(16u)).yx; + v9 += v13; + v9.y += u32(v9.x < v13.x); + v5 ^= v9; + v5 = (v5 >> vec2(31u)).yx | (v5 << vec2(1u)); + v2 += v6; + v2.y += u32(v2.x < v6.x); + // NOP + v14 ^= v2; + v14 = v14.yx; + v10 += v14; + v10.y += u32(v10.x < v14.x); + v6 ^= v10; + v6 = (v6 >> vec2(24u)) | (v6 << vec2(8u)).yx; + v2 += v6; + v2.y += u32(v2.x < v6.x); + v2 += m1; + v2.y += u32(v2.x < m1.x); + v14 ^= v2; + v14 = (v14 >> vec2(16u)) | (v14 << vec2(16u)).yx; + v10 += v14; + v10.y += u32(v10.x < v14.x); + v6 ^= v10; + v6 = (v6 >> vec2(31u)).yx | (v6 << vec2(1u)); + v3 += v7; + v3.y += u32(v3.x < v7.x); + v3 += m3; + v3.y += u32(v3.x < m3.x); + v15 ^= v3; + v15 = v15.yx; + v11 += v15; + v11.y += u32(v11.x < v15.x); + v7 ^= v11; + v7 = (v7 >> vec2(24u)) | (v7 << vec2(8u)).yx; + v3 += v7; + v3.y += u32(v3.x < v7.x); + // NOP + v15 ^= v3; + v15 = (v15 >> vec2(16u)) | (v15 << vec2(16u)).yx; + v11 += v15; + v11.y += u32(v11.x < v15.x); + v7 ^= v11; + v7 = (v7 >> vec2(31u)).yx | (v7 << vec2(1u)); + v0 += v5; + v0.y += u32(v0.x < v5.x); + // NOP + v15 ^= v0; + v15 = v15.yx; + v10 += v15; + v10.y += u32(v10.x < v15.x); + v5 ^= v10; + v5 = (v5 >> vec2(24u)) | (v5 << vec2(8u)).yx; + v0 += v5; + v0.y += u32(v0.x < v5.x); + v0 += m0; + v0.y += u32(v0.x < m0.x); + v15 ^= v0; + v15 = (v15 >> vec2(16u)) | (v15 << vec2(16u)).yx; + v10 += v15; + v10.y += u32(v10.x < v15.x); + v5 ^= v10; + v5 = (v5 >> vec2(31u)).yx | (v5 << vec2(1u)); + v1 += v6; + v1.y += u32(v1.x < v6.x); + // NOP + v12 ^= v1; + v12 = v12.yx; + v11 += v12; + v11.y += u32(v11.x < v12.x); + v6 ^= v11; + v6 = (v6 >> vec2(24u)) | (v6 << vec2(8u)).yx; + v1 += v6; + v1.y += u32(v1.x < v6.x); + v1 += m4; + v1.y += u32(v1.x < m4.x); + v12 ^= v1; + v12 = (v12 >> vec2(16u)) | (v12 << vec2(16u)).yx; + v11 += v12; + v11.y += u32(v11.x < v12.x); + v6 ^= v11; + v6 = (v6 >> vec2(31u)).yx | (v6 << vec2(1u)); + v2 += v7; + v2.y += u32(v2.x < v7.x); + // NOP + v13 ^= v2; + v13 = v13.yx; + v8 += v13; + v8.y += u32(v8.x < v13.x); + v7 ^= v8; + v7 = (v7 >> vec2(24u)) | (v7 << vec2(8u)).yx; + v2 += v7; + v2.y += u32(v2.x < v7.x); + // NOP + v13 ^= v2; + v13 = (v13 >> vec2(16u)) | (v13 << vec2(16u)).yx; + v8 += v13; + v8.y += u32(v8.x < v13.x); + v7 ^= v8; + v7 = (v7 >> vec2(31u)).yx | (v7 << vec2(1u)); + v3 += v4; + v3.y += u32(v3.x < v4.x); + v3 += m2; + v3.y += u32(v3.x < m2.x); + v14 ^= v3; + v14 = v14.yx; + v9 += v14; + v9.y += u32(v9.x < v14.x); + v4 ^= v9; + v4 = (v4 >> vec2(24u)) | (v4 << vec2(8u)).yx; + v3 += v4; + v3.y += u32(v3.x < v4.x); + // NOP + v14 ^= v3; + v14 = (v14 >> vec2(16u)) | (v14 << vec2(16u)).yx; + v9 += v14; + v9.y += u32(v9.x < v14.x); + v4 ^= v9; + v4 = (v4 >> vec2(31u)).yx | (v4 << vec2(1u)); + // ROUND 8 + v0 += v4; + v0.y += u32(v0.x < v4.x); + // NOP + v12 ^= v0; + v12 = v12.yx; + v8 += v12; + v8.y += u32(v8.x < v12.x); + v4 ^= v8; + v4 = (v4 >> vec2(24u)) | (v4 << vec2(8u)).yx; + v0 += v4; + v0.y += u32(v0.x < v4.x); + // NOP + v12 ^= v0; + v12 = (v12 >> vec2(16u)) | (v12 << vec2(16u)).yx; + v8 += v12; + v8.y += u32(v8.x < v12.x); + v4 ^= v8; + v4 = (v4 >> vec2(31u)).yx | (v4 << vec2(1u)); + v1 += v5; + v1.y += u32(v1.x < v5.x); + // NOP + v13 ^= v1; + v13 = v13.yx; + v9 += v13; + v9.y += u32(v9.x < v13.x); + v5 ^= v9; + v5 = (v5 >> vec2(24u)) | (v5 << vec2(8u)).yx; + v1 += v5; + v1.y += u32(v1.x < v5.x); + // NOP + v13 ^= v1; + v13 = (v13 >> vec2(16u)) | (v13 << vec2(16u)).yx; + v9 += v13; + v9.y += u32(v9.x < v13.x); + v5 ^= v9; + v5 = (v5 >> vec2(31u)).yx | (v5 << vec2(1u)); + v2 += v6; + v2.y += u32(v2.x < v6.x); + // NOP + v14 ^= v2; + v14 = v14.yx; + v10 += v14; + v10.y += u32(v10.x < v14.x); + v6 ^= v10; + v6 = (v6 >> vec2(24u)) | (v6 << vec2(8u)).yx; + v2 += v6; + v2.y += u32(v2.x < v6.x); + v2 += m3; + v2.y += u32(v2.x < m3.x); + v14 ^= v2; + v14 = (v14 >> vec2(16u)) | (v14 << vec2(16u)).yx; + v10 += v14; + v10.y += u32(v10.x < v14.x); + v6 ^= v10; + v6 = (v6 >> vec2(31u)).yx | (v6 << vec2(1u)); + v3 += v7; + v3.y += u32(v3.x < v7.x); + v3 += m0; + v3.y += u32(v3.x < m0.x); + v15 ^= v3; + v15 = v15.yx; + v11 += v15; + v11.y += u32(v11.x < v15.x); + v7 ^= v11; + v7 = (v7 >> vec2(24u)) | (v7 << vec2(8u)).yx; + v3 += v7; + v3.y += u32(v3.x < v7.x); + // NOP + v15 ^= v3; + v15 = (v15 >> vec2(16u)) | (v15 << vec2(16u)).yx; + v11 += v15; + v11.y += u32(v11.x < v15.x); + v7 ^= v11; + v7 = (v7 >> vec2(31u)).yx | (v7 << vec2(1u)); + v0 += v5; + v0.y += u32(v0.x < v5.x); + // NOP + v15 ^= v0; + v15 = v15.yx; + v10 += v15; + v10.y += u32(v10.x < v15.x); + v5 ^= v10; + v5 = (v5 >> vec2(24u)) | (v5 << vec2(8u)).yx; + v0 += v5; + v0.y += u32(v0.x < v5.x); + v0 += m2; + v0.y += u32(v0.x < m2.x); + v15 ^= v0; + v15 = (v15 >> vec2(16u)) | (v15 << vec2(16u)).yx; + v10 += v15; + v10.y += u32(v10.x < v15.x); + v5 ^= v10; + v5 = (v5 >> vec2(31u)).yx | (v5 << vec2(1u)); + v1 += v6; + v1.y += u32(v1.x < v6.x); + // NOP + v12 ^= v1; + v12 = v12.yx; + v11 += v12; + v11.y += u32(v11.x < v12.x); + v6 ^= v11; + v6 = (v6 >> vec2(24u)) | (v6 << vec2(8u)).yx; + v1 += v6; + v1.y += u32(v1.x < v6.x); + // NOP + v12 ^= v1; + v12 = (v12 >> vec2(16u)) | (v12 << vec2(16u)).yx; + v11 += v12; + v11.y += u32(v11.x < v12.x); + v6 ^= v11; + v6 = (v6 >> vec2(31u)).yx | (v6 << vec2(1u)); + v2 += v7; + v2.y += u32(v2.x < v7.x); + v2 += m1; + v2.y += u32(v2.x < m1.x); + v13 ^= v2; + v13 = v13.yx; + v8 += v13; + v8.y += u32(v8.x < v13.x); + v7 ^= v8; + v7 = (v7 >> vec2(24u)) | (v7 << vec2(8u)).yx; + v2 += v7; + v2.y += u32(v2.x < v7.x); + v2 += m4; + v2.y += u32(v2.x < m4.x); + v13 ^= v2; + v13 = (v13 >> vec2(16u)) | (v13 << vec2(16u)).yx; + v8 += v13; + v8.y += u32(v8.x < v13.x); + v7 ^= v8; + v7 = (v7 >> vec2(31u)).yx | (v7 << vec2(1u)); + v3 += v4; + v3.y += u32(v3.x < v4.x); + // NOP + v14 ^= v3; + v14 = v14.yx; + v9 += v14; + v9.y += u32(v9.x < v14.x); + v4 ^= v9; + v4 = (v4 >> vec2(24u)) | (v4 << vec2(8u)).yx; + v3 += v4; + v3.y += u32(v3.x < v4.x); + // NOP + v14 ^= v3; + v14 = (v14 >> vec2(16u)) | (v14 << vec2(16u)).yx; + v9 += v14; + v9.y += u32(v9.x < v14.x); + v4 ^= v9; + v4 = (v4 >> vec2(31u)).yx | (v4 << vec2(1u)); + // ROUND 9 + v0 += v4; + v0.y += u32(v0.x < v4.x); + // NOP + v12 ^= v0; + v12 = v12.yx; + v8 += v12; + v8.y += u32(v8.x < v12.x); + v4 ^= v8; + v4 = (v4 >> vec2(24u)) | (v4 << vec2(8u)).yx; + v0 += v4; + v0.y += u32(v0.x < v4.x); + v0 += m2; + v0.y += u32(v0.x < m2.x); + v12 ^= v0; + v12 = (v12 >> vec2(16u)) | (v12 << vec2(16u)).yx; + v8 += v12; + v8.y += u32(v8.x < v12.x); + v4 ^= v8; + v4 = (v4 >> vec2(31u)).yx | (v4 << vec2(1u)); + v1 += v5; + v1.y += u32(v1.x < v5.x); + // NOP + v13 ^= v1; + v13 = v13.yx; + v9 += v13; + v9.y += u32(v9.x < v13.x); + v5 ^= v9; + v5 = (v5 >> vec2(24u)) | (v5 << vec2(8u)).yx; + v1 += v5; + v1.y += u32(v1.x < v5.x); + v1 += m4; + v1.y += u32(v1.x < m4.x); + v13 ^= v1; + v13 = (v13 >> vec2(16u)) | (v13 << vec2(16u)).yx; + v9 += v13; + v9.y += u32(v9.x < v13.x); + v5 ^= v9; + v5 = (v5 >> vec2(31u)).yx | (v5 << vec2(1u)); + v2 += v6; + v2.y += u32(v2.x < v6.x); + // NOP + v14 ^= v2; + v14 = v14.yx; + v10 += v14; + v10.y += u32(v10.x < v14.x); + v6 ^= v10; + v6 = (v6 >> vec2(24u)) | (v6 << vec2(8u)).yx; + v2 += v6; + v2.y += u32(v2.x < v6.x); + // NOP + v14 ^= v2; + v14 = (v14 >> vec2(16u)) | (v14 << vec2(16u)).yx; + v10 += v14; + v10.y += u32(v10.x < v14.x); + v6 ^= v10; + v6 = (v6 >> vec2(31u)).yx | (v6 << vec2(1u)); + v3 += v7; + v3.y += u32(v3.x < v7.x); + v3 += m1; + v3.y += u32(v3.x < m1.x); + v15 ^= v3; + v15 = v15.yx; + v11 += v15; + v11.y += u32(v11.x < v15.x); + v7 ^= v11; + v7 = (v7 >> vec2(24u)) | (v7 << vec2(8u)).yx; + v3 += v7; + v3.y += u32(v3.x < v7.x); + // NOP + v15 ^= v3; + v15 = (v15 >> vec2(16u)) | (v15 << vec2(16u)).yx; + v11 += v15; + v11.y += u32(v11.x < v15.x); + v7 ^= v11; + v7 = (v7 >> vec2(31u)).yx | (v7 << vec2(1u)); + v0 += v5; + v0.y += u32(v0.x < v5.x); + // NOP + v15 ^= v0; + v15 = v15.yx; + v10 += v15; + v10.y += u32(v10.x < v15.x); + v5 ^= v10; + v5 = (v5 >> vec2(24u)) | (v5 << vec2(8u)).yx; + v0 += v5; + v0.y += u32(v0.x < v5.x); + // NOP + v15 ^= v0; + v15 = (v15 >> vec2(16u)) | (v15 << vec2(16u)).yx; + v10 += v15; + v10.y += u32(v10.x < v15.x); + v5 ^= v10; + v5 = (v5 >> vec2(31u)).yx | (v5 << vec2(1u)); + v1 += v6; + v1.y += u32(v1.x < v6.x); + // NOP + v12 ^= v1; + v12 = v12.yx; + v11 += v12; + v11.y += u32(v11.x < v12.x); + v6 ^= v11; + v6 = (v6 >> vec2(24u)) | (v6 << vec2(8u)).yx; + v1 += v6; + v1.y += u32(v1.x < v6.x); + // NOP + v12 ^= v1; + v12 = (v12 >> vec2(16u)) | (v12 << vec2(16u)).yx; + v11 += v12; + v11.y += u32(v11.x < v12.x); + v6 ^= v11; + v6 = (v6 >> vec2(31u)).yx | (v6 << vec2(1u)); + v2 += v7; + v2.y += u32(v2.x < v7.x); + v2 += m3; + v2.y += u32(v2.x < m3.x); + v13 ^= v2; + v13 = v13.yx; + v8 += v13; + v8.y += u32(v8.x < v13.x); + v7 ^= v8; + v7 = (v7 >> vec2(24u)) | (v7 << vec2(8u)).yx; + v2 += v7; + v2.y += u32(v2.x < v7.x); + // NOP + v13 ^= v2; + v13 = (v13 >> vec2(16u)) | (v13 << vec2(16u)).yx; + v8 += v13; + v8.y += u32(v8.x < v13.x); + v7 ^= v8; + v7 = (v7 >> vec2(31u)).yx | (v7 << vec2(1u)); + v3 += v4; + v3.y += u32(v3.x < v4.x); + // NOP + v14 ^= v3; + v14 = v14.yx; + v9 += v14; + v9.y += u32(v9.x < v14.x); + v4 ^= v9; + v4 = (v4 >> vec2(24u)) | (v4 << vec2(8u)).yx; + v3 += v4; + v3.y += u32(v3.x < v4.x); + v3 += m0; + v3.y += u32(v3.x < m0.x); + v14 ^= v3; + v14 = (v14 >> vec2(16u)) | (v14 << vec2(16u)).yx; + v9 += v14; + v9.y += u32(v9.x < v14.x); + v4 ^= v9; + v4 = (v4 >> vec2(31u)).yx | (v4 << vec2(1u)); + // ROUND 10 + v0 += v4; + v0.y += u32(v0.x < v4.x); + v0 += m0; + v0.y += u32(v0.x < m0.x); + v12 ^= v0; + v12 = v12.yx; + v8 += v12; + v8.y += u32(v8.x < v12.x); + v4 ^= v8; + v4 = (v4 >> vec2(24u)) | (v4 << vec2(8u)).yx; + v0 += v4; + v0.y += u32(v0.x < v4.x); + v0 += m1; + v0.y += u32(v0.x < m1.x); + v12 ^= v0; + v12 = (v12 >> vec2(16u)) | (v12 << vec2(16u)).yx; + v8 += v12; + v8.y += u32(v8.x < v12.x); + v4 ^= v8; + v4 = (v4 >> vec2(31u)).yx | (v4 << vec2(1u)); + v1 += v5; + v1.y += u32(v1.x < v5.x); + v1 += m2; + v1.y += u32(v1.x < m2.x); + v13 ^= v1; + v13 = v13.yx; + v9 += v13; + v9.y += u32(v9.x < v13.x); + v5 ^= v9; + v5 = (v5 >> vec2(24u)) | (v5 << vec2(8u)).yx; + v1 += v5; + v1.y += u32(v1.x < v5.x); + v1 += m3; + v1.y += u32(v1.x < m3.x); + v13 ^= v1; + v13 = (v13 >> vec2(16u)) | (v13 << vec2(16u)).yx; + v9 += v13; + v9.y += u32(v9.x < v13.x); + v5 ^= v9; + v5 = (v5 >> vec2(31u)).yx | (v5 << vec2(1u)); + v2 += v6; + v2.y += u32(v2.x < v6.x); + v2 += m4; + v2.y += u32(v2.x < m4.x); + v14 ^= v2; + v14 = v14.yx; + v10 += v14; + v10.y += u32(v10.x < v14.x); + v6 ^= v10; + v6 = (v6 >> vec2(24u)) | (v6 << vec2(8u)).yx; + v2 += v6; + v2.y += u32(v2.x < v6.x); + // NOP + v14 ^= v2; + v14 = (v14 >> vec2(16u)) | (v14 << vec2(16u)).yx; + v10 += v14; + v10.y += u32(v10.x < v14.x); + v6 ^= v10; + v6 = (v6 >> vec2(31u)).yx | (v6 << vec2(1u)); + v3 += v7; + v3.y += u32(v3.x < v7.x); + // NOP + v15 ^= v3; + v15 = v15.yx; + v11 += v15; + v11.y += u32(v11.x < v15.x); + v7 ^= v11; + v7 = (v7 >> vec2(24u)) | (v7 << vec2(8u)).yx; + v3 += v7; + v3.y += u32(v3.x < v7.x); + // NOP + v15 ^= v3; + v15 = (v15 >> vec2(16u)) | (v15 << vec2(16u)).yx; + v11 += v15; + v11.y += u32(v11.x < v15.x); + v7 ^= v11; + v7 = (v7 >> vec2(31u)).yx | (v7 << vec2(1u)); + v0 += v5; + v0.y += u32(v0.x < v5.x); + // NOP + v15 ^= v0; + v15 = v15.yx; + v10 += v15; + v10.y += u32(v10.x < v15.x); + v5 ^= v10; + v5 = (v5 >> vec2(24u)) | (v5 << vec2(8u)).yx; + v0 += v5; + v0.y += u32(v0.x < v5.x); + // NOP + v15 ^= v0; + v15 = (v15 >> vec2(16u)) | (v15 << vec2(16u)).yx; + v10 += v15; + v10.y += u32(v10.x < v15.x); + v5 ^= v10; + v5 = (v5 >> vec2(31u)).yx | (v5 << vec2(1u)); + v1 += v6; + v1.y += u32(v1.x < v6.x); + // NOP + v12 ^= v1; + v12 = v12.yx; + v11 += v12; + v11.y += u32(v11.x < v12.x); + v6 ^= v11; + v6 = (v6 >> vec2(24u)) | (v6 << vec2(8u)).yx; + v1 += v6; + v1.y += u32(v1.x < v6.x); + // NOP + v12 ^= v1; + v12 = (v12 >> vec2(16u)) | (v12 << vec2(16u)).yx; + v11 += v12; + v11.y += u32(v11.x < v12.x); + v6 ^= v11; + v6 = (v6 >> vec2(31u)).yx | (v6 << vec2(1u)); + v2 += v7; + v2.y += u32(v2.x < v7.x); + // NOP + v13 ^= v2; + v13 = v13.yx; + v8 += v13; + v8.y += u32(v8.x < v13.x); + v7 ^= v8; + v7 = (v7 >> vec2(24u)) | (v7 << vec2(8u)).yx; + v2 += v7; + v2.y += u32(v2.x < v7.x); + // NOP + v13 ^= v2; + v13 = (v13 >> vec2(16u)) | (v13 << vec2(16u)).yx; + v8 += v13; + v8.y += u32(v8.x < v13.x); + v7 ^= v8; + v7 = (v7 >> vec2(31u)).yx | (v7 << vec2(1u)); + v3 += v4; + v3.y += u32(v3.x < v4.x); + // NOP + v14 ^= v3; + v14 = v14.yx; + v9 += v14; + v9.y += u32(v9.x < v14.x); + v4 ^= v9; + v4 = (v4 >> vec2(24u)) | (v4 << vec2(8u)).yx; + v3 += v4; + v3.y += u32(v3.x < v4.x); + // NOP + v14 ^= v3; + v14 = (v14 >> vec2(16u)) | (v14 << vec2(16u)).yx; + v9 += v14; + v9.y += u32(v9.x < v14.x); + v4 ^= v9; + v4 = (v4 >> vec2(31u)).yx | (v4 << vec2(1u)); + // ROUND 11 + v0 += v4; + v0.y += u32(v0.x < v4.x); + // NOP + v12 ^= v0; + v12 = v12.yx; + v8 += v12; + v8.y += u32(v8.x < v12.x); + v4 ^= v8; + v4 = (v4 >> vec2(24u)) | (v4 << vec2(8u)).yx; + v0 += v4; + v0.y += u32(v0.x < v4.x); + // NOP + v12 ^= v0; + v12 = (v12 >> vec2(16u)) | (v12 << vec2(16u)).yx; + v8 += v12; + v8.y += u32(v8.x < v12.x); + v4 ^= v8; + v4 = (v4 >> vec2(31u)).yx | (v4 << vec2(1u)); + v1 += v5; + v1.y += u32(v1.x < v5.x); + v1 += m4; + v1.y += u32(v1.x < m4.x); + v13 ^= v1; + v13 = v13.yx; + v9 += v13; + v9.y += u32(v9.x < v13.x); + v5 ^= v9; + v5 = (v5 >> vec2(24u)) | (v5 << vec2(8u)).yx; + v1 += v5; + v1.y += u32(v1.x < v5.x); + // NOP + v13 ^= v1; + v13 = (v13 >> vec2(16u)) | (v13 << vec2(16u)).yx; + v9 += v13; + v9.y += u32(v9.x < v13.x); + v5 ^= v9; + v5 = (v5 >> vec2(31u)).yx | (v5 << vec2(1u)); + v2 += v6; + v2.y += u32(v2.x < v6.x); + // NOP + v14 ^= v2; + v14 = v14.yx; + v10 += v14; + v10.y += u32(v10.x < v14.x); + v6 ^= v10; + v6 = (v6 >> vec2(24u)) | (v6 << vec2(8u)).yx; + v2 += v6; + v2.y += u32(v2.x < v6.x); + // NOP + v14 ^= v2; + v14 = (v14 >> vec2(16u)) | (v14 << vec2(16u)).yx; + v10 += v14; + v10.y += u32(v10.x < v14.x); + v6 ^= v10; + v6 = (v6 >> vec2(31u)).yx | (v6 << vec2(1u)); + v3 += v7; + v3.y += u32(v3.x < v7.x); + // NOP + v15 ^= v3; + v15 = v15.yx; + v11 += v15; + v11.y += u32(v11.x < v15.x); + v7 ^= v11; + v7 = (v7 >> vec2(24u)) | (v7 << vec2(8u)).yx; + v3 += v7; + v3.y += u32(v3.x < v7.x); + // NOP + v15 ^= v3; + v15 = (v15 >> vec2(16u)) | (v15 << vec2(16u)).yx; + v11 += v15; + v11.y += u32(v11.x < v15.x); + v7 ^= v11; + v7 = (v7 >> vec2(31u)).yx | (v7 << vec2(1u)); + v0 += v5; + v0.y += u32(v0.x < v5.x); + v0 += m1; + v0.y += u32(v0.x < m1.x); + v15 ^= v0; + v15 = v15.yx; + v10 += v15; + v10.y += u32(v10.x < v15.x); + v5 ^= v10; + v5 = (v5 >> vec2(24u)) | (v5 << vec2(8u)).yx; + v0 += v5; + v0.y += u32(v0.x < v5.x); + // NOP + v15 ^= v0; + v15 = (v15 >> vec2(16u)) | (v15 << vec2(16u)).yx; + v10 += v15; + v10.y += u32(v10.x < v15.x); + v5 ^= v10; + v5 = (v5 >> vec2(31u)).yx | (v5 << vec2(1u)); + // G NOP + v2 += v7; + v2.y += u32(v2.x < v7.x); + // NOP + v13 ^= v2; + v13 = v13.yx; + v8 += v13; + v8.y += u32(v8.x < v13.x); + v7 ^= v8; + v7 = (v7 >> vec2(24u)) | (v7 << vec2(8u)).yx; + v2 += v7; + v2.y += u32(v2.x < v7.x); + // NOP + v13 ^= v2; + v13 = (v13 >> vec2(16u)) | (v13 << vec2(16u)).yx; + v8 += v13; + v8.y += u32(v8.x < v13.x); + v7 ^= v8; + v7 = (v7 >> vec2(31u)).yx | (v7 << vec2(1u)); + // G NOP + // Set nonce if it exceeds difficulty threshold and no other thread has set it. + let result = vec2(0xf3bcc908u, 0x6a09e667u) ^ vec2(0x01010008u, 0x0u) ^ v0 ^ v8; + if (result.y > input.difficulty.y || (result.y == input.difficulty.y && result.x >= input.difficulty.x)) { + loop { + let swap = atomicCompareExchangeWeak(&output.found, 0u, 1u); + if (swap.exchanged) { + output.work = m0; + output.difficulty = result; + break; + } + if (swap.old_value != 0u) { + break; + } + } + return; + } +} diff --git a/src/lib/generate/webgpu/shaders/generate.ts b/src/lib/generate/webgpu/shaders/generate.ts deleted file mode 100644 index 242dc35..0000000 --- a/src/lib/generate/webgpu/shaders/generate.ts +++ /dev/null @@ -1,270 +0,0 @@ -//! SPDX-FileCopyrightText: 2025 Chris Duncan -//! SPDX-License-Identifier: GPL-3.0-or-later - -const blake2b_state: number[][] = [ - [0, 4, 8, 12], - [1, 5, 9, 13], - [2, 6, 10, 14], - [3, 7, 11, 15], - [0, 5, 10, 15], - [1, 6, 11, 12], - [2, 7, 8, 13], - [3, 4, 9, 14] -] - -const blake2b_sigma: (0 | 1 | 2 | 3 | 4 | 'Z')[][] = [ - [0, 1, 2, 3, 4, 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z'], - ['Z', 'Z', 4, 'Z', 'Z', 'Z', 'Z', 'Z', 1, 'Z', 0, 2, 'Z', 'Z', 'Z', 3], - ['Z', 'Z', 'Z', 0, 'Z', 2, 'Z', 'Z', 'Z', 'Z', 3, 'Z', 'Z', 1, 'Z', 4], - ['Z', 'Z', 3, 1, 'Z', 'Z', 'Z', 'Z', 2, 'Z', 'Z', 'Z', 4, 0, 'Z', 'Z'], - ['Z', 0, 'Z', 'Z', 2, 4, 'Z', 'Z', 'Z', 1, 'Z', 'Z', 'Z', 'Z', 3, 'Z'], - [2, 'Z', 'Z', 'Z', 0, 'Z', 'Z', 3, 4, 'Z', 'Z', 'Z', 'Z', 'Z', 1, 'Z'], - ['Z', 'Z', 1, 'Z', 'Z', 'Z', 4, 'Z', 0, 'Z', 'Z', 3, 'Z', 2, 'Z', 'Z'], - ['Z', 'Z', 'Z', 'Z', 'Z', 1, 3, 'Z', 'Z', 0, 'Z', 4, 'Z', 'Z', 2, 'Z'], - ['Z', 'Z', 'Z', 'Z', 'Z', 3, 0, 'Z', 'Z', 2, 'Z', 'Z', 1, 4, 'Z', 'Z'], - ['Z', 2, 'Z', 4, 'Z', 'Z', 1, 'Z', 'Z', 'Z', 'Z', 'Z', 3, 'Z', 'Z', 0], - [0, 1, 2, 3, 4, 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z'], - ['Z', 'Z', 4, 'Z', 'Z', 'Z', 'Z', 'Z', 1, 'Z', 0, 2, 'Z', 'Z', 'Z', 3] -] - -/** -* Initialization vector defined by BLAKE2. -* Application of each XOR is defined by BLAKE2 section 2.4 compression function. -* Each vec2 represents two halves -* of the original u64 value from the reference implementation. They appear -* reversed pairwise as defined below, but this is an illusion due to endianness: -* the \`x\` component of the vector is the low bits and the \`y\` component is the -* high bits, and if you laid the bits out individually, they would match the -* little-endian 64-bit representation. -*/ -const blake2b_iv = [ - 'vec2(0xF3BCC908u, 0x6A09E667u)', - 'vec2(0x84CAA73Bu, 0xBB67AE85u)', - 'vec2(0xFE94F82Bu, 0x3C6EF372u)', - 'vec2(0x5F1D36F1u, 0xA54FF53Au)', - 'vec2(0xADE682D1u, 0x510E527Fu)', - 'vec2(0x2B3E6C1Fu, 0x9B05688Cu)', - 'vec2(0xFB41BD6Bu, 0x1F83D9ABu)', - 'vec2(0x137E2179u, 0x5BE0CD19u)' -] - -/** -* Parameter block as defined in BLAKE2 section 2.8 and configured as follows: -* maximal depth = 1, fanout = 1, digest byte length = 8 -*/ -const blake2b_param = `vec2(0x01010008u, 0u)` - -function G (a: number, b: number, c: number, d: number, - x: 0 | 1 | 2 | 3 | 4 | 'Z', y: 0 | 1 | 2 | 3 | 4 | 'Z' -): string { - return ` -v${a} += v${b}; -v${a}.y += u32(v${a}.x < v${b}.x); - -${x === 'Z' ? `// NOP` : ` - v${a} += m${x}; - v${a}.y += u32(v${a}.x < m${x}.x);`} - -v${d} = (v${d} ^ v${a}).yx; - -v${c} += v${d}; -v${c}.y += u32(v${c}.x < v${d}.x); - -v${b} ^= v${c}; -v${b} = (v${b} >> vec2(24u)) | (v${b} << vec2(8u)).yx; - -v${a} += v${b}; -v${a}.y += u32(v${a}.x < v${b}.x); - -${y === 'Z' ? `// NOP` : ` - v${a} += m${y}; - v${a}.y += u32(v${a}.x < m${y}.x);`} - -v${d} ^= v${a}; -v${d} = (v${d} >> vec2(16u)) | (v${d} << vec2(16u)).yx; - -v${c} += v${d}; -v${c}.y += u32(v${c}.x < v${d}.x); - -v${b} ^= v${c}; -v${b} = (v${b} >> vec2(31u)).yx | (v${b} << vec2(1u)); -`} - -function ROUND (r: number): string { - let output = `// ROUND ${r}` - for (let i = 0; i < 8; i++) { - const [a, b, c, d] = blake2b_state[i] - const s = blake2b_sigma[r] - output += r === 11 && (i === 5 || i === 7) ? `// G NOP` : ` - ${G(a, b, c, d, s[2 * i], s[2 * i + 1])} - ` - } - return output -} - -function SETUP (): string { - let output = ` -// Initialize unique nonce -let m0: vec2 = seed ^ global_id.xy; - -// Initialize state vector configured for Nano -// v0: depth=1; fanout=1; outlen=8 -// v12: input byte length -// v14: final block flag -` - for (let i = 0; i < 8; i++) { - output += ` - var v${i}: vec2 = ${blake2b_iv[i]}; - var v${i + 8}: vec2 = ${blake2b_iv[i]}; - ` - } - output += ` - v0 ^= vec2(0x01010008u, 0u); - v12 ^= vec2(40u, 0u); - v14 = ~v14; - ` - return output -} - -function HASH () { - let output = ` -/** -* Twelve rounds of G mixing as part of BLAKE2b compression step, each divided -* into eight subprocesses, each of which is further paired to be processed in -* parallel by packing independent vec2 variables into vec4 variables. -* Each subprocess statement execution is alternated so that the compiler can -* interleave independent instructions for improved scheduling. That is to say, -* the first statement \`a = a + b\` is executed for each subprocess, and then -* the next statement \`a = a + m[sigma[r][2*i+0]]\` is executed, and so on -* through all the steps of the G mix function. Once subprocesses 1-4 are done, -* computation on subprocesses 5-8 are executed in the same manner. -* -* Each subprocess applies transformations to \`m\` and \`v\` variables based on a -* defined set of index inputs. The algorithm for each subprocess is defined as -* follows: -* -* r is the current round -* i is the current subprocess within that round -* a, b, c, d are elements of \`v\` at specific indexes -* sigma is a defined set of array indexes for \`m\` -* rotr64 is a right-hand bit rotation function -* -* a = a + b -* a = a + m[sigma[r][2*i+0]] -* d = rotr64(d ^ a, 32) -* c = c + d -* b = rotr64(b ^ c, 24) -* a = a + b -* a = a + m[sigma[r][2*i+1]] -* d = rotr64(d ^ a, 16) -* c = c + d -* b = rotr64(b ^ c, 63) -* -* Each sum step has an extra carry addition. Note that the m[sigma] sum is -* skipped if m[sigma] is zero since it effectively does nothing. Also note -* that rotations must be applied differently from the reference implementation -* due to the lack of both a native rotate function and 64-bit support in WGSL. -*/ -` - for (let i = 0; i < 12; i++) output += ROUND(i) - return output -} - -function UNIFORMS () { - return ` -// Input buffers -struct INPUT { - hash: array, 2>, - difficulty: vec2, - seed: vec2 -}; -@group(0) @binding(0) varinput: INPUT; - -// Output buffers -struct OUTPUT { - found: atomic, - work: vec2, - difficulty: vec2 -}; -@group(0) @binding(1) var output: OUTPUT; -`} - -function SHARED () { - return ` -// Used to fill partial \`m\` vec2 constructions. -const mZ = vec2(0u); - -// Shared flag to prevent execution for all workgroup threads based on the -// atomicLoad() result of a single member thread. -var found: bool; - -// Shared memory for difficulty, hash, and seed. -var m1: vec2; -var m2: vec2; -var m3: vec2; -var m4: vec2; -var d: vec2; -var seed: vec2; -`} - -/** Output to compute.wgsl using Node */ -console.log(`//! SPDX-FileCopyrightText: 2025 Chris Duncan -//! SPDX-License-Identifier: GPL-3.0-or-later - -${UNIFORMS()} -${SHARED()} - -/** -* Main compute function -* -* Computes with a workgroup size of 64 which balances warps between NVIDIA and -* AMD cards while still considering the power-sensitive requirements of mobile -* devices. The entire workgroup exits immediately if a nonce was already found -* by a previous workgroup. -* -* Each component of a random 8-byte value, provided by the UBO as a vec2, -* is XOR'd with a different dimensional index from the global thread identifier -* to create a unique nonce value for each thread. -* -* Where the reference implementation uses array lookups, the NanoPow -* implementation assigns each array element to its own variable to enhance -* performance, but the variable name still contains the original index digit. -*/ -@compute @workgroup_size(64) -fn main(@builtin(global_invocation_id) global_id: vec3, @builtin(local_invocation_id) local_id: vec3) { - if (local_id.x == 0u) { - found = atomicLoad(& output.found) != 0u; - seed = input.seed; - m1 = input.hash[0u].xy; - m2 = input.hash[0u].zw; - m3 = input.hash[1u].xy; - m4 = input.hash[1u].zw; - d = input.difficulty; - } - workgroupBarrier(); - if (found) { return; } - - ${SETUP()} - ${HASH()} - - // NONCE CHECK - // Set nonce if it exceeds difficulty threshold and no other thread has set it. - let result = ${blake2b_iv[0]} ^ ${blake2b_param} ^ v0 ^ v8; - if (result.y > input.difficulty.y || (result.y == input.difficulty.y && result.x >= input.difficulty.x)) { - loop { - let swap = atomicCompareExchangeWeak(&output.found, 0u, 1u); - if (swap.exchanged) { - output.work = m0; - output.difficulty = result; - break; - } - if (swap.old_value != 0u) { - break; - } - } - return; - } -} -`) - -export { } diff --git a/src/lib/generate/webgpu/shaders/tsconfig.json b/src/lib/generate/webgpu/shaders/tsconfig.json deleted file mode 100644 index 482924f..0000000 --- a/src/lib/generate/webgpu/shaders/tsconfig.json +++ /dev/null @@ -1,11 +0,0 @@ -{ - "include": [ - "./*.ts" - ], - "compilerOptions": { - "target": "ESNext", - "module": "ESNext", - "moduleResolution": "Bundler", - "outDir": "./build" - } -} diff --git a/src/lib/generate/webgpu/shaders/tsconfig.json.license b/src/lib/generate/webgpu/shaders/tsconfig.json.license deleted file mode 100644 index 7f56691..0000000 --- a/src/lib/generate/webgpu/shaders/tsconfig.json.license +++ /dev/null @@ -1,2 +0,0 @@ -SPDX-FileCopyrightText: 2025 Chris Duncan -SPDX-License-Identifier: GPL-3.0-or-later diff --git a/test/index.html b/test/index.html index 80aa2e4..a22288b 100644 --- a/test/index.html +++ b/test/index.html @@ -298,8 +298,8 @@ SPDX-License-Identifier: GPL-3.0-or-later

nano-pow

https://www.npmjs.com/package/nano-pow

Speed test for NanoPow proof-of-work tool.

-

NanoPow uses cutting edge WebGPU technology. Not all browsers are supported.

-

NanoPow uses WebGL 2.0 as a fallback option if WebGPU is not detected.

+

NanoPow uses cutting edge WebGPU technology, and not all browsers support it.

+

NanoPow uses WebGL 2.0 as a fallback option if WebGPU is not detected and a WASM module if neither GPU API is available.

Times below are in milliseconds and are summarized by various averaging methods.

Level of Effort depends on hardware and does not guarantee faster results.


-- 2.47.3