--- /dev/null
+export function main (seed: u64, h0: u64, h1: u64, h2: u64, h3: u64, difficulty: u64): u64 {
+ let m0 = v128.splat<u64>(0);
+ const m1 = v128.splat<u64>(h0);
+ const m2 = v128.splat<u64>(h1);
+ const m3 = v128.splat<u64>(h2);
+ const m4 = v128.splat<u64>(h3);
+ let r0: u64 = 0
+ let r1: u64 = 0
+ let result = v128.splat<u64>(0);
+ const blake2b_iv_param = v128.xor(v128.splat<u64>((0x6a09e667 << 32) | 0xf3bcc908), v128.splat<u64>((0x0 << 32) | 0x01010008));
+ const iterations: u64 = 1 << 24;
+ for (let i: u64 = 0; i < iterations; i++) {
+ m0 = i64x2(unchecked(seed + i), unchecked(seed + i + 1));
+ i += 2
+ // INITIALIZE STATE VECTOR
+ // v0: depth=1; fanout=1; outlen=8
+ // v12: input byte length
+ // v14: final block flag
+ let v0 = v128.splat<u64>((0x6a09e667 << 32) | 0xf3bcc908);
+ let v8 = v128.splat<u64>((0x6a09e667 << 32) | 0xf3bcc908);
+ let v1 = v128.splat<u64>((0xbb67ae85 << 32) | 0x84caa73b);
+ let v9 = v128.splat<u64>((0xbb67ae85 << 32) | 0x84caa73b);
+ let v2 = v128.splat<u64>((0x3c6ef372 << 32) | 0xfe94f82b);
+ let v10 = v128.splat<u64>((0x3c6ef372 << 32) | 0xfe94f82b);
+ let v3 = v128.splat<u64>((0xa54ff53a << 32) | 0x5f1d36f1);
+ let v11 = v128.splat<u64>((0xa54ff53a << 32) | 0x5f1d36f1);
+ let v4 = v128.splat<u64>((0x510e527f << 32) | 0xade682d1);
+ let v12 = v128.splat<u64>((0x510e527f << 32) | 0xade682d1);
+ let v5 = v128.splat<u64>((0x9b05688c << 32) | 0x2b3e6c1f);
+ let v13 = v128.splat<u64>((0x9b05688c << 32) | 0x2b3e6c1f);
+ let v6 = v128.splat<u64>((0x1f83d9ab << 32) | 0xfb41bd6b);
+ let v14 = v128.splat<u64>((0x1f83d9ab << 32) | 0xfb41bd6b);
+ let v7 = v128.splat<u64>((0x5be0cd19 << 32) | 0x137e2179);
+ let v15 = v128.splat<u64>((0x5be0cd19 << 32) | 0x137e2179);
+ v0 = v128.xor(v0, v128.splat<u64>((0x0 << 32) | 0x01010008));
+ v12 = v128.xor(v12, v128.splat<u64>((0x0 << 32) | 0x28));
+ v14 = v128.xor(v14, v128.splat<u64>(~0x0));
+ // COMPRESS
+ // ROUND 0
+ v0 = unchecked(v128.add<u64>(v0, v4));
+ v0 = unchecked(v128.add<u64>(v0, m0));
+ v12 = v128.xor(v12, v0);
+ v12 = v128.shuffle<u8>(v12, v12, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11);
+ v8 = unchecked(v128.add<u64>(v8, v12));
+ v4 = v128.xor(v4, v8);
+ v4 = v128.shuffle<u8>(v4, v4, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10);
+ v0 = unchecked(v128.add<u64>(v0, v4));
+ v0 = unchecked(v128.add<u64>(v0, m1));
+ v12 = v128.xor(v12, v0);
+ v12 = v128.shuffle<u8>(v12, v12, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9);
+ v8 = unchecked(v128.add<u64>(v8, v12));
+ v4 = v128.xor(v4, v8);
+ v4 = v128.or(v128.shr<u64>(v4, 63), v128.shl<u64>(v4, 1));
+ v1 = unchecked(v128.add<u64>(v1, v5));
+ v1 = unchecked(v128.add<u64>(v1, m2));
+ v13 = v128.xor(v13, v1);
+ v13 = v128.shuffle<u8>(v13, v13, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11);
+ v9 = unchecked(v128.add<u64>(v9, v13));
+ v5 = v128.xor(v5, v9);
+ v5 = v128.shuffle<u8>(v5, v5, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10);
+ v1 = unchecked(v128.add<u64>(v1, v5));
+ v1 = unchecked(v128.add<u64>(v1, m3));
+ v13 = v128.xor(v13, v1);
+ v13 = v128.shuffle<u8>(v13, v13, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9);
+ v9 = unchecked(v128.add<u64>(v9, v13));
+ v5 = v128.xor(v5, v9);
+ v5 = v128.or(v128.shr<u64>(v5, 63), v128.shl<u64>(v5, 1));
+ v2 = unchecked(v128.add<u64>(v2, v6));
+ v2 = unchecked(v128.add<u64>(v2, m4));
+ v14 = v128.xor(v14, v2);
+ v14 = v128.shuffle<u8>(v14, v14, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11);
+ v10 = unchecked(v128.add<u64>(v10, v14));
+ v6 = v128.xor(v6, v10);
+ v6 = v128.shuffle<u8>(v6, v6, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10);
+ v2 = unchecked(v128.add<u64>(v2, v6));
+ // NOP
+ v14 = v128.xor(v14, v2);
+ v14 = v128.shuffle<u8>(v14, v14, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9);
+ v10 = unchecked(v128.add<u64>(v10, v14));
+ v6 = v128.xor(v6, v10);
+ v6 = v128.or(v128.shr<u64>(v6, 63), v128.shl<u64>(v6, 1));
+ v3 = unchecked(v128.add<u64>(v3, v7));
+ // NOP
+ v15 = v128.xor(v15, v3);
+ v15 = v128.shuffle<u8>(v15, v15, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11);
+ v11 = unchecked(v128.add<u64>(v11, v15));
+ v7 = v128.xor(v7, v11);
+ v7 = v128.shuffle<u8>(v7, v7, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10);
+ v3 = unchecked(v128.add<u64>(v3, v7));
+ // NOP
+ v15 = v128.xor(v15, v3);
+ v15 = v128.shuffle<u8>(v15, v15, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9);
+ v11 = unchecked(v128.add<u64>(v11, v15));
+ v7 = v128.xor(v7, v11);
+ v7 = v128.or(v128.shr<u64>(v7, 63), v128.shl<u64>(v7, 1));
+ v0 = unchecked(v128.add<u64>(v0, v5));
+ // NOP
+ v15 = v128.xor(v15, v0);
+ v15 = v128.shuffle<u8>(v15, v15, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11);
+ v10 = unchecked(v128.add<u64>(v10, v15));
+ v5 = v128.xor(v5, v10);
+ v5 = v128.shuffle<u8>(v5, v5, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10);
+ v0 = unchecked(v128.add<u64>(v0, v5));
+ // NOP
+ v15 = v128.xor(v15, v0);
+ v15 = v128.shuffle<u8>(v15, v15, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9);
+ v10 = unchecked(v128.add<u64>(v10, v15));
+ v5 = v128.xor(v5, v10);
+ v5 = v128.or(v128.shr<u64>(v5, 63), v128.shl<u64>(v5, 1));
+ v1 = unchecked(v128.add<u64>(v1, v6));
+ // NOP
+ v12 = v128.xor(v12, v1);
+ v12 = v128.shuffle<u8>(v12, v12, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11);
+ v11 = unchecked(v128.add<u64>(v11, v12));
+ v6 = v128.xor(v6, v11);
+ v6 = v128.shuffle<u8>(v6, v6, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10);
+ v1 = unchecked(v128.add<u64>(v1, v6));
+ // NOP
+ v12 = v128.xor(v12, v1);
+ v12 = v128.shuffle<u8>(v12, v12, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9);
+ v11 = unchecked(v128.add<u64>(v11, v12));
+ v6 = v128.xor(v6, v11);
+ v6 = v128.or(v128.shr<u64>(v6, 63), v128.shl<u64>(v6, 1));
+ v2 = unchecked(v128.add<u64>(v2, v7));
+ // NOP
+ v13 = v128.xor(v13, v2);
+ v13 = v128.shuffle<u8>(v13, v13, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11);
+ v8 = unchecked(v128.add<u64>(v8, v13));
+ v7 = v128.xor(v7, v8);
+ v7 = v128.shuffle<u8>(v7, v7, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10);
+ v2 = unchecked(v128.add<u64>(v2, v7));
+ // NOP
+ v13 = v128.xor(v13, v2);
+ v13 = v128.shuffle<u8>(v13, v13, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9);
+ v8 = unchecked(v128.add<u64>(v8, v13));
+ v7 = v128.xor(v7, v8);
+ v7 = v128.or(v128.shr<u64>(v7, 63), v128.shl<u64>(v7, 1));
+ v3 = unchecked(v128.add<u64>(v3, v4));
+ // NOP
+ v14 = v128.xor(v14, v3);
+ v14 = v128.shuffle<u8>(v14, v14, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11);
+ v9 = unchecked(v128.add<u64>(v9, v14));
+ v4 = v128.xor(v4, v9);
+ v4 = v128.shuffle<u8>(v4, v4, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10);
+ v3 = unchecked(v128.add<u64>(v3, v4));
+ // NOP
+ v14 = v128.xor(v14, v3);
+ v14 = v128.shuffle<u8>(v14, v14, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9);
+ v9 = unchecked(v128.add<u64>(v9, v14));
+ v4 = v128.xor(v4, v9);
+ v4 = v128.or(v128.shr<u64>(v4, 63), v128.shl<u64>(v4, 1));
+ // ROUND 1
+ v0 = unchecked(v128.add<u64>(v0, v4));
+ // NOP
+ v12 = v128.xor(v12, v0);
+ v12 = v128.shuffle<u8>(v12, v12, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11);
+ v8 = unchecked(v128.add<u64>(v8, v12));
+ v4 = v128.xor(v4, v8);
+ v4 = v128.shuffle<u8>(v4, v4, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10);
+ v0 = unchecked(v128.add<u64>(v0, v4));
+ // NOP
+ v12 = v128.xor(v12, v0);
+ v12 = v128.shuffle<u8>(v12, v12, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9);
+ v8 = unchecked(v128.add<u64>(v8, v12));
+ v4 = v128.xor(v4, v8);
+ v4 = v128.or(v128.shr<u64>(v4, 63), v128.shl<u64>(v4, 1));
+ v1 = unchecked(v128.add<u64>(v1, v5));
+ v1 = unchecked(v128.add<u64>(v1, m4));
+ v13 = v128.xor(v13, v1);
+ v13 = v128.shuffle<u8>(v13, v13, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11);
+ v9 = unchecked(v128.add<u64>(v9, v13));
+ v5 = v128.xor(v5, v9);
+ v5 = v128.shuffle<u8>(v5, v5, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10);
+ v1 = unchecked(v128.add<u64>(v1, v5));
+ // NOP
+ v13 = v128.xor(v13, v1);
+ v13 = v128.shuffle<u8>(v13, v13, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9);
+ v9 = unchecked(v128.add<u64>(v9, v13));
+ v5 = v128.xor(v5, v9);
+ v5 = v128.or(v128.shr<u64>(v5, 63), v128.shl<u64>(v5, 1));
+ v2 = unchecked(v128.add<u64>(v2, v6));
+ // NOP
+ v14 = v128.xor(v14, v2);
+ v14 = v128.shuffle<u8>(v14, v14, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11);
+ v10 = unchecked(v128.add<u64>(v10, v14));
+ v6 = v128.xor(v6, v10);
+ v6 = v128.shuffle<u8>(v6, v6, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10);
+ v2 = unchecked(v128.add<u64>(v2, v6));
+ // NOP
+ v14 = v128.xor(v14, v2);
+ v14 = v128.shuffle<u8>(v14, v14, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9);
+ v10 = unchecked(v128.add<u64>(v10, v14));
+ v6 = v128.xor(v6, v10);
+ v6 = v128.or(v128.shr<u64>(v6, 63), v128.shl<u64>(v6, 1));
+ v3 = unchecked(v128.add<u64>(v3, v7));
+ // NOP
+ v15 = v128.xor(v15, v3);
+ v15 = v128.shuffle<u8>(v15, v15, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11);
+ v11 = unchecked(v128.add<u64>(v11, v15));
+ v7 = v128.xor(v7, v11);
+ v7 = v128.shuffle<u8>(v7, v7, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10);
+ v3 = unchecked(v128.add<u64>(v3, v7));
+ // NOP
+ v15 = v128.xor(v15, v3);
+ v15 = v128.shuffle<u8>(v15, v15, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9);
+ v11 = unchecked(v128.add<u64>(v11, v15));
+ v7 = v128.xor(v7, v11);
+ v7 = v128.or(v128.shr<u64>(v7, 63), v128.shl<u64>(v7, 1));
+ v0 = unchecked(v128.add<u64>(v0, v5));
+ v0 = unchecked(v128.add<u64>(v0, m1));
+ v15 = v128.xor(v15, v0);
+ v15 = v128.shuffle<u8>(v15, v15, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11);
+ v10 = unchecked(v128.add<u64>(v10, v15));
+ v5 = v128.xor(v5, v10);
+ v5 = v128.shuffle<u8>(v5, v5, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10);
+ v0 = unchecked(v128.add<u64>(v0, v5));
+ // NOP
+ v15 = v128.xor(v15, v0);
+ v15 = v128.shuffle<u8>(v15, v15, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9);
+ v10 = unchecked(v128.add<u64>(v10, v15));
+ v5 = v128.xor(v5, v10);
+ v5 = v128.or(v128.shr<u64>(v5, 63), v128.shl<u64>(v5, 1));
+ v1 = unchecked(v128.add<u64>(v1, v6));
+ v1 = unchecked(v128.add<u64>(v1, m0));
+ v12 = v128.xor(v12, v1);
+ v12 = v128.shuffle<u8>(v12, v12, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11);
+ v11 = unchecked(v128.add<u64>(v11, v12));
+ v6 = v128.xor(v6, v11);
+ v6 = v128.shuffle<u8>(v6, v6, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10);
+ v1 = unchecked(v128.add<u64>(v1, v6));
+ v1 = unchecked(v128.add<u64>(v1, m2));
+ v12 = v128.xor(v12, v1);
+ v12 = v128.shuffle<u8>(v12, v12, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9);
+ v11 = unchecked(v128.add<u64>(v11, v12));
+ v6 = v128.xor(v6, v11);
+ v6 = v128.or(v128.shr<u64>(v6, 63), v128.shl<u64>(v6, 1));
+ v2 = unchecked(v128.add<u64>(v2, v7));
+ // NOP
+ v13 = v128.xor(v13, v2);
+ v13 = v128.shuffle<u8>(v13, v13, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11);
+ v8 = unchecked(v128.add<u64>(v8, v13));
+ v7 = v128.xor(v7, v8);
+ v7 = v128.shuffle<u8>(v7, v7, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10);
+ v2 = unchecked(v128.add<u64>(v2, v7));
+ // NOP
+ v13 = v128.xor(v13, v2);
+ v13 = v128.shuffle<u8>(v13, v13, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9);
+ v8 = unchecked(v128.add<u64>(v8, v13));
+ v7 = v128.xor(v7, v8);
+ v7 = v128.or(v128.shr<u64>(v7, 63), v128.shl<u64>(v7, 1));
+ v3 = unchecked(v128.add<u64>(v3, v4));
+ // NOP
+ v14 = v128.xor(v14, v3);
+ v14 = v128.shuffle<u8>(v14, v14, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11);
+ v9 = unchecked(v128.add<u64>(v9, v14));
+ v4 = v128.xor(v4, v9);
+ v4 = v128.shuffle<u8>(v4, v4, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10);
+ v3 = unchecked(v128.add<u64>(v3, v4));
+ v3 = unchecked(v128.add<u64>(v3, m3));
+ v14 = v128.xor(v14, v3);
+ v14 = v128.shuffle<u8>(v14, v14, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9);
+ v9 = unchecked(v128.add<u64>(v9, v14));
+ v4 = v128.xor(v4, v9);
+ v4 = v128.or(v128.shr<u64>(v4, 63), v128.shl<u64>(v4, 1));
+ // ROUND 2
+ v0 = unchecked(v128.add<u64>(v0, v4));
+ // NOP
+ v12 = v128.xor(v12, v0);
+ v12 = v128.shuffle<u8>(v12, v12, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11);
+ v8 = unchecked(v128.add<u64>(v8, v12));
+ v4 = v128.xor(v4, v8);
+ v4 = v128.shuffle<u8>(v4, v4, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10);
+ v0 = unchecked(v128.add<u64>(v0, v4));
+ // NOP
+ v12 = v128.xor(v12, v0);
+ v12 = v128.shuffle<u8>(v12, v12, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9);
+ v8 = unchecked(v128.add<u64>(v8, v12));
+ v4 = v128.xor(v4, v8);
+ v4 = v128.or(v128.shr<u64>(v4, 63), v128.shl<u64>(v4, 1));
+ v1 = unchecked(v128.add<u64>(v1, v5));
+ // NOP
+ v13 = v128.xor(v13, v1);
+ v13 = v128.shuffle<u8>(v13, v13, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11);
+ v9 = unchecked(v128.add<u64>(v9, v13));
+ v5 = v128.xor(v5, v9);
+ v5 = v128.shuffle<u8>(v5, v5, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10);
+ v1 = unchecked(v128.add<u64>(v1, v5));
+ v1 = unchecked(v128.add<u64>(v1, m0));
+ v13 = v128.xor(v13, v1);
+ v13 = v128.shuffle<u8>(v13, v13, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9);
+ v9 = unchecked(v128.add<u64>(v9, v13));
+ v5 = v128.xor(v5, v9);
+ v5 = v128.or(v128.shr<u64>(v5, 63), v128.shl<u64>(v5, 1));
+ v2 = unchecked(v128.add<u64>(v2, v6));
+ // NOP
+ v14 = v128.xor(v14, v2);
+ v14 = v128.shuffle<u8>(v14, v14, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11);
+ v10 = unchecked(v128.add<u64>(v10, v14));
+ v6 = v128.xor(v6, v10);
+ v6 = v128.shuffle<u8>(v6, v6, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10);
+ v2 = unchecked(v128.add<u64>(v2, v6));
+ v2 = unchecked(v128.add<u64>(v2, m2));
+ v14 = v128.xor(v14, v2);
+ v14 = v128.shuffle<u8>(v14, v14, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9);
+ v10 = unchecked(v128.add<u64>(v10, v14));
+ v6 = v128.xor(v6, v10);
+ v6 = v128.or(v128.shr<u64>(v6, 63), v128.shl<u64>(v6, 1));
+ v3 = unchecked(v128.add<u64>(v3, v7));
+ // NOP
+ v15 = v128.xor(v15, v3);
+ v15 = v128.shuffle<u8>(v15, v15, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11);
+ v11 = unchecked(v128.add<u64>(v11, v15));
+ v7 = v128.xor(v7, v11);
+ v7 = v128.shuffle<u8>(v7, v7, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10);
+ v3 = unchecked(v128.add<u64>(v3, v7));
+ // NOP
+ v15 = v128.xor(v15, v3);
+ v15 = v128.shuffle<u8>(v15, v15, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9);
+ v11 = unchecked(v128.add<u64>(v11, v15));
+ v7 = v128.xor(v7, v11);
+ v7 = v128.or(v128.shr<u64>(v7, 63), v128.shl<u64>(v7, 1));
+ v0 = unchecked(v128.add<u64>(v0, v5));
+ // NOP
+ v15 = v128.xor(v15, v0);
+ v15 = v128.shuffle<u8>(v15, v15, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11);
+ v10 = unchecked(v128.add<u64>(v10, v15));
+ v5 = v128.xor(v5, v10);
+ v5 = v128.shuffle<u8>(v5, v5, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10);
+ v0 = unchecked(v128.add<u64>(v0, v5));
+ // NOP
+ v15 = v128.xor(v15, v0);
+ v15 = v128.shuffle<u8>(v15, v15, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9);
+ v10 = unchecked(v128.add<u64>(v10, v15));
+ v5 = v128.xor(v5, v10);
+ v5 = v128.or(v128.shr<u64>(v5, 63), v128.shl<u64>(v5, 1));
+ v1 = unchecked(v128.add<u64>(v1, v6));
+ v1 = unchecked(v128.add<u64>(v1, m3));
+ v12 = v128.xor(v12, v1);
+ v12 = v128.shuffle<u8>(v12, v12, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11);
+ v11 = unchecked(v128.add<u64>(v11, v12));
+ v6 = v128.xor(v6, v11);
+ v6 = v128.shuffle<u8>(v6, v6, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10);
+ v1 = unchecked(v128.add<u64>(v1, v6));
+ // NOP
+ v12 = v128.xor(v12, v1);
+ v12 = v128.shuffle<u8>(v12, v12, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9);
+ v11 = unchecked(v128.add<u64>(v11, v12));
+ v6 = v128.xor(v6, v11);
+ v6 = v128.or(v128.shr<u64>(v6, 63), v128.shl<u64>(v6, 1));
+ v2 = unchecked(v128.add<u64>(v2, v7));
+ // NOP
+ v13 = v128.xor(v13, v2);
+ v13 = v128.shuffle<u8>(v13, v13, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11);
+ v8 = unchecked(v128.add<u64>(v8, v13));
+ v7 = v128.xor(v7, v8);
+ v7 = v128.shuffle<u8>(v7, v7, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10);
+ v2 = unchecked(v128.add<u64>(v2, v7));
+ v2 = unchecked(v128.add<u64>(v2, m1));
+ v13 = v128.xor(v13, v2);
+ v13 = v128.shuffle<u8>(v13, v13, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9);
+ v8 = unchecked(v128.add<u64>(v8, v13));
+ v7 = v128.xor(v7, v8);
+ v7 = v128.or(v128.shr<u64>(v7, 63), v128.shl<u64>(v7, 1));
+ v3 = unchecked(v128.add<u64>(v3, v4));
+ // NOP
+ v14 = v128.xor(v14, v3);
+ v14 = v128.shuffle<u8>(v14, v14, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11);
+ v9 = unchecked(v128.add<u64>(v9, v14));
+ v4 = v128.xor(v4, v9);
+ v4 = v128.shuffle<u8>(v4, v4, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10);
+ v3 = unchecked(v128.add<u64>(v3, v4));
+ v3 = unchecked(v128.add<u64>(v3, m4));
+ v14 = v128.xor(v14, v3);
+ v14 = v128.shuffle<u8>(v14, v14, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9);
+ v9 = unchecked(v128.add<u64>(v9, v14));
+ v4 = v128.xor(v4, v9);
+ v4 = v128.or(v128.shr<u64>(v4, 63), v128.shl<u64>(v4, 1));
+ // ROUND 3
+ v0 = unchecked(v128.add<u64>(v0, v4));
+ // NOP
+ v12 = v128.xor(v12, v0);
+ v12 = v128.shuffle<u8>(v12, v12, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11);
+ v8 = unchecked(v128.add<u64>(v8, v12));
+ v4 = v128.xor(v4, v8);
+ v4 = v128.shuffle<u8>(v4, v4, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10);
+ v0 = unchecked(v128.add<u64>(v0, v4));
+ // NOP
+ v12 = v128.xor(v12, v0);
+ v12 = v128.shuffle<u8>(v12, v12, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9);
+ v8 = unchecked(v128.add<u64>(v8, v12));
+ v4 = v128.xor(v4, v8);
+ v4 = v128.or(v128.shr<u64>(v4, 63), v128.shl<u64>(v4, 1));
+ v1 = unchecked(v128.add<u64>(v1, v5));
+ v1 = unchecked(v128.add<u64>(v1, m3));
+ v13 = v128.xor(v13, v1);
+ v13 = v128.shuffle<u8>(v13, v13, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11);
+ v9 = unchecked(v128.add<u64>(v9, v13));
+ v5 = v128.xor(v5, v9);
+ v5 = v128.shuffle<u8>(v5, v5, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10);
+ v1 = unchecked(v128.add<u64>(v1, v5));
+ v1 = unchecked(v128.add<u64>(v1, m1));
+ v13 = v128.xor(v13, v1);
+ v13 = v128.shuffle<u8>(v13, v13, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9);
+ v9 = unchecked(v128.add<u64>(v9, v13));
+ v5 = v128.xor(v5, v9);
+ v5 = v128.or(v128.shr<u64>(v5, 63), v128.shl<u64>(v5, 1));
+ v2 = unchecked(v128.add<u64>(v2, v6));
+ // NOP
+ v14 = v128.xor(v14, v2);
+ v14 = v128.shuffle<u8>(v14, v14, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11);
+ v10 = unchecked(v128.add<u64>(v10, v14));
+ v6 = v128.xor(v6, v10);
+ v6 = v128.shuffle<u8>(v6, v6, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10);
+ v2 = unchecked(v128.add<u64>(v2, v6));
+ // NOP
+ v14 = v128.xor(v14, v2);
+ v14 = v128.shuffle<u8>(v14, v14, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9);
+ v10 = unchecked(v128.add<u64>(v10, v14));
+ v6 = v128.xor(v6, v10);
+ v6 = v128.or(v128.shr<u64>(v6, 63), v128.shl<u64>(v6, 1));
+ v3 = unchecked(v128.add<u64>(v3, v7));
+ // NOP
+ v15 = v128.xor(v15, v3);
+ v15 = v128.shuffle<u8>(v15, v15, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11);
+ v11 = unchecked(v128.add<u64>(v11, v15));
+ v7 = v128.xor(v7, v11);
+ v7 = v128.shuffle<u8>(v7, v7, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10);
+ v3 = unchecked(v128.add<u64>(v3, v7));
+ // NOP
+ v15 = v128.xor(v15, v3);
+ v15 = v128.shuffle<u8>(v15, v15, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9);
+ v11 = unchecked(v128.add<u64>(v11, v15));
+ v7 = v128.xor(v7, v11);
+ v7 = v128.or(v128.shr<u64>(v7, 63), v128.shl<u64>(v7, 1));
+ v0 = unchecked(v128.add<u64>(v0, v5));
+ v0 = unchecked(v128.add<u64>(v0, m2));
+ v15 = v128.xor(v15, v0);
+ v15 = v128.shuffle<u8>(v15, v15, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11);
+ v10 = unchecked(v128.add<u64>(v10, v15));
+ v5 = v128.xor(v5, v10);
+ v5 = v128.shuffle<u8>(v5, v5, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10);
+ v0 = unchecked(v128.add<u64>(v0, v5));
+ // NOP
+ v15 = v128.xor(v15, v0);
+ v15 = v128.shuffle<u8>(v15, v15, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9);
+ v10 = unchecked(v128.add<u64>(v10, v15));
+ v5 = v128.xor(v5, v10);
+ v5 = v128.or(v128.shr<u64>(v5, 63), v128.shl<u64>(v5, 1));
+ v1 = unchecked(v128.add<u64>(v1, v6));
+ // NOP
+ v12 = v128.xor(v12, v1);
+ v12 = v128.shuffle<u8>(v12, v12, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11);
+ v11 = unchecked(v128.add<u64>(v11, v12));
+ v6 = v128.xor(v6, v11);
+ v6 = v128.shuffle<u8>(v6, v6, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10);
+ v1 = unchecked(v128.add<u64>(v1, v6));
+ // NOP
+ v12 = v128.xor(v12, v1);
+ v12 = v128.shuffle<u8>(v12, v12, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9);
+ v11 = unchecked(v128.add<u64>(v11, v12));
+ v6 = v128.xor(v6, v11);
+ v6 = v128.or(v128.shr<u64>(v6, 63), v128.shl<u64>(v6, 1));
+ v2 = unchecked(v128.add<u64>(v2, v7));
+ v2 = unchecked(v128.add<u64>(v2, m4));
+ v13 = v128.xor(v13, v2);
+ v13 = v128.shuffle<u8>(v13, v13, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11);
+ v8 = unchecked(v128.add<u64>(v8, v13));
+ v7 = v128.xor(v7, v8);
+ v7 = v128.shuffle<u8>(v7, v7, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10);
+ v2 = unchecked(v128.add<u64>(v2, v7));
+ v2 = unchecked(v128.add<u64>(v2, m0));
+ v13 = v128.xor(v13, v2);
+ v13 = v128.shuffle<u8>(v13, v13, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9);
+ v8 = unchecked(v128.add<u64>(v8, v13));
+ v7 = v128.xor(v7, v8);
+ v7 = v128.or(v128.shr<u64>(v7, 63), v128.shl<u64>(v7, 1));
+ v3 = unchecked(v128.add<u64>(v3, v4));
+ // NOP
+ v14 = v128.xor(v14, v3);
+ v14 = v128.shuffle<u8>(v14, v14, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11);
+ v9 = unchecked(v128.add<u64>(v9, v14));
+ v4 = v128.xor(v4, v9);
+ v4 = v128.shuffle<u8>(v4, v4, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10);
+ v3 = unchecked(v128.add<u64>(v3, v4));
+ // NOP
+ v14 = v128.xor(v14, v3);
+ v14 = v128.shuffle<u8>(v14, v14, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9);
+ v9 = unchecked(v128.add<u64>(v9, v14));
+ v4 = v128.xor(v4, v9);
+ v4 = v128.or(v128.shr<u64>(v4, 63), v128.shl<u64>(v4, 1));
+ // ROUND 4
+ v0 = unchecked(v128.add<u64>(v0, v4));
+ // NOP
+ v12 = v128.xor(v12, v0);
+ v12 = v128.shuffle<u8>(v12, v12, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11);
+ v8 = unchecked(v128.add<u64>(v8, v12));
+ v4 = v128.xor(v4, v8);
+ v4 = v128.shuffle<u8>(v4, v4, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10);
+ v0 = unchecked(v128.add<u64>(v0, v4));
+ v0 = unchecked(v128.add<u64>(v0, m0));
+ v12 = v128.xor(v12, v0);
+ v12 = v128.shuffle<u8>(v12, v12, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9);
+ v8 = unchecked(v128.add<u64>(v8, v12));
+ v4 = v128.xor(v4, v8);
+ v4 = v128.or(v128.shr<u64>(v4, 63), v128.shl<u64>(v4, 1));
+ v1 = unchecked(v128.add<u64>(v1, v5));
+ // NOP
+ v13 = v128.xor(v13, v1);
+ v13 = v128.shuffle<u8>(v13, v13, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11);
+ v9 = unchecked(v128.add<u64>(v9, v13));
+ v5 = v128.xor(v5, v9);
+ v5 = v128.shuffle<u8>(v5, v5, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10);
+ v1 = unchecked(v128.add<u64>(v1, v5));
+ // NOP
+ v13 = v128.xor(v13, v1);
+ v13 = v128.shuffle<u8>(v13, v13, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9);
+ v9 = unchecked(v128.add<u64>(v9, v13));
+ v5 = v128.xor(v5, v9);
+ v5 = v128.or(v128.shr<u64>(v5, 63), v128.shl<u64>(v5, 1));
+ v2 = unchecked(v128.add<u64>(v2, v6));
+ v2 = unchecked(v128.add<u64>(v2, m2));
+ v14 = v128.xor(v14, v2);
+ v14 = v128.shuffle<u8>(v14, v14, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11);
+ v10 = unchecked(v128.add<u64>(v10, v14));
+ v6 = v128.xor(v6, v10);
+ v6 = v128.shuffle<u8>(v6, v6, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10);
+ v2 = unchecked(v128.add<u64>(v2, v6));
+ v2 = unchecked(v128.add<u64>(v2, m4));
+ v14 = v128.xor(v14, v2);
+ v14 = v128.shuffle<u8>(v14, v14, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9);
+ v10 = unchecked(v128.add<u64>(v10, v14));
+ v6 = v128.xor(v6, v10);
+ v6 = v128.or(v128.shr<u64>(v6, 63), v128.shl<u64>(v6, 1));
+ v3 = unchecked(v128.add<u64>(v3, v7));
+ // NOP
+ v15 = v128.xor(v15, v3);
+ v15 = v128.shuffle<u8>(v15, v15, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11);
+ v11 = unchecked(v128.add<u64>(v11, v15));
+ v7 = v128.xor(v7, v11);
+ v7 = v128.shuffle<u8>(v7, v7, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10);
+ v3 = unchecked(v128.add<u64>(v3, v7));
+ // NOP
+ v15 = v128.xor(v15, v3);
+ v15 = v128.shuffle<u8>(v15, v15, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9);
+ v11 = unchecked(v128.add<u64>(v11, v15));
+ v7 = v128.xor(v7, v11);
+ v7 = v128.or(v128.shr<u64>(v7, 63), v128.shl<u64>(v7, 1));
+ v0 = unchecked(v128.add<u64>(v0, v5));
+ // NOP
+ v15 = v128.xor(v15, v0);
+ v15 = v128.shuffle<u8>(v15, v15, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11);
+ v10 = unchecked(v128.add<u64>(v10, v15));
+ v5 = v128.xor(v5, v10);
+ v5 = v128.shuffle<u8>(v5, v5, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10);
+ v0 = unchecked(v128.add<u64>(v0, v5));
+ v0 = unchecked(v128.add<u64>(v0, m1));
+ v15 = v128.xor(v15, v0);
+ v15 = v128.shuffle<u8>(v15, v15, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9);
+ v10 = unchecked(v128.add<u64>(v10, v15));
+ v5 = v128.xor(v5, v10);
+ v5 = v128.or(v128.shr<u64>(v5, 63), v128.shl<u64>(v5, 1));
+ v1 = unchecked(v128.add<u64>(v1, v6));
+ // NOP
+ v12 = v128.xor(v12, v1);
+ v12 = v128.shuffle<u8>(v12, v12, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11);
+ v11 = unchecked(v128.add<u64>(v11, v12));
+ v6 = v128.xor(v6, v11);
+ v6 = v128.shuffle<u8>(v6, v6, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10);
+ v1 = unchecked(v128.add<u64>(v1, v6));
+ // NOP
+ v12 = v128.xor(v12, v1);
+ v12 = v128.shuffle<u8>(v12, v12, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9);
+ v11 = unchecked(v128.add<u64>(v11, v12));
+ v6 = v128.xor(v6, v11);
+ v6 = v128.or(v128.shr<u64>(v6, 63), v128.shl<u64>(v6, 1));
+ v2 = unchecked(v128.add<u64>(v2, v7));
+ // NOP
+ v13 = v128.xor(v13, v2);
+ v13 = v128.shuffle<u8>(v13, v13, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11);
+ v8 = unchecked(v128.add<u64>(v8, v13));
+ v7 = v128.xor(v7, v8);
+ v7 = v128.shuffle<u8>(v7, v7, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10);
+ v2 = unchecked(v128.add<u64>(v2, v7));
+ // NOP
+ v13 = v128.xor(v13, v2);
+ v13 = v128.shuffle<u8>(v13, v13, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9);
+ v8 = unchecked(v128.add<u64>(v8, v13));
+ v7 = v128.xor(v7, v8);
+ v7 = v128.or(v128.shr<u64>(v7, 63), v128.shl<u64>(v7, 1));
+ v3 = unchecked(v128.add<u64>(v3, v4));
+ v3 = unchecked(v128.add<u64>(v3, m3));
+ v14 = v128.xor(v14, v3);
+ v14 = v128.shuffle<u8>(v14, v14, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11);
+ v9 = unchecked(v128.add<u64>(v9, v14));
+ v4 = v128.xor(v4, v9);
+ v4 = v128.shuffle<u8>(v4, v4, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10);
+ v3 = unchecked(v128.add<u64>(v3, v4));
+ // NOP
+ v14 = v128.xor(v14, v3);
+ v14 = v128.shuffle<u8>(v14, v14, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9);
+ v9 = unchecked(v128.add<u64>(v9, v14));
+ v4 = v128.xor(v4, v9);
+ v4 = v128.or(v128.shr<u64>(v4, 63), v128.shl<u64>(v4, 1));
+ // ROUND 5
+ v0 = unchecked(v128.add<u64>(v0, v4));
+ v0 = unchecked(v128.add<u64>(v0, m2));
+ v12 = v128.xor(v12, v0);
+ v12 = v128.shuffle<u8>(v12, v12, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11);
+ v8 = unchecked(v128.add<u64>(v8, v12));
+ v4 = v128.xor(v4, v8);
+ v4 = v128.shuffle<u8>(v4, v4, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10);
+ v0 = unchecked(v128.add<u64>(v0, v4));
+ // NOP
+ v12 = v128.xor(v12, v0);
+ v12 = v128.shuffle<u8>(v12, v12, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9);
+ v8 = unchecked(v128.add<u64>(v8, v12));
+ v4 = v128.xor(v4, v8);
+ v4 = v128.or(v128.shr<u64>(v4, 63), v128.shl<u64>(v4, 1));
+ v1 = unchecked(v128.add<u64>(v1, v5));
+ // NOP
+ v13 = v128.xor(v13, v1);
+ v13 = v128.shuffle<u8>(v13, v13, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11);
+ v9 = unchecked(v128.add<u64>(v9, v13));
+ v5 = v128.xor(v5, v9);
+ v5 = v128.shuffle<u8>(v5, v5, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10);
+ v1 = unchecked(v128.add<u64>(v1, v5));
+ // NOP
+ v13 = v128.xor(v13, v1);
+ v13 = v128.shuffle<u8>(v13, v13, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9);
+ v9 = unchecked(v128.add<u64>(v9, v13));
+ v5 = v128.xor(v5, v9);
+ v5 = v128.or(v128.shr<u64>(v5, 63), v128.shl<u64>(v5, 1));
+ v2 = unchecked(v128.add<u64>(v2, v6));
+ v2 = unchecked(v128.add<u64>(v2, m0));
+ v14 = v128.xor(v14, v2);
+ v14 = v128.shuffle<u8>(v14, v14, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11);
+ v10 = unchecked(v128.add<u64>(v10, v14));
+ v6 = v128.xor(v6, v10);
+ v6 = v128.shuffle<u8>(v6, v6, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10);
+ v2 = unchecked(v128.add<u64>(v2, v6));
+ // NOP
+ v14 = v128.xor(v14, v2);
+ v14 = v128.shuffle<u8>(v14, v14, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9);
+ v10 = unchecked(v128.add<u64>(v10, v14));
+ v6 = v128.xor(v6, v10);
+ v6 = v128.or(v128.shr<u64>(v6, 63), v128.shl<u64>(v6, 1));
+ v3 = unchecked(v128.add<u64>(v3, v7));
+ // NOP
+ v15 = v128.xor(v15, v3);
+ v15 = v128.shuffle<u8>(v15, v15, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11);
+ v11 = unchecked(v128.add<u64>(v11, v15));
+ v7 = v128.xor(v7, v11);
+ v7 = v128.shuffle<u8>(v7, v7, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10);
+ v3 = unchecked(v128.add<u64>(v3, v7));
+ v3 = unchecked(v128.add<u64>(v3, m3));
+ v15 = v128.xor(v15, v3);
+ v15 = v128.shuffle<u8>(v15, v15, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9);
+ v11 = unchecked(v128.add<u64>(v11, v15));
+ v7 = v128.xor(v7, v11);
+ v7 = v128.or(v128.shr<u64>(v7, 63), v128.shl<u64>(v7, 1));
+ v0 = unchecked(v128.add<u64>(v0, v5));
+ v0 = unchecked(v128.add<u64>(v0, m4));
+ v15 = v128.xor(v15, v0);
+ v15 = v128.shuffle<u8>(v15, v15, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11);
+ v10 = unchecked(v128.add<u64>(v10, v15));
+ v5 = v128.xor(v5, v10);
+ v5 = v128.shuffle<u8>(v5, v5, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10);
+ v0 = unchecked(v128.add<u64>(v0, v5));
+ // NOP
+ v15 = v128.xor(v15, v0);
+ v15 = v128.shuffle<u8>(v15, v15, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9);
+ v10 = unchecked(v128.add<u64>(v10, v15));
+ v5 = v128.xor(v5, v10);
+ v5 = v128.or(v128.shr<u64>(v5, 63), v128.shl<u64>(v5, 1));
+ v1 = unchecked(v128.add<u64>(v1, v6));
+ // NOP
+ v12 = v128.xor(v12, v1);
+ v12 = v128.shuffle<u8>(v12, v12, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11);
+ v11 = unchecked(v128.add<u64>(v11, v12));
+ v6 = v128.xor(v6, v11);
+ v6 = v128.shuffle<u8>(v6, v6, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10);
+ v1 = unchecked(v128.add<u64>(v1, v6));
+ // NOP
+ v12 = v128.xor(v12, v1);
+ v12 = v128.shuffle<u8>(v12, v12, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9);
+ v11 = unchecked(v128.add<u64>(v11, v12));
+ v6 = v128.xor(v6, v11);
+ v6 = v128.or(v128.shr<u64>(v6, 63), v128.shl<u64>(v6, 1));
+ v2 = unchecked(v128.add<u64>(v2, v7));
+ // NOP
+ v13 = v128.xor(v13, v2);
+ v13 = v128.shuffle<u8>(v13, v13, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11);
+ v8 = unchecked(v128.add<u64>(v8, v13));
+ v7 = v128.xor(v7, v8);
+ v7 = v128.shuffle<u8>(v7, v7, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10);
+ v2 = unchecked(v128.add<u64>(v2, v7));
+ // NOP
+ v13 = v128.xor(v13, v2);
+ v13 = v128.shuffle<u8>(v13, v13, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9);
+ v8 = unchecked(v128.add<u64>(v8, v13));
+ v7 = v128.xor(v7, v8);
+ v7 = v128.or(v128.shr<u64>(v7, 63), v128.shl<u64>(v7, 1));
+ v3 = unchecked(v128.add<u64>(v3, v4));
+ v3 = unchecked(v128.add<u64>(v3, m1));
+ v14 = v128.xor(v14, v3);
+ v14 = v128.shuffle<u8>(v14, v14, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11);
+ v9 = unchecked(v128.add<u64>(v9, v14));
+ v4 = v128.xor(v4, v9);
+ v4 = v128.shuffle<u8>(v4, v4, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10);
+ v3 = unchecked(v128.add<u64>(v3, v4));
+ // NOP
+ v14 = v128.xor(v14, v3);
+ v14 = v128.shuffle<u8>(v14, v14, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9);
+ v9 = unchecked(v128.add<u64>(v9, v14));
+ v4 = v128.xor(v4, v9);
+ v4 = v128.or(v128.shr<u64>(v4, 63), v128.shl<u64>(v4, 1));
+ // ROUND 6
+ v0 = unchecked(v128.add<u64>(v0, v4));
+ // NOP
+ v12 = v128.xor(v12, v0);
+ v12 = v128.shuffle<u8>(v12, v12, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11);
+ v8 = unchecked(v128.add<u64>(v8, v12));
+ v4 = v128.xor(v4, v8);
+ v4 = v128.shuffle<u8>(v4, v4, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10);
+ v0 = unchecked(v128.add<u64>(v0, v4));
+ // NOP
+ v12 = v128.xor(v12, v0);
+ v12 = v128.shuffle<u8>(v12, v12, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9);
+ v8 = unchecked(v128.add<u64>(v8, v12));
+ v4 = v128.xor(v4, v8);
+ v4 = v128.or(v128.shr<u64>(v4, 63), v128.shl<u64>(v4, 1));
+ v1 = unchecked(v128.add<u64>(v1, v5));
+ v1 = unchecked(v128.add<u64>(v1, m1));
+ v13 = v128.xor(v13, v1);
+ v13 = v128.shuffle<u8>(v13, v13, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11);
+ v9 = unchecked(v128.add<u64>(v9, v13));
+ v5 = v128.xor(v5, v9);
+ v5 = v128.shuffle<u8>(v5, v5, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10);
+ v1 = unchecked(v128.add<u64>(v1, v5));
+ // NOP
+ v13 = v128.xor(v13, v1);
+ v13 = v128.shuffle<u8>(v13, v13, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9);
+ v9 = unchecked(v128.add<u64>(v9, v13));
+ v5 = v128.xor(v5, v9);
+ v5 = v128.or(v128.shr<u64>(v5, 63), v128.shl<u64>(v5, 1));
+ v2 = unchecked(v128.add<u64>(v2, v6));
+ // NOP
+ v14 = v128.xor(v14, v2);
+ v14 = v128.shuffle<u8>(v14, v14, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11);
+ v10 = unchecked(v128.add<u64>(v10, v14));
+ v6 = v128.xor(v6, v10);
+ v6 = v128.shuffle<u8>(v6, v6, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10);
+ v2 = unchecked(v128.add<u64>(v2, v6));
+ // NOP
+ v14 = v128.xor(v14, v2);
+ v14 = v128.shuffle<u8>(v14, v14, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9);
+ v10 = unchecked(v128.add<u64>(v10, v14));
+ v6 = v128.xor(v6, v10);
+ v6 = v128.or(v128.shr<u64>(v6, 63), v128.shl<u64>(v6, 1));
+ v3 = unchecked(v128.add<u64>(v3, v7));
+ v3 = unchecked(v128.add<u64>(v3, m4));
+ v15 = v128.xor(v15, v3);
+ v15 = v128.shuffle<u8>(v15, v15, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11);
+ v11 = unchecked(v128.add<u64>(v11, v15));
+ v7 = v128.xor(v7, v11);
+ v7 = v128.shuffle<u8>(v7, v7, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10);
+ v3 = unchecked(v128.add<u64>(v3, v7));
+ // NOP
+ v15 = v128.xor(v15, v3);
+ v15 = v128.shuffle<u8>(v15, v15, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9);
+ v11 = unchecked(v128.add<u64>(v11, v15));
+ v7 = v128.xor(v7, v11);
+ v7 = v128.or(v128.shr<u64>(v7, 63), v128.shl<u64>(v7, 1));
+ v0 = unchecked(v128.add<u64>(v0, v5));
+ v0 = unchecked(v128.add<u64>(v0, m0));
+ v15 = v128.xor(v15, v0);
+ v15 = v128.shuffle<u8>(v15, v15, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11);
+ v10 = unchecked(v128.add<u64>(v10, v15));
+ v5 = v128.xor(v5, v10);
+ v5 = v128.shuffle<u8>(v5, v5, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10);
+ v0 = unchecked(v128.add<u64>(v0, v5));
+ // NOP
+ v15 = v128.xor(v15, v0);
+ v15 = v128.shuffle<u8>(v15, v15, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9);
+ v10 = unchecked(v128.add<u64>(v10, v15));
+ v5 = v128.xor(v5, v10);
+ v5 = v128.or(v128.shr<u64>(v5, 63), v128.shl<u64>(v5, 1));
+ v1 = unchecked(v128.add<u64>(v1, v6));
+ // NOP
+ v12 = v128.xor(v12, v1);
+ v12 = v128.shuffle<u8>(v12, v12, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11);
+ v11 = unchecked(v128.add<u64>(v11, v12));
+ v6 = v128.xor(v6, v11);
+ v6 = v128.shuffle<u8>(v6, v6, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10);
+ v1 = unchecked(v128.add<u64>(v1, v6));
+ v1 = unchecked(v128.add<u64>(v1, m3));
+ v12 = v128.xor(v12, v1);
+ v12 = v128.shuffle<u8>(v12, v12, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9);
+ v11 = unchecked(v128.add<u64>(v11, v12));
+ v6 = v128.xor(v6, v11);
+ v6 = v128.or(v128.shr<u64>(v6, 63), v128.shl<u64>(v6, 1));
+ v2 = unchecked(v128.add<u64>(v2, v7));
+ // NOP
+ v13 = v128.xor(v13, v2);
+ v13 = v128.shuffle<u8>(v13, v13, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11);
+ v8 = unchecked(v128.add<u64>(v8, v13));
+ v7 = v128.xor(v7, v8);
+ v7 = v128.shuffle<u8>(v7, v7, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10);
+ v2 = unchecked(v128.add<u64>(v2, v7));
+ v2 = unchecked(v128.add<u64>(v2, m2));
+ v13 = v128.xor(v13, v2);
+ v13 = v128.shuffle<u8>(v13, v13, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9);
+ v8 = unchecked(v128.add<u64>(v8, v13));
+ v7 = v128.xor(v7, v8);
+ v7 = v128.or(v128.shr<u64>(v7, 63), v128.shl<u64>(v7, 1));
+ v3 = unchecked(v128.add<u64>(v3, v4));
+ // NOP
+ v14 = v128.xor(v14, v3);
+ v14 = v128.shuffle<u8>(v14, v14, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11);
+ v9 = unchecked(v128.add<u64>(v9, v14));
+ v4 = v128.xor(v4, v9);
+ v4 = v128.shuffle<u8>(v4, v4, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10);
+ v3 = unchecked(v128.add<u64>(v3, v4));
+ // NOP
+ v14 = v128.xor(v14, v3);
+ v14 = v128.shuffle<u8>(v14, v14, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9);
+ v9 = unchecked(v128.add<u64>(v9, v14));
+ v4 = v128.xor(v4, v9);
+ v4 = v128.or(v128.shr<u64>(v4, 63), v128.shl<u64>(v4, 1));
+ // ROUND 7
+ v0 = unchecked(v128.add<u64>(v0, v4));
+ // NOP
+ v12 = v128.xor(v12, v0);
+ v12 = v128.shuffle<u8>(v12, v12, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11);
+ v8 = unchecked(v128.add<u64>(v8, v12));
+ v4 = v128.xor(v4, v8);
+ v4 = v128.shuffle<u8>(v4, v4, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10);
+ v0 = unchecked(v128.add<u64>(v0, v4));
+ // NOP
+ v12 = v128.xor(v12, v0);
+ v12 = v128.shuffle<u8>(v12, v12, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9);
+ v8 = unchecked(v128.add<u64>(v8, v12));
+ v4 = v128.xor(v4, v8);
+ v4 = v128.or(v128.shr<u64>(v4, 63), v128.shl<u64>(v4, 1));
+ v1 = unchecked(v128.add<u64>(v1, v5));
+ // NOP
+ v13 = v128.xor(v13, v1);
+ v13 = v128.shuffle<u8>(v13, v13, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11);
+ v9 = unchecked(v128.add<u64>(v9, v13));
+ v5 = v128.xor(v5, v9);
+ v5 = v128.shuffle<u8>(v5, v5, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10);
+ v1 = unchecked(v128.add<u64>(v1, v5));
+ // NOP
+ v13 = v128.xor(v13, v1);
+ v13 = v128.shuffle<u8>(v13, v13, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9);
+ v9 = unchecked(v128.add<u64>(v9, v13));
+ v5 = v128.xor(v5, v9);
+ v5 = v128.or(v128.shr<u64>(v5, 63), v128.shl<u64>(v5, 1));
+ v2 = unchecked(v128.add<u64>(v2, v6));
+ // NOP
+ v14 = v128.xor(v14, v2);
+ v14 = v128.shuffle<u8>(v14, v14, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11);
+ v10 = unchecked(v128.add<u64>(v10, v14));
+ v6 = v128.xor(v6, v10);
+ v6 = v128.shuffle<u8>(v6, v6, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10);
+ v2 = unchecked(v128.add<u64>(v2, v6));
+ v2 = unchecked(v128.add<u64>(v2, m1));
+ v14 = v128.xor(v14, v2);
+ v14 = v128.shuffle<u8>(v14, v14, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9);
+ v10 = unchecked(v128.add<u64>(v10, v14));
+ v6 = v128.xor(v6, v10);
+ v6 = v128.or(v128.shr<u64>(v6, 63), v128.shl<u64>(v6, 1));
+ v3 = unchecked(v128.add<u64>(v3, v7));
+ v3 = unchecked(v128.add<u64>(v3, m3));
+ v15 = v128.xor(v15, v3);
+ v15 = v128.shuffle<u8>(v15, v15, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11);
+ v11 = unchecked(v128.add<u64>(v11, v15));
+ v7 = v128.xor(v7, v11);
+ v7 = v128.shuffle<u8>(v7, v7, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10);
+ v3 = unchecked(v128.add<u64>(v3, v7));
+ // NOP
+ v15 = v128.xor(v15, v3);
+ v15 = v128.shuffle<u8>(v15, v15, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9);
+ v11 = unchecked(v128.add<u64>(v11, v15));
+ v7 = v128.xor(v7, v11);
+ v7 = v128.or(v128.shr<u64>(v7, 63), v128.shl<u64>(v7, 1));
+ v0 = unchecked(v128.add<u64>(v0, v5));
+ // NOP
+ v15 = v128.xor(v15, v0);
+ v15 = v128.shuffle<u8>(v15, v15, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11);
+ v10 = unchecked(v128.add<u64>(v10, v15));
+ v5 = v128.xor(v5, v10);
+ v5 = v128.shuffle<u8>(v5, v5, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10);
+ v0 = unchecked(v128.add<u64>(v0, v5));
+ v0 = unchecked(v128.add<u64>(v0, m0));
+ v15 = v128.xor(v15, v0);
+ v15 = v128.shuffle<u8>(v15, v15, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9);
+ v10 = unchecked(v128.add<u64>(v10, v15));
+ v5 = v128.xor(v5, v10);
+ v5 = v128.or(v128.shr<u64>(v5, 63), v128.shl<u64>(v5, 1));
+ v1 = unchecked(v128.add<u64>(v1, v6));
+ // NOP
+ v12 = v128.xor(v12, v1);
+ v12 = v128.shuffle<u8>(v12, v12, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11);
+ v11 = unchecked(v128.add<u64>(v11, v12));
+ v6 = v128.xor(v6, v11);
+ v6 = v128.shuffle<u8>(v6, v6, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10);
+ v1 = unchecked(v128.add<u64>(v1, v6));
+ v1 = unchecked(v128.add<u64>(v1, m4));
+ v12 = v128.xor(v12, v1);
+ v12 = v128.shuffle<u8>(v12, v12, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9);
+ v11 = unchecked(v128.add<u64>(v11, v12));
+ v6 = v128.xor(v6, v11);
+ v6 = v128.or(v128.shr<u64>(v6, 63), v128.shl<u64>(v6, 1));
+ v2 = unchecked(v128.add<u64>(v2, v7));
+ // NOP
+ v13 = v128.xor(v13, v2);
+ v13 = v128.shuffle<u8>(v13, v13, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11);
+ v8 = unchecked(v128.add<u64>(v8, v13));
+ v7 = v128.xor(v7, v8);
+ v7 = v128.shuffle<u8>(v7, v7, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10);
+ v2 = unchecked(v128.add<u64>(v2, v7));
+ // NOP
+ v13 = v128.xor(v13, v2);
+ v13 = v128.shuffle<u8>(v13, v13, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9);
+ v8 = unchecked(v128.add<u64>(v8, v13));
+ v7 = v128.xor(v7, v8);
+ v7 = v128.or(v128.shr<u64>(v7, 63), v128.shl<u64>(v7, 1));
+ v3 = unchecked(v128.add<u64>(v3, v4));
+ v3 = unchecked(v128.add<u64>(v3, m2));
+ v14 = v128.xor(v14, v3);
+ v14 = v128.shuffle<u8>(v14, v14, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11);
+ v9 = unchecked(v128.add<u64>(v9, v14));
+ v4 = v128.xor(v4, v9);
+ v4 = v128.shuffle<u8>(v4, v4, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10);
+ v3 = unchecked(v128.add<u64>(v3, v4));
+ // NOP
+ v14 = v128.xor(v14, v3);
+ v14 = v128.shuffle<u8>(v14, v14, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9);
+ v9 = unchecked(v128.add<u64>(v9, v14));
+ v4 = v128.xor(v4, v9);
+ v4 = v128.or(v128.shr<u64>(v4, 63), v128.shl<u64>(v4, 1));
+ // ROUND 8
+ v0 = unchecked(v128.add<u64>(v0, v4));
+ // NOP
+ v12 = v128.xor(v12, v0);
+ v12 = v128.shuffle<u8>(v12, v12, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11);
+ v8 = unchecked(v128.add<u64>(v8, v12));
+ v4 = v128.xor(v4, v8);
+ v4 = v128.shuffle<u8>(v4, v4, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10);
+ v0 = unchecked(v128.add<u64>(v0, v4));
+ // NOP
+ v12 = v128.xor(v12, v0);
+ v12 = v128.shuffle<u8>(v12, v12, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9);
+ v8 = unchecked(v128.add<u64>(v8, v12));
+ v4 = v128.xor(v4, v8);
+ v4 = v128.or(v128.shr<u64>(v4, 63), v128.shl<u64>(v4, 1));
+ v1 = unchecked(v128.add<u64>(v1, v5));
+ // NOP
+ v13 = v128.xor(v13, v1);
+ v13 = v128.shuffle<u8>(v13, v13, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11);
+ v9 = unchecked(v128.add<u64>(v9, v13));
+ v5 = v128.xor(v5, v9);
+ v5 = v128.shuffle<u8>(v5, v5, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10);
+ v1 = unchecked(v128.add<u64>(v1, v5));
+ // NOP
+ v13 = v128.xor(v13, v1);
+ v13 = v128.shuffle<u8>(v13, v13, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9);
+ v9 = unchecked(v128.add<u64>(v9, v13));
+ v5 = v128.xor(v5, v9);
+ v5 = v128.or(v128.shr<u64>(v5, 63), v128.shl<u64>(v5, 1));
+ v2 = unchecked(v128.add<u64>(v2, v6));
+ // NOP
+ v14 = v128.xor(v14, v2);
+ v14 = v128.shuffle<u8>(v14, v14, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11);
+ v10 = unchecked(v128.add<u64>(v10, v14));
+ v6 = v128.xor(v6, v10);
+ v6 = v128.shuffle<u8>(v6, v6, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10);
+ v2 = unchecked(v128.add<u64>(v2, v6));
+ v2 = unchecked(v128.add<u64>(v2, m3));
+ v14 = v128.xor(v14, v2);
+ v14 = v128.shuffle<u8>(v14, v14, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9);
+ v10 = unchecked(v128.add<u64>(v10, v14));
+ v6 = v128.xor(v6, v10);
+ v6 = v128.or(v128.shr<u64>(v6, 63), v128.shl<u64>(v6, 1));
+ v3 = unchecked(v128.add<u64>(v3, v7));
+ v3 = unchecked(v128.add<u64>(v3, m0));
+ v15 = v128.xor(v15, v3);
+ v15 = v128.shuffle<u8>(v15, v15, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11);
+ v11 = unchecked(v128.add<u64>(v11, v15));
+ v7 = v128.xor(v7, v11);
+ v7 = v128.shuffle<u8>(v7, v7, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10);
+ v3 = unchecked(v128.add<u64>(v3, v7));
+ // NOP
+ v15 = v128.xor(v15, v3);
+ v15 = v128.shuffle<u8>(v15, v15, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9);
+ v11 = unchecked(v128.add<u64>(v11, v15));
+ v7 = v128.xor(v7, v11);
+ v7 = v128.or(v128.shr<u64>(v7, 63), v128.shl<u64>(v7, 1));
+ v0 = unchecked(v128.add<u64>(v0, v5));
+ // NOP
+ v15 = v128.xor(v15, v0);
+ v15 = v128.shuffle<u8>(v15, v15, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11);
+ v10 = unchecked(v128.add<u64>(v10, v15));
+ v5 = v128.xor(v5, v10);
+ v5 = v128.shuffle<u8>(v5, v5, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10);
+ v0 = unchecked(v128.add<u64>(v0, v5));
+ v0 = unchecked(v128.add<u64>(v0, m2));
+ v15 = v128.xor(v15, v0);
+ v15 = v128.shuffle<u8>(v15, v15, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9);
+ v10 = unchecked(v128.add<u64>(v10, v15));
+ v5 = v128.xor(v5, v10);
+ v5 = v128.or(v128.shr<u64>(v5, 63), v128.shl<u64>(v5, 1));
+ v1 = unchecked(v128.add<u64>(v1, v6));
+ // NOP
+ v12 = v128.xor(v12, v1);
+ v12 = v128.shuffle<u8>(v12, v12, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11);
+ v11 = unchecked(v128.add<u64>(v11, v12));
+ v6 = v128.xor(v6, v11);
+ v6 = v128.shuffle<u8>(v6, v6, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10);
+ v1 = unchecked(v128.add<u64>(v1, v6));
+ // NOP
+ v12 = v128.xor(v12, v1);
+ v12 = v128.shuffle<u8>(v12, v12, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9);
+ v11 = unchecked(v128.add<u64>(v11, v12));
+ v6 = v128.xor(v6, v11);
+ v6 = v128.or(v128.shr<u64>(v6, 63), v128.shl<u64>(v6, 1));
+ v2 = unchecked(v128.add<u64>(v2, v7));
+ v2 = unchecked(v128.add<u64>(v2, m1));
+ v13 = v128.xor(v13, v2);
+ v13 = v128.shuffle<u8>(v13, v13, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11);
+ v8 = unchecked(v128.add<u64>(v8, v13));
+ v7 = v128.xor(v7, v8);
+ v7 = v128.shuffle<u8>(v7, v7, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10);
+ v2 = unchecked(v128.add<u64>(v2, v7));
+ v2 = unchecked(v128.add<u64>(v2, m4));
+ v13 = v128.xor(v13, v2);
+ v13 = v128.shuffle<u8>(v13, v13, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9);
+ v8 = unchecked(v128.add<u64>(v8, v13));
+ v7 = v128.xor(v7, v8);
+ v7 = v128.or(v128.shr<u64>(v7, 63), v128.shl<u64>(v7, 1));
+ v3 = unchecked(v128.add<u64>(v3, v4));
+ // NOP
+ v14 = v128.xor(v14, v3);
+ v14 = v128.shuffle<u8>(v14, v14, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11);
+ v9 = unchecked(v128.add<u64>(v9, v14));
+ v4 = v128.xor(v4, v9);
+ v4 = v128.shuffle<u8>(v4, v4, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10);
+ v3 = unchecked(v128.add<u64>(v3, v4));
+ // NOP
+ v14 = v128.xor(v14, v3);
+ v14 = v128.shuffle<u8>(v14, v14, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9);
+ v9 = unchecked(v128.add<u64>(v9, v14));
+ v4 = v128.xor(v4, v9);
+ v4 = v128.or(v128.shr<u64>(v4, 63), v128.shl<u64>(v4, 1));
+ // ROUND 9
+ v0 = unchecked(v128.add<u64>(v0, v4));
+ // NOP
+ v12 = v128.xor(v12, v0);
+ v12 = v128.shuffle<u8>(v12, v12, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11);
+ v8 = unchecked(v128.add<u64>(v8, v12));
+ v4 = v128.xor(v4, v8);
+ v4 = v128.shuffle<u8>(v4, v4, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10);
+ v0 = unchecked(v128.add<u64>(v0, v4));
+ v0 = unchecked(v128.add<u64>(v0, m2));
+ v12 = v128.xor(v12, v0);
+ v12 = v128.shuffle<u8>(v12, v12, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9);
+ v8 = unchecked(v128.add<u64>(v8, v12));
+ v4 = v128.xor(v4, v8);
+ v4 = v128.or(v128.shr<u64>(v4, 63), v128.shl<u64>(v4, 1));
+ v1 = unchecked(v128.add<u64>(v1, v5));
+ // NOP
+ v13 = v128.xor(v13, v1);
+ v13 = v128.shuffle<u8>(v13, v13, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11);
+ v9 = unchecked(v128.add<u64>(v9, v13));
+ v5 = v128.xor(v5, v9);
+ v5 = v128.shuffle<u8>(v5, v5, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10);
+ v1 = unchecked(v128.add<u64>(v1, v5));
+ v1 = unchecked(v128.add<u64>(v1, m4));
+ v13 = v128.xor(v13, v1);
+ v13 = v128.shuffle<u8>(v13, v13, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9);
+ v9 = unchecked(v128.add<u64>(v9, v13));
+ v5 = v128.xor(v5, v9);
+ v5 = v128.or(v128.shr<u64>(v5, 63), v128.shl<u64>(v5, 1));
+ v2 = unchecked(v128.add<u64>(v2, v6));
+ // NOP
+ v14 = v128.xor(v14, v2);
+ v14 = v128.shuffle<u8>(v14, v14, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11);
+ v10 = unchecked(v128.add<u64>(v10, v14));
+ v6 = v128.xor(v6, v10);
+ v6 = v128.shuffle<u8>(v6, v6, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10);
+ v2 = unchecked(v128.add<u64>(v2, v6));
+ // NOP
+ v14 = v128.xor(v14, v2);
+ v14 = v128.shuffle<u8>(v14, v14, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9);
+ v10 = unchecked(v128.add<u64>(v10, v14));
+ v6 = v128.xor(v6, v10);
+ v6 = v128.or(v128.shr<u64>(v6, 63), v128.shl<u64>(v6, 1));
+ v3 = unchecked(v128.add<u64>(v3, v7));
+ v3 = unchecked(v128.add<u64>(v3, m1));
+ v15 = v128.xor(v15, v3);
+ v15 = v128.shuffle<u8>(v15, v15, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11);
+ v11 = unchecked(v128.add<u64>(v11, v15));
+ v7 = v128.xor(v7, v11);
+ v7 = v128.shuffle<u8>(v7, v7, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10);
+ v3 = unchecked(v128.add<u64>(v3, v7));
+ // NOP
+ v15 = v128.xor(v15, v3);
+ v15 = v128.shuffle<u8>(v15, v15, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9);
+ v11 = unchecked(v128.add<u64>(v11, v15));
+ v7 = v128.xor(v7, v11);
+ v7 = v128.or(v128.shr<u64>(v7, 63), v128.shl<u64>(v7, 1));
+ v0 = unchecked(v128.add<u64>(v0, v5));
+ // NOP
+ v15 = v128.xor(v15, v0);
+ v15 = v128.shuffle<u8>(v15, v15, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11);
+ v10 = unchecked(v128.add<u64>(v10, v15));
+ v5 = v128.xor(v5, v10);
+ v5 = v128.shuffle<u8>(v5, v5, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10);
+ v0 = unchecked(v128.add<u64>(v0, v5));
+ // NOP
+ v15 = v128.xor(v15, v0);
+ v15 = v128.shuffle<u8>(v15, v15, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9);
+ v10 = unchecked(v128.add<u64>(v10, v15));
+ v5 = v128.xor(v5, v10);
+ v5 = v128.or(v128.shr<u64>(v5, 63), v128.shl<u64>(v5, 1));
+ v1 = unchecked(v128.add<u64>(v1, v6));
+ // NOP
+ v12 = v128.xor(v12, v1);
+ v12 = v128.shuffle<u8>(v12, v12, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11);
+ v11 = unchecked(v128.add<u64>(v11, v12));
+ v6 = v128.xor(v6, v11);
+ v6 = v128.shuffle<u8>(v6, v6, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10);
+ v1 = unchecked(v128.add<u64>(v1, v6));
+ // NOP
+ v12 = v128.xor(v12, v1);
+ v12 = v128.shuffle<u8>(v12, v12, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9);
+ v11 = unchecked(v128.add<u64>(v11, v12));
+ v6 = v128.xor(v6, v11);
+ v6 = v128.or(v128.shr<u64>(v6, 63), v128.shl<u64>(v6, 1));
+ v2 = unchecked(v128.add<u64>(v2, v7));
+ v2 = unchecked(v128.add<u64>(v2, m3));
+ v13 = v128.xor(v13, v2);
+ v13 = v128.shuffle<u8>(v13, v13, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11);
+ v8 = unchecked(v128.add<u64>(v8, v13));
+ v7 = v128.xor(v7, v8);
+ v7 = v128.shuffle<u8>(v7, v7, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10);
+ v2 = unchecked(v128.add<u64>(v2, v7));
+ // NOP
+ v13 = v128.xor(v13, v2);
+ v13 = v128.shuffle<u8>(v13, v13, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9);
+ v8 = unchecked(v128.add<u64>(v8, v13));
+ v7 = v128.xor(v7, v8);
+ v7 = v128.or(v128.shr<u64>(v7, 63), v128.shl<u64>(v7, 1));
+ v3 = unchecked(v128.add<u64>(v3, v4));
+ // NOP
+ v14 = v128.xor(v14, v3);
+ v14 = v128.shuffle<u8>(v14, v14, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11);
+ v9 = unchecked(v128.add<u64>(v9, v14));
+ v4 = v128.xor(v4, v9);
+ v4 = v128.shuffle<u8>(v4, v4, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10);
+ v3 = unchecked(v128.add<u64>(v3, v4));
+ v3 = unchecked(v128.add<u64>(v3, m0));
+ v14 = v128.xor(v14, v3);
+ v14 = v128.shuffle<u8>(v14, v14, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9);
+ v9 = unchecked(v128.add<u64>(v9, v14));
+ v4 = v128.xor(v4, v9);
+ v4 = v128.or(v128.shr<u64>(v4, 63), v128.shl<u64>(v4, 1));
+ // ROUND 10
+ v0 = unchecked(v128.add<u64>(v0, v4));
+ v0 = unchecked(v128.add<u64>(v0, m0));
+ v12 = v128.xor(v12, v0);
+ v12 = v128.shuffle<u8>(v12, v12, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11);
+ v8 = unchecked(v128.add<u64>(v8, v12));
+ v4 = v128.xor(v4, v8);
+ v4 = v128.shuffle<u8>(v4, v4, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10);
+ v0 = unchecked(v128.add<u64>(v0, v4));
+ v0 = unchecked(v128.add<u64>(v0, m1));
+ v12 = v128.xor(v12, v0);
+ v12 = v128.shuffle<u8>(v12, v12, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9);
+ v8 = unchecked(v128.add<u64>(v8, v12));
+ v4 = v128.xor(v4, v8);
+ v4 = v128.or(v128.shr<u64>(v4, 63), v128.shl<u64>(v4, 1));
+ v1 = unchecked(v128.add<u64>(v1, v5));
+ v1 = unchecked(v128.add<u64>(v1, m2));
+ v13 = v128.xor(v13, v1);
+ v13 = v128.shuffle<u8>(v13, v13, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11);
+ v9 = unchecked(v128.add<u64>(v9, v13));
+ v5 = v128.xor(v5, v9);
+ v5 = v128.shuffle<u8>(v5, v5, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10);
+ v1 = unchecked(v128.add<u64>(v1, v5));
+ v1 = unchecked(v128.add<u64>(v1, m3));
+ v13 = v128.xor(v13, v1);
+ v13 = v128.shuffle<u8>(v13, v13, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9);
+ v9 = unchecked(v128.add<u64>(v9, v13));
+ v5 = v128.xor(v5, v9);
+ v5 = v128.or(v128.shr<u64>(v5, 63), v128.shl<u64>(v5, 1));
+ v2 = unchecked(v128.add<u64>(v2, v6));
+ v2 = unchecked(v128.add<u64>(v2, m4));
+ v14 = v128.xor(v14, v2);
+ v14 = v128.shuffle<u8>(v14, v14, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11);
+ v10 = unchecked(v128.add<u64>(v10, v14));
+ v6 = v128.xor(v6, v10);
+ v6 = v128.shuffle<u8>(v6, v6, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10);
+ v2 = unchecked(v128.add<u64>(v2, v6));
+ // NOP
+ v14 = v128.xor(v14, v2);
+ v14 = v128.shuffle<u8>(v14, v14, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9);
+ v10 = unchecked(v128.add<u64>(v10, v14));
+ v6 = v128.xor(v6, v10);
+ v6 = v128.or(v128.shr<u64>(v6, 63), v128.shl<u64>(v6, 1));
+ v3 = unchecked(v128.add<u64>(v3, v7));
+ // NOP
+ v15 = v128.xor(v15, v3);
+ v15 = v128.shuffle<u8>(v15, v15, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11);
+ v11 = unchecked(v128.add<u64>(v11, v15));
+ v7 = v128.xor(v7, v11);
+ v7 = v128.shuffle<u8>(v7, v7, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10);
+ v3 = unchecked(v128.add<u64>(v3, v7));
+ // NOP
+ v15 = v128.xor(v15, v3);
+ v15 = v128.shuffle<u8>(v15, v15, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9);
+ v11 = unchecked(v128.add<u64>(v11, v15));
+ v7 = v128.xor(v7, v11);
+ v7 = v128.or(v128.shr<u64>(v7, 63), v128.shl<u64>(v7, 1));
+ v0 = unchecked(v128.add<u64>(v0, v5));
+ // NOP
+ v15 = v128.xor(v15, v0);
+ v15 = v128.shuffle<u8>(v15, v15, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11);
+ v10 = unchecked(v128.add<u64>(v10, v15));
+ v5 = v128.xor(v5, v10);
+ v5 = v128.shuffle<u8>(v5, v5, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10);
+ v0 = unchecked(v128.add<u64>(v0, v5));
+ // NOP
+ v15 = v128.xor(v15, v0);
+ v15 = v128.shuffle<u8>(v15, v15, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9);
+ v10 = unchecked(v128.add<u64>(v10, v15));
+ v5 = v128.xor(v5, v10);
+ v5 = v128.or(v128.shr<u64>(v5, 63), v128.shl<u64>(v5, 1));
+ v1 = unchecked(v128.add<u64>(v1, v6));
+ // NOP
+ v12 = v128.xor(v12, v1);
+ v12 = v128.shuffle<u8>(v12, v12, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11);
+ v11 = unchecked(v128.add<u64>(v11, v12));
+ v6 = v128.xor(v6, v11);
+ v6 = v128.shuffle<u8>(v6, v6, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10);
+ v1 = unchecked(v128.add<u64>(v1, v6));
+ // NOP
+ v12 = v128.xor(v12, v1);
+ v12 = v128.shuffle<u8>(v12, v12, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9);
+ v11 = unchecked(v128.add<u64>(v11, v12));
+ v6 = v128.xor(v6, v11);
+ v6 = v128.or(v128.shr<u64>(v6, 63), v128.shl<u64>(v6, 1));
+ v2 = unchecked(v128.add<u64>(v2, v7));
+ // NOP
+ v13 = v128.xor(v13, v2);
+ v13 = v128.shuffle<u8>(v13, v13, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11);
+ v8 = unchecked(v128.add<u64>(v8, v13));
+ v7 = v128.xor(v7, v8);
+ v7 = v128.shuffle<u8>(v7, v7, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10);
+ v2 = unchecked(v128.add<u64>(v2, v7));
+ // NOP
+ v13 = v128.xor(v13, v2);
+ v13 = v128.shuffle<u8>(v13, v13, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9);
+ v8 = unchecked(v128.add<u64>(v8, v13));
+ v7 = v128.xor(v7, v8);
+ v7 = v128.or(v128.shr<u64>(v7, 63), v128.shl<u64>(v7, 1));
+ v3 = unchecked(v128.add<u64>(v3, v4));
+ // NOP
+ v14 = v128.xor(v14, v3);
+ v14 = v128.shuffle<u8>(v14, v14, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11);
+ v9 = unchecked(v128.add<u64>(v9, v14));
+ v4 = v128.xor(v4, v9);
+ v4 = v128.shuffle<u8>(v4, v4, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10);
+ v3 = unchecked(v128.add<u64>(v3, v4));
+ // NOP
+ v14 = v128.xor(v14, v3);
+ v14 = v128.shuffle<u8>(v14, v14, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9);
+ v9 = unchecked(v128.add<u64>(v9, v14));
+ v4 = v128.xor(v4, v9);
+ v4 = v128.or(v128.shr<u64>(v4, 63), v128.shl<u64>(v4, 1));
+ // ROUND 11
+ v0 = unchecked(v128.add<u64>(v0, v4));
+ // NOP
+ v12 = v128.xor(v12, v0);
+ v12 = v128.shuffle<u8>(v12, v12, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11);
+ v8 = unchecked(v128.add<u64>(v8, v12));
+ v4 = v128.xor(v4, v8);
+ v4 = v128.shuffle<u8>(v4, v4, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10);
+ v0 = unchecked(v128.add<u64>(v0, v4));
+ // NOP
+ v12 = v128.xor(v12, v0);
+ v12 = v128.shuffle<u8>(v12, v12, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9);
+ v8 = unchecked(v128.add<u64>(v8, v12));
+ v4 = v128.xor(v4, v8);
+ v4 = v128.or(v128.shr<u64>(v4, 63), v128.shl<u64>(v4, 1));
+ v1 = unchecked(v128.add<u64>(v1, v5));
+ v1 = unchecked(v128.add<u64>(v1, m4));
+ v13 = v128.xor(v13, v1);
+ v13 = v128.shuffle<u8>(v13, v13, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11);
+ v9 = unchecked(v128.add<u64>(v9, v13));
+ v5 = v128.xor(v5, v9);
+ v5 = v128.shuffle<u8>(v5, v5, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10);
+ v1 = unchecked(v128.add<u64>(v1, v5));
+ // NOP
+ v13 = v128.xor(v13, v1);
+ v13 = v128.shuffle<u8>(v13, v13, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9);
+ v9 = unchecked(v128.add<u64>(v9, v13));
+ v5 = v128.xor(v5, v9);
+ v5 = v128.or(v128.shr<u64>(v5, 63), v128.shl<u64>(v5, 1));
+ v2 = unchecked(v128.add<u64>(v2, v6));
+ // NOP
+ v14 = v128.xor(v14, v2);
+ v14 = v128.shuffle<u8>(v14, v14, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11);
+ v10 = unchecked(v128.add<u64>(v10, v14));
+ v6 = v128.xor(v6, v10);
+ v6 = v128.shuffle<u8>(v6, v6, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10);
+ v2 = unchecked(v128.add<u64>(v2, v6));
+ // NOP
+ v14 = v128.xor(v14, v2);
+ v14 = v128.shuffle<u8>(v14, v14, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9);
+ v10 = unchecked(v128.add<u64>(v10, v14));
+ v6 = v128.xor(v6, v10);
+ v6 = v128.or(v128.shr<u64>(v6, 63), v128.shl<u64>(v6, 1));
+ v3 = unchecked(v128.add<u64>(v3, v7));
+ // NOP
+ v15 = v128.xor(v15, v3);
+ v15 = v128.shuffle<u8>(v15, v15, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11);
+ v11 = unchecked(v128.add<u64>(v11, v15));
+ v7 = v128.xor(v7, v11);
+ v7 = v128.shuffle<u8>(v7, v7, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10);
+ v3 = unchecked(v128.add<u64>(v3, v7));
+ // NOP
+ v15 = v128.xor(v15, v3);
+ v15 = v128.shuffle<u8>(v15, v15, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9);
+ v11 = unchecked(v128.add<u64>(v11, v15));
+ v7 = v128.xor(v7, v11);
+ v7 = v128.or(v128.shr<u64>(v7, 63), v128.shl<u64>(v7, 1));
+ v0 = unchecked(v128.add<u64>(v0, v5));
+ v0 = unchecked(v128.add<u64>(v0, m1));
+ v15 = v128.xor(v15, v0);
+ v15 = v128.shuffle<u8>(v15, v15, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11);
+ v10 = unchecked(v128.add<u64>(v10, v15));
+ v5 = v128.xor(v5, v10);
+ v5 = v128.shuffle<u8>(v5, v5, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10);
+ v0 = unchecked(v128.add<u64>(v0, v5));
+ // NOP
+ v15 = v128.xor(v15, v0);
+ v15 = v128.shuffle<u8>(v15, v15, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9);
+ v10 = unchecked(v128.add<u64>(v10, v15));
+ v5 = v128.xor(v5, v10);
+ v5 = v128.or(v128.shr<u64>(v5, 63), v128.shl<u64>(v5, 1));
+ // G NOP
+ v2 = unchecked(v128.add<u64>(v2, v7));
+ // NOP
+ v13 = v128.xor(v13, v2);
+ v13 = v128.shuffle<u8>(v13, v13, 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11);
+ v8 = unchecked(v128.add<u64>(v8, v13));
+ v7 = v128.xor(v7, v8);
+ v7 = v128.shuffle<u8>(v7, v7, 3,4,5,6,7,0,1,2, 11,12,13,14,15,8,9,10);
+ v2 = unchecked(v128.add<u64>(v2, v7));
+ // NOP
+ v13 = v128.xor(v13, v2);
+ v13 = v128.shuffle<u8>(v13, v13, 2,3,4,5,6,7,0,1, 10,11,12,13,14,15,8,9);
+ v8 = unchecked(v128.add<u64>(v8, v13));
+ v7 = v128.xor(v7, v8);
+ v7 = v128.or(v128.shr<u64>(v7, 63), v128.shl<u64>(v7, 1));
+ // G NOP
+ // RESULT
+ result = v128.xor(blake2b_iv_param, v128.xor(v0, v8));
+ r0 = v128.extract_lane<u64>(result, 0);
+ r1 = v128.extract_lane<u64>(result, 1);
+ if (r0 >= difficulty || r1 >= difficulty) break
+ }
+ // Throw trap if no valid nonce found, else get result from SIMD lane.
+ if (r0 < difficulty && r1 < difficulty) {
+ return 1 / 0
+ }
+ return select(v128.extract_lane<u64>(m0, 0), v128.extract_lane<u64>(m0, 1), r0 >= difficulty);
+}
--- /dev/null
+#version 300 es
+#pragma vscode_glsllint_stage: frag
+//! SPDX-FileCopyrightText: 2025 Chris Duncan <chris@zoso.dev>
+//! SPDX-FileContributor: Ben Green <ben@latenightsketches.com>
+//! SPDX-License-Identifier: GPL-3.0-or-later AND MIT
+// hash - Array of 32-bit integers comprising a 32-byte Nano block hash
+// difficulty - Minimum threshold for BLAKE2b result for work to be valid
+// seed - Random value which is uniquely varied by pixel coordinates
+layout(std140) uniform INPUT {
+ uint hash[8];
+ uvec2 difficulty;
+ uvec2 seed;
+};
+// work - Pixel value output if and only if valid nonce is found
+out uvec4 work;
+// Main draw function
+//
+// Draws a single pixel per shader invocation, multiplied by the dimensions of
+// the canvas.
+//
+// Each component of a random 8-byte value, provided by the INPUT as a uvec2,
+// is XOR'd with the 2-D coordinates of the pixel on the canvas to create a
+// unique nonce value for each.
+//
+// Where the reference implementation uses array lookups, the NanoPow
+// implementation assigns each array element to its own variable to enhance
+// performance, but the variable name still contains the original index digit.
+void main() {
+ // Initialize fragment output
+ work = uvec4(0u);
+ // Initialize unique nonce and block hash
+ uvec2 m0 = seed ^ uvec2(gl_FragCoord);
+ uvec2 m1 = uvec2(hash[0u], hash[1u]);
+ uvec2 m2 = uvec2(hash[2u], hash[3u]);
+ uvec2 m3 = uvec2(hash[4u], hash[5u]);
+ uvec2 m4 = uvec2(hash[6u], hash[7u]);
+ // INITIALIZE STATE VECTOR
+ // v0: depth=1; fanout=1; outlen=8
+ // v12: input byte length
+ // v14: final block flag
+ uvec2 v0 = uvec2(0xf3bcc908u, 0x6a09e667u);
+ uvec2 v8 = uvec2(0xf3bcc908u, 0x6a09e667u);
+ uvec2 v1 = uvec2(0x84caa73bu, 0xbb67ae85u);
+ uvec2 v9 = uvec2(0x84caa73bu, 0xbb67ae85u);
+ uvec2 v2 = uvec2(0xfe94f82bu, 0x3c6ef372u);
+ uvec2 v10 = uvec2(0xfe94f82bu, 0x3c6ef372u);
+ uvec2 v3 = uvec2(0x5f1d36f1u, 0xa54ff53au);
+ uvec2 v11 = uvec2(0x5f1d36f1u, 0xa54ff53au);
+ uvec2 v4 = uvec2(0xade682d1u, 0x510e527fu);
+ uvec2 v12 = uvec2(0xade682d1u, 0x510e527fu);
+ uvec2 v5 = uvec2(0x2b3e6c1fu, 0x9b05688cu);
+ uvec2 v13 = uvec2(0x2b3e6c1fu, 0x9b05688cu);
+ uvec2 v6 = uvec2(0xfb41bd6bu, 0x1f83d9abu);
+ uvec2 v14 = uvec2(0xfb41bd6bu, 0x1f83d9abu);
+ uvec2 v7 = uvec2(0x137e2179u, 0x5be0cd19u);
+ uvec2 v15 = uvec2(0x137e2179u, 0x5be0cd19u);
+ v0 ^= uvec2(0x01010008u, 0x0u);
+ v12 ^= uvec2(0x28u, 0x0u);
+ v14 ^= uvec2(~0x0u);
+ // COMPRESS
+ // ROUND 0
+ v0 += v4;
+ v0.y += uint(v0.x < v4.x);
+ v0 += m0;
+ v0.y += uint(v0.x < m0.x);
+ v12 ^= v0;
+ v12 = v12.yx;
+ v8 += v12;
+ v8.y += uint(v8.x < v12.x);
+ v4 ^= v8;
+ v4 = (v4 >> uvec2(24u)) | (v4 << uvec2(8u)).yx;
+ v0 += v4;
+ v0.y += uint(v0.x < v4.x);
+ v0 += m1;
+ v0.y += uint(v0.x < m1.x);
+ v12 ^= v0;
+ v12 = (v12 >> uvec2(16u)) | (v12 << uvec2(16u)).yx;
+ v8 += v12;
+ v8.y += uint(v8.x < v12.x);
+ v4 ^= v8;
+ v4 = (v4 >> uvec2(31u)).yx | (v4 << uvec2(1u));
+ v1 += v5;
+ v1.y += uint(v1.x < v5.x);
+ v1 += m2;
+ v1.y += uint(v1.x < m2.x);
+ v13 ^= v1;
+ v13 = v13.yx;
+ v9 += v13;
+ v9.y += uint(v9.x < v13.x);
+ v5 ^= v9;
+ v5 = (v5 >> uvec2(24u)) | (v5 << uvec2(8u)).yx;
+ v1 += v5;
+ v1.y += uint(v1.x < v5.x);
+ v1 += m3;
+ v1.y += uint(v1.x < m3.x);
+ v13 ^= v1;
+ v13 = (v13 >> uvec2(16u)) | (v13 << uvec2(16u)).yx;
+ v9 += v13;
+ v9.y += uint(v9.x < v13.x);
+ v5 ^= v9;
+ v5 = (v5 >> uvec2(31u)).yx | (v5 << uvec2(1u));
+ v2 += v6;
+ v2.y += uint(v2.x < v6.x);
+ v2 += m4;
+ v2.y += uint(v2.x < m4.x);
+ v14 ^= v2;
+ v14 = v14.yx;
+ v10 += v14;
+ v10.y += uint(v10.x < v14.x);
+ v6 ^= v10;
+ v6 = (v6 >> uvec2(24u)) | (v6 << uvec2(8u)).yx;
+ v2 += v6;
+ v2.y += uint(v2.x < v6.x);
+ // NOP
+ v14 ^= v2;
+ v14 = (v14 >> uvec2(16u)) | (v14 << uvec2(16u)).yx;
+ v10 += v14;
+ v10.y += uint(v10.x < v14.x);
+ v6 ^= v10;
+ v6 = (v6 >> uvec2(31u)).yx | (v6 << uvec2(1u));
+ v3 += v7;
+ v3.y += uint(v3.x < v7.x);
+ // NOP
+ v15 ^= v3;
+ v15 = v15.yx;
+ v11 += v15;
+ v11.y += uint(v11.x < v15.x);
+ v7 ^= v11;
+ v7 = (v7 >> uvec2(24u)) | (v7 << uvec2(8u)).yx;
+ v3 += v7;
+ v3.y += uint(v3.x < v7.x);
+ // NOP
+ v15 ^= v3;
+ v15 = (v15 >> uvec2(16u)) | (v15 << uvec2(16u)).yx;
+ v11 += v15;
+ v11.y += uint(v11.x < v15.x);
+ v7 ^= v11;
+ v7 = (v7 >> uvec2(31u)).yx | (v7 << uvec2(1u));
+ v0 += v5;
+ v0.y += uint(v0.x < v5.x);
+ // NOP
+ v15 ^= v0;
+ v15 = v15.yx;
+ v10 += v15;
+ v10.y += uint(v10.x < v15.x);
+ v5 ^= v10;
+ v5 = (v5 >> uvec2(24u)) | (v5 << uvec2(8u)).yx;
+ v0 += v5;
+ v0.y += uint(v0.x < v5.x);
+ // NOP
+ v15 ^= v0;
+ v15 = (v15 >> uvec2(16u)) | (v15 << uvec2(16u)).yx;
+ v10 += v15;
+ v10.y += uint(v10.x < v15.x);
+ v5 ^= v10;
+ v5 = (v5 >> uvec2(31u)).yx | (v5 << uvec2(1u));
+ v1 += v6;
+ v1.y += uint(v1.x < v6.x);
+ // NOP
+ v12 ^= v1;
+ v12 = v12.yx;
+ v11 += v12;
+ v11.y += uint(v11.x < v12.x);
+ v6 ^= v11;
+ v6 = (v6 >> uvec2(24u)) | (v6 << uvec2(8u)).yx;
+ v1 += v6;
+ v1.y += uint(v1.x < v6.x);
+ // NOP
+ v12 ^= v1;
+ v12 = (v12 >> uvec2(16u)) | (v12 << uvec2(16u)).yx;
+ v11 += v12;
+ v11.y += uint(v11.x < v12.x);
+ v6 ^= v11;
+ v6 = (v6 >> uvec2(31u)).yx | (v6 << uvec2(1u));
+ v2 += v7;
+ v2.y += uint(v2.x < v7.x);
+ // NOP
+ v13 ^= v2;
+ v13 = v13.yx;
+ v8 += v13;
+ v8.y += uint(v8.x < v13.x);
+ v7 ^= v8;
+ v7 = (v7 >> uvec2(24u)) | (v7 << uvec2(8u)).yx;
+ v2 += v7;
+ v2.y += uint(v2.x < v7.x);
+ // NOP
+ v13 ^= v2;
+ v13 = (v13 >> uvec2(16u)) | (v13 << uvec2(16u)).yx;
+ v8 += v13;
+ v8.y += uint(v8.x < v13.x);
+ v7 ^= v8;
+ v7 = (v7 >> uvec2(31u)).yx | (v7 << uvec2(1u));
+ v3 += v4;
+ v3.y += uint(v3.x < v4.x);
+ // NOP
+ v14 ^= v3;
+ v14 = v14.yx;
+ v9 += v14;
+ v9.y += uint(v9.x < v14.x);
+ v4 ^= v9;
+ v4 = (v4 >> uvec2(24u)) | (v4 << uvec2(8u)).yx;
+ v3 += v4;
+ v3.y += uint(v3.x < v4.x);
+ // NOP
+ v14 ^= v3;
+ v14 = (v14 >> uvec2(16u)) | (v14 << uvec2(16u)).yx;
+ v9 += v14;
+ v9.y += uint(v9.x < v14.x);
+ v4 ^= v9;
+ v4 = (v4 >> uvec2(31u)).yx | (v4 << uvec2(1u));
+ // ROUND 1
+ v0 += v4;
+ v0.y += uint(v0.x < v4.x);
+ // NOP
+ v12 ^= v0;
+ v12 = v12.yx;
+ v8 += v12;
+ v8.y += uint(v8.x < v12.x);
+ v4 ^= v8;
+ v4 = (v4 >> uvec2(24u)) | (v4 << uvec2(8u)).yx;
+ v0 += v4;
+ v0.y += uint(v0.x < v4.x);
+ // NOP
+ v12 ^= v0;
+ v12 = (v12 >> uvec2(16u)) | (v12 << uvec2(16u)).yx;
+ v8 += v12;
+ v8.y += uint(v8.x < v12.x);
+ v4 ^= v8;
+ v4 = (v4 >> uvec2(31u)).yx | (v4 << uvec2(1u));
+ v1 += v5;
+ v1.y += uint(v1.x < v5.x);
+ v1 += m4;
+ v1.y += uint(v1.x < m4.x);
+ v13 ^= v1;
+ v13 = v13.yx;
+ v9 += v13;
+ v9.y += uint(v9.x < v13.x);
+ v5 ^= v9;
+ v5 = (v5 >> uvec2(24u)) | (v5 << uvec2(8u)).yx;
+ v1 += v5;
+ v1.y += uint(v1.x < v5.x);
+ // NOP
+ v13 ^= v1;
+ v13 = (v13 >> uvec2(16u)) | (v13 << uvec2(16u)).yx;
+ v9 += v13;
+ v9.y += uint(v9.x < v13.x);
+ v5 ^= v9;
+ v5 = (v5 >> uvec2(31u)).yx | (v5 << uvec2(1u));
+ v2 += v6;
+ v2.y += uint(v2.x < v6.x);
+ // NOP
+ v14 ^= v2;
+ v14 = v14.yx;
+ v10 += v14;
+ v10.y += uint(v10.x < v14.x);
+ v6 ^= v10;
+ v6 = (v6 >> uvec2(24u)) | (v6 << uvec2(8u)).yx;
+ v2 += v6;
+ v2.y += uint(v2.x < v6.x);
+ // NOP
+ v14 ^= v2;
+ v14 = (v14 >> uvec2(16u)) | (v14 << uvec2(16u)).yx;
+ v10 += v14;
+ v10.y += uint(v10.x < v14.x);
+ v6 ^= v10;
+ v6 = (v6 >> uvec2(31u)).yx | (v6 << uvec2(1u));
+ v3 += v7;
+ v3.y += uint(v3.x < v7.x);
+ // NOP
+ v15 ^= v3;
+ v15 = v15.yx;
+ v11 += v15;
+ v11.y += uint(v11.x < v15.x);
+ v7 ^= v11;
+ v7 = (v7 >> uvec2(24u)) | (v7 << uvec2(8u)).yx;
+ v3 += v7;
+ v3.y += uint(v3.x < v7.x);
+ // NOP
+ v15 ^= v3;
+ v15 = (v15 >> uvec2(16u)) | (v15 << uvec2(16u)).yx;
+ v11 += v15;
+ v11.y += uint(v11.x < v15.x);
+ v7 ^= v11;
+ v7 = (v7 >> uvec2(31u)).yx | (v7 << uvec2(1u));
+ v0 += v5;
+ v0.y += uint(v0.x < v5.x);
+ v0 += m1;
+ v0.y += uint(v0.x < m1.x);
+ v15 ^= v0;
+ v15 = v15.yx;
+ v10 += v15;
+ v10.y += uint(v10.x < v15.x);
+ v5 ^= v10;
+ v5 = (v5 >> uvec2(24u)) | (v5 << uvec2(8u)).yx;
+ v0 += v5;
+ v0.y += uint(v0.x < v5.x);
+ // NOP
+ v15 ^= v0;
+ v15 = (v15 >> uvec2(16u)) | (v15 << uvec2(16u)).yx;
+ v10 += v15;
+ v10.y += uint(v10.x < v15.x);
+ v5 ^= v10;
+ v5 = (v5 >> uvec2(31u)).yx | (v5 << uvec2(1u));
+ v1 += v6;
+ v1.y += uint(v1.x < v6.x);
+ v1 += m0;
+ v1.y += uint(v1.x < m0.x);
+ v12 ^= v1;
+ v12 = v12.yx;
+ v11 += v12;
+ v11.y += uint(v11.x < v12.x);
+ v6 ^= v11;
+ v6 = (v6 >> uvec2(24u)) | (v6 << uvec2(8u)).yx;
+ v1 += v6;
+ v1.y += uint(v1.x < v6.x);
+ v1 += m2;
+ v1.y += uint(v1.x < m2.x);
+ v12 ^= v1;
+ v12 = (v12 >> uvec2(16u)) | (v12 << uvec2(16u)).yx;
+ v11 += v12;
+ v11.y += uint(v11.x < v12.x);
+ v6 ^= v11;
+ v6 = (v6 >> uvec2(31u)).yx | (v6 << uvec2(1u));
+ v2 += v7;
+ v2.y += uint(v2.x < v7.x);
+ // NOP
+ v13 ^= v2;
+ v13 = v13.yx;
+ v8 += v13;
+ v8.y += uint(v8.x < v13.x);
+ v7 ^= v8;
+ v7 = (v7 >> uvec2(24u)) | (v7 << uvec2(8u)).yx;
+ v2 += v7;
+ v2.y += uint(v2.x < v7.x);
+ // NOP
+ v13 ^= v2;
+ v13 = (v13 >> uvec2(16u)) | (v13 << uvec2(16u)).yx;
+ v8 += v13;
+ v8.y += uint(v8.x < v13.x);
+ v7 ^= v8;
+ v7 = (v7 >> uvec2(31u)).yx | (v7 << uvec2(1u));
+ v3 += v4;
+ v3.y += uint(v3.x < v4.x);
+ // NOP
+ v14 ^= v3;
+ v14 = v14.yx;
+ v9 += v14;
+ v9.y += uint(v9.x < v14.x);
+ v4 ^= v9;
+ v4 = (v4 >> uvec2(24u)) | (v4 << uvec2(8u)).yx;
+ v3 += v4;
+ v3.y += uint(v3.x < v4.x);
+ v3 += m3;
+ v3.y += uint(v3.x < m3.x);
+ v14 ^= v3;
+ v14 = (v14 >> uvec2(16u)) | (v14 << uvec2(16u)).yx;
+ v9 += v14;
+ v9.y += uint(v9.x < v14.x);
+ v4 ^= v9;
+ v4 = (v4 >> uvec2(31u)).yx | (v4 << uvec2(1u));
+ // ROUND 2
+ v0 += v4;
+ v0.y += uint(v0.x < v4.x);
+ // NOP
+ v12 ^= v0;
+ v12 = v12.yx;
+ v8 += v12;
+ v8.y += uint(v8.x < v12.x);
+ v4 ^= v8;
+ v4 = (v4 >> uvec2(24u)) | (v4 << uvec2(8u)).yx;
+ v0 += v4;
+ v0.y += uint(v0.x < v4.x);
+ // NOP
+ v12 ^= v0;
+ v12 = (v12 >> uvec2(16u)) | (v12 << uvec2(16u)).yx;
+ v8 += v12;
+ v8.y += uint(v8.x < v12.x);
+ v4 ^= v8;
+ v4 = (v4 >> uvec2(31u)).yx | (v4 << uvec2(1u));
+ v1 += v5;
+ v1.y += uint(v1.x < v5.x);
+ // NOP
+ v13 ^= v1;
+ v13 = v13.yx;
+ v9 += v13;
+ v9.y += uint(v9.x < v13.x);
+ v5 ^= v9;
+ v5 = (v5 >> uvec2(24u)) | (v5 << uvec2(8u)).yx;
+ v1 += v5;
+ v1.y += uint(v1.x < v5.x);
+ v1 += m0;
+ v1.y += uint(v1.x < m0.x);
+ v13 ^= v1;
+ v13 = (v13 >> uvec2(16u)) | (v13 << uvec2(16u)).yx;
+ v9 += v13;
+ v9.y += uint(v9.x < v13.x);
+ v5 ^= v9;
+ v5 = (v5 >> uvec2(31u)).yx | (v5 << uvec2(1u));
+ v2 += v6;
+ v2.y += uint(v2.x < v6.x);
+ // NOP
+ v14 ^= v2;
+ v14 = v14.yx;
+ v10 += v14;
+ v10.y += uint(v10.x < v14.x);
+ v6 ^= v10;
+ v6 = (v6 >> uvec2(24u)) | (v6 << uvec2(8u)).yx;
+ v2 += v6;
+ v2.y += uint(v2.x < v6.x);
+ v2 += m2;
+ v2.y += uint(v2.x < m2.x);
+ v14 ^= v2;
+ v14 = (v14 >> uvec2(16u)) | (v14 << uvec2(16u)).yx;
+ v10 += v14;
+ v10.y += uint(v10.x < v14.x);
+ v6 ^= v10;
+ v6 = (v6 >> uvec2(31u)).yx | (v6 << uvec2(1u));
+ v3 += v7;
+ v3.y += uint(v3.x < v7.x);
+ // NOP
+ v15 ^= v3;
+ v15 = v15.yx;
+ v11 += v15;
+ v11.y += uint(v11.x < v15.x);
+ v7 ^= v11;
+ v7 = (v7 >> uvec2(24u)) | (v7 << uvec2(8u)).yx;
+ v3 += v7;
+ v3.y += uint(v3.x < v7.x);
+ // NOP
+ v15 ^= v3;
+ v15 = (v15 >> uvec2(16u)) | (v15 << uvec2(16u)).yx;
+ v11 += v15;
+ v11.y += uint(v11.x < v15.x);
+ v7 ^= v11;
+ v7 = (v7 >> uvec2(31u)).yx | (v7 << uvec2(1u));
+ v0 += v5;
+ v0.y += uint(v0.x < v5.x);
+ // NOP
+ v15 ^= v0;
+ v15 = v15.yx;
+ v10 += v15;
+ v10.y += uint(v10.x < v15.x);
+ v5 ^= v10;
+ v5 = (v5 >> uvec2(24u)) | (v5 << uvec2(8u)).yx;
+ v0 += v5;
+ v0.y += uint(v0.x < v5.x);
+ // NOP
+ v15 ^= v0;
+ v15 = (v15 >> uvec2(16u)) | (v15 << uvec2(16u)).yx;
+ v10 += v15;
+ v10.y += uint(v10.x < v15.x);
+ v5 ^= v10;
+ v5 = (v5 >> uvec2(31u)).yx | (v5 << uvec2(1u));
+ v1 += v6;
+ v1.y += uint(v1.x < v6.x);
+ v1 += m3;
+ v1.y += uint(v1.x < m3.x);
+ v12 ^= v1;
+ v12 = v12.yx;
+ v11 += v12;
+ v11.y += uint(v11.x < v12.x);
+ v6 ^= v11;
+ v6 = (v6 >> uvec2(24u)) | (v6 << uvec2(8u)).yx;
+ v1 += v6;
+ v1.y += uint(v1.x < v6.x);
+ // NOP
+ v12 ^= v1;
+ v12 = (v12 >> uvec2(16u)) | (v12 << uvec2(16u)).yx;
+ v11 += v12;
+ v11.y += uint(v11.x < v12.x);
+ v6 ^= v11;
+ v6 = (v6 >> uvec2(31u)).yx | (v6 << uvec2(1u));
+ v2 += v7;
+ v2.y += uint(v2.x < v7.x);
+ // NOP
+ v13 ^= v2;
+ v13 = v13.yx;
+ v8 += v13;
+ v8.y += uint(v8.x < v13.x);
+ v7 ^= v8;
+ v7 = (v7 >> uvec2(24u)) | (v7 << uvec2(8u)).yx;
+ v2 += v7;
+ v2.y += uint(v2.x < v7.x);
+ v2 += m1;
+ v2.y += uint(v2.x < m1.x);
+ v13 ^= v2;
+ v13 = (v13 >> uvec2(16u)) | (v13 << uvec2(16u)).yx;
+ v8 += v13;
+ v8.y += uint(v8.x < v13.x);
+ v7 ^= v8;
+ v7 = (v7 >> uvec2(31u)).yx | (v7 << uvec2(1u));
+ v3 += v4;
+ v3.y += uint(v3.x < v4.x);
+ // NOP
+ v14 ^= v3;
+ v14 = v14.yx;
+ v9 += v14;
+ v9.y += uint(v9.x < v14.x);
+ v4 ^= v9;
+ v4 = (v4 >> uvec2(24u)) | (v4 << uvec2(8u)).yx;
+ v3 += v4;
+ v3.y += uint(v3.x < v4.x);
+ v3 += m4;
+ v3.y += uint(v3.x < m4.x);
+ v14 ^= v3;
+ v14 = (v14 >> uvec2(16u)) | (v14 << uvec2(16u)).yx;
+ v9 += v14;
+ v9.y += uint(v9.x < v14.x);
+ v4 ^= v9;
+ v4 = (v4 >> uvec2(31u)).yx | (v4 << uvec2(1u));
+ // ROUND 3
+ v0 += v4;
+ v0.y += uint(v0.x < v4.x);
+ // NOP
+ v12 ^= v0;
+ v12 = v12.yx;
+ v8 += v12;
+ v8.y += uint(v8.x < v12.x);
+ v4 ^= v8;
+ v4 = (v4 >> uvec2(24u)) | (v4 << uvec2(8u)).yx;
+ v0 += v4;
+ v0.y += uint(v0.x < v4.x);
+ // NOP
+ v12 ^= v0;
+ v12 = (v12 >> uvec2(16u)) | (v12 << uvec2(16u)).yx;
+ v8 += v12;
+ v8.y += uint(v8.x < v12.x);
+ v4 ^= v8;
+ v4 = (v4 >> uvec2(31u)).yx | (v4 << uvec2(1u));
+ v1 += v5;
+ v1.y += uint(v1.x < v5.x);
+ v1 += m3;
+ v1.y += uint(v1.x < m3.x);
+ v13 ^= v1;
+ v13 = v13.yx;
+ v9 += v13;
+ v9.y += uint(v9.x < v13.x);
+ v5 ^= v9;
+ v5 = (v5 >> uvec2(24u)) | (v5 << uvec2(8u)).yx;
+ v1 += v5;
+ v1.y += uint(v1.x < v5.x);
+ v1 += m1;
+ v1.y += uint(v1.x < m1.x);
+ v13 ^= v1;
+ v13 = (v13 >> uvec2(16u)) | (v13 << uvec2(16u)).yx;
+ v9 += v13;
+ v9.y += uint(v9.x < v13.x);
+ v5 ^= v9;
+ v5 = (v5 >> uvec2(31u)).yx | (v5 << uvec2(1u));
+ v2 += v6;
+ v2.y += uint(v2.x < v6.x);
+ // NOP
+ v14 ^= v2;
+ v14 = v14.yx;
+ v10 += v14;
+ v10.y += uint(v10.x < v14.x);
+ v6 ^= v10;
+ v6 = (v6 >> uvec2(24u)) | (v6 << uvec2(8u)).yx;
+ v2 += v6;
+ v2.y += uint(v2.x < v6.x);
+ // NOP
+ v14 ^= v2;
+ v14 = (v14 >> uvec2(16u)) | (v14 << uvec2(16u)).yx;
+ v10 += v14;
+ v10.y += uint(v10.x < v14.x);
+ v6 ^= v10;
+ v6 = (v6 >> uvec2(31u)).yx | (v6 << uvec2(1u));
+ v3 += v7;
+ v3.y += uint(v3.x < v7.x);
+ // NOP
+ v15 ^= v3;
+ v15 = v15.yx;
+ v11 += v15;
+ v11.y += uint(v11.x < v15.x);
+ v7 ^= v11;
+ v7 = (v7 >> uvec2(24u)) | (v7 << uvec2(8u)).yx;
+ v3 += v7;
+ v3.y += uint(v3.x < v7.x);
+ // NOP
+ v15 ^= v3;
+ v15 = (v15 >> uvec2(16u)) | (v15 << uvec2(16u)).yx;
+ v11 += v15;
+ v11.y += uint(v11.x < v15.x);
+ v7 ^= v11;
+ v7 = (v7 >> uvec2(31u)).yx | (v7 << uvec2(1u));
+ v0 += v5;
+ v0.y += uint(v0.x < v5.x);
+ v0 += m2;
+ v0.y += uint(v0.x < m2.x);
+ v15 ^= v0;
+ v15 = v15.yx;
+ v10 += v15;
+ v10.y += uint(v10.x < v15.x);
+ v5 ^= v10;
+ v5 = (v5 >> uvec2(24u)) | (v5 << uvec2(8u)).yx;
+ v0 += v5;
+ v0.y += uint(v0.x < v5.x);
+ // NOP
+ v15 ^= v0;
+ v15 = (v15 >> uvec2(16u)) | (v15 << uvec2(16u)).yx;
+ v10 += v15;
+ v10.y += uint(v10.x < v15.x);
+ v5 ^= v10;
+ v5 = (v5 >> uvec2(31u)).yx | (v5 << uvec2(1u));
+ v1 += v6;
+ v1.y += uint(v1.x < v6.x);
+ // NOP
+ v12 ^= v1;
+ v12 = v12.yx;
+ v11 += v12;
+ v11.y += uint(v11.x < v12.x);
+ v6 ^= v11;
+ v6 = (v6 >> uvec2(24u)) | (v6 << uvec2(8u)).yx;
+ v1 += v6;
+ v1.y += uint(v1.x < v6.x);
+ // NOP
+ v12 ^= v1;
+ v12 = (v12 >> uvec2(16u)) | (v12 << uvec2(16u)).yx;
+ v11 += v12;
+ v11.y += uint(v11.x < v12.x);
+ v6 ^= v11;
+ v6 = (v6 >> uvec2(31u)).yx | (v6 << uvec2(1u));
+ v2 += v7;
+ v2.y += uint(v2.x < v7.x);
+ v2 += m4;
+ v2.y += uint(v2.x < m4.x);
+ v13 ^= v2;
+ v13 = v13.yx;
+ v8 += v13;
+ v8.y += uint(v8.x < v13.x);
+ v7 ^= v8;
+ v7 = (v7 >> uvec2(24u)) | (v7 << uvec2(8u)).yx;
+ v2 += v7;
+ v2.y += uint(v2.x < v7.x);
+ v2 += m0;
+ v2.y += uint(v2.x < m0.x);
+ v13 ^= v2;
+ v13 = (v13 >> uvec2(16u)) | (v13 << uvec2(16u)).yx;
+ v8 += v13;
+ v8.y += uint(v8.x < v13.x);
+ v7 ^= v8;
+ v7 = (v7 >> uvec2(31u)).yx | (v7 << uvec2(1u));
+ v3 += v4;
+ v3.y += uint(v3.x < v4.x);
+ // NOP
+ v14 ^= v3;
+ v14 = v14.yx;
+ v9 += v14;
+ v9.y += uint(v9.x < v14.x);
+ v4 ^= v9;
+ v4 = (v4 >> uvec2(24u)) | (v4 << uvec2(8u)).yx;
+ v3 += v4;
+ v3.y += uint(v3.x < v4.x);
+ // NOP
+ v14 ^= v3;
+ v14 = (v14 >> uvec2(16u)) | (v14 << uvec2(16u)).yx;
+ v9 += v14;
+ v9.y += uint(v9.x < v14.x);
+ v4 ^= v9;
+ v4 = (v4 >> uvec2(31u)).yx | (v4 << uvec2(1u));
+ // ROUND 4
+ v0 += v4;
+ v0.y += uint(v0.x < v4.x);
+ // NOP
+ v12 ^= v0;
+ v12 = v12.yx;
+ v8 += v12;
+ v8.y += uint(v8.x < v12.x);
+ v4 ^= v8;
+ v4 = (v4 >> uvec2(24u)) | (v4 << uvec2(8u)).yx;
+ v0 += v4;
+ v0.y += uint(v0.x < v4.x);
+ v0 += m0;
+ v0.y += uint(v0.x < m0.x);
+ v12 ^= v0;
+ v12 = (v12 >> uvec2(16u)) | (v12 << uvec2(16u)).yx;
+ v8 += v12;
+ v8.y += uint(v8.x < v12.x);
+ v4 ^= v8;
+ v4 = (v4 >> uvec2(31u)).yx | (v4 << uvec2(1u));
+ v1 += v5;
+ v1.y += uint(v1.x < v5.x);
+ // NOP
+ v13 ^= v1;
+ v13 = v13.yx;
+ v9 += v13;
+ v9.y += uint(v9.x < v13.x);
+ v5 ^= v9;
+ v5 = (v5 >> uvec2(24u)) | (v5 << uvec2(8u)).yx;
+ v1 += v5;
+ v1.y += uint(v1.x < v5.x);
+ // NOP
+ v13 ^= v1;
+ v13 = (v13 >> uvec2(16u)) | (v13 << uvec2(16u)).yx;
+ v9 += v13;
+ v9.y += uint(v9.x < v13.x);
+ v5 ^= v9;
+ v5 = (v5 >> uvec2(31u)).yx | (v5 << uvec2(1u));
+ v2 += v6;
+ v2.y += uint(v2.x < v6.x);
+ v2 += m2;
+ v2.y += uint(v2.x < m2.x);
+ v14 ^= v2;
+ v14 = v14.yx;
+ v10 += v14;
+ v10.y += uint(v10.x < v14.x);
+ v6 ^= v10;
+ v6 = (v6 >> uvec2(24u)) | (v6 << uvec2(8u)).yx;
+ v2 += v6;
+ v2.y += uint(v2.x < v6.x);
+ v2 += m4;
+ v2.y += uint(v2.x < m4.x);
+ v14 ^= v2;
+ v14 = (v14 >> uvec2(16u)) | (v14 << uvec2(16u)).yx;
+ v10 += v14;
+ v10.y += uint(v10.x < v14.x);
+ v6 ^= v10;
+ v6 = (v6 >> uvec2(31u)).yx | (v6 << uvec2(1u));
+ v3 += v7;
+ v3.y += uint(v3.x < v7.x);
+ // NOP
+ v15 ^= v3;
+ v15 = v15.yx;
+ v11 += v15;
+ v11.y += uint(v11.x < v15.x);
+ v7 ^= v11;
+ v7 = (v7 >> uvec2(24u)) | (v7 << uvec2(8u)).yx;
+ v3 += v7;
+ v3.y += uint(v3.x < v7.x);
+ // NOP
+ v15 ^= v3;
+ v15 = (v15 >> uvec2(16u)) | (v15 << uvec2(16u)).yx;
+ v11 += v15;
+ v11.y += uint(v11.x < v15.x);
+ v7 ^= v11;
+ v7 = (v7 >> uvec2(31u)).yx | (v7 << uvec2(1u));
+ v0 += v5;
+ v0.y += uint(v0.x < v5.x);
+ // NOP
+ v15 ^= v0;
+ v15 = v15.yx;
+ v10 += v15;
+ v10.y += uint(v10.x < v15.x);
+ v5 ^= v10;
+ v5 = (v5 >> uvec2(24u)) | (v5 << uvec2(8u)).yx;
+ v0 += v5;
+ v0.y += uint(v0.x < v5.x);
+ v0 += m1;
+ v0.y += uint(v0.x < m1.x);
+ v15 ^= v0;
+ v15 = (v15 >> uvec2(16u)) | (v15 << uvec2(16u)).yx;
+ v10 += v15;
+ v10.y += uint(v10.x < v15.x);
+ v5 ^= v10;
+ v5 = (v5 >> uvec2(31u)).yx | (v5 << uvec2(1u));
+ v1 += v6;
+ v1.y += uint(v1.x < v6.x);
+ // NOP
+ v12 ^= v1;
+ v12 = v12.yx;
+ v11 += v12;
+ v11.y += uint(v11.x < v12.x);
+ v6 ^= v11;
+ v6 = (v6 >> uvec2(24u)) | (v6 << uvec2(8u)).yx;
+ v1 += v6;
+ v1.y += uint(v1.x < v6.x);
+ // NOP
+ v12 ^= v1;
+ v12 = (v12 >> uvec2(16u)) | (v12 << uvec2(16u)).yx;
+ v11 += v12;
+ v11.y += uint(v11.x < v12.x);
+ v6 ^= v11;
+ v6 = (v6 >> uvec2(31u)).yx | (v6 << uvec2(1u));
+ v2 += v7;
+ v2.y += uint(v2.x < v7.x);
+ // NOP
+ v13 ^= v2;
+ v13 = v13.yx;
+ v8 += v13;
+ v8.y += uint(v8.x < v13.x);
+ v7 ^= v8;
+ v7 = (v7 >> uvec2(24u)) | (v7 << uvec2(8u)).yx;
+ v2 += v7;
+ v2.y += uint(v2.x < v7.x);
+ // NOP
+ v13 ^= v2;
+ v13 = (v13 >> uvec2(16u)) | (v13 << uvec2(16u)).yx;
+ v8 += v13;
+ v8.y += uint(v8.x < v13.x);
+ v7 ^= v8;
+ v7 = (v7 >> uvec2(31u)).yx | (v7 << uvec2(1u));
+ v3 += v4;
+ v3.y += uint(v3.x < v4.x);
+ v3 += m3;
+ v3.y += uint(v3.x < m3.x);
+ v14 ^= v3;
+ v14 = v14.yx;
+ v9 += v14;
+ v9.y += uint(v9.x < v14.x);
+ v4 ^= v9;
+ v4 = (v4 >> uvec2(24u)) | (v4 << uvec2(8u)).yx;
+ v3 += v4;
+ v3.y += uint(v3.x < v4.x);
+ // NOP
+ v14 ^= v3;
+ v14 = (v14 >> uvec2(16u)) | (v14 << uvec2(16u)).yx;
+ v9 += v14;
+ v9.y += uint(v9.x < v14.x);
+ v4 ^= v9;
+ v4 = (v4 >> uvec2(31u)).yx | (v4 << uvec2(1u));
+ // ROUND 5
+ v0 += v4;
+ v0.y += uint(v0.x < v4.x);
+ v0 += m2;
+ v0.y += uint(v0.x < m2.x);
+ v12 ^= v0;
+ v12 = v12.yx;
+ v8 += v12;
+ v8.y += uint(v8.x < v12.x);
+ v4 ^= v8;
+ v4 = (v4 >> uvec2(24u)) | (v4 << uvec2(8u)).yx;
+ v0 += v4;
+ v0.y += uint(v0.x < v4.x);
+ // NOP
+ v12 ^= v0;
+ v12 = (v12 >> uvec2(16u)) | (v12 << uvec2(16u)).yx;
+ v8 += v12;
+ v8.y += uint(v8.x < v12.x);
+ v4 ^= v8;
+ v4 = (v4 >> uvec2(31u)).yx | (v4 << uvec2(1u));
+ v1 += v5;
+ v1.y += uint(v1.x < v5.x);
+ // NOP
+ v13 ^= v1;
+ v13 = v13.yx;
+ v9 += v13;
+ v9.y += uint(v9.x < v13.x);
+ v5 ^= v9;
+ v5 = (v5 >> uvec2(24u)) | (v5 << uvec2(8u)).yx;
+ v1 += v5;
+ v1.y += uint(v1.x < v5.x);
+ // NOP
+ v13 ^= v1;
+ v13 = (v13 >> uvec2(16u)) | (v13 << uvec2(16u)).yx;
+ v9 += v13;
+ v9.y += uint(v9.x < v13.x);
+ v5 ^= v9;
+ v5 = (v5 >> uvec2(31u)).yx | (v5 << uvec2(1u));
+ v2 += v6;
+ v2.y += uint(v2.x < v6.x);
+ v2 += m0;
+ v2.y += uint(v2.x < m0.x);
+ v14 ^= v2;
+ v14 = v14.yx;
+ v10 += v14;
+ v10.y += uint(v10.x < v14.x);
+ v6 ^= v10;
+ v6 = (v6 >> uvec2(24u)) | (v6 << uvec2(8u)).yx;
+ v2 += v6;
+ v2.y += uint(v2.x < v6.x);
+ // NOP
+ v14 ^= v2;
+ v14 = (v14 >> uvec2(16u)) | (v14 << uvec2(16u)).yx;
+ v10 += v14;
+ v10.y += uint(v10.x < v14.x);
+ v6 ^= v10;
+ v6 = (v6 >> uvec2(31u)).yx | (v6 << uvec2(1u));
+ v3 += v7;
+ v3.y += uint(v3.x < v7.x);
+ // NOP
+ v15 ^= v3;
+ v15 = v15.yx;
+ v11 += v15;
+ v11.y += uint(v11.x < v15.x);
+ v7 ^= v11;
+ v7 = (v7 >> uvec2(24u)) | (v7 << uvec2(8u)).yx;
+ v3 += v7;
+ v3.y += uint(v3.x < v7.x);
+ v3 += m3;
+ v3.y += uint(v3.x < m3.x);
+ v15 ^= v3;
+ v15 = (v15 >> uvec2(16u)) | (v15 << uvec2(16u)).yx;
+ v11 += v15;
+ v11.y += uint(v11.x < v15.x);
+ v7 ^= v11;
+ v7 = (v7 >> uvec2(31u)).yx | (v7 << uvec2(1u));
+ v0 += v5;
+ v0.y += uint(v0.x < v5.x);
+ v0 += m4;
+ v0.y += uint(v0.x < m4.x);
+ v15 ^= v0;
+ v15 = v15.yx;
+ v10 += v15;
+ v10.y += uint(v10.x < v15.x);
+ v5 ^= v10;
+ v5 = (v5 >> uvec2(24u)) | (v5 << uvec2(8u)).yx;
+ v0 += v5;
+ v0.y += uint(v0.x < v5.x);
+ // NOP
+ v15 ^= v0;
+ v15 = (v15 >> uvec2(16u)) | (v15 << uvec2(16u)).yx;
+ v10 += v15;
+ v10.y += uint(v10.x < v15.x);
+ v5 ^= v10;
+ v5 = (v5 >> uvec2(31u)).yx | (v5 << uvec2(1u));
+ v1 += v6;
+ v1.y += uint(v1.x < v6.x);
+ // NOP
+ v12 ^= v1;
+ v12 = v12.yx;
+ v11 += v12;
+ v11.y += uint(v11.x < v12.x);
+ v6 ^= v11;
+ v6 = (v6 >> uvec2(24u)) | (v6 << uvec2(8u)).yx;
+ v1 += v6;
+ v1.y += uint(v1.x < v6.x);
+ // NOP
+ v12 ^= v1;
+ v12 = (v12 >> uvec2(16u)) | (v12 << uvec2(16u)).yx;
+ v11 += v12;
+ v11.y += uint(v11.x < v12.x);
+ v6 ^= v11;
+ v6 = (v6 >> uvec2(31u)).yx | (v6 << uvec2(1u));
+ v2 += v7;
+ v2.y += uint(v2.x < v7.x);
+ // NOP
+ v13 ^= v2;
+ v13 = v13.yx;
+ v8 += v13;
+ v8.y += uint(v8.x < v13.x);
+ v7 ^= v8;
+ v7 = (v7 >> uvec2(24u)) | (v7 << uvec2(8u)).yx;
+ v2 += v7;
+ v2.y += uint(v2.x < v7.x);
+ // NOP
+ v13 ^= v2;
+ v13 = (v13 >> uvec2(16u)) | (v13 << uvec2(16u)).yx;
+ v8 += v13;
+ v8.y += uint(v8.x < v13.x);
+ v7 ^= v8;
+ v7 = (v7 >> uvec2(31u)).yx | (v7 << uvec2(1u));
+ v3 += v4;
+ v3.y += uint(v3.x < v4.x);
+ v3 += m1;
+ v3.y += uint(v3.x < m1.x);
+ v14 ^= v3;
+ v14 = v14.yx;
+ v9 += v14;
+ v9.y += uint(v9.x < v14.x);
+ v4 ^= v9;
+ v4 = (v4 >> uvec2(24u)) | (v4 << uvec2(8u)).yx;
+ v3 += v4;
+ v3.y += uint(v3.x < v4.x);
+ // NOP
+ v14 ^= v3;
+ v14 = (v14 >> uvec2(16u)) | (v14 << uvec2(16u)).yx;
+ v9 += v14;
+ v9.y += uint(v9.x < v14.x);
+ v4 ^= v9;
+ v4 = (v4 >> uvec2(31u)).yx | (v4 << uvec2(1u));
+ // ROUND 6
+ v0 += v4;
+ v0.y += uint(v0.x < v4.x);
+ // NOP
+ v12 ^= v0;
+ v12 = v12.yx;
+ v8 += v12;
+ v8.y += uint(v8.x < v12.x);
+ v4 ^= v8;
+ v4 = (v4 >> uvec2(24u)) | (v4 << uvec2(8u)).yx;
+ v0 += v4;
+ v0.y += uint(v0.x < v4.x);
+ // NOP
+ v12 ^= v0;
+ v12 = (v12 >> uvec2(16u)) | (v12 << uvec2(16u)).yx;
+ v8 += v12;
+ v8.y += uint(v8.x < v12.x);
+ v4 ^= v8;
+ v4 = (v4 >> uvec2(31u)).yx | (v4 << uvec2(1u));
+ v1 += v5;
+ v1.y += uint(v1.x < v5.x);
+ v1 += m1;
+ v1.y += uint(v1.x < m1.x);
+ v13 ^= v1;
+ v13 = v13.yx;
+ v9 += v13;
+ v9.y += uint(v9.x < v13.x);
+ v5 ^= v9;
+ v5 = (v5 >> uvec2(24u)) | (v5 << uvec2(8u)).yx;
+ v1 += v5;
+ v1.y += uint(v1.x < v5.x);
+ // NOP
+ v13 ^= v1;
+ v13 = (v13 >> uvec2(16u)) | (v13 << uvec2(16u)).yx;
+ v9 += v13;
+ v9.y += uint(v9.x < v13.x);
+ v5 ^= v9;
+ v5 = (v5 >> uvec2(31u)).yx | (v5 << uvec2(1u));
+ v2 += v6;
+ v2.y += uint(v2.x < v6.x);
+ // NOP
+ v14 ^= v2;
+ v14 = v14.yx;
+ v10 += v14;
+ v10.y += uint(v10.x < v14.x);
+ v6 ^= v10;
+ v6 = (v6 >> uvec2(24u)) | (v6 << uvec2(8u)).yx;
+ v2 += v6;
+ v2.y += uint(v2.x < v6.x);
+ // NOP
+ v14 ^= v2;
+ v14 = (v14 >> uvec2(16u)) | (v14 << uvec2(16u)).yx;
+ v10 += v14;
+ v10.y += uint(v10.x < v14.x);
+ v6 ^= v10;
+ v6 = (v6 >> uvec2(31u)).yx | (v6 << uvec2(1u));
+ v3 += v7;
+ v3.y += uint(v3.x < v7.x);
+ v3 += m4;
+ v3.y += uint(v3.x < m4.x);
+ v15 ^= v3;
+ v15 = v15.yx;
+ v11 += v15;
+ v11.y += uint(v11.x < v15.x);
+ v7 ^= v11;
+ v7 = (v7 >> uvec2(24u)) | (v7 << uvec2(8u)).yx;
+ v3 += v7;
+ v3.y += uint(v3.x < v7.x);
+ // NOP
+ v15 ^= v3;
+ v15 = (v15 >> uvec2(16u)) | (v15 << uvec2(16u)).yx;
+ v11 += v15;
+ v11.y += uint(v11.x < v15.x);
+ v7 ^= v11;
+ v7 = (v7 >> uvec2(31u)).yx | (v7 << uvec2(1u));
+ v0 += v5;
+ v0.y += uint(v0.x < v5.x);
+ v0 += m0;
+ v0.y += uint(v0.x < m0.x);
+ v15 ^= v0;
+ v15 = v15.yx;
+ v10 += v15;
+ v10.y += uint(v10.x < v15.x);
+ v5 ^= v10;
+ v5 = (v5 >> uvec2(24u)) | (v5 << uvec2(8u)).yx;
+ v0 += v5;
+ v0.y += uint(v0.x < v5.x);
+ // NOP
+ v15 ^= v0;
+ v15 = (v15 >> uvec2(16u)) | (v15 << uvec2(16u)).yx;
+ v10 += v15;
+ v10.y += uint(v10.x < v15.x);
+ v5 ^= v10;
+ v5 = (v5 >> uvec2(31u)).yx | (v5 << uvec2(1u));
+ v1 += v6;
+ v1.y += uint(v1.x < v6.x);
+ // NOP
+ v12 ^= v1;
+ v12 = v12.yx;
+ v11 += v12;
+ v11.y += uint(v11.x < v12.x);
+ v6 ^= v11;
+ v6 = (v6 >> uvec2(24u)) | (v6 << uvec2(8u)).yx;
+ v1 += v6;
+ v1.y += uint(v1.x < v6.x);
+ v1 += m3;
+ v1.y += uint(v1.x < m3.x);
+ v12 ^= v1;
+ v12 = (v12 >> uvec2(16u)) | (v12 << uvec2(16u)).yx;
+ v11 += v12;
+ v11.y += uint(v11.x < v12.x);
+ v6 ^= v11;
+ v6 = (v6 >> uvec2(31u)).yx | (v6 << uvec2(1u));
+ v2 += v7;
+ v2.y += uint(v2.x < v7.x);
+ // NOP
+ v13 ^= v2;
+ v13 = v13.yx;
+ v8 += v13;
+ v8.y += uint(v8.x < v13.x);
+ v7 ^= v8;
+ v7 = (v7 >> uvec2(24u)) | (v7 << uvec2(8u)).yx;
+ v2 += v7;
+ v2.y += uint(v2.x < v7.x);
+ v2 += m2;
+ v2.y += uint(v2.x < m2.x);
+ v13 ^= v2;
+ v13 = (v13 >> uvec2(16u)) | (v13 << uvec2(16u)).yx;
+ v8 += v13;
+ v8.y += uint(v8.x < v13.x);
+ v7 ^= v8;
+ v7 = (v7 >> uvec2(31u)).yx | (v7 << uvec2(1u));
+ v3 += v4;
+ v3.y += uint(v3.x < v4.x);
+ // NOP
+ v14 ^= v3;
+ v14 = v14.yx;
+ v9 += v14;
+ v9.y += uint(v9.x < v14.x);
+ v4 ^= v9;
+ v4 = (v4 >> uvec2(24u)) | (v4 << uvec2(8u)).yx;
+ v3 += v4;
+ v3.y += uint(v3.x < v4.x);
+ // NOP
+ v14 ^= v3;
+ v14 = (v14 >> uvec2(16u)) | (v14 << uvec2(16u)).yx;
+ v9 += v14;
+ v9.y += uint(v9.x < v14.x);
+ v4 ^= v9;
+ v4 = (v4 >> uvec2(31u)).yx | (v4 << uvec2(1u));
+ // ROUND 7
+ v0 += v4;
+ v0.y += uint(v0.x < v4.x);
+ // NOP
+ v12 ^= v0;
+ v12 = v12.yx;
+ v8 += v12;
+ v8.y += uint(v8.x < v12.x);
+ v4 ^= v8;
+ v4 = (v4 >> uvec2(24u)) | (v4 << uvec2(8u)).yx;
+ v0 += v4;
+ v0.y += uint(v0.x < v4.x);
+ // NOP
+ v12 ^= v0;
+ v12 = (v12 >> uvec2(16u)) | (v12 << uvec2(16u)).yx;
+ v8 += v12;
+ v8.y += uint(v8.x < v12.x);
+ v4 ^= v8;
+ v4 = (v4 >> uvec2(31u)).yx | (v4 << uvec2(1u));
+ v1 += v5;
+ v1.y += uint(v1.x < v5.x);
+ // NOP
+ v13 ^= v1;
+ v13 = v13.yx;
+ v9 += v13;
+ v9.y += uint(v9.x < v13.x);
+ v5 ^= v9;
+ v5 = (v5 >> uvec2(24u)) | (v5 << uvec2(8u)).yx;
+ v1 += v5;
+ v1.y += uint(v1.x < v5.x);
+ // NOP
+ v13 ^= v1;
+ v13 = (v13 >> uvec2(16u)) | (v13 << uvec2(16u)).yx;
+ v9 += v13;
+ v9.y += uint(v9.x < v13.x);
+ v5 ^= v9;
+ v5 = (v5 >> uvec2(31u)).yx | (v5 << uvec2(1u));
+ v2 += v6;
+ v2.y += uint(v2.x < v6.x);
+ // NOP
+ v14 ^= v2;
+ v14 = v14.yx;
+ v10 += v14;
+ v10.y += uint(v10.x < v14.x);
+ v6 ^= v10;
+ v6 = (v6 >> uvec2(24u)) | (v6 << uvec2(8u)).yx;
+ v2 += v6;
+ v2.y += uint(v2.x < v6.x);
+ v2 += m1;
+ v2.y += uint(v2.x < m1.x);
+ v14 ^= v2;
+ v14 = (v14 >> uvec2(16u)) | (v14 << uvec2(16u)).yx;
+ v10 += v14;
+ v10.y += uint(v10.x < v14.x);
+ v6 ^= v10;
+ v6 = (v6 >> uvec2(31u)).yx | (v6 << uvec2(1u));
+ v3 += v7;
+ v3.y += uint(v3.x < v7.x);
+ v3 += m3;
+ v3.y += uint(v3.x < m3.x);
+ v15 ^= v3;
+ v15 = v15.yx;
+ v11 += v15;
+ v11.y += uint(v11.x < v15.x);
+ v7 ^= v11;
+ v7 = (v7 >> uvec2(24u)) | (v7 << uvec2(8u)).yx;
+ v3 += v7;
+ v3.y += uint(v3.x < v7.x);
+ // NOP
+ v15 ^= v3;
+ v15 = (v15 >> uvec2(16u)) | (v15 << uvec2(16u)).yx;
+ v11 += v15;
+ v11.y += uint(v11.x < v15.x);
+ v7 ^= v11;
+ v7 = (v7 >> uvec2(31u)).yx | (v7 << uvec2(1u));
+ v0 += v5;
+ v0.y += uint(v0.x < v5.x);
+ // NOP
+ v15 ^= v0;
+ v15 = v15.yx;
+ v10 += v15;
+ v10.y += uint(v10.x < v15.x);
+ v5 ^= v10;
+ v5 = (v5 >> uvec2(24u)) | (v5 << uvec2(8u)).yx;
+ v0 += v5;
+ v0.y += uint(v0.x < v5.x);
+ v0 += m0;
+ v0.y += uint(v0.x < m0.x);
+ v15 ^= v0;
+ v15 = (v15 >> uvec2(16u)) | (v15 << uvec2(16u)).yx;
+ v10 += v15;
+ v10.y += uint(v10.x < v15.x);
+ v5 ^= v10;
+ v5 = (v5 >> uvec2(31u)).yx | (v5 << uvec2(1u));
+ v1 += v6;
+ v1.y += uint(v1.x < v6.x);
+ // NOP
+ v12 ^= v1;
+ v12 = v12.yx;
+ v11 += v12;
+ v11.y += uint(v11.x < v12.x);
+ v6 ^= v11;
+ v6 = (v6 >> uvec2(24u)) | (v6 << uvec2(8u)).yx;
+ v1 += v6;
+ v1.y += uint(v1.x < v6.x);
+ v1 += m4;
+ v1.y += uint(v1.x < m4.x);
+ v12 ^= v1;
+ v12 = (v12 >> uvec2(16u)) | (v12 << uvec2(16u)).yx;
+ v11 += v12;
+ v11.y += uint(v11.x < v12.x);
+ v6 ^= v11;
+ v6 = (v6 >> uvec2(31u)).yx | (v6 << uvec2(1u));
+ v2 += v7;
+ v2.y += uint(v2.x < v7.x);
+ // NOP
+ v13 ^= v2;
+ v13 = v13.yx;
+ v8 += v13;
+ v8.y += uint(v8.x < v13.x);
+ v7 ^= v8;
+ v7 = (v7 >> uvec2(24u)) | (v7 << uvec2(8u)).yx;
+ v2 += v7;
+ v2.y += uint(v2.x < v7.x);
+ // NOP
+ v13 ^= v2;
+ v13 = (v13 >> uvec2(16u)) | (v13 << uvec2(16u)).yx;
+ v8 += v13;
+ v8.y += uint(v8.x < v13.x);
+ v7 ^= v8;
+ v7 = (v7 >> uvec2(31u)).yx | (v7 << uvec2(1u));
+ v3 += v4;
+ v3.y += uint(v3.x < v4.x);
+ v3 += m2;
+ v3.y += uint(v3.x < m2.x);
+ v14 ^= v3;
+ v14 = v14.yx;
+ v9 += v14;
+ v9.y += uint(v9.x < v14.x);
+ v4 ^= v9;
+ v4 = (v4 >> uvec2(24u)) | (v4 << uvec2(8u)).yx;
+ v3 += v4;
+ v3.y += uint(v3.x < v4.x);
+ // NOP
+ v14 ^= v3;
+ v14 = (v14 >> uvec2(16u)) | (v14 << uvec2(16u)).yx;
+ v9 += v14;
+ v9.y += uint(v9.x < v14.x);
+ v4 ^= v9;
+ v4 = (v4 >> uvec2(31u)).yx | (v4 << uvec2(1u));
+ // ROUND 8
+ v0 += v4;
+ v0.y += uint(v0.x < v4.x);
+ // NOP
+ v12 ^= v0;
+ v12 = v12.yx;
+ v8 += v12;
+ v8.y += uint(v8.x < v12.x);
+ v4 ^= v8;
+ v4 = (v4 >> uvec2(24u)) | (v4 << uvec2(8u)).yx;
+ v0 += v4;
+ v0.y += uint(v0.x < v4.x);
+ // NOP
+ v12 ^= v0;
+ v12 = (v12 >> uvec2(16u)) | (v12 << uvec2(16u)).yx;
+ v8 += v12;
+ v8.y += uint(v8.x < v12.x);
+ v4 ^= v8;
+ v4 = (v4 >> uvec2(31u)).yx | (v4 << uvec2(1u));
+ v1 += v5;
+ v1.y += uint(v1.x < v5.x);
+ // NOP
+ v13 ^= v1;
+ v13 = v13.yx;
+ v9 += v13;
+ v9.y += uint(v9.x < v13.x);
+ v5 ^= v9;
+ v5 = (v5 >> uvec2(24u)) | (v5 << uvec2(8u)).yx;
+ v1 += v5;
+ v1.y += uint(v1.x < v5.x);
+ // NOP
+ v13 ^= v1;
+ v13 = (v13 >> uvec2(16u)) | (v13 << uvec2(16u)).yx;
+ v9 += v13;
+ v9.y += uint(v9.x < v13.x);
+ v5 ^= v9;
+ v5 = (v5 >> uvec2(31u)).yx | (v5 << uvec2(1u));
+ v2 += v6;
+ v2.y += uint(v2.x < v6.x);
+ // NOP
+ v14 ^= v2;
+ v14 = v14.yx;
+ v10 += v14;
+ v10.y += uint(v10.x < v14.x);
+ v6 ^= v10;
+ v6 = (v6 >> uvec2(24u)) | (v6 << uvec2(8u)).yx;
+ v2 += v6;
+ v2.y += uint(v2.x < v6.x);
+ v2 += m3;
+ v2.y += uint(v2.x < m3.x);
+ v14 ^= v2;
+ v14 = (v14 >> uvec2(16u)) | (v14 << uvec2(16u)).yx;
+ v10 += v14;
+ v10.y += uint(v10.x < v14.x);
+ v6 ^= v10;
+ v6 = (v6 >> uvec2(31u)).yx | (v6 << uvec2(1u));
+ v3 += v7;
+ v3.y += uint(v3.x < v7.x);
+ v3 += m0;
+ v3.y += uint(v3.x < m0.x);
+ v15 ^= v3;
+ v15 = v15.yx;
+ v11 += v15;
+ v11.y += uint(v11.x < v15.x);
+ v7 ^= v11;
+ v7 = (v7 >> uvec2(24u)) | (v7 << uvec2(8u)).yx;
+ v3 += v7;
+ v3.y += uint(v3.x < v7.x);
+ // NOP
+ v15 ^= v3;
+ v15 = (v15 >> uvec2(16u)) | (v15 << uvec2(16u)).yx;
+ v11 += v15;
+ v11.y += uint(v11.x < v15.x);
+ v7 ^= v11;
+ v7 = (v7 >> uvec2(31u)).yx | (v7 << uvec2(1u));
+ v0 += v5;
+ v0.y += uint(v0.x < v5.x);
+ // NOP
+ v15 ^= v0;
+ v15 = v15.yx;
+ v10 += v15;
+ v10.y += uint(v10.x < v15.x);
+ v5 ^= v10;
+ v5 = (v5 >> uvec2(24u)) | (v5 << uvec2(8u)).yx;
+ v0 += v5;
+ v0.y += uint(v0.x < v5.x);
+ v0 += m2;
+ v0.y += uint(v0.x < m2.x);
+ v15 ^= v0;
+ v15 = (v15 >> uvec2(16u)) | (v15 << uvec2(16u)).yx;
+ v10 += v15;
+ v10.y += uint(v10.x < v15.x);
+ v5 ^= v10;
+ v5 = (v5 >> uvec2(31u)).yx | (v5 << uvec2(1u));
+ v1 += v6;
+ v1.y += uint(v1.x < v6.x);
+ // NOP
+ v12 ^= v1;
+ v12 = v12.yx;
+ v11 += v12;
+ v11.y += uint(v11.x < v12.x);
+ v6 ^= v11;
+ v6 = (v6 >> uvec2(24u)) | (v6 << uvec2(8u)).yx;
+ v1 += v6;
+ v1.y += uint(v1.x < v6.x);
+ // NOP
+ v12 ^= v1;
+ v12 = (v12 >> uvec2(16u)) | (v12 << uvec2(16u)).yx;
+ v11 += v12;
+ v11.y += uint(v11.x < v12.x);
+ v6 ^= v11;
+ v6 = (v6 >> uvec2(31u)).yx | (v6 << uvec2(1u));
+ v2 += v7;
+ v2.y += uint(v2.x < v7.x);
+ v2 += m1;
+ v2.y += uint(v2.x < m1.x);
+ v13 ^= v2;
+ v13 = v13.yx;
+ v8 += v13;
+ v8.y += uint(v8.x < v13.x);
+ v7 ^= v8;
+ v7 = (v7 >> uvec2(24u)) | (v7 << uvec2(8u)).yx;
+ v2 += v7;
+ v2.y += uint(v2.x < v7.x);
+ v2 += m4;
+ v2.y += uint(v2.x < m4.x);
+ v13 ^= v2;
+ v13 = (v13 >> uvec2(16u)) | (v13 << uvec2(16u)).yx;
+ v8 += v13;
+ v8.y += uint(v8.x < v13.x);
+ v7 ^= v8;
+ v7 = (v7 >> uvec2(31u)).yx | (v7 << uvec2(1u));
+ v3 += v4;
+ v3.y += uint(v3.x < v4.x);
+ // NOP
+ v14 ^= v3;
+ v14 = v14.yx;
+ v9 += v14;
+ v9.y += uint(v9.x < v14.x);
+ v4 ^= v9;
+ v4 = (v4 >> uvec2(24u)) | (v4 << uvec2(8u)).yx;
+ v3 += v4;
+ v3.y += uint(v3.x < v4.x);
+ // NOP
+ v14 ^= v3;
+ v14 = (v14 >> uvec2(16u)) | (v14 << uvec2(16u)).yx;
+ v9 += v14;
+ v9.y += uint(v9.x < v14.x);
+ v4 ^= v9;
+ v4 = (v4 >> uvec2(31u)).yx | (v4 << uvec2(1u));
+ // ROUND 9
+ v0 += v4;
+ v0.y += uint(v0.x < v4.x);
+ // NOP
+ v12 ^= v0;
+ v12 = v12.yx;
+ v8 += v12;
+ v8.y += uint(v8.x < v12.x);
+ v4 ^= v8;
+ v4 = (v4 >> uvec2(24u)) | (v4 << uvec2(8u)).yx;
+ v0 += v4;
+ v0.y += uint(v0.x < v4.x);
+ v0 += m2;
+ v0.y += uint(v0.x < m2.x);
+ v12 ^= v0;
+ v12 = (v12 >> uvec2(16u)) | (v12 << uvec2(16u)).yx;
+ v8 += v12;
+ v8.y += uint(v8.x < v12.x);
+ v4 ^= v8;
+ v4 = (v4 >> uvec2(31u)).yx | (v4 << uvec2(1u));
+ v1 += v5;
+ v1.y += uint(v1.x < v5.x);
+ // NOP
+ v13 ^= v1;
+ v13 = v13.yx;
+ v9 += v13;
+ v9.y += uint(v9.x < v13.x);
+ v5 ^= v9;
+ v5 = (v5 >> uvec2(24u)) | (v5 << uvec2(8u)).yx;
+ v1 += v5;
+ v1.y += uint(v1.x < v5.x);
+ v1 += m4;
+ v1.y += uint(v1.x < m4.x);
+ v13 ^= v1;
+ v13 = (v13 >> uvec2(16u)) | (v13 << uvec2(16u)).yx;
+ v9 += v13;
+ v9.y += uint(v9.x < v13.x);
+ v5 ^= v9;
+ v5 = (v5 >> uvec2(31u)).yx | (v5 << uvec2(1u));
+ v2 += v6;
+ v2.y += uint(v2.x < v6.x);
+ // NOP
+ v14 ^= v2;
+ v14 = v14.yx;
+ v10 += v14;
+ v10.y += uint(v10.x < v14.x);
+ v6 ^= v10;
+ v6 = (v6 >> uvec2(24u)) | (v6 << uvec2(8u)).yx;
+ v2 += v6;
+ v2.y += uint(v2.x < v6.x);
+ // NOP
+ v14 ^= v2;
+ v14 = (v14 >> uvec2(16u)) | (v14 << uvec2(16u)).yx;
+ v10 += v14;
+ v10.y += uint(v10.x < v14.x);
+ v6 ^= v10;
+ v6 = (v6 >> uvec2(31u)).yx | (v6 << uvec2(1u));
+ v3 += v7;
+ v3.y += uint(v3.x < v7.x);
+ v3 += m1;
+ v3.y += uint(v3.x < m1.x);
+ v15 ^= v3;
+ v15 = v15.yx;
+ v11 += v15;
+ v11.y += uint(v11.x < v15.x);
+ v7 ^= v11;
+ v7 = (v7 >> uvec2(24u)) | (v7 << uvec2(8u)).yx;
+ v3 += v7;
+ v3.y += uint(v3.x < v7.x);
+ // NOP
+ v15 ^= v3;
+ v15 = (v15 >> uvec2(16u)) | (v15 << uvec2(16u)).yx;
+ v11 += v15;
+ v11.y += uint(v11.x < v15.x);
+ v7 ^= v11;
+ v7 = (v7 >> uvec2(31u)).yx | (v7 << uvec2(1u));
+ v0 += v5;
+ v0.y += uint(v0.x < v5.x);
+ // NOP
+ v15 ^= v0;
+ v15 = v15.yx;
+ v10 += v15;
+ v10.y += uint(v10.x < v15.x);
+ v5 ^= v10;
+ v5 = (v5 >> uvec2(24u)) | (v5 << uvec2(8u)).yx;
+ v0 += v5;
+ v0.y += uint(v0.x < v5.x);
+ // NOP
+ v15 ^= v0;
+ v15 = (v15 >> uvec2(16u)) | (v15 << uvec2(16u)).yx;
+ v10 += v15;
+ v10.y += uint(v10.x < v15.x);
+ v5 ^= v10;
+ v5 = (v5 >> uvec2(31u)).yx | (v5 << uvec2(1u));
+ v1 += v6;
+ v1.y += uint(v1.x < v6.x);
+ // NOP
+ v12 ^= v1;
+ v12 = v12.yx;
+ v11 += v12;
+ v11.y += uint(v11.x < v12.x);
+ v6 ^= v11;
+ v6 = (v6 >> uvec2(24u)) | (v6 << uvec2(8u)).yx;
+ v1 += v6;
+ v1.y += uint(v1.x < v6.x);
+ // NOP
+ v12 ^= v1;
+ v12 = (v12 >> uvec2(16u)) | (v12 << uvec2(16u)).yx;
+ v11 += v12;
+ v11.y += uint(v11.x < v12.x);
+ v6 ^= v11;
+ v6 = (v6 >> uvec2(31u)).yx | (v6 << uvec2(1u));
+ v2 += v7;
+ v2.y += uint(v2.x < v7.x);
+ v2 += m3;
+ v2.y += uint(v2.x < m3.x);
+ v13 ^= v2;
+ v13 = v13.yx;
+ v8 += v13;
+ v8.y += uint(v8.x < v13.x);
+ v7 ^= v8;
+ v7 = (v7 >> uvec2(24u)) | (v7 << uvec2(8u)).yx;
+ v2 += v7;
+ v2.y += uint(v2.x < v7.x);
+ // NOP
+ v13 ^= v2;
+ v13 = (v13 >> uvec2(16u)) | (v13 << uvec2(16u)).yx;
+ v8 += v13;
+ v8.y += uint(v8.x < v13.x);
+ v7 ^= v8;
+ v7 = (v7 >> uvec2(31u)).yx | (v7 << uvec2(1u));
+ v3 += v4;
+ v3.y += uint(v3.x < v4.x);
+ // NOP
+ v14 ^= v3;
+ v14 = v14.yx;
+ v9 += v14;
+ v9.y += uint(v9.x < v14.x);
+ v4 ^= v9;
+ v4 = (v4 >> uvec2(24u)) | (v4 << uvec2(8u)).yx;
+ v3 += v4;
+ v3.y += uint(v3.x < v4.x);
+ v3 += m0;
+ v3.y += uint(v3.x < m0.x);
+ v14 ^= v3;
+ v14 = (v14 >> uvec2(16u)) | (v14 << uvec2(16u)).yx;
+ v9 += v14;
+ v9.y += uint(v9.x < v14.x);
+ v4 ^= v9;
+ v4 = (v4 >> uvec2(31u)).yx | (v4 << uvec2(1u));
+ // ROUND 10
+ v0 += v4;
+ v0.y += uint(v0.x < v4.x);
+ v0 += m0;
+ v0.y += uint(v0.x < m0.x);
+ v12 ^= v0;
+ v12 = v12.yx;
+ v8 += v12;
+ v8.y += uint(v8.x < v12.x);
+ v4 ^= v8;
+ v4 = (v4 >> uvec2(24u)) | (v4 << uvec2(8u)).yx;
+ v0 += v4;
+ v0.y += uint(v0.x < v4.x);
+ v0 += m1;
+ v0.y += uint(v0.x < m1.x);
+ v12 ^= v0;
+ v12 = (v12 >> uvec2(16u)) | (v12 << uvec2(16u)).yx;
+ v8 += v12;
+ v8.y += uint(v8.x < v12.x);
+ v4 ^= v8;
+ v4 = (v4 >> uvec2(31u)).yx | (v4 << uvec2(1u));
+ v1 += v5;
+ v1.y += uint(v1.x < v5.x);
+ v1 += m2;
+ v1.y += uint(v1.x < m2.x);
+ v13 ^= v1;
+ v13 = v13.yx;
+ v9 += v13;
+ v9.y += uint(v9.x < v13.x);
+ v5 ^= v9;
+ v5 = (v5 >> uvec2(24u)) | (v5 << uvec2(8u)).yx;
+ v1 += v5;
+ v1.y += uint(v1.x < v5.x);
+ v1 += m3;
+ v1.y += uint(v1.x < m3.x);
+ v13 ^= v1;
+ v13 = (v13 >> uvec2(16u)) | (v13 << uvec2(16u)).yx;
+ v9 += v13;
+ v9.y += uint(v9.x < v13.x);
+ v5 ^= v9;
+ v5 = (v5 >> uvec2(31u)).yx | (v5 << uvec2(1u));
+ v2 += v6;
+ v2.y += uint(v2.x < v6.x);
+ v2 += m4;
+ v2.y += uint(v2.x < m4.x);
+ v14 ^= v2;
+ v14 = v14.yx;
+ v10 += v14;
+ v10.y += uint(v10.x < v14.x);
+ v6 ^= v10;
+ v6 = (v6 >> uvec2(24u)) | (v6 << uvec2(8u)).yx;
+ v2 += v6;
+ v2.y += uint(v2.x < v6.x);
+ // NOP
+ v14 ^= v2;
+ v14 = (v14 >> uvec2(16u)) | (v14 << uvec2(16u)).yx;
+ v10 += v14;
+ v10.y += uint(v10.x < v14.x);
+ v6 ^= v10;
+ v6 = (v6 >> uvec2(31u)).yx | (v6 << uvec2(1u));
+ v3 += v7;
+ v3.y += uint(v3.x < v7.x);
+ // NOP
+ v15 ^= v3;
+ v15 = v15.yx;
+ v11 += v15;
+ v11.y += uint(v11.x < v15.x);
+ v7 ^= v11;
+ v7 = (v7 >> uvec2(24u)) | (v7 << uvec2(8u)).yx;
+ v3 += v7;
+ v3.y += uint(v3.x < v7.x);
+ // NOP
+ v15 ^= v3;
+ v15 = (v15 >> uvec2(16u)) | (v15 << uvec2(16u)).yx;
+ v11 += v15;
+ v11.y += uint(v11.x < v15.x);
+ v7 ^= v11;
+ v7 = (v7 >> uvec2(31u)).yx | (v7 << uvec2(1u));
+ v0 += v5;
+ v0.y += uint(v0.x < v5.x);
+ // NOP
+ v15 ^= v0;
+ v15 = v15.yx;
+ v10 += v15;
+ v10.y += uint(v10.x < v15.x);
+ v5 ^= v10;
+ v5 = (v5 >> uvec2(24u)) | (v5 << uvec2(8u)).yx;
+ v0 += v5;
+ v0.y += uint(v0.x < v5.x);
+ // NOP
+ v15 ^= v0;
+ v15 = (v15 >> uvec2(16u)) | (v15 << uvec2(16u)).yx;
+ v10 += v15;
+ v10.y += uint(v10.x < v15.x);
+ v5 ^= v10;
+ v5 = (v5 >> uvec2(31u)).yx | (v5 << uvec2(1u));
+ v1 += v6;
+ v1.y += uint(v1.x < v6.x);
+ // NOP
+ v12 ^= v1;
+ v12 = v12.yx;
+ v11 += v12;
+ v11.y += uint(v11.x < v12.x);
+ v6 ^= v11;
+ v6 = (v6 >> uvec2(24u)) | (v6 << uvec2(8u)).yx;
+ v1 += v6;
+ v1.y += uint(v1.x < v6.x);
+ // NOP
+ v12 ^= v1;
+ v12 = (v12 >> uvec2(16u)) | (v12 << uvec2(16u)).yx;
+ v11 += v12;
+ v11.y += uint(v11.x < v12.x);
+ v6 ^= v11;
+ v6 = (v6 >> uvec2(31u)).yx | (v6 << uvec2(1u));
+ v2 += v7;
+ v2.y += uint(v2.x < v7.x);
+ // NOP
+ v13 ^= v2;
+ v13 = v13.yx;
+ v8 += v13;
+ v8.y += uint(v8.x < v13.x);
+ v7 ^= v8;
+ v7 = (v7 >> uvec2(24u)) | (v7 << uvec2(8u)).yx;
+ v2 += v7;
+ v2.y += uint(v2.x < v7.x);
+ // NOP
+ v13 ^= v2;
+ v13 = (v13 >> uvec2(16u)) | (v13 << uvec2(16u)).yx;
+ v8 += v13;
+ v8.y += uint(v8.x < v13.x);
+ v7 ^= v8;
+ v7 = (v7 >> uvec2(31u)).yx | (v7 << uvec2(1u));
+ v3 += v4;
+ v3.y += uint(v3.x < v4.x);
+ // NOP
+ v14 ^= v3;
+ v14 = v14.yx;
+ v9 += v14;
+ v9.y += uint(v9.x < v14.x);
+ v4 ^= v9;
+ v4 = (v4 >> uvec2(24u)) | (v4 << uvec2(8u)).yx;
+ v3 += v4;
+ v3.y += uint(v3.x < v4.x);
+ // NOP
+ v14 ^= v3;
+ v14 = (v14 >> uvec2(16u)) | (v14 << uvec2(16u)).yx;
+ v9 += v14;
+ v9.y += uint(v9.x < v14.x);
+ v4 ^= v9;
+ v4 = (v4 >> uvec2(31u)).yx | (v4 << uvec2(1u));
+ // ROUND 11
+ v0 += v4;
+ v0.y += uint(v0.x < v4.x);
+ // NOP
+ v12 ^= v0;
+ v12 = v12.yx;
+ v8 += v12;
+ v8.y += uint(v8.x < v12.x);
+ v4 ^= v8;
+ v4 = (v4 >> uvec2(24u)) | (v4 << uvec2(8u)).yx;
+ v0 += v4;
+ v0.y += uint(v0.x < v4.x);
+ // NOP
+ v12 ^= v0;
+ v12 = (v12 >> uvec2(16u)) | (v12 << uvec2(16u)).yx;
+ v8 += v12;
+ v8.y += uint(v8.x < v12.x);
+ v4 ^= v8;
+ v4 = (v4 >> uvec2(31u)).yx | (v4 << uvec2(1u));
+ v1 += v5;
+ v1.y += uint(v1.x < v5.x);
+ v1 += m4;
+ v1.y += uint(v1.x < m4.x);
+ v13 ^= v1;
+ v13 = v13.yx;
+ v9 += v13;
+ v9.y += uint(v9.x < v13.x);
+ v5 ^= v9;
+ v5 = (v5 >> uvec2(24u)) | (v5 << uvec2(8u)).yx;
+ v1 += v5;
+ v1.y += uint(v1.x < v5.x);
+ // NOP
+ v13 ^= v1;
+ v13 = (v13 >> uvec2(16u)) | (v13 << uvec2(16u)).yx;
+ v9 += v13;
+ v9.y += uint(v9.x < v13.x);
+ v5 ^= v9;
+ v5 = (v5 >> uvec2(31u)).yx | (v5 << uvec2(1u));
+ v2 += v6;
+ v2.y += uint(v2.x < v6.x);
+ // NOP
+ v14 ^= v2;
+ v14 = v14.yx;
+ v10 += v14;
+ v10.y += uint(v10.x < v14.x);
+ v6 ^= v10;
+ v6 = (v6 >> uvec2(24u)) | (v6 << uvec2(8u)).yx;
+ v2 += v6;
+ v2.y += uint(v2.x < v6.x);
+ // NOP
+ v14 ^= v2;
+ v14 = (v14 >> uvec2(16u)) | (v14 << uvec2(16u)).yx;
+ v10 += v14;
+ v10.y += uint(v10.x < v14.x);
+ v6 ^= v10;
+ v6 = (v6 >> uvec2(31u)).yx | (v6 << uvec2(1u));
+ v3 += v7;
+ v3.y += uint(v3.x < v7.x);
+ // NOP
+ v15 ^= v3;
+ v15 = v15.yx;
+ v11 += v15;
+ v11.y += uint(v11.x < v15.x);
+ v7 ^= v11;
+ v7 = (v7 >> uvec2(24u)) | (v7 << uvec2(8u)).yx;
+ v3 += v7;
+ v3.y += uint(v3.x < v7.x);
+ // NOP
+ v15 ^= v3;
+ v15 = (v15 >> uvec2(16u)) | (v15 << uvec2(16u)).yx;
+ v11 += v15;
+ v11.y += uint(v11.x < v15.x);
+ v7 ^= v11;
+ v7 = (v7 >> uvec2(31u)).yx | (v7 << uvec2(1u));
+ v0 += v5;
+ v0.y += uint(v0.x < v5.x);
+ v0 += m1;
+ v0.y += uint(v0.x < m1.x);
+ v15 ^= v0;
+ v15 = v15.yx;
+ v10 += v15;
+ v10.y += uint(v10.x < v15.x);
+ v5 ^= v10;
+ v5 = (v5 >> uvec2(24u)) | (v5 << uvec2(8u)).yx;
+ v0 += v5;
+ v0.y += uint(v0.x < v5.x);
+ // NOP
+ v15 ^= v0;
+ v15 = (v15 >> uvec2(16u)) | (v15 << uvec2(16u)).yx;
+ v10 += v15;
+ v10.y += uint(v10.x < v15.x);
+ v5 ^= v10;
+ v5 = (v5 >> uvec2(31u)).yx | (v5 << uvec2(1u));
+ // G NOP
+ v2 += v7;
+ v2.y += uint(v2.x < v7.x);
+ // NOP
+ v13 ^= v2;
+ v13 = v13.yx;
+ v8 += v13;
+ v8.y += uint(v8.x < v13.x);
+ v7 ^= v8;
+ v7 = (v7 >> uvec2(24u)) | (v7 << uvec2(8u)).yx;
+ v2 += v7;
+ v2.y += uint(v2.x < v7.x);
+ // NOP
+ v13 ^= v2;
+ v13 = (v13 >> uvec2(16u)) | (v13 << uvec2(16u)).yx;
+ v8 += v13;
+ v8.y += uint(v8.x < v13.x);
+ v7 ^= v8;
+ v7 = (v7 >> uvec2(31u)).yx | (v7 << uvec2(1u));
+ // G NOP
+ // Set pixel value if it exceeds difficulty threshold, else discard it.
+ uvec2 result = uvec2(0xf3bcc908u, 0x6a09e667u) ^ uvec2(0x01010008u, 0x0u) ^ v0 ^ v8;
+ if (result.y > difficulty.y || (result.y == difficulty.y && result.x >= difficulty.x)) {
+ work = uvec4(m0, result);
+ }
+ if (work.x == 0u) {
+ discard;
+ }
+}
--- /dev/null
+// Input buffers
+struct INPUT {
+ hash: array<vec4<u32>, 2>,
+ difficulty: vec2<u32>,
+ seed: vec2<u32>
+};
+@group(0) @binding(0) var<uniform>input: INPUT;
+// Output buffers
+struct OUTPUT {
+ found: atomic<u32>,
+ work: vec2<u32>,
+ difficulty: vec2<u32>
+};
+@group(0) @binding(1) var<storage, read_write > output: OUTPUT;
+// Shared flag to prevent execution for all workgroup threads based on the
+// atomicLoad() result of a single member thread.
+var<workgroup> found: bool;
+// Shared memory for hash, difficulty, and seed.
+var<workgroup> m1: vec2<u32>;
+var<workgroup> m2: vec2<u32>;
+var<workgroup> m3: vec2<u32>;
+var<workgroup> m4: vec2<u32>;
+var<workgroup> d: vec2<u32>;
+var<workgroup> seed: vec2<u32>;
+// Main compute function
+//
+// Computes with a workgroup size of 64 which balances warps between NVIDIA and
+// AMD cards while still considering the power-sensitive requirements of mobile
+// devices. The entire workgroup exits immediately if a nonce was already found
+// by a previous workgroup.
+//
+// Each component of a random 8-byte value, provided by the UBO as a vec2<u32>,
+// is XOR'd with a different dimensional index from the global thread identifier
+// to create a unique nonce value for each thread.
+//
+// Where the reference implementation uses array lookups, the NanoPow
+// implementation assigns each array element to its own variable to enhance
+// performance, but the variable name still contains the original index digit.
+@compute @workgroup_size(64)
+fn main(@builtin(global_invocation_id) global_id: vec3<u32>, @builtin(local_invocation_id) local_id: vec3<u32>) {
+ if (local_id.x == 0u) {
+ found = atomicLoad(& output.found) != 0u;
+ seed = input.seed;
+ m1 = input.hash[0u].xy;
+ m2 = input.hash[0u].zw;
+ m3 = input.hash[1u].xy;
+ m4 = input.hash[1u].zw;
+ d = input.difficulty;
+ }
+ workgroupBarrier();
+ if (found) { return; }
+ // Initialize unique nonce
+ let m0: vec2<u32> = seed ^ global_id.xy;
+ // INITIALIZE STATE VECTOR
+ // v0: depth=1; fanout=1; outlen=8
+ // v12: input byte length
+ // v14: final block flag
+ var v0 = vec2<u32>(0xf3bcc908u, 0x6a09e667u);
+ var v8 = vec2<u32>(0xf3bcc908u, 0x6a09e667u);
+ var v1 = vec2<u32>(0x84caa73bu, 0xbb67ae85u);
+ var v9 = vec2<u32>(0x84caa73bu, 0xbb67ae85u);
+ var v2 = vec2<u32>(0xfe94f82bu, 0x3c6ef372u);
+ var v10 = vec2<u32>(0xfe94f82bu, 0x3c6ef372u);
+ var v3 = vec2<u32>(0x5f1d36f1u, 0xa54ff53au);
+ var v11 = vec2<u32>(0x5f1d36f1u, 0xa54ff53au);
+ var v4 = vec2<u32>(0xade682d1u, 0x510e527fu);
+ var v12 = vec2<u32>(0xade682d1u, 0x510e527fu);
+ var v5 = vec2<u32>(0x2b3e6c1fu, 0x9b05688cu);
+ var v13 = vec2<u32>(0x2b3e6c1fu, 0x9b05688cu);
+ var v6 = vec2<u32>(0xfb41bd6bu, 0x1f83d9abu);
+ var v14 = vec2<u32>(0xfb41bd6bu, 0x1f83d9abu);
+ var v7 = vec2<u32>(0x137e2179u, 0x5be0cd19u);
+ var v15 = vec2<u32>(0x137e2179u, 0x5be0cd19u);
+ v0 ^= vec2<u32>(0x01010008u, 0x0u);
+ v12 ^= vec2<u32>(0x28u, 0x0u);
+ v14 ^= vec2<u32>(~0x0u);
+ // COMPRESS
+ // ROUND 0
+ v0 += v4;
+ v0.y += u32(v0.x < v4.x);
+ v0 += m0;
+ v0.y += u32(v0.x < m0.x);
+ v12 ^= v0;
+ v12 = v12.yx;
+ v8 += v12;
+ v8.y += u32(v8.x < v12.x);
+ v4 ^= v8;
+ v4 = (v4 >> vec2<u32>(24u)) | (v4 << vec2<u32>(8u)).yx;
+ v0 += v4;
+ v0.y += u32(v0.x < v4.x);
+ v0 += m1;
+ v0.y += u32(v0.x < m1.x);
+ v12 ^= v0;
+ v12 = (v12 >> vec2<u32>(16u)) | (v12 << vec2<u32>(16u)).yx;
+ v8 += v12;
+ v8.y += u32(v8.x < v12.x);
+ v4 ^= v8;
+ v4 = (v4 >> vec2<u32>(31u)).yx | (v4 << vec2<u32>(1u));
+ v1 += v5;
+ v1.y += u32(v1.x < v5.x);
+ v1 += m2;
+ v1.y += u32(v1.x < m2.x);
+ v13 ^= v1;
+ v13 = v13.yx;
+ v9 += v13;
+ v9.y += u32(v9.x < v13.x);
+ v5 ^= v9;
+ v5 = (v5 >> vec2<u32>(24u)) | (v5 << vec2<u32>(8u)).yx;
+ v1 += v5;
+ v1.y += u32(v1.x < v5.x);
+ v1 += m3;
+ v1.y += u32(v1.x < m3.x);
+ v13 ^= v1;
+ v13 = (v13 >> vec2<u32>(16u)) | (v13 << vec2<u32>(16u)).yx;
+ v9 += v13;
+ v9.y += u32(v9.x < v13.x);
+ v5 ^= v9;
+ v5 = (v5 >> vec2<u32>(31u)).yx | (v5 << vec2<u32>(1u));
+ v2 += v6;
+ v2.y += u32(v2.x < v6.x);
+ v2 += m4;
+ v2.y += u32(v2.x < m4.x);
+ v14 ^= v2;
+ v14 = v14.yx;
+ v10 += v14;
+ v10.y += u32(v10.x < v14.x);
+ v6 ^= v10;
+ v6 = (v6 >> vec2<u32>(24u)) | (v6 << vec2<u32>(8u)).yx;
+ v2 += v6;
+ v2.y += u32(v2.x < v6.x);
+ // NOP
+ v14 ^= v2;
+ v14 = (v14 >> vec2<u32>(16u)) | (v14 << vec2<u32>(16u)).yx;
+ v10 += v14;
+ v10.y += u32(v10.x < v14.x);
+ v6 ^= v10;
+ v6 = (v6 >> vec2<u32>(31u)).yx | (v6 << vec2<u32>(1u));
+ v3 += v7;
+ v3.y += u32(v3.x < v7.x);
+ // NOP
+ v15 ^= v3;
+ v15 = v15.yx;
+ v11 += v15;
+ v11.y += u32(v11.x < v15.x);
+ v7 ^= v11;
+ v7 = (v7 >> vec2<u32>(24u)) | (v7 << vec2<u32>(8u)).yx;
+ v3 += v7;
+ v3.y += u32(v3.x < v7.x);
+ // NOP
+ v15 ^= v3;
+ v15 = (v15 >> vec2<u32>(16u)) | (v15 << vec2<u32>(16u)).yx;
+ v11 += v15;
+ v11.y += u32(v11.x < v15.x);
+ v7 ^= v11;
+ v7 = (v7 >> vec2<u32>(31u)).yx | (v7 << vec2<u32>(1u));
+ v0 += v5;
+ v0.y += u32(v0.x < v5.x);
+ // NOP
+ v15 ^= v0;
+ v15 = v15.yx;
+ v10 += v15;
+ v10.y += u32(v10.x < v15.x);
+ v5 ^= v10;
+ v5 = (v5 >> vec2<u32>(24u)) | (v5 << vec2<u32>(8u)).yx;
+ v0 += v5;
+ v0.y += u32(v0.x < v5.x);
+ // NOP
+ v15 ^= v0;
+ v15 = (v15 >> vec2<u32>(16u)) | (v15 << vec2<u32>(16u)).yx;
+ v10 += v15;
+ v10.y += u32(v10.x < v15.x);
+ v5 ^= v10;
+ v5 = (v5 >> vec2<u32>(31u)).yx | (v5 << vec2<u32>(1u));
+ v1 += v6;
+ v1.y += u32(v1.x < v6.x);
+ // NOP
+ v12 ^= v1;
+ v12 = v12.yx;
+ v11 += v12;
+ v11.y += u32(v11.x < v12.x);
+ v6 ^= v11;
+ v6 = (v6 >> vec2<u32>(24u)) | (v6 << vec2<u32>(8u)).yx;
+ v1 += v6;
+ v1.y += u32(v1.x < v6.x);
+ // NOP
+ v12 ^= v1;
+ v12 = (v12 >> vec2<u32>(16u)) | (v12 << vec2<u32>(16u)).yx;
+ v11 += v12;
+ v11.y += u32(v11.x < v12.x);
+ v6 ^= v11;
+ v6 = (v6 >> vec2<u32>(31u)).yx | (v6 << vec2<u32>(1u));
+ v2 += v7;
+ v2.y += u32(v2.x < v7.x);
+ // NOP
+ v13 ^= v2;
+ v13 = v13.yx;
+ v8 += v13;
+ v8.y += u32(v8.x < v13.x);
+ v7 ^= v8;
+ v7 = (v7 >> vec2<u32>(24u)) | (v7 << vec2<u32>(8u)).yx;
+ v2 += v7;
+ v2.y += u32(v2.x < v7.x);
+ // NOP
+ v13 ^= v2;
+ v13 = (v13 >> vec2<u32>(16u)) | (v13 << vec2<u32>(16u)).yx;
+ v8 += v13;
+ v8.y += u32(v8.x < v13.x);
+ v7 ^= v8;
+ v7 = (v7 >> vec2<u32>(31u)).yx | (v7 << vec2<u32>(1u));
+ v3 += v4;
+ v3.y += u32(v3.x < v4.x);
+ // NOP
+ v14 ^= v3;
+ v14 = v14.yx;
+ v9 += v14;
+ v9.y += u32(v9.x < v14.x);
+ v4 ^= v9;
+ v4 = (v4 >> vec2<u32>(24u)) | (v4 << vec2<u32>(8u)).yx;
+ v3 += v4;
+ v3.y += u32(v3.x < v4.x);
+ // NOP
+ v14 ^= v3;
+ v14 = (v14 >> vec2<u32>(16u)) | (v14 << vec2<u32>(16u)).yx;
+ v9 += v14;
+ v9.y += u32(v9.x < v14.x);
+ v4 ^= v9;
+ v4 = (v4 >> vec2<u32>(31u)).yx | (v4 << vec2<u32>(1u));
+ // ROUND 1
+ v0 += v4;
+ v0.y += u32(v0.x < v4.x);
+ // NOP
+ v12 ^= v0;
+ v12 = v12.yx;
+ v8 += v12;
+ v8.y += u32(v8.x < v12.x);
+ v4 ^= v8;
+ v4 = (v4 >> vec2<u32>(24u)) | (v4 << vec2<u32>(8u)).yx;
+ v0 += v4;
+ v0.y += u32(v0.x < v4.x);
+ // NOP
+ v12 ^= v0;
+ v12 = (v12 >> vec2<u32>(16u)) | (v12 << vec2<u32>(16u)).yx;
+ v8 += v12;
+ v8.y += u32(v8.x < v12.x);
+ v4 ^= v8;
+ v4 = (v4 >> vec2<u32>(31u)).yx | (v4 << vec2<u32>(1u));
+ v1 += v5;
+ v1.y += u32(v1.x < v5.x);
+ v1 += m4;
+ v1.y += u32(v1.x < m4.x);
+ v13 ^= v1;
+ v13 = v13.yx;
+ v9 += v13;
+ v9.y += u32(v9.x < v13.x);
+ v5 ^= v9;
+ v5 = (v5 >> vec2<u32>(24u)) | (v5 << vec2<u32>(8u)).yx;
+ v1 += v5;
+ v1.y += u32(v1.x < v5.x);
+ // NOP
+ v13 ^= v1;
+ v13 = (v13 >> vec2<u32>(16u)) | (v13 << vec2<u32>(16u)).yx;
+ v9 += v13;
+ v9.y += u32(v9.x < v13.x);
+ v5 ^= v9;
+ v5 = (v5 >> vec2<u32>(31u)).yx | (v5 << vec2<u32>(1u));
+ v2 += v6;
+ v2.y += u32(v2.x < v6.x);
+ // NOP
+ v14 ^= v2;
+ v14 = v14.yx;
+ v10 += v14;
+ v10.y += u32(v10.x < v14.x);
+ v6 ^= v10;
+ v6 = (v6 >> vec2<u32>(24u)) | (v6 << vec2<u32>(8u)).yx;
+ v2 += v6;
+ v2.y += u32(v2.x < v6.x);
+ // NOP
+ v14 ^= v2;
+ v14 = (v14 >> vec2<u32>(16u)) | (v14 << vec2<u32>(16u)).yx;
+ v10 += v14;
+ v10.y += u32(v10.x < v14.x);
+ v6 ^= v10;
+ v6 = (v6 >> vec2<u32>(31u)).yx | (v6 << vec2<u32>(1u));
+ v3 += v7;
+ v3.y += u32(v3.x < v7.x);
+ // NOP
+ v15 ^= v3;
+ v15 = v15.yx;
+ v11 += v15;
+ v11.y += u32(v11.x < v15.x);
+ v7 ^= v11;
+ v7 = (v7 >> vec2<u32>(24u)) | (v7 << vec2<u32>(8u)).yx;
+ v3 += v7;
+ v3.y += u32(v3.x < v7.x);
+ // NOP
+ v15 ^= v3;
+ v15 = (v15 >> vec2<u32>(16u)) | (v15 << vec2<u32>(16u)).yx;
+ v11 += v15;
+ v11.y += u32(v11.x < v15.x);
+ v7 ^= v11;
+ v7 = (v7 >> vec2<u32>(31u)).yx | (v7 << vec2<u32>(1u));
+ v0 += v5;
+ v0.y += u32(v0.x < v5.x);
+ v0 += m1;
+ v0.y += u32(v0.x < m1.x);
+ v15 ^= v0;
+ v15 = v15.yx;
+ v10 += v15;
+ v10.y += u32(v10.x < v15.x);
+ v5 ^= v10;
+ v5 = (v5 >> vec2<u32>(24u)) | (v5 << vec2<u32>(8u)).yx;
+ v0 += v5;
+ v0.y += u32(v0.x < v5.x);
+ // NOP
+ v15 ^= v0;
+ v15 = (v15 >> vec2<u32>(16u)) | (v15 << vec2<u32>(16u)).yx;
+ v10 += v15;
+ v10.y += u32(v10.x < v15.x);
+ v5 ^= v10;
+ v5 = (v5 >> vec2<u32>(31u)).yx | (v5 << vec2<u32>(1u));
+ v1 += v6;
+ v1.y += u32(v1.x < v6.x);
+ v1 += m0;
+ v1.y += u32(v1.x < m0.x);
+ v12 ^= v1;
+ v12 = v12.yx;
+ v11 += v12;
+ v11.y += u32(v11.x < v12.x);
+ v6 ^= v11;
+ v6 = (v6 >> vec2<u32>(24u)) | (v6 << vec2<u32>(8u)).yx;
+ v1 += v6;
+ v1.y += u32(v1.x < v6.x);
+ v1 += m2;
+ v1.y += u32(v1.x < m2.x);
+ v12 ^= v1;
+ v12 = (v12 >> vec2<u32>(16u)) | (v12 << vec2<u32>(16u)).yx;
+ v11 += v12;
+ v11.y += u32(v11.x < v12.x);
+ v6 ^= v11;
+ v6 = (v6 >> vec2<u32>(31u)).yx | (v6 << vec2<u32>(1u));
+ v2 += v7;
+ v2.y += u32(v2.x < v7.x);
+ // NOP
+ v13 ^= v2;
+ v13 = v13.yx;
+ v8 += v13;
+ v8.y += u32(v8.x < v13.x);
+ v7 ^= v8;
+ v7 = (v7 >> vec2<u32>(24u)) | (v7 << vec2<u32>(8u)).yx;
+ v2 += v7;
+ v2.y += u32(v2.x < v7.x);
+ // NOP
+ v13 ^= v2;
+ v13 = (v13 >> vec2<u32>(16u)) | (v13 << vec2<u32>(16u)).yx;
+ v8 += v13;
+ v8.y += u32(v8.x < v13.x);
+ v7 ^= v8;
+ v7 = (v7 >> vec2<u32>(31u)).yx | (v7 << vec2<u32>(1u));
+ v3 += v4;
+ v3.y += u32(v3.x < v4.x);
+ // NOP
+ v14 ^= v3;
+ v14 = v14.yx;
+ v9 += v14;
+ v9.y += u32(v9.x < v14.x);
+ v4 ^= v9;
+ v4 = (v4 >> vec2<u32>(24u)) | (v4 << vec2<u32>(8u)).yx;
+ v3 += v4;
+ v3.y += u32(v3.x < v4.x);
+ v3 += m3;
+ v3.y += u32(v3.x < m3.x);
+ v14 ^= v3;
+ v14 = (v14 >> vec2<u32>(16u)) | (v14 << vec2<u32>(16u)).yx;
+ v9 += v14;
+ v9.y += u32(v9.x < v14.x);
+ v4 ^= v9;
+ v4 = (v4 >> vec2<u32>(31u)).yx | (v4 << vec2<u32>(1u));
+ // ROUND 2
+ v0 += v4;
+ v0.y += u32(v0.x < v4.x);
+ // NOP
+ v12 ^= v0;
+ v12 = v12.yx;
+ v8 += v12;
+ v8.y += u32(v8.x < v12.x);
+ v4 ^= v8;
+ v4 = (v4 >> vec2<u32>(24u)) | (v4 << vec2<u32>(8u)).yx;
+ v0 += v4;
+ v0.y += u32(v0.x < v4.x);
+ // NOP
+ v12 ^= v0;
+ v12 = (v12 >> vec2<u32>(16u)) | (v12 << vec2<u32>(16u)).yx;
+ v8 += v12;
+ v8.y += u32(v8.x < v12.x);
+ v4 ^= v8;
+ v4 = (v4 >> vec2<u32>(31u)).yx | (v4 << vec2<u32>(1u));
+ v1 += v5;
+ v1.y += u32(v1.x < v5.x);
+ // NOP
+ v13 ^= v1;
+ v13 = v13.yx;
+ v9 += v13;
+ v9.y += u32(v9.x < v13.x);
+ v5 ^= v9;
+ v5 = (v5 >> vec2<u32>(24u)) | (v5 << vec2<u32>(8u)).yx;
+ v1 += v5;
+ v1.y += u32(v1.x < v5.x);
+ v1 += m0;
+ v1.y += u32(v1.x < m0.x);
+ v13 ^= v1;
+ v13 = (v13 >> vec2<u32>(16u)) | (v13 << vec2<u32>(16u)).yx;
+ v9 += v13;
+ v9.y += u32(v9.x < v13.x);
+ v5 ^= v9;
+ v5 = (v5 >> vec2<u32>(31u)).yx | (v5 << vec2<u32>(1u));
+ v2 += v6;
+ v2.y += u32(v2.x < v6.x);
+ // NOP
+ v14 ^= v2;
+ v14 = v14.yx;
+ v10 += v14;
+ v10.y += u32(v10.x < v14.x);
+ v6 ^= v10;
+ v6 = (v6 >> vec2<u32>(24u)) | (v6 << vec2<u32>(8u)).yx;
+ v2 += v6;
+ v2.y += u32(v2.x < v6.x);
+ v2 += m2;
+ v2.y += u32(v2.x < m2.x);
+ v14 ^= v2;
+ v14 = (v14 >> vec2<u32>(16u)) | (v14 << vec2<u32>(16u)).yx;
+ v10 += v14;
+ v10.y += u32(v10.x < v14.x);
+ v6 ^= v10;
+ v6 = (v6 >> vec2<u32>(31u)).yx | (v6 << vec2<u32>(1u));
+ v3 += v7;
+ v3.y += u32(v3.x < v7.x);
+ // NOP
+ v15 ^= v3;
+ v15 = v15.yx;
+ v11 += v15;
+ v11.y += u32(v11.x < v15.x);
+ v7 ^= v11;
+ v7 = (v7 >> vec2<u32>(24u)) | (v7 << vec2<u32>(8u)).yx;
+ v3 += v7;
+ v3.y += u32(v3.x < v7.x);
+ // NOP
+ v15 ^= v3;
+ v15 = (v15 >> vec2<u32>(16u)) | (v15 << vec2<u32>(16u)).yx;
+ v11 += v15;
+ v11.y += u32(v11.x < v15.x);
+ v7 ^= v11;
+ v7 = (v7 >> vec2<u32>(31u)).yx | (v7 << vec2<u32>(1u));
+ v0 += v5;
+ v0.y += u32(v0.x < v5.x);
+ // NOP
+ v15 ^= v0;
+ v15 = v15.yx;
+ v10 += v15;
+ v10.y += u32(v10.x < v15.x);
+ v5 ^= v10;
+ v5 = (v5 >> vec2<u32>(24u)) | (v5 << vec2<u32>(8u)).yx;
+ v0 += v5;
+ v0.y += u32(v0.x < v5.x);
+ // NOP
+ v15 ^= v0;
+ v15 = (v15 >> vec2<u32>(16u)) | (v15 << vec2<u32>(16u)).yx;
+ v10 += v15;
+ v10.y += u32(v10.x < v15.x);
+ v5 ^= v10;
+ v5 = (v5 >> vec2<u32>(31u)).yx | (v5 << vec2<u32>(1u));
+ v1 += v6;
+ v1.y += u32(v1.x < v6.x);
+ v1 += m3;
+ v1.y += u32(v1.x < m3.x);
+ v12 ^= v1;
+ v12 = v12.yx;
+ v11 += v12;
+ v11.y += u32(v11.x < v12.x);
+ v6 ^= v11;
+ v6 = (v6 >> vec2<u32>(24u)) | (v6 << vec2<u32>(8u)).yx;
+ v1 += v6;
+ v1.y += u32(v1.x < v6.x);
+ // NOP
+ v12 ^= v1;
+ v12 = (v12 >> vec2<u32>(16u)) | (v12 << vec2<u32>(16u)).yx;
+ v11 += v12;
+ v11.y += u32(v11.x < v12.x);
+ v6 ^= v11;
+ v6 = (v6 >> vec2<u32>(31u)).yx | (v6 << vec2<u32>(1u));
+ v2 += v7;
+ v2.y += u32(v2.x < v7.x);
+ // NOP
+ v13 ^= v2;
+ v13 = v13.yx;
+ v8 += v13;
+ v8.y += u32(v8.x < v13.x);
+ v7 ^= v8;
+ v7 = (v7 >> vec2<u32>(24u)) | (v7 << vec2<u32>(8u)).yx;
+ v2 += v7;
+ v2.y += u32(v2.x < v7.x);
+ v2 += m1;
+ v2.y += u32(v2.x < m1.x);
+ v13 ^= v2;
+ v13 = (v13 >> vec2<u32>(16u)) | (v13 << vec2<u32>(16u)).yx;
+ v8 += v13;
+ v8.y += u32(v8.x < v13.x);
+ v7 ^= v8;
+ v7 = (v7 >> vec2<u32>(31u)).yx | (v7 << vec2<u32>(1u));
+ v3 += v4;
+ v3.y += u32(v3.x < v4.x);
+ // NOP
+ v14 ^= v3;
+ v14 = v14.yx;
+ v9 += v14;
+ v9.y += u32(v9.x < v14.x);
+ v4 ^= v9;
+ v4 = (v4 >> vec2<u32>(24u)) | (v4 << vec2<u32>(8u)).yx;
+ v3 += v4;
+ v3.y += u32(v3.x < v4.x);
+ v3 += m4;
+ v3.y += u32(v3.x < m4.x);
+ v14 ^= v3;
+ v14 = (v14 >> vec2<u32>(16u)) | (v14 << vec2<u32>(16u)).yx;
+ v9 += v14;
+ v9.y += u32(v9.x < v14.x);
+ v4 ^= v9;
+ v4 = (v4 >> vec2<u32>(31u)).yx | (v4 << vec2<u32>(1u));
+ // ROUND 3
+ v0 += v4;
+ v0.y += u32(v0.x < v4.x);
+ // NOP
+ v12 ^= v0;
+ v12 = v12.yx;
+ v8 += v12;
+ v8.y += u32(v8.x < v12.x);
+ v4 ^= v8;
+ v4 = (v4 >> vec2<u32>(24u)) | (v4 << vec2<u32>(8u)).yx;
+ v0 += v4;
+ v0.y += u32(v0.x < v4.x);
+ // NOP
+ v12 ^= v0;
+ v12 = (v12 >> vec2<u32>(16u)) | (v12 << vec2<u32>(16u)).yx;
+ v8 += v12;
+ v8.y += u32(v8.x < v12.x);
+ v4 ^= v8;
+ v4 = (v4 >> vec2<u32>(31u)).yx | (v4 << vec2<u32>(1u));
+ v1 += v5;
+ v1.y += u32(v1.x < v5.x);
+ v1 += m3;
+ v1.y += u32(v1.x < m3.x);
+ v13 ^= v1;
+ v13 = v13.yx;
+ v9 += v13;
+ v9.y += u32(v9.x < v13.x);
+ v5 ^= v9;
+ v5 = (v5 >> vec2<u32>(24u)) | (v5 << vec2<u32>(8u)).yx;
+ v1 += v5;
+ v1.y += u32(v1.x < v5.x);
+ v1 += m1;
+ v1.y += u32(v1.x < m1.x);
+ v13 ^= v1;
+ v13 = (v13 >> vec2<u32>(16u)) | (v13 << vec2<u32>(16u)).yx;
+ v9 += v13;
+ v9.y += u32(v9.x < v13.x);
+ v5 ^= v9;
+ v5 = (v5 >> vec2<u32>(31u)).yx | (v5 << vec2<u32>(1u));
+ v2 += v6;
+ v2.y += u32(v2.x < v6.x);
+ // NOP
+ v14 ^= v2;
+ v14 = v14.yx;
+ v10 += v14;
+ v10.y += u32(v10.x < v14.x);
+ v6 ^= v10;
+ v6 = (v6 >> vec2<u32>(24u)) | (v6 << vec2<u32>(8u)).yx;
+ v2 += v6;
+ v2.y += u32(v2.x < v6.x);
+ // NOP
+ v14 ^= v2;
+ v14 = (v14 >> vec2<u32>(16u)) | (v14 << vec2<u32>(16u)).yx;
+ v10 += v14;
+ v10.y += u32(v10.x < v14.x);
+ v6 ^= v10;
+ v6 = (v6 >> vec2<u32>(31u)).yx | (v6 << vec2<u32>(1u));
+ v3 += v7;
+ v3.y += u32(v3.x < v7.x);
+ // NOP
+ v15 ^= v3;
+ v15 = v15.yx;
+ v11 += v15;
+ v11.y += u32(v11.x < v15.x);
+ v7 ^= v11;
+ v7 = (v7 >> vec2<u32>(24u)) | (v7 << vec2<u32>(8u)).yx;
+ v3 += v7;
+ v3.y += u32(v3.x < v7.x);
+ // NOP
+ v15 ^= v3;
+ v15 = (v15 >> vec2<u32>(16u)) | (v15 << vec2<u32>(16u)).yx;
+ v11 += v15;
+ v11.y += u32(v11.x < v15.x);
+ v7 ^= v11;
+ v7 = (v7 >> vec2<u32>(31u)).yx | (v7 << vec2<u32>(1u));
+ v0 += v5;
+ v0.y += u32(v0.x < v5.x);
+ v0 += m2;
+ v0.y += u32(v0.x < m2.x);
+ v15 ^= v0;
+ v15 = v15.yx;
+ v10 += v15;
+ v10.y += u32(v10.x < v15.x);
+ v5 ^= v10;
+ v5 = (v5 >> vec2<u32>(24u)) | (v5 << vec2<u32>(8u)).yx;
+ v0 += v5;
+ v0.y += u32(v0.x < v5.x);
+ // NOP
+ v15 ^= v0;
+ v15 = (v15 >> vec2<u32>(16u)) | (v15 << vec2<u32>(16u)).yx;
+ v10 += v15;
+ v10.y += u32(v10.x < v15.x);
+ v5 ^= v10;
+ v5 = (v5 >> vec2<u32>(31u)).yx | (v5 << vec2<u32>(1u));
+ v1 += v6;
+ v1.y += u32(v1.x < v6.x);
+ // NOP
+ v12 ^= v1;
+ v12 = v12.yx;
+ v11 += v12;
+ v11.y += u32(v11.x < v12.x);
+ v6 ^= v11;
+ v6 = (v6 >> vec2<u32>(24u)) | (v6 << vec2<u32>(8u)).yx;
+ v1 += v6;
+ v1.y += u32(v1.x < v6.x);
+ // NOP
+ v12 ^= v1;
+ v12 = (v12 >> vec2<u32>(16u)) | (v12 << vec2<u32>(16u)).yx;
+ v11 += v12;
+ v11.y += u32(v11.x < v12.x);
+ v6 ^= v11;
+ v6 = (v6 >> vec2<u32>(31u)).yx | (v6 << vec2<u32>(1u));
+ v2 += v7;
+ v2.y += u32(v2.x < v7.x);
+ v2 += m4;
+ v2.y += u32(v2.x < m4.x);
+ v13 ^= v2;
+ v13 = v13.yx;
+ v8 += v13;
+ v8.y += u32(v8.x < v13.x);
+ v7 ^= v8;
+ v7 = (v7 >> vec2<u32>(24u)) | (v7 << vec2<u32>(8u)).yx;
+ v2 += v7;
+ v2.y += u32(v2.x < v7.x);
+ v2 += m0;
+ v2.y += u32(v2.x < m0.x);
+ v13 ^= v2;
+ v13 = (v13 >> vec2<u32>(16u)) | (v13 << vec2<u32>(16u)).yx;
+ v8 += v13;
+ v8.y += u32(v8.x < v13.x);
+ v7 ^= v8;
+ v7 = (v7 >> vec2<u32>(31u)).yx | (v7 << vec2<u32>(1u));
+ v3 += v4;
+ v3.y += u32(v3.x < v4.x);
+ // NOP
+ v14 ^= v3;
+ v14 = v14.yx;
+ v9 += v14;
+ v9.y += u32(v9.x < v14.x);
+ v4 ^= v9;
+ v4 = (v4 >> vec2<u32>(24u)) | (v4 << vec2<u32>(8u)).yx;
+ v3 += v4;
+ v3.y += u32(v3.x < v4.x);
+ // NOP
+ v14 ^= v3;
+ v14 = (v14 >> vec2<u32>(16u)) | (v14 << vec2<u32>(16u)).yx;
+ v9 += v14;
+ v9.y += u32(v9.x < v14.x);
+ v4 ^= v9;
+ v4 = (v4 >> vec2<u32>(31u)).yx | (v4 << vec2<u32>(1u));
+ // ROUND 4
+ v0 += v4;
+ v0.y += u32(v0.x < v4.x);
+ // NOP
+ v12 ^= v0;
+ v12 = v12.yx;
+ v8 += v12;
+ v8.y += u32(v8.x < v12.x);
+ v4 ^= v8;
+ v4 = (v4 >> vec2<u32>(24u)) | (v4 << vec2<u32>(8u)).yx;
+ v0 += v4;
+ v0.y += u32(v0.x < v4.x);
+ v0 += m0;
+ v0.y += u32(v0.x < m0.x);
+ v12 ^= v0;
+ v12 = (v12 >> vec2<u32>(16u)) | (v12 << vec2<u32>(16u)).yx;
+ v8 += v12;
+ v8.y += u32(v8.x < v12.x);
+ v4 ^= v8;
+ v4 = (v4 >> vec2<u32>(31u)).yx | (v4 << vec2<u32>(1u));
+ v1 += v5;
+ v1.y += u32(v1.x < v5.x);
+ // NOP
+ v13 ^= v1;
+ v13 = v13.yx;
+ v9 += v13;
+ v9.y += u32(v9.x < v13.x);
+ v5 ^= v9;
+ v5 = (v5 >> vec2<u32>(24u)) | (v5 << vec2<u32>(8u)).yx;
+ v1 += v5;
+ v1.y += u32(v1.x < v5.x);
+ // NOP
+ v13 ^= v1;
+ v13 = (v13 >> vec2<u32>(16u)) | (v13 << vec2<u32>(16u)).yx;
+ v9 += v13;
+ v9.y += u32(v9.x < v13.x);
+ v5 ^= v9;
+ v5 = (v5 >> vec2<u32>(31u)).yx | (v5 << vec2<u32>(1u));
+ v2 += v6;
+ v2.y += u32(v2.x < v6.x);
+ v2 += m2;
+ v2.y += u32(v2.x < m2.x);
+ v14 ^= v2;
+ v14 = v14.yx;
+ v10 += v14;
+ v10.y += u32(v10.x < v14.x);
+ v6 ^= v10;
+ v6 = (v6 >> vec2<u32>(24u)) | (v6 << vec2<u32>(8u)).yx;
+ v2 += v6;
+ v2.y += u32(v2.x < v6.x);
+ v2 += m4;
+ v2.y += u32(v2.x < m4.x);
+ v14 ^= v2;
+ v14 = (v14 >> vec2<u32>(16u)) | (v14 << vec2<u32>(16u)).yx;
+ v10 += v14;
+ v10.y += u32(v10.x < v14.x);
+ v6 ^= v10;
+ v6 = (v6 >> vec2<u32>(31u)).yx | (v6 << vec2<u32>(1u));
+ v3 += v7;
+ v3.y += u32(v3.x < v7.x);
+ // NOP
+ v15 ^= v3;
+ v15 = v15.yx;
+ v11 += v15;
+ v11.y += u32(v11.x < v15.x);
+ v7 ^= v11;
+ v7 = (v7 >> vec2<u32>(24u)) | (v7 << vec2<u32>(8u)).yx;
+ v3 += v7;
+ v3.y += u32(v3.x < v7.x);
+ // NOP
+ v15 ^= v3;
+ v15 = (v15 >> vec2<u32>(16u)) | (v15 << vec2<u32>(16u)).yx;
+ v11 += v15;
+ v11.y += u32(v11.x < v15.x);
+ v7 ^= v11;
+ v7 = (v7 >> vec2<u32>(31u)).yx | (v7 << vec2<u32>(1u));
+ v0 += v5;
+ v0.y += u32(v0.x < v5.x);
+ // NOP
+ v15 ^= v0;
+ v15 = v15.yx;
+ v10 += v15;
+ v10.y += u32(v10.x < v15.x);
+ v5 ^= v10;
+ v5 = (v5 >> vec2<u32>(24u)) | (v5 << vec2<u32>(8u)).yx;
+ v0 += v5;
+ v0.y += u32(v0.x < v5.x);
+ v0 += m1;
+ v0.y += u32(v0.x < m1.x);
+ v15 ^= v0;
+ v15 = (v15 >> vec2<u32>(16u)) | (v15 << vec2<u32>(16u)).yx;
+ v10 += v15;
+ v10.y += u32(v10.x < v15.x);
+ v5 ^= v10;
+ v5 = (v5 >> vec2<u32>(31u)).yx | (v5 << vec2<u32>(1u));
+ v1 += v6;
+ v1.y += u32(v1.x < v6.x);
+ // NOP
+ v12 ^= v1;
+ v12 = v12.yx;
+ v11 += v12;
+ v11.y += u32(v11.x < v12.x);
+ v6 ^= v11;
+ v6 = (v6 >> vec2<u32>(24u)) | (v6 << vec2<u32>(8u)).yx;
+ v1 += v6;
+ v1.y += u32(v1.x < v6.x);
+ // NOP
+ v12 ^= v1;
+ v12 = (v12 >> vec2<u32>(16u)) | (v12 << vec2<u32>(16u)).yx;
+ v11 += v12;
+ v11.y += u32(v11.x < v12.x);
+ v6 ^= v11;
+ v6 = (v6 >> vec2<u32>(31u)).yx | (v6 << vec2<u32>(1u));
+ v2 += v7;
+ v2.y += u32(v2.x < v7.x);
+ // NOP
+ v13 ^= v2;
+ v13 = v13.yx;
+ v8 += v13;
+ v8.y += u32(v8.x < v13.x);
+ v7 ^= v8;
+ v7 = (v7 >> vec2<u32>(24u)) | (v7 << vec2<u32>(8u)).yx;
+ v2 += v7;
+ v2.y += u32(v2.x < v7.x);
+ // NOP
+ v13 ^= v2;
+ v13 = (v13 >> vec2<u32>(16u)) | (v13 << vec2<u32>(16u)).yx;
+ v8 += v13;
+ v8.y += u32(v8.x < v13.x);
+ v7 ^= v8;
+ v7 = (v7 >> vec2<u32>(31u)).yx | (v7 << vec2<u32>(1u));
+ v3 += v4;
+ v3.y += u32(v3.x < v4.x);
+ v3 += m3;
+ v3.y += u32(v3.x < m3.x);
+ v14 ^= v3;
+ v14 = v14.yx;
+ v9 += v14;
+ v9.y += u32(v9.x < v14.x);
+ v4 ^= v9;
+ v4 = (v4 >> vec2<u32>(24u)) | (v4 << vec2<u32>(8u)).yx;
+ v3 += v4;
+ v3.y += u32(v3.x < v4.x);
+ // NOP
+ v14 ^= v3;
+ v14 = (v14 >> vec2<u32>(16u)) | (v14 << vec2<u32>(16u)).yx;
+ v9 += v14;
+ v9.y += u32(v9.x < v14.x);
+ v4 ^= v9;
+ v4 = (v4 >> vec2<u32>(31u)).yx | (v4 << vec2<u32>(1u));
+ // ROUND 5
+ v0 += v4;
+ v0.y += u32(v0.x < v4.x);
+ v0 += m2;
+ v0.y += u32(v0.x < m2.x);
+ v12 ^= v0;
+ v12 = v12.yx;
+ v8 += v12;
+ v8.y += u32(v8.x < v12.x);
+ v4 ^= v8;
+ v4 = (v4 >> vec2<u32>(24u)) | (v4 << vec2<u32>(8u)).yx;
+ v0 += v4;
+ v0.y += u32(v0.x < v4.x);
+ // NOP
+ v12 ^= v0;
+ v12 = (v12 >> vec2<u32>(16u)) | (v12 << vec2<u32>(16u)).yx;
+ v8 += v12;
+ v8.y += u32(v8.x < v12.x);
+ v4 ^= v8;
+ v4 = (v4 >> vec2<u32>(31u)).yx | (v4 << vec2<u32>(1u));
+ v1 += v5;
+ v1.y += u32(v1.x < v5.x);
+ // NOP
+ v13 ^= v1;
+ v13 = v13.yx;
+ v9 += v13;
+ v9.y += u32(v9.x < v13.x);
+ v5 ^= v9;
+ v5 = (v5 >> vec2<u32>(24u)) | (v5 << vec2<u32>(8u)).yx;
+ v1 += v5;
+ v1.y += u32(v1.x < v5.x);
+ // NOP
+ v13 ^= v1;
+ v13 = (v13 >> vec2<u32>(16u)) | (v13 << vec2<u32>(16u)).yx;
+ v9 += v13;
+ v9.y += u32(v9.x < v13.x);
+ v5 ^= v9;
+ v5 = (v5 >> vec2<u32>(31u)).yx | (v5 << vec2<u32>(1u));
+ v2 += v6;
+ v2.y += u32(v2.x < v6.x);
+ v2 += m0;
+ v2.y += u32(v2.x < m0.x);
+ v14 ^= v2;
+ v14 = v14.yx;
+ v10 += v14;
+ v10.y += u32(v10.x < v14.x);
+ v6 ^= v10;
+ v6 = (v6 >> vec2<u32>(24u)) | (v6 << vec2<u32>(8u)).yx;
+ v2 += v6;
+ v2.y += u32(v2.x < v6.x);
+ // NOP
+ v14 ^= v2;
+ v14 = (v14 >> vec2<u32>(16u)) | (v14 << vec2<u32>(16u)).yx;
+ v10 += v14;
+ v10.y += u32(v10.x < v14.x);
+ v6 ^= v10;
+ v6 = (v6 >> vec2<u32>(31u)).yx | (v6 << vec2<u32>(1u));
+ v3 += v7;
+ v3.y += u32(v3.x < v7.x);
+ // NOP
+ v15 ^= v3;
+ v15 = v15.yx;
+ v11 += v15;
+ v11.y += u32(v11.x < v15.x);
+ v7 ^= v11;
+ v7 = (v7 >> vec2<u32>(24u)) | (v7 << vec2<u32>(8u)).yx;
+ v3 += v7;
+ v3.y += u32(v3.x < v7.x);
+ v3 += m3;
+ v3.y += u32(v3.x < m3.x);
+ v15 ^= v3;
+ v15 = (v15 >> vec2<u32>(16u)) | (v15 << vec2<u32>(16u)).yx;
+ v11 += v15;
+ v11.y += u32(v11.x < v15.x);
+ v7 ^= v11;
+ v7 = (v7 >> vec2<u32>(31u)).yx | (v7 << vec2<u32>(1u));
+ v0 += v5;
+ v0.y += u32(v0.x < v5.x);
+ v0 += m4;
+ v0.y += u32(v0.x < m4.x);
+ v15 ^= v0;
+ v15 = v15.yx;
+ v10 += v15;
+ v10.y += u32(v10.x < v15.x);
+ v5 ^= v10;
+ v5 = (v5 >> vec2<u32>(24u)) | (v5 << vec2<u32>(8u)).yx;
+ v0 += v5;
+ v0.y += u32(v0.x < v5.x);
+ // NOP
+ v15 ^= v0;
+ v15 = (v15 >> vec2<u32>(16u)) | (v15 << vec2<u32>(16u)).yx;
+ v10 += v15;
+ v10.y += u32(v10.x < v15.x);
+ v5 ^= v10;
+ v5 = (v5 >> vec2<u32>(31u)).yx | (v5 << vec2<u32>(1u));
+ v1 += v6;
+ v1.y += u32(v1.x < v6.x);
+ // NOP
+ v12 ^= v1;
+ v12 = v12.yx;
+ v11 += v12;
+ v11.y += u32(v11.x < v12.x);
+ v6 ^= v11;
+ v6 = (v6 >> vec2<u32>(24u)) | (v6 << vec2<u32>(8u)).yx;
+ v1 += v6;
+ v1.y += u32(v1.x < v6.x);
+ // NOP
+ v12 ^= v1;
+ v12 = (v12 >> vec2<u32>(16u)) | (v12 << vec2<u32>(16u)).yx;
+ v11 += v12;
+ v11.y += u32(v11.x < v12.x);
+ v6 ^= v11;
+ v6 = (v6 >> vec2<u32>(31u)).yx | (v6 << vec2<u32>(1u));
+ v2 += v7;
+ v2.y += u32(v2.x < v7.x);
+ // NOP
+ v13 ^= v2;
+ v13 = v13.yx;
+ v8 += v13;
+ v8.y += u32(v8.x < v13.x);
+ v7 ^= v8;
+ v7 = (v7 >> vec2<u32>(24u)) | (v7 << vec2<u32>(8u)).yx;
+ v2 += v7;
+ v2.y += u32(v2.x < v7.x);
+ // NOP
+ v13 ^= v2;
+ v13 = (v13 >> vec2<u32>(16u)) | (v13 << vec2<u32>(16u)).yx;
+ v8 += v13;
+ v8.y += u32(v8.x < v13.x);
+ v7 ^= v8;
+ v7 = (v7 >> vec2<u32>(31u)).yx | (v7 << vec2<u32>(1u));
+ v3 += v4;
+ v3.y += u32(v3.x < v4.x);
+ v3 += m1;
+ v3.y += u32(v3.x < m1.x);
+ v14 ^= v3;
+ v14 = v14.yx;
+ v9 += v14;
+ v9.y += u32(v9.x < v14.x);
+ v4 ^= v9;
+ v4 = (v4 >> vec2<u32>(24u)) | (v4 << vec2<u32>(8u)).yx;
+ v3 += v4;
+ v3.y += u32(v3.x < v4.x);
+ // NOP
+ v14 ^= v3;
+ v14 = (v14 >> vec2<u32>(16u)) | (v14 << vec2<u32>(16u)).yx;
+ v9 += v14;
+ v9.y += u32(v9.x < v14.x);
+ v4 ^= v9;
+ v4 = (v4 >> vec2<u32>(31u)).yx | (v4 << vec2<u32>(1u));
+ // ROUND 6
+ v0 += v4;
+ v0.y += u32(v0.x < v4.x);
+ // NOP
+ v12 ^= v0;
+ v12 = v12.yx;
+ v8 += v12;
+ v8.y += u32(v8.x < v12.x);
+ v4 ^= v8;
+ v4 = (v4 >> vec2<u32>(24u)) | (v4 << vec2<u32>(8u)).yx;
+ v0 += v4;
+ v0.y += u32(v0.x < v4.x);
+ // NOP
+ v12 ^= v0;
+ v12 = (v12 >> vec2<u32>(16u)) | (v12 << vec2<u32>(16u)).yx;
+ v8 += v12;
+ v8.y += u32(v8.x < v12.x);
+ v4 ^= v8;
+ v4 = (v4 >> vec2<u32>(31u)).yx | (v4 << vec2<u32>(1u));
+ v1 += v5;
+ v1.y += u32(v1.x < v5.x);
+ v1 += m1;
+ v1.y += u32(v1.x < m1.x);
+ v13 ^= v1;
+ v13 = v13.yx;
+ v9 += v13;
+ v9.y += u32(v9.x < v13.x);
+ v5 ^= v9;
+ v5 = (v5 >> vec2<u32>(24u)) | (v5 << vec2<u32>(8u)).yx;
+ v1 += v5;
+ v1.y += u32(v1.x < v5.x);
+ // NOP
+ v13 ^= v1;
+ v13 = (v13 >> vec2<u32>(16u)) | (v13 << vec2<u32>(16u)).yx;
+ v9 += v13;
+ v9.y += u32(v9.x < v13.x);
+ v5 ^= v9;
+ v5 = (v5 >> vec2<u32>(31u)).yx | (v5 << vec2<u32>(1u));
+ v2 += v6;
+ v2.y += u32(v2.x < v6.x);
+ // NOP
+ v14 ^= v2;
+ v14 = v14.yx;
+ v10 += v14;
+ v10.y += u32(v10.x < v14.x);
+ v6 ^= v10;
+ v6 = (v6 >> vec2<u32>(24u)) | (v6 << vec2<u32>(8u)).yx;
+ v2 += v6;
+ v2.y += u32(v2.x < v6.x);
+ // NOP
+ v14 ^= v2;
+ v14 = (v14 >> vec2<u32>(16u)) | (v14 << vec2<u32>(16u)).yx;
+ v10 += v14;
+ v10.y += u32(v10.x < v14.x);
+ v6 ^= v10;
+ v6 = (v6 >> vec2<u32>(31u)).yx | (v6 << vec2<u32>(1u));
+ v3 += v7;
+ v3.y += u32(v3.x < v7.x);
+ v3 += m4;
+ v3.y += u32(v3.x < m4.x);
+ v15 ^= v3;
+ v15 = v15.yx;
+ v11 += v15;
+ v11.y += u32(v11.x < v15.x);
+ v7 ^= v11;
+ v7 = (v7 >> vec2<u32>(24u)) | (v7 << vec2<u32>(8u)).yx;
+ v3 += v7;
+ v3.y += u32(v3.x < v7.x);
+ // NOP
+ v15 ^= v3;
+ v15 = (v15 >> vec2<u32>(16u)) | (v15 << vec2<u32>(16u)).yx;
+ v11 += v15;
+ v11.y += u32(v11.x < v15.x);
+ v7 ^= v11;
+ v7 = (v7 >> vec2<u32>(31u)).yx | (v7 << vec2<u32>(1u));
+ v0 += v5;
+ v0.y += u32(v0.x < v5.x);
+ v0 += m0;
+ v0.y += u32(v0.x < m0.x);
+ v15 ^= v0;
+ v15 = v15.yx;
+ v10 += v15;
+ v10.y += u32(v10.x < v15.x);
+ v5 ^= v10;
+ v5 = (v5 >> vec2<u32>(24u)) | (v5 << vec2<u32>(8u)).yx;
+ v0 += v5;
+ v0.y += u32(v0.x < v5.x);
+ // NOP
+ v15 ^= v0;
+ v15 = (v15 >> vec2<u32>(16u)) | (v15 << vec2<u32>(16u)).yx;
+ v10 += v15;
+ v10.y += u32(v10.x < v15.x);
+ v5 ^= v10;
+ v5 = (v5 >> vec2<u32>(31u)).yx | (v5 << vec2<u32>(1u));
+ v1 += v6;
+ v1.y += u32(v1.x < v6.x);
+ // NOP
+ v12 ^= v1;
+ v12 = v12.yx;
+ v11 += v12;
+ v11.y += u32(v11.x < v12.x);
+ v6 ^= v11;
+ v6 = (v6 >> vec2<u32>(24u)) | (v6 << vec2<u32>(8u)).yx;
+ v1 += v6;
+ v1.y += u32(v1.x < v6.x);
+ v1 += m3;
+ v1.y += u32(v1.x < m3.x);
+ v12 ^= v1;
+ v12 = (v12 >> vec2<u32>(16u)) | (v12 << vec2<u32>(16u)).yx;
+ v11 += v12;
+ v11.y += u32(v11.x < v12.x);
+ v6 ^= v11;
+ v6 = (v6 >> vec2<u32>(31u)).yx | (v6 << vec2<u32>(1u));
+ v2 += v7;
+ v2.y += u32(v2.x < v7.x);
+ // NOP
+ v13 ^= v2;
+ v13 = v13.yx;
+ v8 += v13;
+ v8.y += u32(v8.x < v13.x);
+ v7 ^= v8;
+ v7 = (v7 >> vec2<u32>(24u)) | (v7 << vec2<u32>(8u)).yx;
+ v2 += v7;
+ v2.y += u32(v2.x < v7.x);
+ v2 += m2;
+ v2.y += u32(v2.x < m2.x);
+ v13 ^= v2;
+ v13 = (v13 >> vec2<u32>(16u)) | (v13 << vec2<u32>(16u)).yx;
+ v8 += v13;
+ v8.y += u32(v8.x < v13.x);
+ v7 ^= v8;
+ v7 = (v7 >> vec2<u32>(31u)).yx | (v7 << vec2<u32>(1u));
+ v3 += v4;
+ v3.y += u32(v3.x < v4.x);
+ // NOP
+ v14 ^= v3;
+ v14 = v14.yx;
+ v9 += v14;
+ v9.y += u32(v9.x < v14.x);
+ v4 ^= v9;
+ v4 = (v4 >> vec2<u32>(24u)) | (v4 << vec2<u32>(8u)).yx;
+ v3 += v4;
+ v3.y += u32(v3.x < v4.x);
+ // NOP
+ v14 ^= v3;
+ v14 = (v14 >> vec2<u32>(16u)) | (v14 << vec2<u32>(16u)).yx;
+ v9 += v14;
+ v9.y += u32(v9.x < v14.x);
+ v4 ^= v9;
+ v4 = (v4 >> vec2<u32>(31u)).yx | (v4 << vec2<u32>(1u));
+ // ROUND 7
+ v0 += v4;
+ v0.y += u32(v0.x < v4.x);
+ // NOP
+ v12 ^= v0;
+ v12 = v12.yx;
+ v8 += v12;
+ v8.y += u32(v8.x < v12.x);
+ v4 ^= v8;
+ v4 = (v4 >> vec2<u32>(24u)) | (v4 << vec2<u32>(8u)).yx;
+ v0 += v4;
+ v0.y += u32(v0.x < v4.x);
+ // NOP
+ v12 ^= v0;
+ v12 = (v12 >> vec2<u32>(16u)) | (v12 << vec2<u32>(16u)).yx;
+ v8 += v12;
+ v8.y += u32(v8.x < v12.x);
+ v4 ^= v8;
+ v4 = (v4 >> vec2<u32>(31u)).yx | (v4 << vec2<u32>(1u));
+ v1 += v5;
+ v1.y += u32(v1.x < v5.x);
+ // NOP
+ v13 ^= v1;
+ v13 = v13.yx;
+ v9 += v13;
+ v9.y += u32(v9.x < v13.x);
+ v5 ^= v9;
+ v5 = (v5 >> vec2<u32>(24u)) | (v5 << vec2<u32>(8u)).yx;
+ v1 += v5;
+ v1.y += u32(v1.x < v5.x);
+ // NOP
+ v13 ^= v1;
+ v13 = (v13 >> vec2<u32>(16u)) | (v13 << vec2<u32>(16u)).yx;
+ v9 += v13;
+ v9.y += u32(v9.x < v13.x);
+ v5 ^= v9;
+ v5 = (v5 >> vec2<u32>(31u)).yx | (v5 << vec2<u32>(1u));
+ v2 += v6;
+ v2.y += u32(v2.x < v6.x);
+ // NOP
+ v14 ^= v2;
+ v14 = v14.yx;
+ v10 += v14;
+ v10.y += u32(v10.x < v14.x);
+ v6 ^= v10;
+ v6 = (v6 >> vec2<u32>(24u)) | (v6 << vec2<u32>(8u)).yx;
+ v2 += v6;
+ v2.y += u32(v2.x < v6.x);
+ v2 += m1;
+ v2.y += u32(v2.x < m1.x);
+ v14 ^= v2;
+ v14 = (v14 >> vec2<u32>(16u)) | (v14 << vec2<u32>(16u)).yx;
+ v10 += v14;
+ v10.y += u32(v10.x < v14.x);
+ v6 ^= v10;
+ v6 = (v6 >> vec2<u32>(31u)).yx | (v6 << vec2<u32>(1u));
+ v3 += v7;
+ v3.y += u32(v3.x < v7.x);
+ v3 += m3;
+ v3.y += u32(v3.x < m3.x);
+ v15 ^= v3;
+ v15 = v15.yx;
+ v11 += v15;
+ v11.y += u32(v11.x < v15.x);
+ v7 ^= v11;
+ v7 = (v7 >> vec2<u32>(24u)) | (v7 << vec2<u32>(8u)).yx;
+ v3 += v7;
+ v3.y += u32(v3.x < v7.x);
+ // NOP
+ v15 ^= v3;
+ v15 = (v15 >> vec2<u32>(16u)) | (v15 << vec2<u32>(16u)).yx;
+ v11 += v15;
+ v11.y += u32(v11.x < v15.x);
+ v7 ^= v11;
+ v7 = (v7 >> vec2<u32>(31u)).yx | (v7 << vec2<u32>(1u));
+ v0 += v5;
+ v0.y += u32(v0.x < v5.x);
+ // NOP
+ v15 ^= v0;
+ v15 = v15.yx;
+ v10 += v15;
+ v10.y += u32(v10.x < v15.x);
+ v5 ^= v10;
+ v5 = (v5 >> vec2<u32>(24u)) | (v5 << vec2<u32>(8u)).yx;
+ v0 += v5;
+ v0.y += u32(v0.x < v5.x);
+ v0 += m0;
+ v0.y += u32(v0.x < m0.x);
+ v15 ^= v0;
+ v15 = (v15 >> vec2<u32>(16u)) | (v15 << vec2<u32>(16u)).yx;
+ v10 += v15;
+ v10.y += u32(v10.x < v15.x);
+ v5 ^= v10;
+ v5 = (v5 >> vec2<u32>(31u)).yx | (v5 << vec2<u32>(1u));
+ v1 += v6;
+ v1.y += u32(v1.x < v6.x);
+ // NOP
+ v12 ^= v1;
+ v12 = v12.yx;
+ v11 += v12;
+ v11.y += u32(v11.x < v12.x);
+ v6 ^= v11;
+ v6 = (v6 >> vec2<u32>(24u)) | (v6 << vec2<u32>(8u)).yx;
+ v1 += v6;
+ v1.y += u32(v1.x < v6.x);
+ v1 += m4;
+ v1.y += u32(v1.x < m4.x);
+ v12 ^= v1;
+ v12 = (v12 >> vec2<u32>(16u)) | (v12 << vec2<u32>(16u)).yx;
+ v11 += v12;
+ v11.y += u32(v11.x < v12.x);
+ v6 ^= v11;
+ v6 = (v6 >> vec2<u32>(31u)).yx | (v6 << vec2<u32>(1u));
+ v2 += v7;
+ v2.y += u32(v2.x < v7.x);
+ // NOP
+ v13 ^= v2;
+ v13 = v13.yx;
+ v8 += v13;
+ v8.y += u32(v8.x < v13.x);
+ v7 ^= v8;
+ v7 = (v7 >> vec2<u32>(24u)) | (v7 << vec2<u32>(8u)).yx;
+ v2 += v7;
+ v2.y += u32(v2.x < v7.x);
+ // NOP
+ v13 ^= v2;
+ v13 = (v13 >> vec2<u32>(16u)) | (v13 << vec2<u32>(16u)).yx;
+ v8 += v13;
+ v8.y += u32(v8.x < v13.x);
+ v7 ^= v8;
+ v7 = (v7 >> vec2<u32>(31u)).yx | (v7 << vec2<u32>(1u));
+ v3 += v4;
+ v3.y += u32(v3.x < v4.x);
+ v3 += m2;
+ v3.y += u32(v3.x < m2.x);
+ v14 ^= v3;
+ v14 = v14.yx;
+ v9 += v14;
+ v9.y += u32(v9.x < v14.x);
+ v4 ^= v9;
+ v4 = (v4 >> vec2<u32>(24u)) | (v4 << vec2<u32>(8u)).yx;
+ v3 += v4;
+ v3.y += u32(v3.x < v4.x);
+ // NOP
+ v14 ^= v3;
+ v14 = (v14 >> vec2<u32>(16u)) | (v14 << vec2<u32>(16u)).yx;
+ v9 += v14;
+ v9.y += u32(v9.x < v14.x);
+ v4 ^= v9;
+ v4 = (v4 >> vec2<u32>(31u)).yx | (v4 << vec2<u32>(1u));
+ // ROUND 8
+ v0 += v4;
+ v0.y += u32(v0.x < v4.x);
+ // NOP
+ v12 ^= v0;
+ v12 = v12.yx;
+ v8 += v12;
+ v8.y += u32(v8.x < v12.x);
+ v4 ^= v8;
+ v4 = (v4 >> vec2<u32>(24u)) | (v4 << vec2<u32>(8u)).yx;
+ v0 += v4;
+ v0.y += u32(v0.x < v4.x);
+ // NOP
+ v12 ^= v0;
+ v12 = (v12 >> vec2<u32>(16u)) | (v12 << vec2<u32>(16u)).yx;
+ v8 += v12;
+ v8.y += u32(v8.x < v12.x);
+ v4 ^= v8;
+ v4 = (v4 >> vec2<u32>(31u)).yx | (v4 << vec2<u32>(1u));
+ v1 += v5;
+ v1.y += u32(v1.x < v5.x);
+ // NOP
+ v13 ^= v1;
+ v13 = v13.yx;
+ v9 += v13;
+ v9.y += u32(v9.x < v13.x);
+ v5 ^= v9;
+ v5 = (v5 >> vec2<u32>(24u)) | (v5 << vec2<u32>(8u)).yx;
+ v1 += v5;
+ v1.y += u32(v1.x < v5.x);
+ // NOP
+ v13 ^= v1;
+ v13 = (v13 >> vec2<u32>(16u)) | (v13 << vec2<u32>(16u)).yx;
+ v9 += v13;
+ v9.y += u32(v9.x < v13.x);
+ v5 ^= v9;
+ v5 = (v5 >> vec2<u32>(31u)).yx | (v5 << vec2<u32>(1u));
+ v2 += v6;
+ v2.y += u32(v2.x < v6.x);
+ // NOP
+ v14 ^= v2;
+ v14 = v14.yx;
+ v10 += v14;
+ v10.y += u32(v10.x < v14.x);
+ v6 ^= v10;
+ v6 = (v6 >> vec2<u32>(24u)) | (v6 << vec2<u32>(8u)).yx;
+ v2 += v6;
+ v2.y += u32(v2.x < v6.x);
+ v2 += m3;
+ v2.y += u32(v2.x < m3.x);
+ v14 ^= v2;
+ v14 = (v14 >> vec2<u32>(16u)) | (v14 << vec2<u32>(16u)).yx;
+ v10 += v14;
+ v10.y += u32(v10.x < v14.x);
+ v6 ^= v10;
+ v6 = (v6 >> vec2<u32>(31u)).yx | (v6 << vec2<u32>(1u));
+ v3 += v7;
+ v3.y += u32(v3.x < v7.x);
+ v3 += m0;
+ v3.y += u32(v3.x < m0.x);
+ v15 ^= v3;
+ v15 = v15.yx;
+ v11 += v15;
+ v11.y += u32(v11.x < v15.x);
+ v7 ^= v11;
+ v7 = (v7 >> vec2<u32>(24u)) | (v7 << vec2<u32>(8u)).yx;
+ v3 += v7;
+ v3.y += u32(v3.x < v7.x);
+ // NOP
+ v15 ^= v3;
+ v15 = (v15 >> vec2<u32>(16u)) | (v15 << vec2<u32>(16u)).yx;
+ v11 += v15;
+ v11.y += u32(v11.x < v15.x);
+ v7 ^= v11;
+ v7 = (v7 >> vec2<u32>(31u)).yx | (v7 << vec2<u32>(1u));
+ v0 += v5;
+ v0.y += u32(v0.x < v5.x);
+ // NOP
+ v15 ^= v0;
+ v15 = v15.yx;
+ v10 += v15;
+ v10.y += u32(v10.x < v15.x);
+ v5 ^= v10;
+ v5 = (v5 >> vec2<u32>(24u)) | (v5 << vec2<u32>(8u)).yx;
+ v0 += v5;
+ v0.y += u32(v0.x < v5.x);
+ v0 += m2;
+ v0.y += u32(v0.x < m2.x);
+ v15 ^= v0;
+ v15 = (v15 >> vec2<u32>(16u)) | (v15 << vec2<u32>(16u)).yx;
+ v10 += v15;
+ v10.y += u32(v10.x < v15.x);
+ v5 ^= v10;
+ v5 = (v5 >> vec2<u32>(31u)).yx | (v5 << vec2<u32>(1u));
+ v1 += v6;
+ v1.y += u32(v1.x < v6.x);
+ // NOP
+ v12 ^= v1;
+ v12 = v12.yx;
+ v11 += v12;
+ v11.y += u32(v11.x < v12.x);
+ v6 ^= v11;
+ v6 = (v6 >> vec2<u32>(24u)) | (v6 << vec2<u32>(8u)).yx;
+ v1 += v6;
+ v1.y += u32(v1.x < v6.x);
+ // NOP
+ v12 ^= v1;
+ v12 = (v12 >> vec2<u32>(16u)) | (v12 << vec2<u32>(16u)).yx;
+ v11 += v12;
+ v11.y += u32(v11.x < v12.x);
+ v6 ^= v11;
+ v6 = (v6 >> vec2<u32>(31u)).yx | (v6 << vec2<u32>(1u));
+ v2 += v7;
+ v2.y += u32(v2.x < v7.x);
+ v2 += m1;
+ v2.y += u32(v2.x < m1.x);
+ v13 ^= v2;
+ v13 = v13.yx;
+ v8 += v13;
+ v8.y += u32(v8.x < v13.x);
+ v7 ^= v8;
+ v7 = (v7 >> vec2<u32>(24u)) | (v7 << vec2<u32>(8u)).yx;
+ v2 += v7;
+ v2.y += u32(v2.x < v7.x);
+ v2 += m4;
+ v2.y += u32(v2.x < m4.x);
+ v13 ^= v2;
+ v13 = (v13 >> vec2<u32>(16u)) | (v13 << vec2<u32>(16u)).yx;
+ v8 += v13;
+ v8.y += u32(v8.x < v13.x);
+ v7 ^= v8;
+ v7 = (v7 >> vec2<u32>(31u)).yx | (v7 << vec2<u32>(1u));
+ v3 += v4;
+ v3.y += u32(v3.x < v4.x);
+ // NOP
+ v14 ^= v3;
+ v14 = v14.yx;
+ v9 += v14;
+ v9.y += u32(v9.x < v14.x);
+ v4 ^= v9;
+ v4 = (v4 >> vec2<u32>(24u)) | (v4 << vec2<u32>(8u)).yx;
+ v3 += v4;
+ v3.y += u32(v3.x < v4.x);
+ // NOP
+ v14 ^= v3;
+ v14 = (v14 >> vec2<u32>(16u)) | (v14 << vec2<u32>(16u)).yx;
+ v9 += v14;
+ v9.y += u32(v9.x < v14.x);
+ v4 ^= v9;
+ v4 = (v4 >> vec2<u32>(31u)).yx | (v4 << vec2<u32>(1u));
+ // ROUND 9
+ v0 += v4;
+ v0.y += u32(v0.x < v4.x);
+ // NOP
+ v12 ^= v0;
+ v12 = v12.yx;
+ v8 += v12;
+ v8.y += u32(v8.x < v12.x);
+ v4 ^= v8;
+ v4 = (v4 >> vec2<u32>(24u)) | (v4 << vec2<u32>(8u)).yx;
+ v0 += v4;
+ v0.y += u32(v0.x < v4.x);
+ v0 += m2;
+ v0.y += u32(v0.x < m2.x);
+ v12 ^= v0;
+ v12 = (v12 >> vec2<u32>(16u)) | (v12 << vec2<u32>(16u)).yx;
+ v8 += v12;
+ v8.y += u32(v8.x < v12.x);
+ v4 ^= v8;
+ v4 = (v4 >> vec2<u32>(31u)).yx | (v4 << vec2<u32>(1u));
+ v1 += v5;
+ v1.y += u32(v1.x < v5.x);
+ // NOP
+ v13 ^= v1;
+ v13 = v13.yx;
+ v9 += v13;
+ v9.y += u32(v9.x < v13.x);
+ v5 ^= v9;
+ v5 = (v5 >> vec2<u32>(24u)) | (v5 << vec2<u32>(8u)).yx;
+ v1 += v5;
+ v1.y += u32(v1.x < v5.x);
+ v1 += m4;
+ v1.y += u32(v1.x < m4.x);
+ v13 ^= v1;
+ v13 = (v13 >> vec2<u32>(16u)) | (v13 << vec2<u32>(16u)).yx;
+ v9 += v13;
+ v9.y += u32(v9.x < v13.x);
+ v5 ^= v9;
+ v5 = (v5 >> vec2<u32>(31u)).yx | (v5 << vec2<u32>(1u));
+ v2 += v6;
+ v2.y += u32(v2.x < v6.x);
+ // NOP
+ v14 ^= v2;
+ v14 = v14.yx;
+ v10 += v14;
+ v10.y += u32(v10.x < v14.x);
+ v6 ^= v10;
+ v6 = (v6 >> vec2<u32>(24u)) | (v6 << vec2<u32>(8u)).yx;
+ v2 += v6;
+ v2.y += u32(v2.x < v6.x);
+ // NOP
+ v14 ^= v2;
+ v14 = (v14 >> vec2<u32>(16u)) | (v14 << vec2<u32>(16u)).yx;
+ v10 += v14;
+ v10.y += u32(v10.x < v14.x);
+ v6 ^= v10;
+ v6 = (v6 >> vec2<u32>(31u)).yx | (v6 << vec2<u32>(1u));
+ v3 += v7;
+ v3.y += u32(v3.x < v7.x);
+ v3 += m1;
+ v3.y += u32(v3.x < m1.x);
+ v15 ^= v3;
+ v15 = v15.yx;
+ v11 += v15;
+ v11.y += u32(v11.x < v15.x);
+ v7 ^= v11;
+ v7 = (v7 >> vec2<u32>(24u)) | (v7 << vec2<u32>(8u)).yx;
+ v3 += v7;
+ v3.y += u32(v3.x < v7.x);
+ // NOP
+ v15 ^= v3;
+ v15 = (v15 >> vec2<u32>(16u)) | (v15 << vec2<u32>(16u)).yx;
+ v11 += v15;
+ v11.y += u32(v11.x < v15.x);
+ v7 ^= v11;
+ v7 = (v7 >> vec2<u32>(31u)).yx | (v7 << vec2<u32>(1u));
+ v0 += v5;
+ v0.y += u32(v0.x < v5.x);
+ // NOP
+ v15 ^= v0;
+ v15 = v15.yx;
+ v10 += v15;
+ v10.y += u32(v10.x < v15.x);
+ v5 ^= v10;
+ v5 = (v5 >> vec2<u32>(24u)) | (v5 << vec2<u32>(8u)).yx;
+ v0 += v5;
+ v0.y += u32(v0.x < v5.x);
+ // NOP
+ v15 ^= v0;
+ v15 = (v15 >> vec2<u32>(16u)) | (v15 << vec2<u32>(16u)).yx;
+ v10 += v15;
+ v10.y += u32(v10.x < v15.x);
+ v5 ^= v10;
+ v5 = (v5 >> vec2<u32>(31u)).yx | (v5 << vec2<u32>(1u));
+ v1 += v6;
+ v1.y += u32(v1.x < v6.x);
+ // NOP
+ v12 ^= v1;
+ v12 = v12.yx;
+ v11 += v12;
+ v11.y += u32(v11.x < v12.x);
+ v6 ^= v11;
+ v6 = (v6 >> vec2<u32>(24u)) | (v6 << vec2<u32>(8u)).yx;
+ v1 += v6;
+ v1.y += u32(v1.x < v6.x);
+ // NOP
+ v12 ^= v1;
+ v12 = (v12 >> vec2<u32>(16u)) | (v12 << vec2<u32>(16u)).yx;
+ v11 += v12;
+ v11.y += u32(v11.x < v12.x);
+ v6 ^= v11;
+ v6 = (v6 >> vec2<u32>(31u)).yx | (v6 << vec2<u32>(1u));
+ v2 += v7;
+ v2.y += u32(v2.x < v7.x);
+ v2 += m3;
+ v2.y += u32(v2.x < m3.x);
+ v13 ^= v2;
+ v13 = v13.yx;
+ v8 += v13;
+ v8.y += u32(v8.x < v13.x);
+ v7 ^= v8;
+ v7 = (v7 >> vec2<u32>(24u)) | (v7 << vec2<u32>(8u)).yx;
+ v2 += v7;
+ v2.y += u32(v2.x < v7.x);
+ // NOP
+ v13 ^= v2;
+ v13 = (v13 >> vec2<u32>(16u)) | (v13 << vec2<u32>(16u)).yx;
+ v8 += v13;
+ v8.y += u32(v8.x < v13.x);
+ v7 ^= v8;
+ v7 = (v7 >> vec2<u32>(31u)).yx | (v7 << vec2<u32>(1u));
+ v3 += v4;
+ v3.y += u32(v3.x < v4.x);
+ // NOP
+ v14 ^= v3;
+ v14 = v14.yx;
+ v9 += v14;
+ v9.y += u32(v9.x < v14.x);
+ v4 ^= v9;
+ v4 = (v4 >> vec2<u32>(24u)) | (v4 << vec2<u32>(8u)).yx;
+ v3 += v4;
+ v3.y += u32(v3.x < v4.x);
+ v3 += m0;
+ v3.y += u32(v3.x < m0.x);
+ v14 ^= v3;
+ v14 = (v14 >> vec2<u32>(16u)) | (v14 << vec2<u32>(16u)).yx;
+ v9 += v14;
+ v9.y += u32(v9.x < v14.x);
+ v4 ^= v9;
+ v4 = (v4 >> vec2<u32>(31u)).yx | (v4 << vec2<u32>(1u));
+ // ROUND 10
+ v0 += v4;
+ v0.y += u32(v0.x < v4.x);
+ v0 += m0;
+ v0.y += u32(v0.x < m0.x);
+ v12 ^= v0;
+ v12 = v12.yx;
+ v8 += v12;
+ v8.y += u32(v8.x < v12.x);
+ v4 ^= v8;
+ v4 = (v4 >> vec2<u32>(24u)) | (v4 << vec2<u32>(8u)).yx;
+ v0 += v4;
+ v0.y += u32(v0.x < v4.x);
+ v0 += m1;
+ v0.y += u32(v0.x < m1.x);
+ v12 ^= v0;
+ v12 = (v12 >> vec2<u32>(16u)) | (v12 << vec2<u32>(16u)).yx;
+ v8 += v12;
+ v8.y += u32(v8.x < v12.x);
+ v4 ^= v8;
+ v4 = (v4 >> vec2<u32>(31u)).yx | (v4 << vec2<u32>(1u));
+ v1 += v5;
+ v1.y += u32(v1.x < v5.x);
+ v1 += m2;
+ v1.y += u32(v1.x < m2.x);
+ v13 ^= v1;
+ v13 = v13.yx;
+ v9 += v13;
+ v9.y += u32(v9.x < v13.x);
+ v5 ^= v9;
+ v5 = (v5 >> vec2<u32>(24u)) | (v5 << vec2<u32>(8u)).yx;
+ v1 += v5;
+ v1.y += u32(v1.x < v5.x);
+ v1 += m3;
+ v1.y += u32(v1.x < m3.x);
+ v13 ^= v1;
+ v13 = (v13 >> vec2<u32>(16u)) | (v13 << vec2<u32>(16u)).yx;
+ v9 += v13;
+ v9.y += u32(v9.x < v13.x);
+ v5 ^= v9;
+ v5 = (v5 >> vec2<u32>(31u)).yx | (v5 << vec2<u32>(1u));
+ v2 += v6;
+ v2.y += u32(v2.x < v6.x);
+ v2 += m4;
+ v2.y += u32(v2.x < m4.x);
+ v14 ^= v2;
+ v14 = v14.yx;
+ v10 += v14;
+ v10.y += u32(v10.x < v14.x);
+ v6 ^= v10;
+ v6 = (v6 >> vec2<u32>(24u)) | (v6 << vec2<u32>(8u)).yx;
+ v2 += v6;
+ v2.y += u32(v2.x < v6.x);
+ // NOP
+ v14 ^= v2;
+ v14 = (v14 >> vec2<u32>(16u)) | (v14 << vec2<u32>(16u)).yx;
+ v10 += v14;
+ v10.y += u32(v10.x < v14.x);
+ v6 ^= v10;
+ v6 = (v6 >> vec2<u32>(31u)).yx | (v6 << vec2<u32>(1u));
+ v3 += v7;
+ v3.y += u32(v3.x < v7.x);
+ // NOP
+ v15 ^= v3;
+ v15 = v15.yx;
+ v11 += v15;
+ v11.y += u32(v11.x < v15.x);
+ v7 ^= v11;
+ v7 = (v7 >> vec2<u32>(24u)) | (v7 << vec2<u32>(8u)).yx;
+ v3 += v7;
+ v3.y += u32(v3.x < v7.x);
+ // NOP
+ v15 ^= v3;
+ v15 = (v15 >> vec2<u32>(16u)) | (v15 << vec2<u32>(16u)).yx;
+ v11 += v15;
+ v11.y += u32(v11.x < v15.x);
+ v7 ^= v11;
+ v7 = (v7 >> vec2<u32>(31u)).yx | (v7 << vec2<u32>(1u));
+ v0 += v5;
+ v0.y += u32(v0.x < v5.x);
+ // NOP
+ v15 ^= v0;
+ v15 = v15.yx;
+ v10 += v15;
+ v10.y += u32(v10.x < v15.x);
+ v5 ^= v10;
+ v5 = (v5 >> vec2<u32>(24u)) | (v5 << vec2<u32>(8u)).yx;
+ v0 += v5;
+ v0.y += u32(v0.x < v5.x);
+ // NOP
+ v15 ^= v0;
+ v15 = (v15 >> vec2<u32>(16u)) | (v15 << vec2<u32>(16u)).yx;
+ v10 += v15;
+ v10.y += u32(v10.x < v15.x);
+ v5 ^= v10;
+ v5 = (v5 >> vec2<u32>(31u)).yx | (v5 << vec2<u32>(1u));
+ v1 += v6;
+ v1.y += u32(v1.x < v6.x);
+ // NOP
+ v12 ^= v1;
+ v12 = v12.yx;
+ v11 += v12;
+ v11.y += u32(v11.x < v12.x);
+ v6 ^= v11;
+ v6 = (v6 >> vec2<u32>(24u)) | (v6 << vec2<u32>(8u)).yx;
+ v1 += v6;
+ v1.y += u32(v1.x < v6.x);
+ // NOP
+ v12 ^= v1;
+ v12 = (v12 >> vec2<u32>(16u)) | (v12 << vec2<u32>(16u)).yx;
+ v11 += v12;
+ v11.y += u32(v11.x < v12.x);
+ v6 ^= v11;
+ v6 = (v6 >> vec2<u32>(31u)).yx | (v6 << vec2<u32>(1u));
+ v2 += v7;
+ v2.y += u32(v2.x < v7.x);
+ // NOP
+ v13 ^= v2;
+ v13 = v13.yx;
+ v8 += v13;
+ v8.y += u32(v8.x < v13.x);
+ v7 ^= v8;
+ v7 = (v7 >> vec2<u32>(24u)) | (v7 << vec2<u32>(8u)).yx;
+ v2 += v7;
+ v2.y += u32(v2.x < v7.x);
+ // NOP
+ v13 ^= v2;
+ v13 = (v13 >> vec2<u32>(16u)) | (v13 << vec2<u32>(16u)).yx;
+ v8 += v13;
+ v8.y += u32(v8.x < v13.x);
+ v7 ^= v8;
+ v7 = (v7 >> vec2<u32>(31u)).yx | (v7 << vec2<u32>(1u));
+ v3 += v4;
+ v3.y += u32(v3.x < v4.x);
+ // NOP
+ v14 ^= v3;
+ v14 = v14.yx;
+ v9 += v14;
+ v9.y += u32(v9.x < v14.x);
+ v4 ^= v9;
+ v4 = (v4 >> vec2<u32>(24u)) | (v4 << vec2<u32>(8u)).yx;
+ v3 += v4;
+ v3.y += u32(v3.x < v4.x);
+ // NOP
+ v14 ^= v3;
+ v14 = (v14 >> vec2<u32>(16u)) | (v14 << vec2<u32>(16u)).yx;
+ v9 += v14;
+ v9.y += u32(v9.x < v14.x);
+ v4 ^= v9;
+ v4 = (v4 >> vec2<u32>(31u)).yx | (v4 << vec2<u32>(1u));
+ // ROUND 11
+ v0 += v4;
+ v0.y += u32(v0.x < v4.x);
+ // NOP
+ v12 ^= v0;
+ v12 = v12.yx;
+ v8 += v12;
+ v8.y += u32(v8.x < v12.x);
+ v4 ^= v8;
+ v4 = (v4 >> vec2<u32>(24u)) | (v4 << vec2<u32>(8u)).yx;
+ v0 += v4;
+ v0.y += u32(v0.x < v4.x);
+ // NOP
+ v12 ^= v0;
+ v12 = (v12 >> vec2<u32>(16u)) | (v12 << vec2<u32>(16u)).yx;
+ v8 += v12;
+ v8.y += u32(v8.x < v12.x);
+ v4 ^= v8;
+ v4 = (v4 >> vec2<u32>(31u)).yx | (v4 << vec2<u32>(1u));
+ v1 += v5;
+ v1.y += u32(v1.x < v5.x);
+ v1 += m4;
+ v1.y += u32(v1.x < m4.x);
+ v13 ^= v1;
+ v13 = v13.yx;
+ v9 += v13;
+ v9.y += u32(v9.x < v13.x);
+ v5 ^= v9;
+ v5 = (v5 >> vec2<u32>(24u)) | (v5 << vec2<u32>(8u)).yx;
+ v1 += v5;
+ v1.y += u32(v1.x < v5.x);
+ // NOP
+ v13 ^= v1;
+ v13 = (v13 >> vec2<u32>(16u)) | (v13 << vec2<u32>(16u)).yx;
+ v9 += v13;
+ v9.y += u32(v9.x < v13.x);
+ v5 ^= v9;
+ v5 = (v5 >> vec2<u32>(31u)).yx | (v5 << vec2<u32>(1u));
+ v2 += v6;
+ v2.y += u32(v2.x < v6.x);
+ // NOP
+ v14 ^= v2;
+ v14 = v14.yx;
+ v10 += v14;
+ v10.y += u32(v10.x < v14.x);
+ v6 ^= v10;
+ v6 = (v6 >> vec2<u32>(24u)) | (v6 << vec2<u32>(8u)).yx;
+ v2 += v6;
+ v2.y += u32(v2.x < v6.x);
+ // NOP
+ v14 ^= v2;
+ v14 = (v14 >> vec2<u32>(16u)) | (v14 << vec2<u32>(16u)).yx;
+ v10 += v14;
+ v10.y += u32(v10.x < v14.x);
+ v6 ^= v10;
+ v6 = (v6 >> vec2<u32>(31u)).yx | (v6 << vec2<u32>(1u));
+ v3 += v7;
+ v3.y += u32(v3.x < v7.x);
+ // NOP
+ v15 ^= v3;
+ v15 = v15.yx;
+ v11 += v15;
+ v11.y += u32(v11.x < v15.x);
+ v7 ^= v11;
+ v7 = (v7 >> vec2<u32>(24u)) | (v7 << vec2<u32>(8u)).yx;
+ v3 += v7;
+ v3.y += u32(v3.x < v7.x);
+ // NOP
+ v15 ^= v3;
+ v15 = (v15 >> vec2<u32>(16u)) | (v15 << vec2<u32>(16u)).yx;
+ v11 += v15;
+ v11.y += u32(v11.x < v15.x);
+ v7 ^= v11;
+ v7 = (v7 >> vec2<u32>(31u)).yx | (v7 << vec2<u32>(1u));
+ v0 += v5;
+ v0.y += u32(v0.x < v5.x);
+ v0 += m1;
+ v0.y += u32(v0.x < m1.x);
+ v15 ^= v0;
+ v15 = v15.yx;
+ v10 += v15;
+ v10.y += u32(v10.x < v15.x);
+ v5 ^= v10;
+ v5 = (v5 >> vec2<u32>(24u)) | (v5 << vec2<u32>(8u)).yx;
+ v0 += v5;
+ v0.y += u32(v0.x < v5.x);
+ // NOP
+ v15 ^= v0;
+ v15 = (v15 >> vec2<u32>(16u)) | (v15 << vec2<u32>(16u)).yx;
+ v10 += v15;
+ v10.y += u32(v10.x < v15.x);
+ v5 ^= v10;
+ v5 = (v5 >> vec2<u32>(31u)).yx | (v5 << vec2<u32>(1u));
+ // G NOP
+ v2 += v7;
+ v2.y += u32(v2.x < v7.x);
+ // NOP
+ v13 ^= v2;
+ v13 = v13.yx;
+ v8 += v13;
+ v8.y += u32(v8.x < v13.x);
+ v7 ^= v8;
+ v7 = (v7 >> vec2<u32>(24u)) | (v7 << vec2<u32>(8u)).yx;
+ v2 += v7;
+ v2.y += u32(v2.x < v7.x);
+ // NOP
+ v13 ^= v2;
+ v13 = (v13 >> vec2<u32>(16u)) | (v13 << vec2<u32>(16u)).yx;
+ v8 += v13;
+ v8.y += u32(v8.x < v13.x);
+ v7 ^= v8;
+ v7 = (v7 >> vec2<u32>(31u)).yx | (v7 << vec2<u32>(1u));
+ // G NOP
+ // Set nonce if it exceeds difficulty threshold and no other thread has set it.
+ let result = vec2<u32>(0xf3bcc908u, 0x6a09e667u) ^ vec2<u32>(0x01010008u, 0x0u) ^ v0 ^ v8;
+ if (result.y > input.difficulty.y || (result.y == input.difficulty.y && result.x >= input.difficulty.x)) {
+ loop {
+ let swap = atomicCompareExchangeWeak(&output.found, 0u, 1u);
+ if (swap.exchanged) {
+ output.work = m0;
+ output.difficulty = result;
+ break;
+ }
+ if (swap.old_value != 0u) {
+ break;
+ }
+ }
+ return;
+ }
+}