--- /dev/null
+# Benchmarks to compute 16,777,216 nonces
+_Each test is 128 samples of one pass (dispatch or frame) at zero threshold with no early exit_
+
+## Summary
+- Chromium WebGPU and Firefox WebGL are the clear winners
+- Chromium WebGL seems to suffer from an Nvidia driver issue
+- Firefox WebGPU exhibits a strange implementation bottleneck
+ - It seems to restricts each pass to a minimum of 200ms
+ - Not shown here: This minimum is higher or lower depending on dispatch size
+- Safari WebGPU performance was maintained between versions
+- WebGL downsampling shader minimized readback lag and thus enabled much larger frames
+- WebGPU improved dramatically on non-mobile platforms, almost halving frame times
+
+
+## All Results
+| Version | System | Browser | API | Total | Rate | Median | Mean |
+|-----------|--------------|-------------|-----------|-----------|----------|----------|----------|
+| RC | RTX 3070 | Chromium N | WebGPU | 747 | 218.23 | 4.5 | 4.58 |
+| RC | RTX 3070 | Firefox N | WebGL | 1096 | 117.06 | 8.55 | 8.54 |
+| RC | RTX 3070 | Firefox | WebGL | 1174 | 116.57 | 9 | 8.58 |
+| 2.0.0 | RTX 3070 | Chromium N | WebGPU | 1339 | 112.02 | ? | 10.47 |
+| RC | Intel Xe | Chromium N | WebGPU | 5107 | 25.20 | 39.5 | 39.69 |
+| RC | Intel Xe | Firefox | WebGL | 7166 | 18.84 | 52 | 53.07 |
+| RC | Intel Xe | Firefox N | WebGL | 8060 | 16.71 | 63 | 59.83 |
+| RC | iPhone 12 | Safari | WebGPU | 8586 | 14.97 | 67 | 66.79 |
+| 2.0.0 | iPhone 12 | Safari | WebGPU | 8765 | 14.69 | 68 | 68.48 |
+| RC | Intel Xe | Chromium N | WebGL | 9764 | 15.42 | 62.8 | 64.85 |
+| 2.0.0 | Intel Xe | Chromium N | WebGPU | 10103 | 12.63 | ? | 78.93 |
+| RC | RTX 3070 | Chromium N | WebGL | 10681 | 19.81 | 50.60 | 50.47 |
+| RC | iPhone 12 | Safari | WebGL | 12631 | 10.54 | 95 | 94.86 |
+| RC | iPad Mini 5 | Safari | WebGPU | 14232 | 8.78 | 114 | 113.85 |
+| 2.0.0 | iPad Mini 5 | Safari | WebGPU | 14903 | 8.59 | 118 | 116.41 |
+| RC | iPad Mini 5 | Safari | WebGL | 18928 | 6.97 | 145 | 143.40 |
+| RC | Intel Xe | Firefox N | WebGPU | 25679 | 4.99 | 200 | 200.47 |
+| 2.0.0 | Intel Xe | Firefox N | WebGPU | 25805 | 4.94 | ? | 201.60 |
+| 2.0.0 | RTX 3070 | Firefox N | WebGPU | 25629 | 4.97 | ? | 200.23 |
+| RC | RTX 3070 | Firefox N | WebGPU | 25633 | 5.00 | 200 | 200.15 |
+| 2.0.0 | RTX 3070 | Firefox | WebGL | 35224 | 3.72 | ? | 275.19 |
+| 2.0.0 | RTX 3070 | Chromium N | WebGL | 47603 | 3.06 | ? | 371.90 |
+| 2.0.0 | RTX 3070 | Firefox N | WebGL | Unusable | N/A | ? | N/A |
+| 2.0.0 | Intel Xe | Firefox | WebGL | Unusable | N/A | ? | N/A |
+| 2.0.0 | Intel Xe | Firefox N | WebGL | Unusable | N/A | ? | N/A |
+| 2.0.0 | Intel Xe | Chromium N | WebGL | Unusable | N/A | ? | N/A |
+| 2.0.0 | iPhone 12 | Safari | WebGL | Unusable | N/A | ? | N/A |
+| 2.0.0 | iPad Mini 5 | Safari | WebGL | Unusable | N/A | ? | N/A |
+
+## RTX 3070
+| Version | Browser | API | Total | Rate | Median | Mean |
+|-----------|-------------|-----------|-----------|----------|----------|----------|
+| RC | Chromium N | WebGPU | 747 | 218.23 | 4.5 | 4.58 |
+| RC | Firefox N | WebGL | 1096 | 117.06 | 8.55 | 8.54 |
+| RC | Firefox | WebGL | 1174 | 116.57 | 9 | 8.58 |
+| 2.0.0 | Chromium N | WebGPU | 1339 | 112.02 | ? | 10.47 |
+| RC | Chromium N | WebGL | 10681 | 19.81 | 50.60 | 50.47 |
+| 2.0.0 | Firefox N | WebGPU | 25629 | 4.97 | ? | 200.23 |
+| RC | Firefox N | WebGPU | 25633 | 5.00 | 200 | 200.15 |
+| 2.0.0 | Firefox | WebGL | 35224 | 3.72 | ? | 275.19 |
+| 2.0.0 | Chromium N | WebGL | 47603 | 3.06 | ? | 371.90 |
+| 2.0.0 | Firefox N | WebGL | Unusable | N/A | ? | N/A |
+
+## Intel Xe integrated graphics
+| Version | Browser | API | Total | Rate | Median | Mean |
+|-----------|-------------|-----------|-----------|----------|----------|----------|
+| RC | Chromium N | WebGPU | 5107 | 25.20 | 39.5 | 39.69 |
+| RC | Firefox | WebGL | 7166 | 18.84 | 52 | 53.07 |
+| RC | Firefox N | WebGL | 8060 | 16.71 | 63 | 59.83 |
+| RC | Chromium N | WebGL | 9764 | 15.42 | 62.8 | 64.85 |
+| 2.0.0 | Chromium N | WebGPU | 10103 | 12.63 | ? | 78.93 |
+| RC | Firefox N | WebGPU | 25679 | 4.99 | 200 | 200.47 |
+| 2.0.0 | Firefox N | WebGPU | 25805 | 4.94 | ? | 201.60 |
+| 2.0.0 | Firefox | WebGL | Unusable | N/A | ? | N/A |
+| 2.0.0 | Firefox N | WebGL | Unusable | N/A | ? | N/A |
+| 2.0.0 | Chromium N | WebGL | Unusable | N/A | ? | N/A |
+
+## iPhone 12 (A14 Bionic, ??? 4-core GPU)
+| Version | Browser | API | Total | Rate | Median | Mean |
+|-----------|-------------|-----------|-----------|----------|----------|----------|
+| RC | Safari | WebGPU | 8586 | 14.97 | 67 | 66.79 |
+| 2.0.0 | Safari | WebGPU | 8765 | 14.69 | 68 | 68.48 |
+| RC | Safari | WebGL | 12631 | 10.54 | 95 | 94.86 |
+| 2.0.0 | Safari | WebGL | Unusable | N/A | ? | N/A |
+
+## iPad Mini 5 (A12 Bionic, G11P 4-core GPU)
+| Version | Browser | API | Total | Rate | Median | Mean |
+|-----------|-------------|-----------|-----------|----------|----------|----------|
+| RC | Safari | WebGPU | 14232 | 8.78 | 114 | 113.85 |
+| 2.0.0 | Safari | WebGPU | 14903 | 8.59 | 118 | 116.41 |
+| RC | Safari | WebGL | 18928 | 6.97 | 145 | 143.40 |
+| 2.0.0 | Safari | WebGL | Unusable | N/A | ? | N/A |
// src/shaders/compute.wgsl
-var compute_default = `struct UBO{blockhash:array<vec4<u32>,2>,seed:vec2<u32>,threshold:u32};@group(0)@binding(0)var<uniform> ubo:UBO;struct WORK{nonce:vec2<u32>,found:atomic<u32>};@group(0)@binding(1)var<storage,read_write>work:WORK;const BLAKE2B_IV_0=vec2(0xF2BDC900u,0x6A09E667u);const ROTATE_1=vec2(1u);const ROTATE_8=vec2(8u);const ROTATE_16=vec2(16u);const ROTATE_24=vec2(24u);const ROTATE_31=vec2(31u);var<workgroup> found:bool;@compute @workgroup_size(32)fn search(@builtin(global_invocation_id)global_id:vec3<u32>,@builtin(local_invocation_id)local_id:vec3<u32>){main(global_id);}@compute @workgroup_size(1)fn validate(@builtin(global_invocation_id)global_id:vec3<u32>){main(global_id);}fn main(id:vec3<u32>){let m0:vec2<u32>=ubo.seed ^ id.xy;let m1:vec2<u32>=ubo.blockhash[0u].xy;let m2:vec2<u32>=ubo.blockhash[0u].zw;let m3:vec2<u32>=ubo.blockhash[1u].xy;let m4:vec2<u32>=ubo.blockhash[1u].zw;var v0:vec2<u32>=BLAKE2B_IV_0;var v1:vec2<u32>=vec2(0x84CAA73Bu,0xBB67AE85u);var v2:vec2<u32>=vec2(0xFE94F82Bu,0x3C6EF372u);var v3:vec2<u32>=vec2(0x5F1D36F1u,0xA54FF53Au);var v4:vec2<u32>=vec2(0xADE682D1u,0x510E527Fu);var v5:vec2<u32>=vec2(0x2B3E6C1Fu,0x9B05688Cu);var v6:vec2<u32>=vec2(0xFB41BD6Bu,0x1F83D9ABu);var v7:vec2<u32>=vec2(0x137E2179u,0x5BE0CD19u);var v8:vec2<u32>=vec2(0xF3BCC908u,0x6A09E667u);var v9:vec2<u32>=vec2(0x84CAA73Bu,0xBB67AE85u);var vA:vec2<u32>=vec2(0xFE94F82Bu,0x3C6EF372u);var vB:vec2<u32>=vec2(0x5F1D36F1u,0xA54FF53Au);var vC:vec2<u32>=vec2(0xADE682F9u,0x510E527Fu);var vD:vec2<u32>=vec2(0x2B3E6C1Fu,0x9B05688Cu);var vE:vec2<u32>=vec2(0x04BE4294u,0xE07C2654u);var vF:vec2<u32>=vec2(0x137E2179u,0x5BE0CD19u);v0=v0+v4+vec2(0u,u32(v0.x+v4.x<v0.x));v0=v0+m0+vec2(0u,u32(v0.x+m0.x<v0.x));vC=(vC ^ v0).yx;v8=v8+vC+vec2(0u,u32(v8.x+vC.x<v8.x));v4=((v4 ^ v8)>>ROTATE_24)|((v4 ^ v8).yx<<ROTATE_8);v0=v0+v4+vec2(0u,u32(v0.x+v4.x<v0.x));v0=v0+m1+vec2(0u,u32(v0.x+m1.x<v0.x));vC=((vC ^ v0)>>ROTATE_16)|((vC ^ v0).yx<<ROTATE_16);v8=v8+vC+vec2(0u,u32(v8.x+vC.x<v8.x));v4=((v4 ^ v8)<<ROTATE_1)|((v4 ^ v8).yx>>ROTATE_31);v1=v1+v5+vec2(0u,u32(v1.x+v5.x<v1.x));v1=v1+m2+vec2(0u,u32(v1.x+m2.x<v1.x));vD=(vD ^ v1).yx;v9=v9+vD+vec2(0u,u32(v9.x+vD.x<v9.x));v5=((v5 ^ v9)>>ROTATE_24)|((v5 ^ v9).yx<<ROTATE_8);v1=v1+v5+vec2(0u,u32(v1.x+v5.x<v1.x));v1=v1+m3+vec2(0u,u32(v1.x+m3.x<v1.x));vD=((vD ^ v1)>>ROTATE_16)|((vD ^ v1).yx<<ROTATE_16);v9=v9+vD+vec2(0u,u32(v9.x+vD.x<v9.x));v5=((v5 ^ v9)<<ROTATE_1)|((v5 ^ v9).yx>>ROTATE_31);v2=v2+v6+vec2(0u,u32(v2.x+v6.x<v2.x));v2=v2+m4+vec2(0u,u32(v2.x+m4.x<v2.x));vE=(vE ^ v2).yx;vA=vA+vE+vec2(0u,u32(vA.x+vE.x<vA.x));v6=((v6 ^ vA)>>ROTATE_24)|((v6 ^ vA).yx<<ROTATE_8);v2=v2+v6+vec2(0u,u32(v2.x+v6.x<v2.x));vE=((vE ^ v2)>>ROTATE_16)|((vE ^ v2).yx<<ROTATE_16);vA=vA+vE+vec2(0u,u32(vA.x+vE.x<vA.x));v6=((v6 ^ vA)<<ROTATE_1)|((v6 ^ vA).yx>>ROTATE_31);v3=v3+v7+vec2(0u,u32(v3.x+v7.x<v3.x));vF=(vF ^ v3).yx;vB=vB+vF+vec2(0u,u32(vB.x+vF.x<vB.x));v7=((v7 ^ vB)>>ROTATE_24)|((v7 ^ vB).yx<<ROTATE_8);v3=v3+v7+vec2(0u,u32(v3.x+v7.x<v3.x));vF=((vF ^ v3)>>ROTATE_16)|((vF ^ v3).yx<<ROTATE_16);vB=vB+vF+vec2(0u,u32(vB.x+vF.x<vB.x));v7=((v7 ^ vB)<<ROTATE_1)|((v7 ^ vB).yx>>ROTATE_31);v0=v0+v5+vec2(0u,u32(v0.x+v5.x<v0.x));vF=(vF ^ v0).yx;vA=vA+vF+vec2(0u,u32(vA.x+vF.x<vA.x));v5=((v5 ^ vA)>>ROTATE_24)|((v5 ^ vA).yx<<ROTATE_8);v0=v0+v5+vec2(0u,u32(v0.x+v5.x<v0.x));vF=((vF ^ v0)>>ROTATE_16)|((vF ^ v0).yx<<ROTATE_16);vA=vA+vF+vec2(0u,u32(vA.x+vF.x<vA.x));v5=((v5 ^ vA)<<ROTATE_1)|((v5 ^ vA).yx>>ROTATE_31);v1=v1+v6+vec2(0u,u32(v1.x+v6.x<v1.x));vC=(vC ^ v1).yx;vB=vB+vC+vec2(0u,u32(vB.x+vC.x<vB.x));v6=((v6 ^ vB)>>ROTATE_24)|((v6 ^ vB).yx<<ROTATE_8);v1=v1+v6+vec2(0u,u32(v1.x+v6.x<v1.x));vC=((vC ^ v1)>>ROTATE_16)|((vC ^ v1).yx<<ROTATE_16);vB=vB+vC+vec2(0u,u32(vB.x+vC.x<vB.x));v6=((v6 ^ vB)<<ROTATE_1)|((v6 ^ vB).yx>>ROTATE_31);v2=v2+v7+vec2(0u,u32(v2.x+v7.x<v2.x));vD=(vD ^ v2).yx;v8=v8+vD+vec2(0u,u32(v8.x+vD.x<v8.x));v7=((v7 ^ v8)>>ROTATE_24)|((v7 ^ v8).yx<<ROTATE_8);v2=v2+v7+vec2(0u,u32(v2.x+v7.x<v2.x));vD=((vD ^ v2)>>ROTATE_16)|((vD ^ v2).yx<<ROTATE_16);v8=v8+vD+vec2(0u,u32(v8.x+vD.x<v8.x));v7=((v7 ^ v8)<<ROTATE_1)|((v7 ^ v8).yx>>ROTATE_31);v3=v3+v4+vec2(0u,u32(v3.x+v4.x<v3.x));vE=(vE ^ v3).yx;v9=v9+vE+vec2(0u,u32(v9.x+vE.x<v9.x));v4=((v4 ^ v9)>>ROTATE_24)|((v4 ^ v9).yx<<ROTATE_8);v3=v3+v4+vec2(0u,u32(v3.x+v4.x<v3.x));vE=((vE ^ v3)>>ROTATE_16)|((vE ^ v3).yx<<ROTATE_16);v9=v9+vE+vec2(0u,u32(v9.x+vE.x<v9.x));v4=((v4 ^ v9)<<ROTATE_1)|((v4 ^ v9).yx>>ROTATE_31);v0=v0+v4+vec2(0u,u32(v0.x+v4.x<v0.x));vC=(vC ^ v0).yx;v8=v8+vC+vec2(0u,u32(v8.x+vC.x<v8.x));v4=((v4 ^ v8)>>ROTATE_24)|((v4 ^ v8).yx<<ROTATE_8);v0=v0+v4+vec2(0u,u32(v0.x+v4.x<v0.x));vC=((vC ^ v0)>>ROTATE_16)|((vC ^ v0).yx<<ROTATE_16);v8=v8+vC+vec2(0u,u32(v8.x+vC.x<v8.x));v4=((v4 ^ v8)<<ROTATE_1)|((v4 ^ v8).yx>>ROTATE_31);v1=v1+v5+vec2(0u,u32(v1.x+v5.x<v1.x));v1=v1+m4+vec2(0u,u32(v1.x+m4.x<v1.x));vD=(vD ^ v1).yx;v9=v9+vD+vec2(0u,u32(v9.x+vD.x<v9.x));v5=((v5 ^ v9)>>ROTATE_24)|((v5 ^ v9).yx<<ROTATE_8);v1=v1+v5+vec2(0u,u32(v1.x+v5.x<v1.x));vD=((vD ^ v1)>>ROTATE_16)|((vD ^ v1).yx<<ROTATE_16);v9=v9+vD+vec2(0u,u32(v9.x+vD.x<v9.x));v5=((v5 ^ v9)<<ROTATE_1)|((v5 ^ v9).yx>>ROTATE_31);v2=v2+v6+vec2(0u,u32(v2.x+v6.x<v2.x));vE=(vE ^ v2).yx;vA=vA+vE+vec2(0u,u32(vA.x+vE.x<vA.x));v6=((v6 ^ vA)>>ROTATE_24)|((v6 ^ vA).yx<<ROTATE_8);v2=v2+v6+vec2(0u,u32(v2.x+v6.x<v2.x));vE=((vE ^ v2)>>ROTATE_16)|((vE ^ v2).yx<<ROTATE_16);vA=vA+vE+vec2(0u,u32(vA.x+vE.x<vA.x));v6=((v6 ^ vA)<<ROTATE_1)|((v6 ^ vA).yx>>ROTATE_31);v3=v3+v7+vec2(0u,u32(v3.x+v7.x<v3.x));vF=(vF ^ v3).yx;vB=vB+vF+vec2(0u,u32(vB.x+vF.x<vB.x));v7=((v7 ^ vB)>>ROTATE_24)|((v7 ^ vB).yx<<ROTATE_8);v3=v3+v7+vec2(0u,u32(v3.x+v7.x<v3.x));vF=((vF ^ v3)>>ROTATE_16)|((vF ^ v3).yx<<ROTATE_16);vB=vB+vF+vec2(0u,u32(vB.x+vF.x<vB.x));v7=((v7 ^ vB)<<ROTATE_1)|((v7 ^ vB).yx>>ROTATE_31);v0=v0+v5+vec2(0u,u32(v0.x+v5.x<v0.x));v0=v0+m1+vec2(0u,u32(v0.x+m1.x<v0.x));vF=(vF ^ v0).yx;vA=vA+vF+vec2(0u,u32(vA.x+vF.x<vA.x));v5=((v5 ^ vA)>>ROTATE_24)|((v5 ^ vA).yx<<ROTATE_8);v0=v0+v5+vec2(0u,u32(v0.x+v5.x<v0.x));vF=((vF ^ v0)>>ROTATE_16)|((vF ^ v0).yx<<ROTATE_16);vA=vA+vF+vec2(0u,u32(vA.x+vF.x<vA.x));v5=((v5 ^ vA)<<ROTATE_1)|((v5 ^ vA).yx>>ROTATE_31);v1=v1+v6+vec2(0u,u32(v1.x+v6.x<v1.x));v1=v1+m0+vec2(0u,u32(v1.x+m0.x<v1.x));vC=(vC ^ v1).yx;vB=vB+vC+vec2(0u,u32(vB.x+vC.x<vB.x));v6=((v6 ^ vB)>>ROTATE_24)|((v6 ^ vB).yx<<ROTATE_8);v1=v1+v6+vec2(0u,u32(v1.x+v6.x<v1.x));v1=v1+m2+vec2(0u,u32(v1.x+m2.x<v1.x));vC=((vC ^ v1)>>ROTATE_16)|((vC ^ v1).yx<<ROTATE_16);vB=vB+vC+vec2(0u,u32(vB.x+vC.x<vB.x));v6=((v6 ^ vB)<<ROTATE_1)|((v6 ^ vB).yx>>ROTATE_31);v2=v2+v7+vec2(0u,u32(v2.x+v7.x<v2.x));vD=(vD ^ v2).yx;v8=v8+vD+vec2(0u,u32(v8.x+vD.x<v8.x));v7=((v7 ^ v8)>>ROTATE_24)|((v7 ^ v8).yx<<ROTATE_8);v2=v2+v7+vec2(0u,u32(v2.x+v7.x<v2.x));vD=((vD ^ v2)>>ROTATE_16)|((vD ^ v2).yx<<ROTATE_16);v8=v8+vD+vec2(0u,u32(v8.x+vD.x<v8.x));v7=((v7 ^ v8)<<ROTATE_1)|((v7 ^ v8).yx>>ROTATE_31);v3=v3+v4+vec2(0u,u32(v3.x+v4.x<v3.x));vE=(vE ^ v3).yx;v9=v9+vE+vec2(0u,u32(v9.x+vE.x<v9.x));v4=((v4 ^ v9)>>ROTATE_24)|((v4 ^ v9).yx<<ROTATE_8);v3=v3+v4+vec2(0u,u32(v3.x+v4.x<v3.x));v3=v3+m3+vec2(0u,u32(v3.x+m3.x<v3.x));vE=((vE ^ v3)>>ROTATE_16)|((vE ^ v3).yx<<ROTATE_16);v9=v9+vE+vec2(0u,u32(v9.x+vE.x<v9.x));v4=((v4 ^ v9)<<ROTATE_1)|((v4 ^ v9).yx>>ROTATE_31);v0=v0+v4+vec2(0u,u32(v0.x+v4.x<v0.x));vC=(vC ^ v0).yx;v8=v8+vC+vec2(0u,u32(v8.x+vC.x<v8.x));v4=((v4 ^ v8)>>ROTATE_24)|((v4 ^ v8).yx<<ROTATE_8);v0=v0+v4+vec2(0u,u32(v0.x+v4.x<v0.x));vC=((vC ^ v0)>>ROTATE_16)|((vC ^ v0).yx<<ROTATE_16);v8=v8+vC+vec2(0u,u32(v8.x+vC.x<v8.x));v4=((v4 ^ v8)<<ROTATE_1)|((v4 ^ v8).yx>>ROTATE_31);v1=v1+v5+vec2(0u,u32(v1.x+v5.x<v1.x));vD=(vD ^ v1).yx;v9=v9+vD+vec2(0u,u32(v9.x+vD.x<v9.x));v5=((v5 ^ v9)>>ROTATE_24)|((v5 ^ v9).yx<<ROTATE_8);v1=v1+v5+vec2(0u,u32(v1.x+v5.x<v1.x));v1=v1+m0+vec2(0u,u32(v1.x+m0.x<v1.x));vD=((vD ^ v1)>>ROTATE_16)|((vD ^ v1).yx<<ROTATE_16);v9=v9+vD+vec2(0u,u32(v9.x+vD.x<v9.x));v5=((v5 ^ v9)<<ROTATE_1)|((v5 ^ v9).yx>>ROTATE_31);v2=v2+v6+vec2(0u,u32(v2.x+v6.x<v2.x));vE=(vE ^ v2).yx;vA=vA+vE+vec2(0u,u32(vA.x+vE.x<vA.x));v6=((v6 ^ vA)>>ROTATE_24)|((v6 ^ vA).yx<<ROTATE_8);v2=v2+v6+vec2(0u,u32(v2.x+v6.x<v2.x));v2=v2+m2+vec2(0u,u32(v2.x+m2.x<v2.x));vE=((vE ^ v2)>>ROTATE_16)|((vE ^ v2).yx<<ROTATE_16);vA=vA+vE+vec2(0u,u32(vA.x+vE.x<vA.x));v6=((v6 ^ vA)<<ROTATE_1)|((v6 ^ vA).yx>>ROTATE_31);v3=v3+v7+vec2(0u,u32(v3.x+v7.x<v3.x));vF=(vF ^ v3).yx;vB=vB+vF+vec2(0u,u32(vB.x+vF.x<vB.x));v7=((v7 ^ vB)>>ROTATE_24)|((v7 ^ vB).yx<<ROTATE_8);v3=v3+v7+vec2(0u,u32(v3.x+v7.x<v3.x));vF=((vF ^ v3)>>ROTATE_16)|((vF ^ v3).yx<<ROTATE_16);vB=vB+vF+vec2(0u,u32(vB.x+vF.x<vB.x));v7=((v7 ^ vB)<<ROTATE_1)|((v7 ^ vB).yx>>ROTATE_31);v0=v0+v5+vec2(0u,u32(v0.x+v5.x<v0.x));vF=(vF ^ v0).yx;vA=vA+vF+vec2(0u,u32(vA.x+vF.x<vA.x));v5=((v5 ^ vA)>>ROTATE_24)|((v5 ^ vA).yx<<ROTATE_8);v0=v0+v5+vec2(0u,u32(v0.x+v5.x<v0.x));vF=((vF ^ v0)>>ROTATE_16)|((vF ^ v0).yx<<ROTATE_16);vA=vA+vF+vec2(0u,u32(vA.x+vF.x<vA.x));v5=((v5 ^ vA)<<ROTATE_1)|((v5 ^ vA).yx>>ROTATE_31);v1=v1+v6+vec2(0u,u32(v1.x+v6.x<v1.x));v1=v1+m3+vec2(0u,u32(v1.x+m3.x<v1.x));vC=(vC ^ v1).yx;vB=vB+vC+vec2(0u,u32(vB.x+vC.x<vB.x));v6=((v6 ^ vB)>>ROTATE_24)|((v6 ^ vB).yx<<ROTATE_8);v1=v1+v6+vec2(0u,u32(v1.x+v6.x<v1.x));vC=((vC ^ v1)>>ROTATE_16)|((vC ^ v1).yx<<ROTATE_16);vB=vB+vC+vec2(0u,u32(vB.x+vC.x<vB.x));v6=((v6 ^ vB)<<ROTATE_1)|((v6 ^ vB).yx>>ROTATE_31);v2=v2+v7+vec2(0u,u32(v2.x+v7.x<v2.x));vD=(vD ^ v2).yx;v8=v8+vD+vec2(0u,u32(v8.x+vD.x<v8.x));v7=((v7 ^ v8)>>ROTATE_24)|((v7 ^ v8).yx<<ROTATE_8);v2=v2+v7+vec2(0u,u32(v2.x+v7.x<v2.x));v2=v2+m1+vec2(0u,u32(v2.x+m1.x<v2.x));vD=((vD ^ v2)>>ROTATE_16)|((vD ^ v2).yx<<ROTATE_16);v8=v8+vD+vec2(0u,u32(v8.x+vD.x<v8.x));v7=((v7 ^ v8)<<ROTATE_1)|((v7 ^ v8).yx>>ROTATE_31);v3=v3+v4+vec2(0u,u32(v3.x+v4.x<v3.x));vE=(vE ^ v3).yx;v9=v9+vE+vec2(0u,u32(v9.x+vE.x<v9.x));v4=((v4 ^ v9)>>ROTATE_24)|((v4 ^ v9).yx<<ROTATE_8);v3=v3+v4+vec2(0u,u32(v3.x+v4.x<v3.x));v3=v3+m4+vec2(0u,u32(v3.x+m4.x<v3.x));vE=((vE ^ v3)>>ROTATE_16)|((vE ^ v3).yx<<ROTATE_16);v9=v9+vE+vec2(0u,u32(v9.x+vE.x<v9.x));v4=((v4 ^ v9)<<ROTATE_1)|((v4 ^ v9).yx>>ROTATE_31);v0=v0+v4+vec2(0u,u32(v0.x+v4.x<v0.x));vC=(vC ^ v0).yx;v8=v8+vC+vec2(0u,u32(v8.x+vC.x<v8.x));v4=((v4 ^ v8)>>ROTATE_24)|((v4 ^ v8).yx<<ROTATE_8);v0=v0+v4+vec2(0u,u32(v0.x+v4.x<v0.x));vC=((vC ^ v0)>>ROTATE_16)|((vC ^ v0).yx<<ROTATE_16);v8=v8+vC+vec2(0u,u32(v8.x+vC.x<v8.x));v4=((v4 ^ v8)<<ROTATE_1)|((v4 ^ v8).yx>>ROTATE_31);v1=v1+v5+vec2(0u,u32(v1.x+v5.x<v1.x));v1=v1+m3+vec2(0u,u32(v1.x+m3.x<v1.x));vD=(vD ^ v1).yx;v9=v9+vD+vec2(0u,u32(v9.x+vD.x<v9.x));v5=((v5 ^ v9)>>ROTATE_24)|((v5 ^ v9).yx<<ROTATE_8);v1=v1+v5+vec2(0u,u32(v1.x+v5.x<v1.x));v1=v1+m1+vec2(0u,u32(v1.x+m1.x<v1.x));vD=((vD ^ v1)>>ROTATE_16)|((vD ^ v1).yx<<ROTATE_16);v9=v9+vD+vec2(0u,u32(v9.x+vD.x<v9.x));v5=((v5 ^ v9)<<ROTATE_1)|((v5 ^ v9).yx>>ROTATE_31);v2=v2+v6+vec2(0u,u32(v2.x+v6.x<v2.x));vE=(vE ^ v2).yx;vA=vA+vE+vec2(0u,u32(vA.x+vE.x<vA.x));v6=((v6 ^ vA)>>ROTATE_24)|((v6 ^ vA).yx<<ROTATE_8);v2=v2+v6+vec2(0u,u32(v2.x+v6.x<v2.x));vE=((vE ^ v2)>>ROTATE_16)|((vE ^ v2).yx<<ROTATE_16);vA=vA+vE+vec2(0u,u32(vA.x+vE.x<vA.x));v6=((v6 ^ vA)<<ROTATE_1)|((v6 ^ vA).yx>>ROTATE_31);v3=v3+v7+vec2(0u,u32(v3.x+v7.x<v3.x));vF=(vF ^ v3).yx;vB=vB+vF+vec2(0u,u32(vB.x+vF.x<vB.x));v7=((v7 ^ vB)>>ROTATE_24)|((v7 ^ vB).yx<<ROTATE_8);v3=v3+v7+vec2(0u,u32(v3.x+v7.x<v3.x));vF=((vF ^ v3)>>ROTATE_16)|((vF ^ v3).yx<<ROTATE_16);vB=vB+vF+vec2(0u,u32(vB.x+vF.x<vB.x));v7=((v7 ^ vB)<<ROTATE_1)|((v7 ^ vB).yx>>ROTATE_31);v0=v0+v5+vec2(0u,u32(v0.x+v5.x<v0.x));v0=v0+m2+vec2(0u,u32(v0.x+m2.x<v0.x));vF=(vF ^ v0).yx;vA=vA+vF+vec2(0u,u32(vA.x+vF.x<vA.x));v5=((v5 ^ vA)>>ROTATE_24)|((v5 ^ vA).yx<<ROTATE_8);v0=v0+v5+vec2(0u,u32(v0.x+v5.x<v0.x));vF=((vF ^ v0)>>ROTATE_16)|((vF ^ v0).yx<<ROTATE_16);vA=vA+vF+vec2(0u,u32(vA.x+vF.x<vA.x));v5=((v5 ^ vA)<<ROTATE_1)|((v5 ^ vA).yx>>ROTATE_31);v1=v1+v6+vec2(0u,u32(v1.x+v6.x<v1.x));vC=(vC ^ v1).yx;vB=vB+vC+vec2(0u,u32(vB.x+vC.x<vB.x));v6=((v6 ^ vB)>>ROTATE_24)|((v6 ^ vB).yx<<ROTATE_8);v1=v1+v6+vec2(0u,u32(v1.x+v6.x<v1.x));vC=((vC ^ v1)>>ROTATE_16)|((vC ^ v1).yx<<ROTATE_16);vB=vB+vC+vec2(0u,u32(vB.x+vC.x<vB.x));v6=((v6 ^ vB)<<ROTATE_1)|((v6 ^ vB).yx>>ROTATE_31);v2=v2+v7+vec2(0u,u32(v2.x+v7.x<v2.x));v2=v2+m4+vec2(0u,u32(v2.x+m4.x<v2.x));vD=(vD ^ v2).yx;v8=v8+vD+vec2(0u,u32(v8.x+vD.x<v8.x));v7=((v7 ^ v8)>>ROTATE_24)|((v7 ^ v8).yx<<ROTATE_8);v2=v2+v7+vec2(0u,u32(v2.x+v7.x<v2.x));v2=v2+m0+vec2(0u,u32(v2.x+m0.x<v2.x));vD=((vD ^ v2)>>ROTATE_16)|((vD ^ v2).yx<<ROTATE_16);v8=v8+vD+vec2(0u,u32(v8.x+vD.x<v8.x));v7=((v7 ^ v8)<<ROTATE_1)|((v7 ^ v8).yx>>ROTATE_31);v3=v3+v4+vec2(0u,u32(v3.x+v4.x<v3.x));vE=(vE ^ v3).yx;v9=v9+vE+vec2(0u,u32(v9.x+vE.x<v9.x));v4=((v4 ^ v9)>>ROTATE_24)|((v4 ^ v9).yx<<ROTATE_8);v3=v3+v4+vec2(0u,u32(v3.x+v4.x<v3.x));vE=((vE ^ v3)>>ROTATE_16)|((vE ^ v3).yx<<ROTATE_16);v9=v9+vE+vec2(0u,u32(v9.x+vE.x<v9.x));v4=((v4 ^ v9)<<ROTATE_1)|((v4 ^ v9).yx>>ROTATE_31);v0=v0+v4+vec2(0u,u32(v0.x+v4.x<v0.x));vC=(vC ^ v0).yx;v8=v8+vC+vec2(0u,u32(v8.x+vC.x<v8.x));v4=((v4 ^ v8)>>ROTATE_24)|((v4 ^ v8).yx<<ROTATE_8);v0=v0+v4+vec2(0u,u32(v0.x+v4.x<v0.x));v0=v0+m0+vec2(0u,u32(v0.x+m0.x<v0.x));vC=((vC ^ v0)>>ROTATE_16)|((vC ^ v0).yx<<ROTATE_16);v8=v8+vC+vec2(0u,u32(v8.x+vC.x<v8.x));v4=((v4 ^ v8)<<ROTATE_1)|((v4 ^ v8).yx>>ROTATE_31);v1=v1+v5+vec2(0u,u32(v1.x+v5.x<v1.x));vD=(vD ^ v1).yx;v9=v9+vD+vec2(0u,u32(v9.x+vD.x<v9.x));v5=((v5 ^ v9)>>ROTATE_24)|((v5 ^ v9).yx<<ROTATE_8);v1=v1+v5+vec2(0u,u32(v1.x+v5.x<v1.x));vD=((vD ^ v1)>>ROTATE_16)|((vD ^ v1).yx<<ROTATE_16);v9=v9+vD+vec2(0u,u32(v9.x+vD.x<v9.x));v5=((v5 ^ v9)<<ROTATE_1)|((v5 ^ v9).yx>>ROTATE_31);v2=v2+v6+vec2(0u,u32(v2.x+v6.x<v2.x));v2=v2+m2+vec2(0u,u32(v2.x+m2.x<v2.x));vE=(vE ^ v2).yx;vA=vA+vE+vec2(0u,u32(vA.x+vE.x<vA.x));v6=((v6 ^ vA)>>ROTATE_24)|((v6 ^ vA).yx<<ROTATE_8);v2=v2+v6+vec2(0u,u32(v2.x+v6.x<v2.x));v2=v2+m4+vec2(0u,u32(v2.x+m4.x<v2.x));vE=((vE ^ v2)>>ROTATE_16)|((vE ^ v2).yx<<ROTATE_16);vA=vA+vE+vec2(0u,u32(vA.x+vE.x<vA.x));v6=((v6 ^ vA)<<ROTATE_1)|((v6 ^ vA).yx>>ROTATE_31);v3=v3+v7+vec2(0u,u32(v3.x+v7.x<v3.x));vF=(vF ^ v3).yx;vB=vB+vF+vec2(0u,u32(vB.x+vF.x<vB.x));v7=((v7 ^ vB)>>ROTATE_24)|((v7 ^ vB).yx<<ROTATE_8);v3=v3+v7+vec2(0u,u32(v3.x+v7.x<v3.x));vF=((vF ^ v3)>>ROTATE_16)|((vF ^ v3).yx<<ROTATE_16);vB=vB+vF+vec2(0u,u32(vB.x+vF.x<vB.x));v7=((v7 ^ vB)<<ROTATE_1)|((v7 ^ vB).yx>>ROTATE_31);v0=v0+v5+vec2(0u,u32(v0.x+v5.x<v0.x));vF=(vF ^ v0).yx;vA=vA+vF+vec2(0u,u32(vA.x+vF.x<vA.x));v5=((v5 ^ vA)>>ROTATE_24)|((v5 ^ vA).yx<<ROTATE_8);v0=v0+v5+vec2(0u,u32(v0.x+v5.x<v0.x));v0=v0+m1+vec2(0u,u32(v0.x+m1.x<v0.x));vF=((vF ^ v0)>>ROTATE_16)|((vF ^ v0).yx<<ROTATE_16);vA=vA+vF+vec2(0u,u32(vA.x+vF.x<vA.x));v5=((v5 ^ vA)<<ROTATE_1)|((v5 ^ vA).yx>>ROTATE_31);v1=v1+v6+vec2(0u,u32(v1.x+v6.x<v1.x));vC=(vC ^ v1).yx;vB=vB+vC+vec2(0u,u32(vB.x+vC.x<vB.x));v6=((v6 ^ vB)>>ROTATE_24)|((v6 ^ vB).yx<<ROTATE_8);v1=v1+v6+vec2(0u,u32(v1.x+v6.x<v1.x));vC=((vC ^ v1)>>ROTATE_16)|((vC ^ v1).yx<<ROTATE_16);vB=vB+vC+vec2(0u,u32(vB.x+vC.x<vB.x));v6=((v6 ^ vB)<<ROTATE_1)|((v6 ^ vB).yx>>ROTATE_31);v2=v2+v7+vec2(0u,u32(v2.x+v7.x<v2.x));vD=(vD ^ v2).yx;v8=v8+vD+vec2(0u,u32(v8.x+vD.x<v8.x));v7=((v7 ^ v8)>>ROTATE_24)|((v7 ^ v8).yx<<ROTATE_8);v2=v2+v7+vec2(0u,u32(v2.x+v7.x<v2.x));vD=((vD ^ v2)>>ROTATE_16)|((vD ^ v2).yx<<ROTATE_16);v8=v8+vD+vec2(0u,u32(v8.x+vD.x<v8.x));v7=((v7 ^ v8)<<ROTATE_1)|((v7 ^ v8).yx>>ROTATE_31);v3=v3+v4+vec2(0u,u32(v3.x+v4.x<v3.x));v3=v3+m3+vec2(0u,u32(v3.x+m3.x<v3.x));vE=(vE ^ v3).yx;v9=v9+vE+vec2(0u,u32(v9.x+vE.x<v9.x));v4=((v4 ^ v9)>>ROTATE_24)|((v4 ^ v9).yx<<ROTATE_8);v3=v3+v4+vec2(0u,u32(v3.x+v4.x<v3.x));vE=((vE ^ v3)>>ROTATE_16)|((vE ^ v3).yx<<ROTATE_16);v9=v9+vE+vec2(0u,u32(v9.x+vE.x<v9.x));v4=((v4 ^ v9)<<ROTATE_1)|((v4 ^ v9).yx>>ROTATE_31);v0=v0+v4+vec2(0u,u32(v0.x+v4.x<v0.x));v0=v0+m2+vec2(0u,u32(v0.x+m2.x<v0.x));vC=(vC ^ v0).yx;v8=v8+vC+vec2(0u,u32(v8.x+vC.x<v8.x));v4=((v4 ^ v8)>>ROTATE_24)|((v4 ^ v8).yx<<ROTATE_8);v0=v0+v4+vec2(0u,u32(v0.x+v4.x<v0.x));vC=((vC ^ v0)>>ROTATE_16)|((vC ^ v0).yx<<ROTATE_16);v8=v8+vC+vec2(0u,u32(v8.x+vC.x<v8.x));v4=((v4 ^ v8)<<ROTATE_1)|((v4 ^ v8).yx>>ROTATE_31);v1=v1+v5+vec2(0u,u32(v1.x+v5.x<v1.x));vD=(vD ^ v1).yx;v9=v9+vD+vec2(0u,u32(v9.x+vD.x<v9.x));v5=((v5 ^ v9)>>ROTATE_24)|((v5 ^ v9).yx<<ROTATE_8);v1=v1+v5+vec2(0u,u32(v1.x+v5.x<v1.x));vD=((vD ^ v1)>>ROTATE_16)|((vD ^ v1).yx<<ROTATE_16);v9=v9+vD+vec2(0u,u32(v9.x+vD.x<v9.x));v5=((v5 ^ v9)<<ROTATE_1)|((v5 ^ v9).yx>>ROTATE_31);v2=v2+v6+vec2(0u,u32(v2.x+v6.x<v2.x));v2=v2+m0+vec2(0u,u32(v2.x+m0.x<v2.x));vE=(vE ^ v2).yx;vA=vA+vE+vec2(0u,u32(vA.x+vE.x<vA.x));v6=((v6 ^ vA)>>ROTATE_24)|((v6 ^ vA).yx<<ROTATE_8);v2=v2+v6+vec2(0u,u32(v2.x+v6.x<v2.x));vE=((vE ^ v2)>>ROTATE_16)|((vE ^ v2).yx<<ROTATE_16);vA=vA+vE+vec2(0u,u32(vA.x+vE.x<vA.x));v6=((v6 ^ vA)<<ROTATE_1)|((v6 ^ vA).yx>>ROTATE_31);v3=v3+v7+vec2(0u,u32(v3.x+v7.x<v3.x));vF=(vF ^ v3).yx;vB=vB+vF+vec2(0u,u32(vB.x+vF.x<vB.x));v7=((v7 ^ vB)>>ROTATE_24)|((v7 ^ vB).yx<<ROTATE_8);v3=v3+v7+vec2(0u,u32(v3.x+v7.x<v3.x));v3=v3+m3+vec2(0u,u32(v3.x+m3.x<v3.x));vF=((vF ^ v3)>>ROTATE_16)|((vF ^ v3).yx<<ROTATE_16);vB=vB+vF+vec2(0u,u32(vB.x+vF.x<vB.x));v7=((v7 ^ vB)<<ROTATE_1)|((v7 ^ vB).yx>>ROTATE_31);v0=v0+v5+vec2(0u,u32(v0.x+v5.x<v0.x));v0=v0+m4+vec2(0u,u32(v0.x+m4.x<v0.x));vF=(vF ^ v0).yx;vA=vA+vF+vec2(0u,u32(vA.x+vF.x<vA.x));v5=((v5 ^ vA)>>ROTATE_24)|((v5 ^ vA).yx<<ROTATE_8);v0=v0+v5+vec2(0u,u32(v0.x+v5.x<v0.x));vF=((vF ^ v0)>>ROTATE_16)|((vF ^ v0).yx<<ROTATE_16);vA=vA+vF+vec2(0u,u32(vA.x+vF.x<vA.x));v5=((v5 ^ vA)<<ROTATE_1)|((v5 ^ vA).yx>>ROTATE_31);v1=v1+v6+vec2(0u,u32(v1.x+v6.x<v1.x));vC=(vC ^ v1).yx;vB=vB+vC+vec2(0u,u32(vB.x+vC.x<vB.x));v6=((v6 ^ vB)>>ROTATE_24)|((v6 ^ vB).yx<<ROTATE_8);v1=v1+v6+vec2(0u,u32(v1.x+v6.x<v1.x));vC=((vC ^ v1)>>ROTATE_16)|((vC ^ v1).yx<<ROTATE_16);vB=vB+vC+vec2(0u,u32(vB.x+vC.x<vB.x));v6=((v6 ^ vB)<<ROTATE_1)|((v6 ^ vB).yx>>ROTATE_31);v2=v2+v7+vec2(0u,u32(v2.x+v7.x<v2.x));vD=(vD ^ v2).yx;v8=v8+vD+vec2(0u,u32(v8.x+vD.x<v8.x));v7=((v7 ^ v8)>>ROTATE_24)|((v7 ^ v8).yx<<ROTATE_8);v2=v2+v7+vec2(0u,u32(v2.x+v7.x<v2.x));vD=((vD ^ v2)>>ROTATE_16)|((vD ^ v2).yx<<ROTATE_16);v8=v8+vD+vec2(0u,u32(v8.x+vD.x<v8.x));v7=((v7 ^ v8)<<ROTATE_1)|((v7 ^ v8).yx>>ROTATE_31);v3=v3+v4+vec2(0u,u32(v3.x+v4.x<v3.x));v3=v3+m1+vec2(0u,u32(v3.x+m1.x<v3.x));vE=(vE ^ v3).yx;v9=v9+vE+vec2(0u,u32(v9.x+vE.x<v9.x));v4=((v4 ^ v9)>>ROTATE_24)|((v4 ^ v9).yx<<ROTATE_8);v3=v3+v4+vec2(0u,u32(v3.x+v4.x<v3.x));vE=((vE ^ v3)>>ROTATE_16)|((vE ^ v3).yx<<ROTATE_16);v9=v9+vE+vec2(0u,u32(v9.x+vE.x<v9.x));v4=((v4 ^ v9)<<ROTATE_1)|((v4 ^ v9).yx>>ROTATE_31);v0=v0+v4+vec2(0u,u32(v0.x+v4.x<v0.x));vC=(vC ^ v0).yx;v8=v8+vC+vec2(0u,u32(v8.x+vC.x<v8.x));v4=((v4 ^ v8)>>ROTATE_24)|((v4 ^ v8).yx<<ROTATE_8);v0=v0+v4+vec2(0u,u32(v0.x+v4.x<v0.x));vC=((vC ^ v0)>>ROTATE_16)|((vC ^ v0).yx<<ROTATE_16);v8=v8+vC+vec2(0u,u32(v8.x+vC.x<v8.x));v4=((v4 ^ v8)<<ROTATE_1)|((v4 ^ v8).yx>>ROTATE_31);v1=v1+v5+vec2(0u,u32(v1.x+v5.x<v1.x));v1=v1+m1+vec2(0u,u32(v1.x+m1.x<v1.x));vD=(vD ^ v1).yx;v9=v9+vD+vec2(0u,u32(v9.x+vD.x<v9.x));v5=((v5 ^ v9)>>ROTATE_24)|((v5 ^ v9).yx<<ROTATE_8);v1=v1+v5+vec2(0u,u32(v1.x+v5.x<v1.x));vD=((vD ^ v1)>>ROTATE_16)|((vD ^ v1).yx<<ROTATE_16);v9=v9+vD+vec2(0u,u32(v9.x+vD.x<v9.x));v5=((v5 ^ v9)<<ROTATE_1)|((v5 ^ v9).yx>>ROTATE_31);v2=v2+v6+vec2(0u,u32(v2.x+v6.x<v2.x));vE=(vE ^ v2).yx;vA=vA+vE+vec2(0u,u32(vA.x+vE.x<vA.x));v6=((v6 ^ vA)>>ROTATE_24)|((v6 ^ vA).yx<<ROTATE_8);v2=v2+v6+vec2(0u,u32(v2.x+v6.x<v2.x));vE=((vE ^ v2)>>ROTATE_16)|((vE ^ v2).yx<<ROTATE_16);vA=vA+vE+vec2(0u,u32(vA.x+vE.x<vA.x));v6=((v6 ^ vA)<<ROTATE_1)|((v6 ^ vA).yx>>ROTATE_31);v3=v3+v7+vec2(0u,u32(v3.x+v7.x<v3.x));v3=v3+m4+vec2(0u,u32(v3.x+m4.x<v3.x));vF=(vF ^ v3).yx;vB=vB+vF+vec2(0u,u32(vB.x+vF.x<vB.x));v7=((v7 ^ vB)>>ROTATE_24)|((v7 ^ vB).yx<<ROTATE_8);v3=v3+v7+vec2(0u,u32(v3.x+v7.x<v3.x));vF=((vF ^ v3)>>ROTATE_16)|((vF ^ v3).yx<<ROTATE_16);vB=vB+vF+vec2(0u,u32(vB.x+vF.x<vB.x));v7=((v7 ^ vB)<<ROTATE_1)|((v7 ^ vB).yx>>ROTATE_31);v0=v0+v5+vec2(0u,u32(v0.x+v5.x<v0.x));v0=v0+m0+vec2(0u,u32(v0.x+m0.x<v0.x));vF=(vF ^ v0).yx;vA=vA+vF+vec2(0u,u32(vA.x+vF.x<vA.x));v5=((v5 ^ vA)>>ROTATE_24)|((v5 ^ vA).yx<<ROTATE_8);v0=v0+v5+vec2(0u,u32(v0.x+v5.x<v0.x));vF=((vF ^ v0)>>ROTATE_16)|((vF ^ v0).yx<<ROTATE_16);vA=vA+vF+vec2(0u,u32(vA.x+vF.x<vA.x));v5=((v5 ^ vA)<<ROTATE_1)|((v5 ^ vA).yx>>ROTATE_31);v1=v1+v6+vec2(0u,u32(v1.x+v6.x<v1.x));vC=(vC ^ v1).yx;vB=vB+vC+vec2(0u,u32(vB.x+vC.x<vB.x));v6=((v6 ^ vB)>>ROTATE_24)|((v6 ^ vB).yx<<ROTATE_8);v1=v1+v6+vec2(0u,u32(v1.x+v6.x<v1.x));v1=v1+m3+vec2(0u,u32(v1.x+m3.x<v1.x));vC=((vC ^ v1)>>ROTATE_16)|((vC ^ v1).yx<<ROTATE_16);vB=vB+vC+vec2(0u,u32(vB.x+vC.x<vB.x));v6=((v6 ^ vB)<<ROTATE_1)|((v6 ^ vB).yx>>ROTATE_31);v2=v2+v7+vec2(0u,u32(v2.x+v7.x<v2.x));vD=(vD ^ v2).yx;v8=v8+vD+vec2(0u,u32(v8.x+vD.x<v8.x));v7=((v7 ^ v8)>>ROTATE_24)|((v7 ^ v8).yx<<ROTATE_8);v2=v2+v7+vec2(0u,u32(v2.x+v7.x<v2.x));v2=v2+m2+vec2(0u,u32(v2.x+m2.x<v2.x));vD=((vD ^ v2)>>ROTATE_16)|((vD ^ v2).yx<<ROTATE_16);v8=v8+vD+vec2(0u,u32(v8.x+vD.x<v8.x));v7=((v7 ^ v8)<<ROTATE_1)|((v7 ^ v8).yx>>ROTATE_31);v3=v3+v4+vec2(0u,u32(v3.x+v4.x<v3.x));vE=(vE ^ v3).yx;v9=v9+vE+vec2(0u,u32(v9.x+vE.x<v9.x));v4=((v4 ^ v9)>>ROTATE_24)|((v4 ^ v9).yx<<ROTATE_8);v3=v3+v4+vec2(0u,u32(v3.x+v4.x<v3.x));vE=((vE ^ v3)>>ROTATE_16)|((vE ^ v3).yx<<ROTATE_16);v9=v9+vE+vec2(0u,u32(v9.x+vE.x<v9.x));v4=((v4 ^ v9)<<ROTATE_1)|((v4 ^ v9).yx>>ROTATE_31);v0=v0+v4+vec2(0u,u32(v0.x+v4.x<v0.x));vC=(vC ^ v0).yx;v8=v8+vC+vec2(0u,u32(v8.x+vC.x<v8.x));v4=((v4 ^ v8)>>ROTATE_24)|((v4 ^ v8).yx<<ROTATE_8);v0=v0+v4+vec2(0u,u32(v0.x+v4.x<v0.x));vC=((vC ^ v0)>>ROTATE_16)|((vC ^ v0).yx<<ROTATE_16);v8=v8+vC+vec2(0u,u32(v8.x+vC.x<v8.x));v4=((v4 ^ v8)<<ROTATE_1)|((v4 ^ v8).yx>>ROTATE_31);v1=v1+v5+vec2(0u,u32(v1.x+v5.x<v1.x));vD=(vD ^ v1).yx;v9=v9+vD+vec2(0u,u32(v9.x+vD.x<v9.x));v5=((v5 ^ v9)>>ROTATE_24)|((v5 ^ v9).yx<<ROTATE_8);v1=v1+v5+vec2(0u,u32(v1.x+v5.x<v1.x));vD=((vD ^ v1)>>ROTATE_16)|((vD ^ v1).yx<<ROTATE_16);v9=v9+vD+vec2(0u,u32(v9.x+vD.x<v9.x));v5=((v5 ^ v9)<<ROTATE_1)|((v5 ^ v9).yx>>ROTATE_31);v2=v2+v6+vec2(0u,u32(v2.x+v6.x<v2.x));vE=(vE ^ v2).yx;vA=vA+vE+vec2(0u,u32(vA.x+vE.x<vA.x));v6=((v6 ^ vA)>>ROTATE_24)|((v6 ^ vA).yx<<ROTATE_8);v2=v2+v6+vec2(0u,u32(v2.x+v6.x<v2.x));v2=v2+m1+vec2(0u,u32(v2.x+m1.x<v2.x));vE=((vE ^ v2)>>ROTATE_16)|((vE ^ v2).yx<<ROTATE_16);vA=vA+vE+vec2(0u,u32(vA.x+vE.x<vA.x));v6=((v6 ^ vA)<<ROTATE_1)|((v6 ^ vA).yx>>ROTATE_31);v3=v3+v7+vec2(0u,u32(v3.x+v7.x<v3.x));v3=v3+m3+vec2(0u,u32(v3.x+m3.x<v3.x));vF=(vF ^ v3).yx;vB=vB+vF+vec2(0u,u32(vB.x+vF.x<vB.x));v7=((v7 ^ vB)>>ROTATE_24)|((v7 ^ vB).yx<<ROTATE_8);v3=v3+v7+vec2(0u,u32(v3.x+v7.x<v3.x));vF=((vF ^ v3)>>ROTATE_16)|((vF ^ v3).yx<<ROTATE_16);vB=vB+vF+vec2(0u,u32(vB.x+vF.x<vB.x));v7=((v7 ^ vB)<<ROTATE_1)|((v7 ^ vB).yx>>ROTATE_31);v0=v0+v5+vec2(0u,u32(v0.x+v5.x<v0.x));vF=(vF ^ v0).yx;vA=vA+vF+vec2(0u,u32(vA.x+vF.x<vA.x));v5=((v5 ^ vA)>>ROTATE_24)|((v5 ^ vA).yx<<ROTATE_8);v0=v0+v5+vec2(0u,u32(v0.x+v5.x<v0.x));v0=v0+m0+vec2(0u,u32(v0.x+m0.x<v0.x));vF=((vF ^ v0)>>ROTATE_16)|((vF ^ v0).yx<<ROTATE_16);vA=vA+vF+vec2(0u,u32(vA.x+vF.x<vA.x));v5=((v5 ^ vA)<<ROTATE_1)|((v5 ^ vA).yx>>ROTATE_31);v1=v1+v6+vec2(0u,u32(v1.x+v6.x<v1.x));vC=(vC ^ v1).yx;vB=vB+vC+vec2(0u,u32(vB.x+vC.x<vB.x));v6=((v6 ^ vB)>>ROTATE_24)|((v6 ^ vB).yx<<ROTATE_8);v1=v1+v6+vec2(0u,u32(v1.x+v6.x<v1.x));v1=v1+m4+vec2(0u,u32(v1.x+m4.x<v1.x));vC=((vC ^ v1)>>ROTATE_16)|((vC ^ v1).yx<<ROTATE_16);vB=vB+vC+vec2(0u,u32(vB.x+vC.x<vB.x));v6=((v6 ^ vB)<<ROTATE_1)|((v6 ^ vB).yx>>ROTATE_31);v2=v2+v7+vec2(0u,u32(v2.x+v7.x<v2.x));vD=(vD ^ v2).yx;v8=v8+vD+vec2(0u,u32(v8.x+vD.x<v8.x));v7=((v7 ^ v8)>>ROTATE_24)|((v7 ^ v8).yx<<ROTATE_8);v2=v2+v7+vec2(0u,u32(v2.x+v7.x<v2.x));vD=((vD ^ v2)>>ROTATE_16)|((vD ^ v2).yx<<ROTATE_16);v8=v8+vD+vec2(0u,u32(v8.x+vD.x<v8.x));v7=((v7 ^ v8)<<ROTATE_1)|((v7 ^ v8).yx>>ROTATE_31);v3=v3+v4+vec2(0u,u32(v3.x+v4.x<v3.x));v3=v3+m2+vec2(0u,u32(v3.x+m2.x<v3.x));vE=(vE ^ v3).yx;v9=v9+vE+vec2(0u,u32(v9.x+vE.x<v9.x));v4=((v4 ^ v9)>>ROTATE_24)|((v4 ^ v9).yx<<ROTATE_8);v3=v3+v4+vec2(0u,u32(v3.x+v4.x<v3.x));vE=((vE ^ v3)>>ROTATE_16)|((vE ^ v3).yx<<ROTATE_16);v9=v9+vE+vec2(0u,u32(v9.x+vE.x<v9.x));v4=((v4 ^ v9)<<ROTATE_1)|((v4 ^ v9).yx>>ROTATE_31);v0=v0+v4+vec2(0u,u32(v0.x+v4.x<v0.x));vC=(vC ^ v0).yx;v8=v8+vC+vec2(0u,u32(v8.x+vC.x<v8.x));v4=((v4 ^ v8)>>ROTATE_24)|((v4 ^ v8).yx<<ROTATE_8);v0=v0+v4+vec2(0u,u32(v0.x+v4.x<v0.x));vC=((vC ^ v0)>>ROTATE_16)|((vC ^ v0).yx<<ROTATE_16);v8=v8+vC+vec2(0u,u32(v8.x+vC.x<v8.x));v4=((v4 ^ v8)<<ROTATE_1)|((v4 ^ v8).yx>>ROTATE_31);v1=v1+v5+vec2(0u,u32(v1.x+v5.x<v1.x));vD=(vD ^ v1).yx;v9=v9+vD+vec2(0u,u32(v9.x+vD.x<v9.x));v5=((v5 ^ v9)>>ROTATE_24)|((v5 ^ v9).yx<<ROTATE_8);v1=v1+v5+vec2(0u,u32(v1.x+v5.x<v1.x));vD=((vD ^ v1)>>ROTATE_16)|((vD ^ v1).yx<<ROTATE_16);v9=v9+vD+vec2(0u,u32(v9.x+vD.x<v9.x));v5=((v5 ^ v9)<<ROTATE_1)|((v5 ^ v9).yx>>ROTATE_31);v2=v2+v6+vec2(0u,u32(v2.x+v6.x<v2.x));vE=(vE ^ v2).yx;vA=vA+vE+vec2(0u,u32(vA.x+vE.x<vA.x));v6=((v6 ^ vA)>>ROTATE_24)|((v6 ^ vA).yx<<ROTATE_8);v2=v2+v6+vec2(0u,u32(v2.x+v6.x<v2.x));v2=v2+m3+vec2(0u,u32(v2.x+m3.x<v2.x));vE=((vE ^ v2)>>ROTATE_16)|((vE ^ v2).yx<<ROTATE_16);vA=vA+vE+vec2(0u,u32(vA.x+vE.x<vA.x));v6=((v6 ^ vA)<<ROTATE_1)|((v6 ^ vA).yx>>ROTATE_31);v3=v3+v7+vec2(0u,u32(v3.x+v7.x<v3.x));v3=v3+m0+vec2(0u,u32(v3.x+m0.x<v3.x));vF=(vF ^ v3).yx;vB=vB+vF+vec2(0u,u32(vB.x+vF.x<vB.x));v7=((v7 ^ vB)>>ROTATE_24)|((v7 ^ vB).yx<<ROTATE_8);v3=v3+v7+vec2(0u,u32(v3.x+v7.x<v3.x));vF=((vF ^ v3)>>ROTATE_16)|((vF ^ v3).yx<<ROTATE_16);vB=vB+vF+vec2(0u,u32(vB.x+vF.x<vB.x));v7=((v7 ^ vB)<<ROTATE_1)|((v7 ^ vB).yx>>ROTATE_31);v0=v0+v5+vec2(0u,u32(v0.x+v5.x<v0.x));vF=(vF ^ v0).yx;vA=vA+vF+vec2(0u,u32(vA.x+vF.x<vA.x));v5=((v5 ^ vA)>>ROTATE_24)|((v5 ^ vA).yx<<ROTATE_8);v0=v0+v5+vec2(0u,u32(v0.x+v5.x<v0.x));v0=v0+m2+vec2(0u,u32(v0.x+m2.x<v0.x));vF=((vF ^ v0)>>ROTATE_16)|((vF ^ v0).yx<<ROTATE_16);vA=vA+vF+vec2(0u,u32(vA.x+vF.x<vA.x));v5=((v5 ^ vA)<<ROTATE_1)|((v5 ^ vA).yx>>ROTATE_31);v1=v1+v6+vec2(0u,u32(v1.x+v6.x<v1.x));vC=(vC ^ v1).yx;vB=vB+vC+vec2(0u,u32(vB.x+vC.x<vB.x));v6=((v6 ^ vB)>>ROTATE_24)|((v6 ^ vB).yx<<ROTATE_8);v1=v1+v6+vec2(0u,u32(v1.x+v6.x<v1.x));vC=((vC ^ v1)>>ROTATE_16)|((vC ^ v1).yx<<ROTATE_16);vB=vB+vC+vec2(0u,u32(vB.x+vC.x<vB.x));v6=((v6 ^ vB)<<ROTATE_1)|((v6 ^ vB).yx>>ROTATE_31);v2=v2+v7+vec2(0u,u32(v2.x+v7.x<v2.x));v2=v2+m1+vec2(0u,u32(v2.x+m1.x<v2.x));vD=(vD ^ v2).yx;v8=v8+vD+vec2(0u,u32(v8.x+vD.x<v8.x));v7=((v7 ^ v8)>>ROTATE_24)|((v7 ^ v8).yx<<ROTATE_8);v2=v2+v7+vec2(0u,u32(v2.x+v7.x<v2.x));v2=v2+m4+vec2(0u,u32(v2.x+m4.x<v2.x));vD=((vD ^ v2)>>ROTATE_16)|((vD ^ v2).yx<<ROTATE_16);v8=v8+vD+vec2(0u,u32(v8.x+vD.x<v8.x));v7=((v7 ^ v8)<<ROTATE_1)|((v7 ^ v8).yx>>ROTATE_31);v3=v3+v4+vec2(0u,u32(v3.x+v4.x<v3.x));vE=(vE ^ v3).yx;v9=v9+vE+vec2(0u,u32(v9.x+vE.x<v9.x));v4=((v4 ^ v9)>>ROTATE_24)|((v4 ^ v9).yx<<ROTATE_8);v3=v3+v4+vec2(0u,u32(v3.x+v4.x<v3.x));vE=((vE ^ v3)>>ROTATE_16)|((vE ^ v3).yx<<ROTATE_16);v9=v9+vE+vec2(0u,u32(v9.x+vE.x<v9.x));v4=((v4 ^ v9)<<ROTATE_1)|((v4 ^ v9).yx>>ROTATE_31);v0=v0+v4+vec2(0u,u32(v0.x+v4.x<v0.x));vC=(vC ^ v0).yx;v8=v8+vC+vec2(0u,u32(v8.x+vC.x<v8.x));v4=((v4 ^ v8)>>ROTATE_24)|((v4 ^ v8).yx<<ROTATE_8);v0=v0+v4+vec2(0u,u32(v0.x+v4.x<v0.x));v0=v0+m2+vec2(0u,u32(v0.x+m2.x<v0.x));vC=((vC ^ v0)>>ROTATE_16)|((vC ^ v0).yx<<ROTATE_16);v8=v8+vC+vec2(0u,u32(v8.x+vC.x<v8.x));v4=((v4 ^ v8)<<ROTATE_1)|((v4 ^ v8).yx>>ROTATE_31);v1=v1+v5+vec2(0u,u32(v1.x+v5.x<v1.x));vD=(vD ^ v1).yx;v9=v9+vD+vec2(0u,u32(v9.x+vD.x<v9.x));v5=((v5 ^ v9)>>ROTATE_24)|((v5 ^ v9).yx<<ROTATE_8);v1=v1+v5+vec2(0u,u32(v1.x+v5.x<v1.x));v1=v1+m4+vec2(0u,u32(v1.x+m4.x<v1.x));vD=((vD ^ v1)>>ROTATE_16)|((vD ^ v1).yx<<ROTATE_16);v9=v9+vD+vec2(0u,u32(v9.x+vD.x<v9.x));v5=((v5 ^ v9)<<ROTATE_1)|((v5 ^ v9).yx>>ROTATE_31);v2=v2+v6+vec2(0u,u32(v2.x+v6.x<v2.x));vE=(vE ^ v2).yx;vA=vA+vE+vec2(0u,u32(vA.x+vE.x<vA.x));v6=((v6 ^ vA)>>ROTATE_24)|((v6 ^ vA).yx<<ROTATE_8);v2=v2+v6+vec2(0u,u32(v2.x+v6.x<v2.x));vE=((vE ^ v2)>>ROTATE_16)|((vE ^ v2).yx<<ROTATE_16);vA=vA+vE+vec2(0u,u32(vA.x+vE.x<vA.x));v6=((v6 ^ vA)<<ROTATE_1)|((v6 ^ vA).yx>>ROTATE_31);v3=v3+v7+vec2(0u,u32(v3.x+v7.x<v3.x));v3=v3+m1+vec2(0u,u32(v3.x+m1.x<v3.x));vF=(vF ^ v3).yx;vB=vB+vF+vec2(0u,u32(vB.x+vF.x<vB.x));v7=((v7 ^ vB)>>ROTATE_24)|((v7 ^ vB).yx<<ROTATE_8);v3=v3+v7+vec2(0u,u32(v3.x+v7.x<v3.x));vF=((vF ^ v3)>>ROTATE_16)|((vF ^ v3).yx<<ROTATE_16);vB=vB+vF+vec2(0u,u32(vB.x+vF.x<vB.x));v7=((v7 ^ vB)<<ROTATE_1)|((v7 ^ vB).yx>>ROTATE_31);v0=v0+v5+vec2(0u,u32(v0.x+v5.x<v0.x));vF=(vF ^ v0).yx;vA=vA+vF+vec2(0u,u32(vA.x+vF.x<vA.x));v5=((v5 ^ vA)>>ROTATE_24)|((v5 ^ vA).yx<<ROTATE_8);v0=v0+v5+vec2(0u,u32(v0.x+v5.x<v0.x));vF=((vF ^ v0)>>ROTATE_16)|((vF ^ v0).yx<<ROTATE_16);vA=vA+vF+vec2(0u,u32(vA.x+vF.x<vA.x));v5=((v5 ^ vA)<<ROTATE_1)|((v5 ^ vA).yx>>ROTATE_31);v1=v1+v6+vec2(0u,u32(v1.x+v6.x<v1.x));vC=(vC ^ v1).yx;vB=vB+vC+vec2(0u,u32(vB.x+vC.x<vB.x));v6=((v6 ^ vB)>>ROTATE_24)|((v6 ^ vB).yx<<ROTATE_8);v1=v1+v6+vec2(0u,u32(v1.x+v6.x<v1.x));vC=((vC ^ v1)>>ROTATE_16)|((vC ^ v1).yx<<ROTATE_16);vB=vB+vC+vec2(0u,u32(vB.x+vC.x<vB.x));v6=((v6 ^ vB)<<ROTATE_1)|((v6 ^ vB).yx>>ROTATE_31);v2=v2+v7+vec2(0u,u32(v2.x+v7.x<v2.x));v2=v2+m3+vec2(0u,u32(v2.x+m3.x<v2.x));vD=(vD ^ v2).yx;v8=v8+vD+vec2(0u,u32(v8.x+vD.x<v8.x));v7=((v7 ^ v8)>>ROTATE_24)|((v7 ^ v8).yx<<ROTATE_8);v2=v2+v7+vec2(0u,u32(v2.x+v7.x<v2.x));vD=((vD ^ v2)>>ROTATE_16)|((vD ^ v2).yx<<ROTATE_16);v8=v8+vD+vec2(0u,u32(v8.x+vD.x<v8.x));v7=((v7 ^ v8)<<ROTATE_1)|((v7 ^ v8).yx>>ROTATE_31);v3=v3+v4+vec2(0u,u32(v3.x+v4.x<v3.x));vE=(vE ^ v3).yx;v9=v9+vE+vec2(0u,u32(v9.x+vE.x<v9.x));v4=((v4 ^ v9)>>ROTATE_24)|((v4 ^ v9).yx<<ROTATE_8);v3=v3+v4+vec2(0u,u32(v3.x+v4.x<v3.x));v3=v3+m0+vec2(0u,u32(v3.x+m0.x<v3.x));vE=((vE ^ v3)>>ROTATE_16)|((vE ^ v3).yx<<ROTATE_16);v9=v9+vE+vec2(0u,u32(v9.x+vE.x<v9.x));v4=((v4 ^ v9)<<ROTATE_1)|((v4 ^ v9).yx>>ROTATE_31);v0=v0+v4+vec2(0u,u32(v0.x+v4.x<v0.x));v0=v0+m0+vec2(0u,u32(v0.x+m0.x<v0.x));vC=(vC ^ v0).yx;v8=v8+vC+vec2(0u,u32(v8.x+vC.x<v8.x));v4=((v4 ^ v8)>>ROTATE_24)|((v4 ^ v8).yx<<ROTATE_8);v0=v0+v4+vec2(0u,u32(v0.x+v4.x<v0.x));v0=v0+m1+vec2(0u,u32(v0.x+m1.x<v0.x));vC=((vC ^ v0)>>ROTATE_16)|((vC ^ v0).yx<<ROTATE_16);v8=v8+vC+vec2(0u,u32(v8.x+vC.x<v8.x));v4=((v4 ^ v8)<<ROTATE_1)|((v4 ^ v8).yx>>ROTATE_31);v1=v1+v5+vec2(0u,u32(v1.x+v5.x<v1.x));v1=v1+m2+vec2(0u,u32(v1.x+m2.x<v1.x));vD=(vD ^ v1).yx;v9=v9+vD+vec2(0u,u32(v9.x+vD.x<v9.x));v5=((v5 ^ v9)>>ROTATE_24)|((v5 ^ v9).yx<<ROTATE_8);v1=v1+v5+vec2(0u,u32(v1.x+v5.x<v1.x));v1=v1+m3+vec2(0u,u32(v1.x+m3.x<v1.x));vD=((vD ^ v1)>>ROTATE_16)|((vD ^ v1).yx<<ROTATE_16);v9=v9+vD+vec2(0u,u32(v9.x+vD.x<v9.x));v5=((v5 ^ v9)<<ROTATE_1)|((v5 ^ v9).yx>>ROTATE_31);v2=v2+v6+vec2(0u,u32(v2.x+v6.x<v2.x));v2=v2+m4+vec2(0u,u32(v2.x+m4.x<v2.x));vE=(vE ^ v2).yx;vA=vA+vE+vec2(0u,u32(vA.x+vE.x<vA.x));v6=((v6 ^ vA)>>ROTATE_24)|((v6 ^ vA).yx<<ROTATE_8);v2=v2+v6+vec2(0u,u32(v2.x+v6.x<v2.x));vE=((vE ^ v2)>>ROTATE_16)|((vE ^ v2).yx<<ROTATE_16);vA=vA+vE+vec2(0u,u32(vA.x+vE.x<vA.x));v6=((v6 ^ vA)<<ROTATE_1)|((v6 ^ vA).yx>>ROTATE_31);v3=v3+v7+vec2(0u,u32(v3.x+v7.x<v3.x));vF=(vF ^ v3).yx;vB=vB+vF+vec2(0u,u32(vB.x+vF.x<vB.x));v7=((v7 ^ vB)>>ROTATE_24)|((v7 ^ vB).yx<<ROTATE_8);v3=v3+v7+vec2(0u,u32(v3.x+v7.x<v3.x));vF=((vF ^ v3)>>ROTATE_16)|((vF ^ v3).yx<<ROTATE_16);vB=vB+vF+vec2(0u,u32(vB.x+vF.x<vB.x));v7=((v7 ^ vB)<<ROTATE_1)|((v7 ^ vB).yx>>ROTATE_31);v0=v0+v5+vec2(0u,u32(v0.x+v5.x<v0.x));vF=(vF ^ v0).yx;vA=vA+vF+vec2(0u,u32(vA.x+vF.x<vA.x));v5=((v5 ^ vA)>>ROTATE_24)|((v5 ^ vA).yx<<ROTATE_8);v0=v0+v5+vec2(0u,u32(v0.x+v5.x<v0.x));vF=((vF ^ v0)>>ROTATE_16)|((vF ^ v0).yx<<ROTATE_16);vA=vA+vF+vec2(0u,u32(vA.x+vF.x<vA.x));v5=((v5 ^ vA)<<ROTATE_1)|((v5 ^ vA).yx>>ROTATE_31);v1=v1+v6+vec2(0u,u32(v1.x+v6.x<v1.x));vC=(vC ^ v1).yx;vB=vB+vC+vec2(0u,u32(vB.x+vC.x<vB.x));v6=((v6 ^ vB)>>ROTATE_24)|((v6 ^ vB).yx<<ROTATE_8);v1=v1+v6+vec2(0u,u32(v1.x+v6.x<v1.x));vC=((vC ^ v1)>>ROTATE_16)|((vC ^ v1).yx<<ROTATE_16);vB=vB+vC+vec2(0u,u32(vB.x+vC.x<vB.x));v6=((v6 ^ vB)<<ROTATE_1)|((v6 ^ vB).yx>>ROTATE_31);v2=v2+v7+vec2(0u,u32(v2.x+v7.x<v2.x));vD=(vD ^ v2).yx;v8=v8+vD+vec2(0u,u32(v8.x+vD.x<v8.x));v7=((v7 ^ v8)>>ROTATE_24)|((v7 ^ v8).yx<<ROTATE_8);v2=v2+v7+vec2(0u,u32(v2.x+v7.x<v2.x));vD=((vD ^ v2)>>ROTATE_16)|((vD ^ v2).yx<<ROTATE_16);v8=v8+vD+vec2(0u,u32(v8.x+vD.x<v8.x));v7=((v7 ^ v8)<<ROTATE_1)|((v7 ^ v8).yx>>ROTATE_31);v3=v3+v4+vec2(0u,u32(v3.x+v4.x<v3.x));vE=(vE ^ v3).yx;v9=v9+vE+vec2(0u,u32(v9.x+vE.x<v9.x));v4=((v4 ^ v9)>>ROTATE_24)|((v4 ^ v9).yx<<ROTATE_8);v3=v3+v4+vec2(0u,u32(v3.x+v4.x<v3.x));vE=((vE ^ v3)>>ROTATE_16)|((vE ^ v3).yx<<ROTATE_16);v9=v9+vE+vec2(0u,u32(v9.x+vE.x<v9.x));v4=((v4 ^ v9)<<ROTATE_1)|((v4 ^ v9).yx>>ROTATE_31);v0=v0+v4+vec2(0u,u32(v0.x+v4.x<v0.x));vC=(vC ^ v0).yx;v8=v8+vC+vec2(0u,u32(v8.x+vC.x<v8.x));v4=((v4 ^ v8)>>ROTATE_24)|((v4 ^ v8).yx<<ROTATE_8);v0=v0+v4+vec2(0u,u32(v0.x+v4.x<v0.x));vC=((vC ^ v0)>>ROTATE_16)|((vC ^ v0).yx<<ROTATE_16);v8=v8+vC+vec2(0u,u32(v8.x+vC.x<v8.x));v1=v1+v5+vec2(0u,u32(v1.x+v5.x<v1.x));v1=v1+m4+vec2(0u,u32(v1.x+m4.x<v1.x));vD=(vD ^ v1).yx;v9=v9+vD+vec2(0u,u32(v9.x+vD.x<v9.x));v5=((v5 ^ v9)>>ROTATE_24)|((v5 ^ v9).yx<<ROTATE_8);v1=v1+v5+vec2(0u,u32(v1.x+v5.x<v1.x));vD=((vD ^ v1)>>ROTATE_16)|((vD ^ v1).yx<<ROTATE_16);v9=v9+vD+vec2(0u,u32(v9.x+vD.x<v9.x));v5=((v5 ^ v9)<<ROTATE_1)|((v5 ^ v9).yx>>ROTATE_31);v2=v2+v6+vec2(0u,u32(v2.x+v6.x<v2.x));vE=(vE ^ v2).yx;vA=vA+vE+vec2(0u,u32(vA.x+vE.x<vA.x));v6=((v6 ^ vA)>>ROTATE_24)|((v6 ^ vA).yx<<ROTATE_8);v2=v2+v6+vec2(0u,u32(v2.x+v6.x<v2.x));vE=((vE ^ v2)>>ROTATE_16)|((vE ^ v2).yx<<ROTATE_16);vA=vA+vE+vec2(0u,u32(vA.x+vE.x<vA.x));v3=v3+v7+vec2(0u,u32(v3.x+v7.x<v3.x));vF=(vF ^ v3).yx;vB=vB+vF+vec2(0u,u32(vB.x+vF.x<vB.x));v7=((v7 ^ vB)>>ROTATE_24)|((v7 ^ vB).yx<<ROTATE_8);v3=v3+v7+vec2(0u,u32(v3.x+v7.x<v3.x));vF=((vF ^ v3)>>ROTATE_16)|((vF ^ v3).yx<<ROTATE_16);vB=vB+vF+vec2(0u,u32(vB.x+vF.x<vB.x));v7=((v7 ^ vB)<<ROTATE_1)|((v7 ^ vB).yx>>ROTATE_31);v0=v0+v5+vec2(0u,u32(v0.x+v5.x<v0.x));v0=v0+m1+vec2(0u,u32(v0.x+m1.x<v0.x));vF=(vF ^ v0).yx;vA=vA+vF+vec2(0u,u32(vA.x+vF.x<vA.x));v5=((v5 ^ vA)>>ROTATE_24)|((v5 ^ vA).yx<<ROTATE_8);v0=v0+v5+vec2(0u,u32(v0.x+v5.x<v0.x));v2=v2+v7+vec2(0u,u32(v2.x+v7.x<v2.x));vD=(vD ^ v2).yx;v8=v8+vD+vec2(0u,u32(v8.x+vD.x<v8.x));v7=((v7 ^ v8)>>ROTATE_24)|((v7 ^ v8).yx<<ROTATE_8);v2=v2+v7+vec2(0u,u32(v2.x+v7.x<v2.x));vD=((vD ^ v2)>>ROTATE_16)|((vD ^ v2).yx<<ROTATE_16);v8=v8+vD+vec2(0u,u32(v8.x+vD.x<v8.x));if((BLAKE2B_IV_0.y ^ v0.y ^ v8.y)>ubo.threshold&&atomicLoad(&work.found)==0u){atomicStore(&work.found,1u);work.nonce=m0;}return;}`;
+var compute_default = "struct UBO{blockhash:array<vec4<u32>,2>,seed:vec2<u32>,threshold:u32};@group(0)@binding(0)var<uniform> ubo:UBO;struct WORK{nonce:vec2<u32>,found:atomic<u32>};@group(0)@binding(1)var<storage,read_write>work:WORK;const BLAKE2B_IV_0=vec2(0xF2BDC900u,0x6A09E667u);const BLAKE2B_IV_0=vec2(0xF2BDC900u,0x6A09E667u);const Z=vec2(0u);const CARRY=vec4(1u,0u,1u,0u);const ROTATE_1=vec4(1u);const ROTATE_8=vec4(8u);const ROTATE_16=vec4(16u);const ROTATE_24=vec4(24u);const ROTATE_31=vec4(31u);var<workgroup> found:bool;@compute @workgroup_size(32)fn search(@builtin(global_invocation_id)global_id:vec3<u32>,@builtin(local_invocation_id)local_id:vec3<u32>){found=(local_id.x==0u&&atomicLoad(&work.found)!=0u);workgroupBarrier();if(found){return;}main(global_id);}@compute @workgroup_size(1)fn validate(@builtin(global_invocation_id)global_id:vec3<u32>){main(global_id);}fn main(id:vec3<u32>){let m0:vec2<u32>=ubo.seed ^ id.xy;let m1:vec2<u32>=ubo.blockhash[0u].xy;let m2:vec2<u32>=ubo.blockhash[0u].zw;let m3:vec2<u32>=ubo.blockhash[1u].xy;let m4:vec2<u32>=ubo.blockhash[1u].zw;var v01:vec4<u32>=vec4(BLAKE2B_IV_0,0x84CAA73Bu,0xBB67AE85u);var v23:vec4<u32>=vec4(0xFE94F82Bu,0x3C6EF372u,0x5F1D36F1u,0xA54FF53Au);var v45:vec4<u32>=vec4(0xADE682D1u,0x510E527Fu,0x2B3E6C1Fu,0x9B05688Cu);var v67:vec4<u32>=vec4(0xFB41BD6Bu,0x1F83D9ABu,0x137E2179u,0x5BE0CD19u);var v89:vec4<u32>=vec4(0xF3BCC908u,0x6A09E667u,0x84CAA73Bu,0xBB67AE85u);var vAB:vec4<u32>=vec4(0xFE94F82Bu,0x3C6EF372u,0x5F1D36F1u,0xA54FF53Au);var vCD:vec4<u32>=vec4(0xADE682F9u,0x510E527Fu,0x2B3E6C1Fu,0x9B05688Cu);var vEF:vec4<u32>=vec4(0x04BE4294u,0xE07C2654u,0x137E2179u,0x5BE0CD19u);var v56:vec4<u32>;var vFC:vec4<u32>;var v74:vec4<u32>;var vDE:vec4<u32>;var s0:vec4<u32>;var s1:vec4<u32>;s0=v01+v45;v01=s0+(vec4<u32>(s0<v01)&CARRY).yxwz;s1=v23+v67;v23=s1+(vec4<u32>(s1<v23)&CARRY).yxwz;s0=v01+vec4(m0,m2);v01=s0+(vec4<u32>(s0<v01)&CARRY).yxwz;v23+=vec4(m4,Z);v23.y+=u32(v23.x<m4.x);vCD=(vCD ^ v01).yxwz;vEF=(vEF ^ v23).yxwz;s0=v89+vCD;v89=s0+(vec4<u32>(s0<v89)&CARRY).yxwz;s1=vAB+vEF;vAB=s1+(vec4<u32>(s1<vAB)&CARRY).yxwz;v45 ^=v89;v45=(v45>>ROTATE_24)|(v45<<ROTATE_8).yxwz;v67 ^=vAB;v67=(v67>>ROTATE_24)|(v67<<ROTATE_8).yxwz;s0=v01+v45;v01=s0+(vec4<u32>(s0<v01)&CARRY).yxwz;s1=v23+v67;v23=s1+(vec4<u32>(s1<v23)&CARRY).yxwz;s0=v01+vec4(m1,m3);v01=s0+(vec4<u32>(s0<v01)&CARRY).yxwz;vCD ^=v01;vCD=(vCD>>ROTATE_16)|(vCD<<ROTATE_16).yxwz;vEF ^=v23;vEF=(vEF>>ROTATE_16)|(vEF<<ROTATE_16).yxwz;s0=v89+vCD;v89=s0+(vec4<u32>(s0<v89)&CARRY).yxwz;s1=vAB+vEF;vAB=s1+(vec4<u32>(s1<vAB)&CARRY).yxwz;v45 ^=v89;v45=(v45>>ROTATE_31).yxwz|(v45<<ROTATE_1);v67 ^=vAB;v67=(v67>>ROTATE_31).yxwz|(v67<<ROTATE_1);v56=vec4(v45.zw,v67.xy);v74=vec4(v67.zw,v45.xy);vFC=vec4(vEF.zw,vCD.xy);vDE=vec4(vCD.zw,vEF.xy);s0=v01+v56;v01=s0+(vec4<u32>(s0<v01)&CARRY).yxwz;s1=v23+v74;v23=s1+(vec4<u32>(s1<v23)&CARRY).yxwz;vFC=(vFC ^ v01).yxwz;vDE=(vDE ^ v23).yxwz;s0=vAB+vFC;vAB=s0+(vec4<u32>(s0<vAB)&CARRY).yxwz;s1=v89+vDE;v89=s1+(vec4<u32>(s1<v89)&CARRY).yxwz;v56 ^=vAB;v56=(v56>>ROTATE_24)|(v56<<ROTATE_8).yxwz;v74 ^=v89;v74=(v74>>ROTATE_24)|(v74<<ROTATE_8).yxwz;s0=v01+v56;v01=s0+(vec4<u32>(s0<v01)&CARRY).yxwz;s1=v23+v74;v23=s1+(vec4<u32>(s1<v23)&CARRY).yxwz;vFC ^=v01;vFC=(vFC>>ROTATE_16)|(vFC<<ROTATE_16).yxwz;vDE ^=v23;vDE=(vDE>>ROTATE_16)|(vDE<<ROTATE_16).yxwz;s0=vAB+vFC;vAB=s0+(vec4<u32>(s0<vAB)&CARRY).yxwz;s1=v89+vDE;v89=s1+(vec4<u32>(s1<v89)&CARRY).yxwz;v56 ^=vAB;v56=(v56>>ROTATE_31).yxwz|(v56<<ROTATE_1);v74 ^=v89;v74=(v74>>ROTATE_31).yxwz|(v74<<ROTATE_1);v45=vec4(v74.zw,v56.xy);v67=vec4(v56.zw,v74.xy);vCD=vec4(vFC.zw,vDE.xy);vEF=vec4(vDE.zw,vFC.xy);s0=v01+v45;v01=s0+(vec4<u32>(s0<v01)&CARRY).yxwz;s1=v23+v67;v23=s1+(vec4<u32>(s1<v23)&CARRY).yxwz;v01+=vec4(Z,m4);v01.w+=u32(v01.z<m4.x);vCD=(vCD ^ v01).yxwz;vEF=(vEF ^ v23).yxwz;s0=v89+vCD;v89=s0+(vec4<u32>(s0<v89)&CARRY).yxwz;s1=vAB+vEF;vAB=s1+(vec4<u32>(s1<vAB)&CARRY).yxwz;v45 ^=v89;v45=(v45>>ROTATE_24)|(v45<<ROTATE_8).yxwz;v67 ^=vAB;v67=(v67>>ROTATE_24)|(v67<<ROTATE_8).yxwz;s0=v01+v45;v01=s0+(vec4<u32>(s0<v01)&CARRY).yxwz;s1=v23+v67;v23=s1+(vec4<u32>(s1<v23)&CARRY).yxwz;vCD ^=v01;vCD=(vCD>>ROTATE_16)|(vCD<<ROTATE_16).yxwz;vEF ^=v23;vEF=(vEF>>ROTATE_16)|(vEF<<ROTATE_16).yxwz;s0=v89+vCD;v89=s0+(vec4<u32>(s0<v89)&CARRY).yxwz;s1=vAB+vEF;vAB=s1+(vec4<u32>(s1<vAB)&CARRY).yxwz;v45 ^=v89;v45=(v45>>ROTATE_31).yxwz|(v45<<ROTATE_1);v67 ^=vAB;v67=(v67>>ROTATE_31).yxwz|(v67<<ROTATE_1);v56=vec4(v45.zw,v67.xy);v74=vec4(v67.zw,v45.xy);vFC=vec4(vEF.zw,vCD.xy);vDE=vec4(vCD.zw,vEF.xy);s0=v01+v56;v01=s0+(vec4<u32>(s0<v01)&CARRY).yxwz;s1=v23+v74;v23=s1+(vec4<u32>(s1<v23)&CARRY).yxwz;s0=v01+vec4(m1,m0);v01=s0+(vec4<u32>(s0<v01)&CARRY).yxwz;vFC=(vFC ^ v01).yxwz;vDE=(vDE ^ v23).yxwz;s0=vAB+vFC;vAB=s0+(vec4<u32>(s0<vAB)&CARRY).yxwz;s1=v89+vDE;v89=s1+(vec4<u32>(s1<v89)&CARRY).yxwz;v56 ^=vAB;v56=(v56>>ROTATE_24)|(v56<<ROTATE_8).yxwz;v74 ^=v89;v74=(v74>>ROTATE_24)|(v74<<ROTATE_8).yxwz;s0=v01+v56;v01=s0+(vec4<u32>(s0<v01)&CARRY).yxwz;s1=v23+v74;v23=s1+(vec4<u32>(s1<v23)&CARRY).yxwz;v01+=vec4(Z,m2);v01.w+=u32(v01.z<m2.x);v23+=vec4(Z,m3);v23.w+=u32(v23.z<m3.x);vFC ^=v01;vFC=(vFC>>ROTATE_16)|(vFC<<ROTATE_16).yxwz;vDE ^=v23;vDE=(vDE>>ROTATE_16)|(vDE<<ROTATE_16).yxwz;s0=vAB+vFC;vAB=s0+(vec4<u32>(s0<vAB)&CARRY).yxwz;s1=v89+vDE;v89=s1+(vec4<u32>(s1<v89)&CARRY).yxwz;v56 ^=vAB;v56=(v56>>ROTATE_31).yxwz|(v56<<ROTATE_1);v74 ^=v89;v74=(v74>>ROTATE_31).yxwz|(v74<<ROTATE_1);v45=vec4(v74.zw,v56.xy);v67=vec4(v56.zw,v74.xy);vCD=vec4(vFC.zw,vDE.xy);vEF=vec4(vDE.zw,vFC.xy);s0=v01+v45;v01=s0+(vec4<u32>(s0<v01)&CARRY).yxwz;s1=v23+v67;v23=s1+(vec4<u32>(s1<v23)&CARRY).yxwz;vCD=(vCD ^ v01).yxwz;vEF=(vEF ^ v23).yxwz;s0=v89+vCD;v89=s0+(vec4<u32>(s0<v89)&CARRY).yxwz;s1=vAB+vEF;vAB=s1+(vec4<u32>(s1<vAB)&CARRY).yxwz;v45 ^=v89;v45=(v45>>ROTATE_24)|(v45<<ROTATE_8).yxwz;v67 ^=vAB;v67=(v67>>ROTATE_24)|(v67<<ROTATE_8).yxwz;s0=v01+v45;v01=s0+(vec4<u32>(s0<v01)&CARRY).yxwz;s1=v23+v67;v23=s1+(vec4<u32>(s1<v23)&CARRY).yxwz;s0=v01+vec4(Z,m0);v01=s0+(vec4<u32>(s0<v01)&CARRY).yxwz;s1=v23+vec4(m2,Z);v23=s1+(vec4<u32>(s1<v23)&CARRY).yxwz;vCD ^=v01;vCD=(vCD>>ROTATE_16)|(vCD<<ROTATE_16).yxwz;vEF ^=v23;vEF=(vEF>>ROTATE_16)|(vEF<<ROTATE_16).yxwz;s0=v89+vCD;v89=s0+(vec4<u32>(s0<v89)&CARRY).yxwz;s1=vAB+vEF;vAB=s1+(vec4<u32>(s1<vAB)&CARRY).yxwz;v45 ^=v89;v45=(v45>>ROTATE_31).yxwz|(v45<<ROTATE_1);v67 ^=vAB;v67=(v67>>ROTATE_31).yxwz|(v67<<ROTATE_1);v56=vec4(v45.zw,v67.xy);v74=vec4(v67.zw,v45.xy);vFC=vec4(vEF.zw,vCD.xy);vDE=vec4(vCD.zw,vEF.xy);s0=v01+v56;v01=s0+(vec4<u32>(s0<v01)&CARRY).yxwz;s1=v23+v74;v23=s1+(vec4<u32>(s1<v23)&CARRY).yxwz;v01.z+=m3.x;v01.w+=m3.y+u32(v01.z<m3.x);vFC=(vFC ^ v01).yxwz;vDE=(vDE ^ v23).yxwz;s0=vAB+vFC;vAB=s0+(vec4<u32>(s0<vAB)&CARRY).yxwz;s1=v89+vDE;v89=s1+(vec4<u32>(s1<v89)&CARRY).yxwz;v56 ^=vAB;v56=(v56>>ROTATE_24)|(v56<<ROTATE_8).yxwz;v74 ^=v89;v74=(v74>>ROTATE_24)|(v74<<ROTATE_8).yxwz;s0=v01+v56;v01=s0+(vec4<u32>(s0<v01)&CARRY).yxwz;s1=v23+v74;v23=s1+(vec4<u32>(s1<v23)&CARRY).yxwz;v23+=vec4(m1,m4)+vec4(0u,u32(v23.x+vec4(m1,m4).x<v23.x),0u,u32(v23.z+vec4(m1,m4).z<v23.z));vFC ^=v01;vFC=(vFC>>ROTATE_16)|(vFC<<ROTATE_16).yxwz;vDE ^=v23;vDE=(vDE>>ROTATE_16)|(vDE<<ROTATE_16).yxwz;s0=vAB+vFC;vAB=s0+(vec4<u32>(s0<vAB)&CARRY).yxwz;s1=v89+vDE;v89=s1+(vec4<u32>(s1<v89)&CARRY).yxwz;v56 ^=vAB;v56=(v56>>ROTATE_31).yxwz|(v56<<ROTATE_1);v74 ^=v89;v74=(v74>>ROTATE_31).yxwz|(v74<<ROTATE_1);v45=vec4(v74.zw,v56.xy);v67=vec4(v56.zw,v74.xy);vCD=vec4(vFC.zw,vDE.xy);vEF=vec4(vDE.zw,vFC.xy);s0=v01+v45;v01=s0+(vec4<u32>(s0<v01)&CARRY).yxwz;s1=v23+v67;v23=s1+(vec4<u32>(s1<v23)&CARRY).yxwz;v01+=vec4(Z,m3)+vec4(Z,0u,u32(v01.z+vec4(Z,m3).z<v01.z));vCD=(vCD ^ v01).yxwz;vEF=(vEF ^ v23).yxwz;s0=v89+vCD;v89=s0+(vec4<u32>(s0<v89)&CARRY).yxwz;s1=vAB+vEF;vAB=s1+(vec4<u32>(s1<vAB)&CARRY).yxwz;v45 ^=v89;v45=(v45>>ROTATE_24)|(v45<<ROTATE_8).yxwz;v67 ^=vAB;v67=(v67>>ROTATE_24)|(v67<<ROTATE_8).yxwz;s0=v01+v45;v01=s0+(vec4<u32>(s0<v01)&CARRY).yxwz;s1=v23+v67;v23=s1+(vec4<u32>(s1<v23)&CARRY).yxwz;v01+=vec4(Z,m1)+vec4(Z,0u,u32(v01.z+vec4(Z,m1).z<v01.z));vCD ^=v01;vCD=(vCD>>ROTATE_16)|(vCD<<ROTATE_16).yxwz;vEF ^=v23;vEF=(vEF>>ROTATE_16)|(vEF<<ROTATE_16).yxwz;s0=v89+vCD;v89=s0+(vec4<u32>(s0<v89)&CARRY).yxwz;s1=vAB+vEF;vAB=s1+(vec4<u32>(s1<vAB)&CARRY).yxwz;v45 ^=v89;v45=(v45>>ROTATE_31).yxwz|(v45<<ROTATE_1);v67 ^=vAB;v67=(v67>>ROTATE_31).yxwz|(v67<<ROTATE_1);v56=vec4(v45.zw,v67.xy);v74=vec4(v67.zw,v45.xy);vFC=vec4(vEF.zw,vCD.xy);vDE=vec4(vCD.zw,vEF.xy);s0=v01+v56;v01=s0+(vec4<u32>(s0<v01)&CARRY).yxwz;s1=v23+v74;v23=s1+(vec4<u32>(s1<v23)&CARRY).yxwz;v01+=vec4(m2,Z)+vec4(0u,u32(v01.x+vec4(m2,Z).x<v01.x),Z);v23+=vec4(m4,Z)+vec4(0u,u32(v23.x+vec4(m4,Z).x<v23.x),Z);vFC=(vFC ^ v01).yxwz;vDE=(vDE ^ v23).yxwz;s0=vAB+vFC;vAB=s0+(vec4<u32>(s0<vAB)&CARRY).yxwz;s1=v89+vDE;v89=s1+(vec4<u32>(s1<v89)&CARRY).yxwz;v56 ^=vAB;v56=(v56>>ROTATE_24)|(v56<<ROTATE_8).yxwz;v74 ^=v89;v74=(v74>>ROTATE_24)|(v74<<ROTATE_8).yxwz;s0=v01+v56;v01=s0+(vec4<u32>(s0<v01)&CARRY).yxwz;s1=v23+v74;v23=s1+(vec4<u32>(s1<v23)&CARRY).yxwz;v23+=vec4(m0,Z)+vec4(0u,u32(v23.x+vec4(m0,Z).x<v23.x),Z);vFC ^=v01;vFC=(vFC>>ROTATE_16)|(vFC<<ROTATE_16).yxwz;vDE ^=v23;vDE=(vDE>>ROTATE_16)|(vDE<<ROTATE_16).yxwz;s0=vAB+vFC;vAB=s0+(vec4<u32>(s0<vAB)&CARRY).yxwz;s1=v89+vDE;v89=s1+(vec4<u32>(s1<v89)&CARRY).yxwz;v56 ^=vAB;v56=(v56>>ROTATE_31).yxwz|(v56<<ROTATE_1);v74 ^=v89;v74=(v74>>ROTATE_31).yxwz|(v74<<ROTATE_1);v45=vec4(v74.zw,v56.xy);v67=vec4(v56.zw,v74.xy);vCD=vec4(vFC.zw,vDE.xy);vEF=vec4(vDE.zw,vFC.xy);s0=v01+v45;v01=s0+(vec4<u32>(s0<v01)&CARRY).yxwz;s1=v23+v67;v23=s1+(vec4<u32>(s1<v23)&CARRY).yxwz;s1=v23+vec4(m2,Z);v23=s1+(vec4<u32>(s1<v23)&CARRY).yxwz;vCD=(vCD ^ v01).yxwz;vEF=(vEF ^ v23).yxwz;s0=v89+vCD;v89=s0+(vec4<u32>(s0<v89)&CARRY).yxwz;s1=vAB+vEF;vAB=s1+(vec4<u32>(s1<vAB)&CARRY).yxwz;v45 ^=v89;v45=(v45>>ROTATE_24)|(v45<<ROTATE_8).yxwz;v67 ^=vAB;v67=(v67>>ROTATE_24)|(v67<<ROTATE_8).yxwz;s0=v01+v45;v01=s0+(vec4<u32>(s0<v01)&CARRY).yxwz;s1=v23+v67;v23=s1+(vec4<u32>(s1<v23)&CARRY).yxwz;v01+=vec4(m0,Z)+vec4(0u,u32(v01.x+vec4(m0,Z).x<v01.x),Z);v23+=vec4(m4,Z)+vec4(0u,u32(v23.x+vec4(m4,Z).x<v23.x),Z);vCD ^=v01;vCD=(vCD>>ROTATE_16)|(vCD<<ROTATE_16).yxwz;vEF ^=v23;vEF=(vEF>>ROTATE_16)|(vEF<<ROTATE_16).yxwz;s0=v89+vCD;v89=s0+(vec4<u32>(s0<v89)&CARRY).yxwz;s1=vAB+vEF;vAB=s1+(vec4<u32>(s1<vAB)&CARRY).yxwz;v45 ^=v89;v45=(v45>>ROTATE_31).yxwz|(v45<<ROTATE_1);v67 ^=vAB;v67=(v67>>ROTATE_31).yxwz|(v67<<ROTATE_1);v56=vec4(v45.zw,v67.xy);v74=vec4(v67.zw,v45.xy);vFC=vec4(vEF.zw,vCD.xy);vDE=vec4(vCD.zw,vEF.xy);s0=v01+v56;v01=s0+(vec4<u32>(s0<v01)&CARRY).yxwz;s1=v23+v74;v23=s1+(vec4<u32>(s1<v23)&CARRY).yxwz;s1=v23+vec4(Z,m3);v23=s1+(vec4<u32>(s1<v23)&CARRY).yxwz;vFC=(vFC ^ v01).yxwz;vDE=(vDE ^ v23).yxwz;s0=vAB+vFC;vAB=s0+(vec4<u32>(s0<vAB)&CARRY).yxwz;s1=v89+vDE;v89=s1+(vec4<u32>(s1<v89)&CARRY).yxwz;v56 ^=vAB;v56=(v56>>ROTATE_24)|(v56<<ROTATE_8).yxwz;v74 ^=v89;v74=(v74>>ROTATE_24)|(v74<<ROTATE_8).yxwz;s0=v01+v56;v01=s0+(vec4<u32>(s0<v01)&CARRY).yxwz;s1=v23+v74;v23=s1+(vec4<u32>(s1<v23)&CARRY).yxwz;v01+=vec4(m1,Z)+vec4(0u,u32(v01.x+vec4(m1,Z).x<v01.x),Z);vFC ^=v01;vFC=(vFC>>ROTATE_16)|(vFC<<ROTATE_16).yxwz;vDE ^=v23;vDE=(vDE>>ROTATE_16)|(vDE<<ROTATE_16).yxwz;s0=vAB+vFC;vAB=s0+(vec4<u32>(s0<vAB)&CARRY).yxwz;s1=v89+vDE;v89=s1+(vec4<u32>(s1<v89)&CARRY).yxwz;v56 ^=vAB;v56=(v56>>ROTATE_31).yxwz|(v56<<ROTATE_1);v74 ^=v89;v74=(v74>>ROTATE_31).yxwz|(v74<<ROTATE_1);v45=vec4(v74.zw,v56.xy);v67=vec4(v56.zw,v74.xy);vCD=vec4(vFC.zw,vDE.xy);vEF=vec4(vDE.zw,vFC.xy);s0=v01+v45;v01=s0+(vec4<u32>(s0<v01)&CARRY).yxwz;s1=v23+v67;v23=s1+(vec4<u32>(s1<v23)&CARRY).yxwz;v01+=vec4(m2,Z)+vec4(0u,u32(v01.x+vec4(m2,Z).x<v01.x),Z);v23+=vec4(m0,Z)+vec4(0u,u32(v23.x+vec4(m0,Z).x<v23.x),Z);vCD=(vCD ^ v01).yxwz;vEF=(vEF ^ v23).yxwz;s0=v89+vCD;v89=s0+(vec4<u32>(s0<v89)&CARRY).yxwz;s1=vAB+vEF;vAB=s1+(vec4<u32>(s1<vAB)&CARRY).yxwz;v45 ^=v89;v45=(v45>>ROTATE_24)|(v45<<ROTATE_8).yxwz;v67 ^=vAB;v67=(v67>>ROTATE_24)|(v67<<ROTATE_8).yxwz;s0=v01+v45;v01=s0+(vec4<u32>(s0<v01)&CARRY).yxwz;s1=v23+v67;v23=s1+(vec4<u32>(s1<v23)&CARRY).yxwz;s1=v23+vec4(Z,m3);v23=s1+(vec4<u32>(s1<v23)&CARRY).yxwz;vCD ^=v01;vCD=(vCD>>ROTATE_16)|(vCD<<ROTATE_16).yxwz;vEF ^=v23;vEF=(vEF>>ROTATE_16)|(vEF<<ROTATE_16).yxwz;s0=v89+vCD;v89=s0+(vec4<u32>(s0<v89)&CARRY).yxwz;s1=vAB+vEF;vAB=s1+(vec4<u32>(s1<vAB)&CARRY).yxwz;v45 ^=v89;v45=(v45>>ROTATE_31).yxwz|(v45<<ROTATE_1);v67 ^=vAB;v67=(v67>>ROTATE_31).yxwz|(v67<<ROTATE_1);v56=vec4(v45.zw,v67.xy);v74=vec4(v67.zw,v45.xy);vFC=vec4(vEF.zw,vCD.xy);vDE=vec4(vCD.zw,vEF.xy);s0=v01+v56;v01=s0+(vec4<u32>(s0<v01)&CARRY).yxwz;s1=v23+v74;v23=s1+(vec4<u32>(s1<v23)&CARRY).yxwz;v01+=vec4(m4,Z)+vec4(0u,u32(v01.x+vec4(m4,Z).x<v01.x),Z);v23+=vec4(Z,m1)+vec4(Z,0u,u32(v23.z+vec4(Z,m1).z<v23.z));vFC=(vFC ^ v01).yxwz;vDE=(vDE ^ v23).yxwz;s0=vAB+vFC;vAB=s0+(vec4<u32>(s0<vAB)&CARRY).yxwz;s1=v89+vDE;v89=s1+(vec4<u32>(s1<v89)&CARRY).yxwz;v56 ^=vAB;v56=(v56>>ROTATE_24)|(v56<<ROTATE_8).yxwz;v74 ^=v89;v74=(v74>>ROTATE_24)|(v74<<ROTATE_8).yxwz;s0=v01+v56;v01=s0+(vec4<u32>(s0<v01)&CARRY).yxwz;s1=v23+v74;v23=s1+(vec4<u32>(s1<v23)&CARRY).yxwz;vFC ^=v01;vFC=(vFC>>ROTATE_16)|(vFC<<ROTATE_16).yxwz;vDE ^=v23;vDE=(vDE>>ROTATE_16)|(vDE<<ROTATE_16).yxwz;s0=vAB+vFC;vAB=s0+(vec4<u32>(s0<vAB)&CARRY).yxwz;s1=v89+vDE;v89=s1+(vec4<u32>(s1<v89)&CARRY).yxwz;v56 ^=vAB;v56=(v56>>ROTATE_31).yxwz|(v56<<ROTATE_1);v74 ^=v89;v74=(v74>>ROTATE_31).yxwz|(v74<<ROTATE_1);v45=vec4(v74.zw,v56.xy);v67=vec4(v56.zw,v74.xy);vCD=vec4(vFC.zw,vDE.xy);vEF=vec4(vDE.zw,vFC.xy);s0=v01+v45;v01=s0+(vec4<u32>(s0<v01)&CARRY).yxwz;s1=v23+v67;v23=s1+(vec4<u32>(s1<v23)&CARRY).yxwz;v01+=vec4(Z,m1)+vec4(Z,0u,u32(v01.z+vec4(Z,m1).z<v01.z));v23+=vec4(Z,m4)+vec4(Z,0u,u32(v23.z+vec4(Z,m4).z<v23.z));vCD=(vCD ^ v01).yxwz;vEF=(vEF ^ v23).yxwz;s0=v89+vCD;v89=s0+(vec4<u32>(s0<v89)&CARRY).yxwz;s1=vAB+vEF;vAB=s1+(vec4<u32>(s1<vAB)&CARRY).yxwz;v45 ^=v89;v45=(v45>>ROTATE_24)|(v45<<ROTATE_8).yxwz;v67 ^=vAB;v67=(v67>>ROTATE_24)|(v67<<ROTATE_8).yxwz;s0=v01+v45;v01=s0+(vec4<u32>(s0<v01)&CARRY).yxwz;s1=v23+v67;v23=s1+(vec4<u32>(s1<v23)&CARRY).yxwz;vCD ^=v01;vCD=(vCD>>ROTATE_16)|(vCD<<ROTATE_16).yxwz;vEF ^=v23;vEF=(vEF>>ROTATE_16)|(vEF<<ROTATE_16).yxwz;s0=v89+vCD;v89=s0+(vec4<u32>(s0<v89)&CARRY).yxwz;s1=vAB+vEF;vAB=s1+(vec4<u32>(s1<vAB)&CARRY).yxwz;v45 ^=v89;v45=(v45>>ROTATE_31).yxwz|(v45<<ROTATE_1);v67 ^=vAB;v67=(v67>>ROTATE_31).yxwz|(v67<<ROTATE_1);v56=vec4(v45.zw,v67.xy);v74=vec4(v67.zw,v45.xy);vFC=vec4(vEF.zw,vCD.xy);vDE=vec4(vCD.zw,vEF.xy);s0=v01+v56;v01=s0+(vec4<u32>(s0<v01)&CARRY).yxwz;s1=v23+v74;v23=s1+(vec4<u32>(s1<v23)&CARRY).yxwz;v01+=vec4(m0,Z)+vec4(0u,u32(v01.x+vec4(m0,Z).x<v01.x),Z);vFC=(vFC ^ v01).yxwz;vDE=(vDE ^ v23).yxwz;s0=vAB+vFC;vAB=s0+(vec4<u32>(s0<vAB)&CARRY).yxwz;s1=v89+vDE;v89=s1+(vec4<u32>(s1<v89)&CARRY).yxwz;v56 ^=vAB;v56=(v56>>ROTATE_24)|(v56<<ROTATE_8).yxwz;v74 ^=v89;v74=(v74>>ROTATE_24)|(v74<<ROTATE_8).yxwz;s0=v01+v56;v01=s0+(vec4<u32>(s0<v01)&CARRY).yxwz;s1=v23+v74;v23=s1+(vec4<u32>(s1<v23)&CARRY).yxwz;v01+=vec4(Z,m3)+vec4(Z,0u,u32(v01.z+vec4(Z,m3).z<v01.z));s1=v23+vec4(m2,Z);v23=s1+(vec4<u32>(s1<v23)&CARRY).yxwz;vFC ^=v01;vFC=(vFC>>ROTATE_16)|(vFC<<ROTATE_16).yxwz;vDE ^=v23;vDE=(vDE>>ROTATE_16)|(vDE<<ROTATE_16).yxwz;s0=vAB+vFC;vAB=s0+(vec4<u32>(s0<vAB)&CARRY).yxwz;s1=v89+vDE;v89=s1+(vec4<u32>(s1<v89)&CARRY).yxwz;v56 ^=vAB;v56=(v56>>ROTATE_31).yxwz|(v56<<ROTATE_1);v74 ^=v89;v74=(v74>>ROTATE_31).yxwz|(v74<<ROTATE_1);v45=vec4(v74.zw,v56.xy);v67=vec4(v56.zw,v74.xy);vCD=vec4(vFC.zw,vDE.xy);vEF=vec4(vDE.zw,vFC.xy);s0=v01+v45;v01=s0+(vec4<u32>(s0<v01)&CARRY).yxwz;s1=v23+v67;v23=s1+(vec4<u32>(s1<v23)&CARRY).yxwz;s1=v23+vec4(Z,m3);v23=s1+(vec4<u32>(s1<v23)&CARRY).yxwz;vCD=(vCD ^ v01).yxwz;vEF=(vEF ^ v23).yxwz;s0=v89+vCD;v89=s0+(vec4<u32>(s0<v89)&CARRY).yxwz;s1=vAB+vEF;vAB=s1+(vec4<u32>(s1<vAB)&CARRY).yxwz;v45 ^=v89;v45=(v45>>ROTATE_24)|(v45<<ROTATE_8).yxwz;v67 ^=vAB;v67=(v67>>ROTATE_24)|(v67<<ROTATE_8).yxwz;s0=v01+v45;v01=s0+(vec4<u32>(s0<v01)&CARRY).yxwz;s1=v23+v67;v23=s1+(vec4<u32>(s1<v23)&CARRY).yxwz;v23+=vec4(m1,Z)+vec4(0u,u32(v23.x+vec4(m1,Z).x<v23.x),Z);vCD ^=v01;vCD=(vCD>>ROTATE_16)|(vCD<<ROTATE_16).yxwz;vEF ^=v23;vEF=(vEF>>ROTATE_16)|(vEF<<ROTATE_16).yxwz;s0=v89+vCD;v89=s0+(vec4<u32>(s0<v89)&CARRY).yxwz;s1=vAB+vEF;vAB=s1+(vec4<u32>(s1<vAB)&CARRY).yxwz;v45 ^=v89;v45=(v45>>ROTATE_31).yxwz|(v45<<ROTATE_1);v67 ^=vAB;v67=(v67>>ROTATE_31).yxwz|(v67<<ROTATE_1);v56=vec4(v45.zw,v67.xy);v74=vec4(v67.zw,v45.xy);vFC=vec4(vEF.zw,vCD.xy);vDE=vec4(vCD.zw,vEF.xy);s0=v01+v56;v01=s0+(vec4<u32>(s0<v01)&CARRY).yxwz;s1=v23+v74;v23=s1+(vec4<u32>(s1<v23)&CARRY).yxwz;v23+=vec4(Z,m2)+vec4(Z,0u,u32(v23.z+vec4(Z,m2).z<v23.z));vFC=(vFC ^ v01).yxwz;vDE=(vDE ^ v23).yxwz;s0=vAB+vFC;vAB=s0+(vec4<u32>(s0<vAB)&CARRY).yxwz;s1=v89+vDE;v89=s1+(vec4<u32>(s1<v89)&CARRY).yxwz;v56 ^=vAB;v56=(v56>>ROTATE_24)|(v56<<ROTATE_8).yxwz;v74 ^=v89;v74=(v74>>ROTATE_24)|(v74<<ROTATE_8).yxwz;s0=v01+v56;v01=s0+(vec4<u32>(s0<v01)&CARRY).yxwz;s1=v23+v74;v23=s1+(vec4<u32>(s1<v23)&CARRY).yxwz;v01+=vec4(m0,m4)+vec4(0u,u32(v01.x+vec4(m0,m4).x<v01.x),0u,u32(v01.z+vec4(m0,m4).z<v01.z));vFC ^=v01;vFC=(vFC>>ROTATE_16)|(vFC<<ROTATE_16).yxwz;vDE ^=v23;vDE=(vDE>>ROTATE_16)|(vDE<<ROTATE_16).yxwz;s0=vAB+vFC;vAB=s0+(vec4<u32>(s0<vAB)&CARRY).yxwz;s1=v89+vDE;v89=s1+(vec4<u32>(s1<v89)&CARRY).yxwz;v56 ^=vAB;v56=(v56>>ROTATE_31).yxwz|(v56<<ROTATE_1);v74 ^=v89;v74=(v74>>ROTATE_31).yxwz|(v74<<ROTATE_1);v45=vec4(v74.zw,v56.xy);v67=vec4(v56.zw,v74.xy);vCD=vec4(vFC.zw,vDE.xy);vEF=vec4(vDE.zw,vFC.xy);s0=v01+v45;v01=s0+(vec4<u32>(s0<v01)&CARRY).yxwz;s1=v23+v67;v23=s1+(vec4<u32>(s1<v23)&CARRY).yxwz;v23+=vec4(Z,m0);v23.w+=u32(v23.z<m0.x);vCD=(vCD ^ v01).yxwz;vEF=(vEF ^ v23).yxwz;s0=v89+vCD;v89=s0+(vec4<u32>(s0<v89)&CARRY).yxwz;s1=vAB+vEF;vAB=s1+(vec4<u32>(s1<vAB)&CARRY).yxwz;v45 ^=v89;v45=(v45>>ROTATE_24)|(v45<<ROTATE_8).yxwz;v67 ^=vAB;v67=(v67>>ROTATE_24)|(v67<<ROTATE_8).yxwz;s0=v01+v45;v01=s0+(vec4<u32>(s0<v01)&CARRY).yxwz;s1=v23+v67;v23=s1+(vec4<u32>(s1<v23)&CARRY).yxwz;v23+=vec4(m3,Z)+vec4(0u,u32(v23.x+vec4(m3,Z).x<v23.x),Z);vCD ^=v01;vCD=(vCD>>ROTATE_16)|(vCD<<ROTATE_16).yxwz;vEF ^=v23;vEF=(vEF>>ROTATE_16)|(vEF<<ROTATE_16).yxwz;s0=v89+vCD;v89=s0+(vec4<u32>(s0<v89)&CARRY).yxwz;s1=vAB+vEF;vAB=s1+(vec4<u32>(s1<vAB)&CARRY).yxwz;v45 ^=v89;v45=(v45>>ROTATE_31).yxwz|(v45<<ROTATE_1);v67 ^=vAB;v67=(v67>>ROTATE_31).yxwz|(v67<<ROTATE_1);v56=vec4(v45.zw,v67.xy);v74=vec4(v67.zw,v45.xy);vFC=vec4(vEF.zw,vCD.xy);vDE=vec4(vCD.zw,vEF.xy);s0=v01+v56;v01=s0+(vec4<u32>(s0<v01)&CARRY).yxwz;s1=v23+v74;v23=s1+(vec4<u32>(s1<v23)&CARRY).yxwz;v23+=vec4(m1,Z)+vec4(0u,u32(v23.x+vec4(m1,Z).x<v23.x),Z);vFC=(vFC ^ v01).yxwz;vDE=(vDE ^ v23).yxwz;s0=vAB+vFC;vAB=s0+(vec4<u32>(s0<vAB)&CARRY).yxwz;s1=v89+vDE;v89=s1+(vec4<u32>(s1<v89)&CARRY).yxwz;v56 ^=vAB;v56=(v56>>ROTATE_24)|(v56<<ROTATE_8).yxwz;v74 ^=v89;v74=(v74>>ROTATE_24)|(v74<<ROTATE_8).yxwz;s0=v01+v56;v01=s0+(vec4<u32>(s0<v01)&CARRY).yxwz;s1=v23+v74;v23=s1+(vec4<u32>(s1<v23)&CARRY).yxwz;v01+=vec4(m2,Z)+vec4(0u,u32(v01.x+vec4(m2,Z).x<v01.x),Z);v23+=vec4(m4,Z)+vec4(0u,u32(v23.x+vec4(m4,Z).x<v23.x),Z);vFC ^=v01;vFC=(vFC>>ROTATE_16)|(vFC<<ROTATE_16).yxwz;vDE ^=v23;vDE=(vDE>>ROTATE_16)|(vDE<<ROTATE_16).yxwz;s0=vAB+vFC;vAB=s0+(vec4<u32>(s0<vAB)&CARRY).yxwz;s1=v89+vDE;v89=s1+(vec4<u32>(s1<v89)&CARRY).yxwz;v56 ^=vAB;v56=(v56>>ROTATE_31).yxwz|(v56<<ROTATE_1);v74 ^=v89;v74=(v74>>ROTATE_31).yxwz|(v74<<ROTATE_1);v45=vec4(v74.zw,v56.xy);v67=vec4(v56.zw,v74.xy);vCD=vec4(vFC.zw,vDE.xy);vEF=vec4(vDE.zw,vFC.xy);s0=v01+v45;v01=s0+(vec4<u32>(s0<v01)&CARRY).yxwz;s1=v23+v67;v23=s1+(vec4<u32>(s1<v23)&CARRY).yxwz;v23+=vec4(Z,m1);v23.w+=u32(v23.z<m1.x);vCD=(vCD ^ v01).yxwz;vEF=(vEF ^ v23).yxwz;s0=v89+vCD;v89=s0+(vec4<u32>(s0<v89)&CARRY).yxwz;s1=vAB+vEF;vAB=s1+(vec4<u32>(s1<vAB)&CARRY).yxwz;v45 ^=v89;v45=(v45>>ROTATE_24)|(v45<<ROTATE_8).yxwz;v67 ^=vAB;v67=(v67>>ROTATE_24)|(v67<<ROTATE_8).yxwz;s0=v01+v45;v01=s0+(vec4<u32>(s0<v01)&CARRY).yxwz;s1=v23+v67;v23=s1+(vec4<u32>(s1<v23)&CARRY).yxwz;s0=v01+vec4(m2,m4);v01=s0+(vec4<u32>(s0<v01)&CARRY).yxwz;vCD ^=v01;vCD=(vCD>>ROTATE_16)|(vCD<<ROTATE_16).yxwz;vEF ^=v23;vEF=(vEF>>ROTATE_16)|(vEF<<ROTATE_16).yxwz;s0=v89+vCD;v89=s0+(vec4<u32>(s0<v89)&CARRY).yxwz;s1=vAB+vEF;vAB=s1+(vec4<u32>(s1<vAB)&CARRY).yxwz;v45 ^=v89;v45=(v45>>ROTATE_31).yxwz|(v45<<ROTATE_1);v67 ^=vAB;v67=(v67>>ROTATE_31).yxwz|(v67<<ROTATE_1);v56=vec4(v45.zw,v67.xy);v74=vec4(v67.zw,v45.xy);vFC=vec4(vEF.zw,vCD.xy);vDE=vec4(vCD.zw,vEF.xy);s0=v01+v56;v01=s0+(vec4<u32>(s0<v01)&CARRY).yxwz;s1=v23+v74;v23=s1+(vec4<u32>(s1<v23)&CARRY).yxwz;v23+=vec4(m3,Z);v23.y+=u32(v23.x<m3.x);vFC=(vFC ^ v01).yxwz;vDE=(vDE ^ v23).yxwz;s0=vAB+vFC;vAB=s0+(vec4<u32>(s0<vAB)&CARRY).yxwz;s1=v89+vDE;v89=s1+(vec4<u32>(s1<v89)&CARRY).yxwz;v56 ^=vAB;v56=(v56>>ROTATE_24)|(v56<<ROTATE_8).yxwz;v74 ^=v89;v74=(v74>>ROTATE_24)|(v74<<ROTATE_8).yxwz;s0=v01+v56;v01=s0+(vec4<u32>(s0<v01)&CARRY).yxwz;s1=v23+v74;v23=s1+(vec4<u32>(s1<v23)&CARRY).yxwz;v23+=vec4(Z,m0);v23.w+=u32(v23.z<m0.x);vFC ^=v01;vFC=(vFC>>ROTATE_16)|(vFC<<ROTATE_16).yxwz;vDE ^=v23;vDE=(vDE>>ROTATE_16)|(vDE<<ROTATE_16).yxwz;s0=vAB+vFC;vAB=s0+(vec4<u32>(s0<vAB)&CARRY).yxwz;s1=v89+vDE;v89=s1+(vec4<u32>(s1<v89)&CARRY).yxwz;v56 ^=vAB;v56=(v56>>ROTATE_31).yxwz|(v56<<ROTATE_1);v74 ^=v89;v74=(v74>>ROTATE_31).yxwz|(v74<<ROTATE_1);v45=vec4(v74.zw,v56.xy);v67=vec4(v56.zw,v74.xy);vCD=vec4(vFC.zw,vDE.xy);vEF=vec4(vDE.zw,vFC.xy);s0=v01+v45;v01=s0+(vec4<u32>(s0<v01)&CARRY).yxwz;s1=v23+v67;v23=s1+(vec4<u32>(s1<v23)&CARRY).yxwz;s0=v01+vec4(m0,m2);v01=s0+(vec4<u32>(s0<v01)&CARRY).yxwz;v23+=vec4(m4,Z);v23.y+=u32(v23.x<m4.x);vCD=(vCD ^ v01).yxwz;vEF=(vEF ^ v23).yxwz;s0=v89+vCD;v89=s0+(vec4<u32>(s0<v89)&CARRY).yxwz;s1=vAB+vEF;vAB=s1+(vec4<u32>(s1<vAB)&CARRY).yxwz;v45 ^=v89;v45=(v45>>ROTATE_24)|(v45<<ROTATE_8).yxwz;v67 ^=vAB;v67=(v67>>ROTATE_24)|(v67<<ROTATE_8).yxwz;s0=v01+v45;v01=s0+(vec4<u32>(s0<v01)&CARRY).yxwz;s1=v23+v67;v23=s1+(vec4<u32>(s1<v23)&CARRY).yxwz;s0=v01+vec4(m1,m3);v01=s0+(vec4<u32>(s0<v01)&CARRY).yxwz;vCD ^=v01;vCD=(vCD>>ROTATE_16)|(vCD<<ROTATE_16).yxwz;vEF ^=v23;vEF=(vEF>>ROTATE_16)|(vEF<<ROTATE_16).yxwz;s0=v89+vCD;v89=s0+(vec4<u32>(s0<v89)&CARRY).yxwz;s1=vAB+vEF;vAB=s1+(vec4<u32>(s1<vAB)&CARRY).yxwz;v45 ^=v89;v45=(v45>>ROTATE_31).yxwz|(v45<<ROTATE_1);v67 ^=vAB;v67=(v67>>ROTATE_31).yxwz|(v67<<ROTATE_1);v56=vec4(v45.zw,v67.xy);v74=vec4(v67.zw,v45.xy);vFC=vec4(vEF.zw,vCD.xy);vDE=vec4(vCD.zw,vEF.xy);s0=v01+v56;v01=s0+(vec4<u32>(s0<v01)&CARRY).yxwz;s1=v23+v74;v23=s1+(vec4<u32>(s1<v23)&CARRY).yxwz;vFC=(vFC ^ v01).yxwz;vDE=(vDE ^ v23).yxwz;s0=vAB+vFC;vAB=s0+(vec4<u32>(s0<vAB)&CARRY).yxwz;s1=v89+vDE;v89=s1+(vec4<u32>(s1<v89)&CARRY).yxwz;v56 ^=vAB;v56=(v56>>ROTATE_24)|(v56<<ROTATE_8).yxwz;v74 ^=v89;v74=(v74>>ROTATE_24)|(v74<<ROTATE_8).yxwz;s0=v01+v56;v01=s0+(vec4<u32>(s0<v01)&CARRY).yxwz;s1=v23+v74;v23=s1+(vec4<u32>(s1<v23)&CARRY).yxwz;vFC ^=v01;vFC=(vFC>>ROTATE_16)|(vFC<<ROTATE_16).yxwz;vDE ^=v23;vDE=(vDE>>ROTATE_16)|(vDE<<ROTATE_16).yxwz;s0=vAB+vFC;vAB=s0+(vec4<u32>(s0<vAB)&CARRY).yxwz;s1=v89+vDE;v89=s1+(vec4<u32>(s1<v89)&CARRY).yxwz;v56 ^=vAB;v56=(v56>>ROTATE_31).yxwz|(v56<<ROTATE_1);v74 ^=v89;v74=(v74>>ROTATE_31).yxwz|(v74<<ROTATE_1);v45=vec4(v74.zw,v56.xy);v67=vec4(v56.zw,v74.xy);vCD=vec4(vFC.zw,vDE.xy);vEF=vec4(vDE.zw,vFC.xy);s0=v01+v45;v01=s0+(vec4<u32>(s0<v01)&CARRY).yxwz;s1=v23+v67;v23=s1+(vec4<u32>(s1<v23)&CARRY).yxwz;v01+=vec4(Z,m4);v01.w+=u32(v01.z<m4.x);vCD=(vCD ^ v01).yxwz;vEF=(vEF ^ v23).yxwz;s0=v89+vCD;v89=s0+(vec4<u32>(s0<v89)&CARRY).yxwz;s1=vAB+vEF;vAB=s1+(vec4<u32>(s1<vAB)&CARRY).yxwz;v45 ^=v89;v45=(v45>>ROTATE_24)|(v45<<ROTATE_8).yxwz;v67 ^=vAB;v67=(v67>>ROTATE_24)|(v67<<ROTATE_8).yxwz;s0=v01+v45;v01=s0+(vec4<u32>(s0<v01)&CARRY).yxwz;s1=v23+v67;v23=s1+(vec4<u32>(s1<v23)&CARRY).yxwz;vCD ^=v01;vCD=(vCD>>ROTATE_16)|(vCD<<ROTATE_16).yxwz;vEF ^=v23;vEF=(vEF>>ROTATE_16)|(vEF<<ROTATE_16).yxwz;s0=v89+vCD;v89=s0+(vec4<u32>(s0<v89)&CARRY).yxwz;s1=vAB+vEF;vAB=s1+(vec4<u32>(s1<vAB)&CARRY).yxwz;v45 ^=v89;v45=(v45>>ROTATE_31).yxwz|(v45<<ROTATE_1);v67 ^=vAB;v67=(v67>>ROTATE_31).yxwz|(v67<<ROTATE_1);v56=vec4(v45.zw,v67.xy);v74=vec4(v67.zw,v45.xy);vFC=vec4(vEF.zw,vCD.xy);vDE=vec4(vCD.zw,vEF.xy);s0=v01+v56;v01=s0+(vec4<u32>(s0<v01)&CARRY).yxwz;s1=v23+v74;v23=s1+(vec4<u32>(s1<v23)&CARRY).yxwz;s0=v01+vec4(m1,m0);v01=s0+(vec4<u32>(s0<v01)&CARRY).yxwz;vFC=(vFC ^ v01).yxwz;vDE=(vDE ^ v23).yxwz;s0=vAB+vFC;vAB=s0+(vec4<u32>(s0<vAB)&CARRY).yxwz;s1=v89+vDE;v89=s1+(vec4<u32>(s1<v89)&CARRY).yxwz;v56 ^=vAB;v56=(v56>>ROTATE_24)|(v56<<ROTATE_8).yxwz;v74 ^=v89;v74=(v74>>ROTATE_24)|(v74<<ROTATE_8).yxwz;s0=v01+v56;v01=s0+(vec4<u32>(s0<v01)&CARRY).yxwz;s1=v23+v74;v23=s1+(vec4<u32>(s1<v23)&CARRY).yxwz;v01+=vec4(Z,m2);v01.w+=u32(v01.z<m2.x);v23+=vec4(Z,m3);v23.w+=u32(v23.z<m3.x);vDE ^=v23;vDE=(vDE>>ROTATE_16)|(vDE<<ROTATE_16).yxwz;s1=v89+vDE;v89=s1+(vec4<u32>(s1<v89).yxwz&CARRY);if((BLAKE2B_IV_0.y ^ v01.y ^ v89.y)>ubo.threshold&&atomicLoad(&work.found)==0u){atomicStore(&work.found,1u);work.nonce=m0;}return;}";
-// src/shaders/gl-fragment.ts
-var NanoPowGlFragmentShader = `#version 300 es
+// src/shaders/gl-downsample.ts
+var NanoPowGlDownsampleShader = `#version 300 es
#pragma vscode_glsllint_stage: frag
+#ifdef GL_FRAGMENT_PRECISION_HIGH
precision highp float;
+#else
+precision mediump float;
+#endif
+precision highp int;
-in vec2 uv_pos;
out uvec4 nonce;
-// blockhash - array of precalculated block hash components
+// source texture to be downsampled
+uniform highp usampler2D src;
+
+void main() {
+ nonce = uvec4(0u);
+ vec2 inputSize = vec2(textureSize(src, 0));
+ vec2 texel = vec2(1.0) / inputSize;
+ vec2 blockCoord = (floor(gl_FragCoord.xy) * 2.0 + vec2(0.5)) / inputSize;
+
+ uvec4 pixel = texture(src, blockCoord);
+ nonce = pixel.x == 0u ? nonce : pixel;
+
+ pixel = texture(src, blockCoord + vec2(texel.x, 0.0));
+ nonce = pixel.x == 0u ? nonce : pixel;
+
+ pixel = texture(src, blockCoord + vec2(0.0, texel.y));
+ nonce = pixel.x == 0u ? nonce : pixel;
+
+ pixel = texture(src, blockCoord + vec2(texel.x, texel.y));
+ nonce = pixel.x == 0u ? nonce : pixel;
+}
+`;
+
+// src/shaders/gl-draw.ts
+var NanoPowGlDrawShader = `#version 300 es
+#pragma vscode_glsllint_stage: frag
+#ifdef GL_FRAGMENT_PRECISION_HIGH
+precision highp float;
+#else
+precision mediump float;
+#endif
+
+out uvec4 nonce;
+
+// blockhash - Array of precalculated block hash components
// threshold - 0xfffffff8 for send/change blocks, 0xfffffe00 for all else
-// workload - Defines canvas size
+// search - Checks all pixels if true, else only checks 1 pixel to validate
layout(std140) uniform UBO {
uint blockhash[8];
uint threshold;
- float workload;
+ bool search;
};
-// Random work values
+// Random work seed values
layout(std140) uniform WORK {
- uvec2 work;
+ uvec2 seed;
};
-// Defined separately from uint v[32] below as the original value is required
+// Defined separately from uint v[0].y below as the original value is required
// to calculate the second uint32 of the digest for threshold comparison
const uint BLAKE2B_IV32_1 = 0x6A09E667u;
// Used during G for vector bit rotations
-const uvec2 ROTATE_1 = uvec2(1u, 1u);
-const uvec2 ROTATE_8 = uvec2(8u, 8u);
-const uvec2 ROTATE_16 = uvec2(16u, 16u);
-const uvec2 ROTATE_24 = uvec2(24u, 24u);
-const uvec2 ROTATE_31 = uvec2(31u, 31u);
+const uvec4 ROTATE_1 = uvec4(1u);
+const uvec4 ROTATE_8 = uvec4(8u);
+const uvec4 ROTATE_16 = uvec4(16u);
+const uvec4 ROTATE_24 = uvec4(24u);
+const uvec4 ROTATE_31 = uvec4(31u);
// Both buffers represent 16 uint64s as 32 uint32s
// because that's what GLSL offers, just like Javascript
// OUTLEN is constant 8 bytes
// v[0] ^= 0x01010000u ^ uint(OUTLEN);
// INLEN is constant 40 bytes: work value (8) + block hash (32)
-// v[24] ^= uint(INLEN);
+// v[12] ^= uint(INLEN);
// It's always the "last" compression at this INLEN
-// v[28] = ~v[28];
-// v[29] = ~v[29];
-uvec2 v[16] = uvec2[16](
+// v[14] = ~v[14];
+const uvec2 blake2b_iv[16] = uvec2[16](
uvec2(0xF2BDC900u, 0x6A09E667u),
uvec2(0x84CAA73Bu, 0xBB67AE85u),
uvec2(0xFE94F82Bu, 0x3C6EF372u),
uvec2(0x137E2179u, 0x5BE0CD19u)
);
+// Iterated initialization vector
+uvec2 v[16];
+
// Input data buffer
uvec2 m[16];
-// Offsets into the input data buffer for each mixing step
-const uint SIGMA[192] = uint[192](
- 0u,1u,2u,3u,4u,5u,6u,7u,8u,9u,10u,11u,12u,13u,14u,15u,
- 14u,10u,4u,8u,9u,15u,13u,6u,1u,12u,0u,2u,11u,7u,5u,3u,
- 11u,8u,12u,0u,5u,2u,15u,13u,10u,14u,3u,6u,7u,1u,9u,4u,
- 7u,9u,3u,1u,13u,12u,11u,14u,2u,6u,5u,10u,4u,0u,15u,8u,
- 9u,0u,5u,7u,2u,4u,10u,15u,14u,1u,11u,12u,6u,8u,3u,13u,
- 2u,12u,6u,10u,0u,11u,8u,3u,4u,13u,7u,5u,15u,14u,1u,9u,
- 12u,5u,1u,15u,14u,13u,4u,10u,0u,7u,6u,3u,9u,2u,8u,11u,
- 13u,11u,7u,14u,12u,1u,3u,9u,5u,0u,15u,4u,8u,6u,2u,10u,
- 6u,15u,14u,9u,11u,3u,0u,8u,12u,2u,13u,7u,1u,4u,10u,5u,
- 10u,2u,8u,4u,7u,6u,1u,5u,15u,11u,9u,14u,3u,12u,13u,0u,
- 0u,1u,2u,3u,4u,5u,6u,7u,8u,9u,10u,11u,12u,13u,14u,15u,
- 14u,10u,4u,8u,9u,15u,13u,6u,1u,12u,0u,2u,11u,7u,5u,3u
-);
+// G mixing function, compressing two subprocesses into one
+void G (
+ uint a0, uint b0, uint c0, uint d0, uvec2 x0, uvec2 y0,
+ uint a1, uint b1, uint c1, uint d1, uvec2 x1, uvec2 y1
+) {
+ uvec4 a = uvec4(v[a0], v[a1]);
+ uvec4 b = uvec4(v[b0], v[b1]);
+ uvec4 c = uvec4(v[c0], v[c1]);
+ uvec4 d = uvec4(v[d0], v[d1]);
+ uvec4 mx = uvec4(x0, x1);
+ uvec4 my = uvec4(y0, y1);
-// G mixing function
-void G (uint a, uint b, uint c, uint d, uint x, uint y) {
- v[a] = v[a] + v[b] + uvec2(0u, uint(v[a].x + v[b].x < v[b].x));
- v[a] = v[a] + m[x] + uvec2(0u, uint(v[a].x + m[x].x < m[x].x));
- v[d] = (v[d] ^ v[a]).yx;
- v[c] = v[c] + v[d] + uvec2(0u, uint(v[c].x + v[d].x < v[d].x));
- v[b] = ((v[b] ^ v[c]) >> ROTATE_24) | ((v[b] ^ v[c]).yx << ROTATE_8);
- v[a] = v[a] + v[b] + uvec2(0u, uint(v[a].x + v[b].x < v[b].x));
- v[a] = v[a] + m[y] + uvec2(0u, uint(v[a].x + m[y].x < m[y].x));
- v[d] = ((v[d] ^ v[a]) >> ROTATE_16) | ((v[d] ^ v[a]).yx << ROTATE_16);
- v[c] = v[c] + v[d] + uvec2(0u, uint(v[c].x + v[d].x < v[d].x));
- v[b] = ((v[b] ^ v[c]).yx >> ROTATE_31) | ((v[b] ^ v[c]) << ROTATE_1);
+ a = a + b + uvec4(0u, uint(a.x + b.x < a.x), 0u, uint(a.z + b.z < a.z));
+ a = a + mx + uvec4(0u, uint(a.x + mx.x < a.x), 0u, uint(a.z + mx.z < a.z));
+ d = (d ^ a).yxwz;
+ c = c + d + uvec4(0u, uint(c.x + d.x < c.x), 0u, uint(c.z + d.z < c.z));
+ b = ((b ^ c) >> ROTATE_24) | ((b ^ c) << ROTATE_8).yxwz;
+ a = a + b + uvec4(0u, uint(a.x + b.x < b.x), 0u, uint(a.z + b.z < b.z));
+ a = a + my + uvec4(0u, uint(a.x + my.x < a.x), 0u, uint(a.z + my.z < a.z));
+ d = ((d ^ a) >> ROTATE_16) | ((d ^ a) << ROTATE_16).yxwz;
+ c = c + d + uvec4(0u, uint(c.x + d.x < c.x), 0u, uint(c.z + d.z < c.z));
+ b = ((b ^ c) >> ROTATE_31).yxwz | ((b ^ c) << ROTATE_1);
+
+ v[a0] = a.xy;
+ v[b0] = b.xy;
+ v[c0] = c.xy;
+ v[d0] = d.xy;
+ v[a1] = a.zw;
+ v[b1] = b.zw;
+ v[c1] = c.zw;
+ v[d1] = d.zw;
}
void main() {
+ // Initialize fragment output
+ nonce = uvec4(0u);
+
// Nonce uniquely differentiated by pixel location
- m[0u].x = work.x ^ uint(uv_pos.x * workload);
- m[0u].y = work.y ^ uint(uv_pos.y * workload);
+ m[0u] = seed ^ uvec2(gl_FragCoord);
// Block hash
m[1u] = uvec2(blockhash[0u], blockhash[1u]);
m[3u] = uvec2(blockhash[4u], blockhash[5u]);
m[4u] = uvec2(blockhash[6u], blockhash[7u]);
- // twelve rounds of mixing
- for(uint i = 0u; i < 12u; i = i + 1u) {
- G(0u, 4u, 8u, 12u, SIGMA[i * 16u + 0u], SIGMA[i * 16u + 1u]);
- G(1u, 5u, 9u, 13u, SIGMA[i * 16u + 2u], SIGMA[i * 16u + 3u]);
- G(2u, 6u, 10u, 14u, SIGMA[i * 16u + 4u], SIGMA[i * 16u + 5u]);
- G(3u, 7u, 11u, 15u, SIGMA[i * 16u + 6u], SIGMA[i * 16u + 7u]);
- G(0u, 5u, 10u, 15u, SIGMA[i * 16u + 8u], SIGMA[i * 16u + 9u]);
- G(1u, 6u, 11u, 12u, SIGMA[i * 16u + 10u], SIGMA[i * 16u + 11u]);
- G(2u, 7u, 8u, 13u, SIGMA[i * 16u + 12u], SIGMA[i * 16u + 13u]);
- G(3u, 4u, 9u, 14u, SIGMA[i * 16u + 14u], SIGMA[i * 16u + 15u]);
- }
+ // Reset v
+ v = blake2b_iv;
+
+ // Twelve rounds of G mixing
+
+ // Round 0
+ G(0u, 4u, 8u, 12u, m[0u], m[1u], 1u, 5u, 9u, 13u, m[2u], m[3u]);
+ G(2u, 6u, 10u, 14u, m[4u], m[5u], 3u, 7u, 11u, 15u, m[6u], m[7u]);
+ G(0u, 5u, 10u, 15u, m[8u], m[9u], 1u, 6u, 11u, 12u, m[10u], m[11u]);
+ G(2u, 7u, 8u, 13u, m[12u], m[13u], 3u, 4u, 9u, 14u, m[14u], m[15u]);
+
+ // Round 1
+ G(0u, 4u, 8u, 12u, m[14u], m[10u], 1u, 5u, 9u, 13u, m[4u], m[8u]);
+ G(2u, 6u, 10u, 14u, m[9u], m[15u], 3u, 7u, 11u, 15u, m[13u], m[6u]);
+ G(0u, 5u, 10u, 15u, m[1u], m[12u], 1u, 6u, 11u, 12u, m[0u], m[2u]);
+ G(2u, 7u, 8u, 13u, m[11u], m[7u], 3u, 4u, 9u, 14u, m[5u], m[3u]);
+
+ // Round 2
+ G(0u, 4u, 8u, 12u, m[11u], m[8u], 1u, 5u, 9u, 13u, m[12u], m[0u]);
+ G(2u, 6u, 10u, 14u, m[5u], m[2u], 3u, 7u, 11u, 15u, m[15u], m[13u]);
+ G(0u, 5u, 10u, 15u, m[10u], m[14u], 1u, 6u, 11u, 12u, m[3u], m[6u]);
+ G(2u, 7u, 8u, 13u, m[7u], m[1u], 3u, 4u, 9u, 14u, m[9u], m[4u]);
+
+ // Round 3
+ G(0u, 4u, 8u, 12u, m[7u], m[9u], 1u, 5u, 9u, 13u, m[3u], m[1u]);
+ G(2u, 6u, 10u, 14u, m[13u], m[12u], 3u, 7u, 11u, 15u, m[11u], m[14u]);
+ G(0u, 5u, 10u, 15u, m[2u], m[6u], 1u, 6u, 11u, 12u, m[5u], m[10u]);
+ G(2u, 7u, 8u, 13u, m[4u], m[0u], 3u, 4u, 9u, 14u, m[15u], m[8u]);
+
+ // Round 4
+ G(0u, 4u, 8u, 12u, m[9u], m[0u], 1u, 5u, 9u, 13u, m[5u], m[7u]);
+ G(2u, 6u, 10u, 14u, m[2u], m[4u], 3u, 7u, 11u, 15u, m[10u], m[15u]);
+ G(0u, 5u, 10u, 15u, m[14u], m[1u], 1u, 6u, 11u, 12u, m[11u], m[12u]);
+ G(2u, 7u, 8u, 13u, m[6u], m[8u], 3u, 4u, 9u, 14u, m[3u], m[13u]);
- // Pixel data set from work values
+ // Round 5
+ G(0u, 4u, 8u, 12u, m[2u], m[12u], 1u, 5u, 9u, 13u, m[6u], m[10u]);
+ G(2u, 6u, 10u, 14u, m[0u], m[11u], 3u, 7u, 11u, 15u, m[8u], m[3u]);
+ G(0u, 5u, 10u, 15u, m[4u], m[13u], 1u, 6u, 11u, 12u, m[7u], m[5u]);
+ G(2u, 7u, 8u, 13u, m[15u], m[14u], 3u, 4u, 9u, 14u, m[1u], m[9u]);
+
+ // Round 6
+ G(0u, 4u, 8u, 12u, m[12u], m[5u], 1u, 5u, 9u, 13u, m[1u], m[15u]);
+ G(2u, 6u, 10u, 14u, m[14u], m[13u], 3u, 7u, 11u, 15u, m[4u], m[10u]);
+ G(0u, 5u, 10u, 15u, m[0u], m[7u], 1u, 6u, 11u, 12u, m[6u], m[3u]);
+ G(2u, 7u, 8u, 13u, m[9u], m[2u], 3u, 4u, 9u, 14u, m[8u], m[11u]);
+
+ // Round 7
+ G(0u, 4u, 8u, 12u, m[13u], m[11u], 1u, 5u, 9u, 13u, m[7u], m[14u]);
+ G(2u, 6u, 10u, 14u, m[12u], m[1u], 3u, 7u, 11u, 15u, m[3u], m[9u]);
+ G(0u, 5u, 10u, 15u, m[5u], m[0u], 1u, 6u, 11u, 12u, m[15u], m[4u]);
+ G(2u, 7u, 8u, 13u, m[8u], m[6u], 3u, 4u, 9u, 14u, m[2u], m[10u]);
+
+ // Round 8
+ G(0u, 4u, 8u, 12u, m[6u], m[15u], 1u, 5u, 9u, 13u, m[14u], m[9u]);
+ G(2u, 6u, 10u, 14u, m[11u], m[3u], 3u, 7u, 11u, 15u, m[0u], m[8u]);
+ G(0u, 5u, 10u, 15u, m[12u], m[2u], 1u, 6u, 11u, 12u, m[13u], m[7u]);
+ G(2u, 7u, 8u, 13u, m[1u], m[4u], 3u, 4u, 9u, 14u, m[10u], m[5u]);
+
+ // Round 9
+ G(0u, 4u, 8u, 12u, m[10u], m[2u], 1u, 5u, 9u, 13u, m[8u], m[4u]);
+ G(2u, 6u, 10u, 14u, m[7u], m[6u], 3u, 7u, 11u, 15u, m[1u], m[5u]);
+ G(0u, 5u, 10u, 15u, m[15u], m[11u], 1u, 6u, 11u, 12u, m[9u], m[14u]);
+ G(2u, 7u, 8u, 13u, m[3u], m[12u], 3u, 4u, 9u, 14u, m[13u], m[0u]);
+
+ // Round 10
+ G(0u, 4u, 8u, 12u, m[0u], m[1u], 1u, 5u, 9u, 13u, m[2u], m[3u]);
+ G(2u, 6u, 10u, 14u, m[4u], m[5u], 3u, 7u, 11u, 15u, m[6u], m[7u]);
+ G(0u, 5u, 10u, 15u, m[8u], m[9u], 1u, 6u, 11u, 12u, m[10u], m[11u]);
+ G(2u, 7u, 8u, 13u, m[12u], m[13u], 3u, 4u, 9u, 14u, m[14u], m[15u]);
+
+ // Round 11
+ G(0u, 4u, 8u, 12u, m[14u], m[10u], 1u, 5u, 9u, 13u, m[4u], m[8u]);
+ G(2u, 6u, 10u, 14u, m[9u], m[15u], 3u, 7u, 11u, 15u, m[13u], m[6u]);
+ G(0u, 5u, 10u, 15u, m[1u], m[12u], 1u, 6u, 11u, 12u, m[0u], m[2u]);
+ G(2u, 7u, 8u, 13u, m[11u], m[7u], 3u, 4u, 9u, 14u, m[5u], m[3u]);
+
+ // Pixel data set from work seed values
// Finalize digest from high bits, low bits can be safely ignored
- if ((BLAKE2B_IV32_1 ^ v[0u].y ^ v[8u].y) > threshold) {
- nonce = uvec4(1u, m[0].y, m[0].x, 1u);
- } else {
+ if ((BLAKE2B_IV32_1 ^ v[0u].y ^ v[8u].y) >= threshold && (search || uvec2(gl_FragCoord) == uvec2(0u))) {
+ nonce = uvec4(1u, m[0u].y, m[0u].x, (uint(gl_FragCoord.x) << 16u) | uint(gl_FragCoord.y));
+ }
+
+ // Valid nonce not found
+ if (nonce.x == 0u) {
discard;
}
}
// src/shaders/gl-vertex.ts
var NanoPowGlVertexShader = `#version 300 es
#pragma vscode_glsllint_stage: vert
+#ifdef GL_FRAGMENT_PRECISION_HIGH
precision highp float;
-layout (location=0) in vec4 position;
-layout (location=1) in vec2 uv;
+#else
+precision mediump float;
+#endif
-out vec2 uv_pos;
+layout (location=0) in vec4 position;
void main() {
- uv_pos = uv;
gl_Position = position;
}
`;
// src/classes/gl.ts
var NanoPowGl = class _NanoPowGl {
static #busy = false;
- /** Used to set canvas size. Must be a multiple of 256. */
- static #WORKLOAD = 256 * Math.max(1, Math.floor(navigator.hardwareConcurrency));
+ static #debug = false;
+ static #raf = 0;
+ /** Used to set canvas size. */
+ static #cores = Math.max(1, Math.floor(navigator.hardwareConcurrency));
+ static #WORKLOAD = 256 * this.#cores;
+ static #canvas = new OffscreenCanvas(this.#WORKLOAD, this.#WORKLOAD);
+ static get size() {
+ return this.#gl?.drawingBufferWidth;
+ }
static #gl;
- static #program;
+ static #drawProgram;
+ static #downsampleProgram;
static #vertexShader;
- static #fragmentShader;
- static #texture;
- static #framebuffer;
+ static #drawShader;
+ static #downsampleShader;
static #positionBuffer;
- static #uvBuffer;
+ static #drawFbo;
+ static #downsampleFbos = [];
+ static #downsampleSrcLocation;
static #uboBuffer;
- static #workBuffer;
+ static #uboView = new DataView(new ArrayBuffer(144));
+ static #seedBuffer;
+ static #seed = new BigUint64Array(1);
static #query;
static #pixels;
/**Vertex Positions, 2 triangles */
static #positions = new Float32Array([
-1,
-1,
- 0,
- -1,
- 1,
- 0,
- 1,
- 1,
- 0,
1,
-1,
- 0,
1,
1,
- 0,
-1,
- -1,
- 0
- ]);
- /** Texture Positions */
- static #uvPosArray = new Float32Array([
- 1,
- 1,
- 1,
- 0,
- 0,
- 0,
- 0,
- 1,
- 0,
- 0,
- 1,
1
]);
/** Compile */
if (this.#busy) return;
this.#busy = true;
try {
- this.#gl = new OffscreenCanvas(this.#WORKLOAD, this.#WORKLOAD).getContext("webgl2");
+ this.#canvas.addEventListener("webglcontextlost", (event) => {
+ event.preventDefault();
+ console.warn("WebGL context lost. Waiting for it to be restored...");
+ cancelAnimationFrame(this.#raf);
+ }, false);
+ this.#canvas.addEventListener("webglcontextrestored", (event) => {
+ console.warn("WebGL context restored. Reinitializing...");
+ _NanoPowGl.init();
+ }, false);
+ this.#gl = this.#canvas.getContext("webgl2");
if (this.#gl == null) throw new Error("WebGL 2 is required");
- this.#gl.clearColor(0, 0, 0, 1);
- this.#program = this.#gl.createProgram();
- if (this.#program == null) throw new Error("Failed to create shader program");
+ this.#drawProgram = this.#gl.createProgram();
+ if (this.#drawProgram == null) throw new Error("Failed to create shader program");
this.#vertexShader = this.#gl.createShader(this.#gl.VERTEX_SHADER);
if (this.#vertexShader == null) throw new Error("Failed to create vertex shader");
this.#gl.shaderSource(this.#vertexShader, NanoPowGlVertexShader);
this.#gl.compileShader(this.#vertexShader);
if (!this.#gl.getShaderParameter(this.#vertexShader, this.#gl.COMPILE_STATUS))
throw new Error(this.#gl.getShaderInfoLog(this.#vertexShader) ?? `Failed to compile vertex shader`);
- this.#fragmentShader = this.#gl.createShader(this.#gl.FRAGMENT_SHADER);
- if (this.#fragmentShader == null) throw new Error("Failed to create fragment shader");
- this.#gl.shaderSource(this.#fragmentShader, NanoPowGlFragmentShader);
- this.#gl.compileShader(this.#fragmentShader);
- if (!this.#gl.getShaderParameter(this.#fragmentShader, this.#gl.COMPILE_STATUS))
- throw new Error(this.#gl.getShaderInfoLog(this.#fragmentShader) ?? `Failed to compile fragment shader`);
- this.#gl.attachShader(this.#program, this.#vertexShader);
- this.#gl.attachShader(this.#program, this.#fragmentShader);
- this.#gl.linkProgram(this.#program);
- if (!this.#gl.getProgramParameter(this.#program, this.#gl.LINK_STATUS))
- throw new Error(this.#gl.getProgramInfoLog(this.#program) ?? `Failed to link program`);
- this.#gl.useProgram(this.#program);
+ this.#drawShader = this.#gl.createShader(this.#gl.FRAGMENT_SHADER);
+ if (this.#drawShader == null) throw new Error("Failed to create fragment shader");
+ this.#gl.shaderSource(this.#drawShader, NanoPowGlDrawShader);
+ this.#gl.compileShader(this.#drawShader);
+ if (!this.#gl.getShaderParameter(this.#drawShader, this.#gl.COMPILE_STATUS))
+ throw new Error(this.#gl.getShaderInfoLog(this.#drawShader) ?? `Failed to compile fragment shader`);
+ this.#gl.attachShader(this.#drawProgram, this.#vertexShader);
+ this.#gl.attachShader(this.#drawProgram, this.#drawShader);
+ this.#gl.linkProgram(this.#drawProgram);
+ if (!this.#gl.getProgramParameter(this.#drawProgram, this.#gl.LINK_STATUS))
+ throw new Error(this.#gl.getProgramInfoLog(this.#drawProgram) ?? `Failed to link program`);
+ this.#downsampleProgram = this.#gl.createProgram();
+ if (this.#downsampleProgram == null) throw new Error("Failed to create downsample program");
+ this.#downsampleShader = this.#gl.createShader(this.#gl.FRAGMENT_SHADER);
+ if (this.#downsampleShader == null) throw new Error("Failed to create downsample shader");
+ this.#gl.shaderSource(this.#downsampleShader, NanoPowGlDownsampleShader);
+ this.#gl.compileShader(this.#downsampleShader);
+ if (!this.#gl.getShaderParameter(this.#downsampleShader, this.#gl.COMPILE_STATUS))
+ throw new Error(this.#gl.getShaderInfoLog(this.#downsampleShader) ?? `Failed to compile downsample shader`);
+ this.#gl.attachShader(this.#downsampleProgram, this.#vertexShader);
+ this.#gl.attachShader(this.#downsampleProgram, this.#downsampleShader);
+ this.#gl.linkProgram(this.#downsampleProgram);
+ if (!this.#gl.getProgramParameter(this.#downsampleProgram, this.#gl.LINK_STATUS))
+ throw new Error(this.#gl.getProgramInfoLog(this.#downsampleProgram) ?? `Failed to link program`);
+ this.#gl.useProgram(this.#drawProgram);
const triangleArray = this.#gl.createVertexArray();
this.#gl.bindVertexArray(triangleArray);
- this.#texture = this.#gl.createTexture();
- this.#gl.bindTexture(this.#gl.TEXTURE_2D, this.#texture);
- this.#gl.texImage2D(this.#gl.TEXTURE_2D, 0, this.#gl.RGBA32UI, this.#gl.drawingBufferWidth, this.#gl.drawingBufferHeight, 0, this.#gl.RGBA_INTEGER, this.#gl.UNSIGNED_INT, null);
- this.#gl.texParameteri(this.#gl.TEXTURE_2D, this.#gl.TEXTURE_MIN_FILTER, this.#gl.NEAREST);
- this.#gl.texParameteri(this.#gl.TEXTURE_2D, this.#gl.TEXTURE_MAG_FILTER, this.#gl.NEAREST);
- this.#gl.bindTexture(this.#gl.TEXTURE_2D, null);
- this.#framebuffer = this.#gl.createFramebuffer();
- this.#gl.bindFramebuffer(this.#gl.FRAMEBUFFER, this.#framebuffer);
- this.#gl.framebufferTexture2D(this.#gl.FRAMEBUFFER, this.#gl.COLOR_ATTACHMENT0, this.#gl.TEXTURE_2D, this.#texture, 0);
- if (this.#gl.checkFramebufferStatus(this.#gl.FRAMEBUFFER) !== this.#gl.FRAMEBUFFER_COMPLETE)
- throw new Error(`Failed to create framebuffer`);
- this.#gl.bindFramebuffer(this.#gl.FRAMEBUFFER, null);
this.#positionBuffer = this.#gl.createBuffer();
this.#gl.bindBuffer(this.#gl.ARRAY_BUFFER, this.#positionBuffer);
this.#gl.bufferData(this.#gl.ARRAY_BUFFER, this.#positions, this.#gl.STATIC_DRAW);
- this.#gl.vertexAttribPointer(0, 3, this.#gl.FLOAT, false, 0, 0);
+ this.#gl.vertexAttribPointer(0, 2, this.#gl.FLOAT, false, 0, 0);
this.#gl.enableVertexAttribArray(0);
this.#gl.bindBuffer(this.#gl.ARRAY_BUFFER, null);
- this.#uvBuffer = this.#gl.createBuffer();
- this.#gl.bindBuffer(this.#gl.ARRAY_BUFFER, this.#uvBuffer);
- this.#gl.bufferData(this.#gl.ARRAY_BUFFER, this.#uvPosArray, this.#gl.STATIC_DRAW);
- this.#gl.vertexAttribPointer(1, 2, this.#gl.FLOAT, false, 0, 0);
- this.#gl.enableVertexAttribArray(1);
- this.#gl.bindBuffer(this.#gl.ARRAY_BUFFER, null);
+ const texture = this.#gl.createTexture();
+ this.#gl.bindTexture(this.#gl.TEXTURE_2D, texture);
+ this.#gl.texImage2D(this.#gl.TEXTURE_2D, 0, this.#gl.RGBA32UI, this.#gl.drawingBufferWidth, this.#gl.drawingBufferHeight, 0, this.#gl.RGBA_INTEGER, this.#gl.UNSIGNED_INT, null);
+ this.#gl.texParameteri(this.#gl.TEXTURE_2D, this.#gl.TEXTURE_MIN_FILTER, this.#gl.NEAREST);
+ this.#gl.texParameteri(this.#gl.TEXTURE_2D, this.#gl.TEXTURE_MAG_FILTER, this.#gl.NEAREST);
+ const framebuffer = this.#gl.createFramebuffer();
+ this.#gl.bindFramebuffer(this.#gl.FRAMEBUFFER, framebuffer);
+ this.#gl.framebufferTexture2D(this.#gl.FRAMEBUFFER, this.#gl.COLOR_ATTACHMENT0, this.#gl.TEXTURE_2D, texture, 0);
+ if (this.#gl.checkFramebufferStatus(this.#gl.FRAMEBUFFER) !== this.#gl.FRAMEBUFFER_COMPLETE)
+ throw new Error(`Failed to create drawing framebuffer`);
+ this.#drawFbo = { texture, framebuffer, size: { x: this.#gl.drawingBufferWidth, y: this.#gl.drawingBufferHeight } };
+ for (let i = 1; i <= 4; i++) {
+ const width = this.#gl.drawingBufferWidth / 2 ** i;
+ const height = this.#gl.drawingBufferHeight / 2 ** i;
+ const texture2 = this.#gl.createTexture();
+ this.#gl.bindTexture(this.#gl.TEXTURE_2D, texture2);
+ this.#gl.texImage2D(this.#gl.TEXTURE_2D, 0, this.#gl.RGBA32UI, width, height, 0, this.#gl.RGBA_INTEGER, this.#gl.UNSIGNED_INT, null);
+ this.#gl.texParameteri(this.#gl.TEXTURE_2D, this.#gl.TEXTURE_MIN_FILTER, this.#gl.NEAREST);
+ this.#gl.texParameteri(this.#gl.TEXTURE_2D, this.#gl.TEXTURE_MAG_FILTER, this.#gl.NEAREST);
+ const framebuffer2 = this.#gl.createFramebuffer();
+ this.#gl.bindFramebuffer(this.#gl.FRAMEBUFFER, framebuffer2);
+ this.#gl.framebufferTexture2D(this.#gl.FRAMEBUFFER, this.#gl.COLOR_ATTACHMENT0, this.#gl.TEXTURE_2D, texture2, 0);
+ if (this.#gl.checkFramebufferStatus(this.#gl.FRAMEBUFFER) !== this.#gl.FRAMEBUFFER_COMPLETE)
+ throw new Error(`Failed to create downsampling framebuffer ${i}`);
+ this.#downsampleFbos.push({ texture: texture2, framebuffer: framebuffer2, size: { x: width, y: height } });
+ }
+ this.#downsampleSrcLocation = this.#gl.getUniformLocation(this.#downsampleProgram, "src");
+ this.#gl.bindTexture(this.#gl.TEXTURE_2D, null);
+ this.#gl.bindFramebuffer(this.#gl.FRAMEBUFFER, null);
this.#uboBuffer = this.#gl.createBuffer();
this.#gl.bindBuffer(this.#gl.UNIFORM_BUFFER, this.#uboBuffer);
this.#gl.bufferData(this.#gl.UNIFORM_BUFFER, 144, this.#gl.DYNAMIC_DRAW);
this.#gl.bindBuffer(this.#gl.UNIFORM_BUFFER, null);
this.#gl.bindBufferBase(this.#gl.UNIFORM_BUFFER, 0, this.#uboBuffer);
- this.#gl.uniformBlockBinding(this.#program, this.#gl.getUniformBlockIndex(this.#program, "UBO"), 0);
+ this.#gl.uniformBlockBinding(this.#drawProgram, this.#gl.getUniformBlockIndex(this.#drawProgram, "UBO"), 0);
this.#gl.bindBuffer(this.#gl.UNIFORM_BUFFER, null);
- this.#workBuffer = this.#gl.createBuffer();
- this.#gl.bindBuffer(this.#gl.UNIFORM_BUFFER, this.#workBuffer);
- this.#gl.bufferData(this.#gl.UNIFORM_BUFFER, 32, this.#gl.STREAM_DRAW);
+ this.#seedBuffer = this.#gl.createBuffer();
+ this.#gl.bindBuffer(this.#gl.UNIFORM_BUFFER, this.#seedBuffer);
+ this.#gl.bufferData(this.#gl.UNIFORM_BUFFER, 16, this.#gl.DYNAMIC_DRAW);
this.#gl.bindBuffer(this.#gl.UNIFORM_BUFFER, null);
- this.#gl.bindBufferBase(this.#gl.UNIFORM_BUFFER, 1, this.#workBuffer);
- this.#gl.uniformBlockBinding(this.#program, this.#gl.getUniformBlockIndex(this.#program, "WORK"), 1);
+ this.#gl.bindBufferBase(this.#gl.UNIFORM_BUFFER, 1, this.#seedBuffer);
+ this.#gl.uniformBlockBinding(this.#drawProgram, this.#gl.getUniformBlockIndex(this.#drawProgram, "WORK"), 1);
this.#gl.bindBuffer(this.#gl.UNIFORM_BUFFER, null);
- this.#pixels = new Uint32Array(this.#gl.drawingBufferWidth * this.#gl.drawingBufferHeight * 4);
this.#query = this.#gl.createQuery();
+ this.#pixels = new Uint32Array(this.#gl.drawingBufferWidth * this.#gl.drawingBufferHeight * 4);
+ console.log(`NanoPow WebGL initialized at ${this.#gl.drawingBufferWidth}x${this.#gl.drawingBufferHeight}. Maximum nonces checked per frame: ${this.#gl.drawingBufferWidth * this.#gl.drawingBufferHeight}`);
} catch (err) {
- throw new Error(`WebGL initialization failed. ${err}`);
+ throw new Error("WebGL initialization failed.", { cause: err });
} finally {
this.#busy = false;
}
}
static reset() {
+ cancelAnimationFrame(_NanoPowGl.#raf);
+ _NanoPowGl.#gl?.deleteQuery(_NanoPowGl.#query);
_NanoPowGl.#query = null;
- _NanoPowGl.#workBuffer = null;
+ _NanoPowGl.#gl?.deleteBuffer(_NanoPowGl.#seedBuffer);
+ _NanoPowGl.#seedBuffer = null;
+ _NanoPowGl.#gl?.deleteBuffer(_NanoPowGl.#uboBuffer);
_NanoPowGl.#uboBuffer = null;
- _NanoPowGl.#uvBuffer = null;
+ for (const fbo of _NanoPowGl.#downsampleFbos) {
+ _NanoPowGl.#gl?.deleteFramebuffer(fbo.framebuffer);
+ _NanoPowGl.#gl?.deleteTexture(fbo.texture);
+ }
+ _NanoPowGl.#downsampleFbos = [];
+ _NanoPowGl.#gl?.deleteShader(_NanoPowGl.#downsampleShader);
+ _NanoPowGl.#downsampleShader = null;
+ _NanoPowGl.#gl?.deleteProgram(_NanoPowGl.#downsampleProgram);
+ _NanoPowGl.#downsampleProgram = null;
+ _NanoPowGl.#gl?.deleteFramebuffer(_NanoPowGl.#drawFbo?.framebuffer ?? null);
+ _NanoPowGl.#drawFbo = null;
+ _NanoPowGl.#gl?.deleteTexture(_NanoPowGl.#drawFbo);
+ _NanoPowGl.#drawFbo = null;
+ _NanoPowGl.#gl?.deleteBuffer(_NanoPowGl.#positionBuffer);
_NanoPowGl.#positionBuffer = null;
- _NanoPowGl.#framebuffer = null;
- _NanoPowGl.#texture = null;
- _NanoPowGl.#fragmentShader = null;
+ _NanoPowGl.#gl?.deleteShader(_NanoPowGl.#drawShader);
+ _NanoPowGl.#drawShader = null;
+ _NanoPowGl.#gl?.deleteShader(_NanoPowGl.#vertexShader);
_NanoPowGl.#vertexShader = null;
- _NanoPowGl.#program = null;
+ _NanoPowGl.#gl?.deleteProgram(_NanoPowGl.#drawProgram);
+ _NanoPowGl.#drawProgram = null;
_NanoPowGl.#gl = null;
_NanoPowGl.#busy = false;
_NanoPowGl.init();
"Harmonic Mean (ms)": count / reciprocals,
"Geometric Mean (ms)": Math.exp(logarithms / count)
};
+ console.log(`Averages: ${JSON.stringify(averages)}`);
console.table(averages);
}
- static #draw(work) {
+ static #draw(seed) {
if (this.#gl == null || this.#query == null) throw new Error("WebGL 2 is required to draw and query pixels");
- if (this.#workBuffer == null) throw new Error("Work buffer is required to draw");
- this.#gl.clear(this.#gl.COLOR_BUFFER_BIT);
- this.#gl.bindBuffer(this.#gl.UNIFORM_BUFFER, this.#workBuffer);
- this.#gl.bufferSubData(this.#gl.UNIFORM_BUFFER, 0, work);
+ if (this.#drawFbo == null) throw new Error("FBO is required to draw");
+ if (this.#seed[0] == null || this.#seedBuffer == null) throw new Error("Seed is required to draw");
+ this.#gl.bindBuffer(this.#gl.UNIFORM_BUFFER, this.#seedBuffer);
+ this.#gl.bufferSubData(this.#gl.UNIFORM_BUFFER, 0, seed);
this.#gl.bindBuffer(this.#gl.UNIFORM_BUFFER, null);
+ this.#gl.useProgram(this.#drawProgram);
+ this.#gl.bindFramebuffer(this.#gl.FRAMEBUFFER, this.#drawFbo.framebuffer);
+ this.#gl.activeTexture(this.#gl.TEXTURE0);
+ this.#gl.bindTexture(this.#gl.TEXTURE_2D, this.#drawFbo.texture);
this.#gl.beginQuery(this.#gl.ANY_SAMPLES_PASSED_CONSERVATIVE, this.#query);
- this.#gl.bindFramebuffer(this.#gl.FRAMEBUFFER, this.#framebuffer);
- this.#gl.drawArrays(this.#gl.TRIANGLES, 0, 6);
+ this.#gl.viewport(0, 0, this.#drawFbo.size.x, this.#drawFbo.size.y);
+ this.#gl.drawArrays(this.#gl.TRIANGLES, 0, 4);
this.#gl.bindFramebuffer(this.#gl.FRAMEBUFFER, null);
this.#gl.endQuery(this.#gl.ANY_SAMPLES_PASSED_CONSERVATIVE);
}
static async #checkQueryResult() {
return new Promise((resolve, reject) => {
- try {
- if (this.#gl == null || this.#query == null) throw new Error("WebGL 2 is required to check query results");
- if (this.#gl.getQueryParameter(this.#query, this.#gl.QUERY_RESULT_AVAILABLE)) {
- resolve(!!this.#gl.getQueryParameter(this.#query, this.#gl.QUERY_RESULT));
- } else {
- requestAnimationFrame(async () => {
- const result = await _NanoPowGl.#checkQueryResult();
- resolve(result);
- });
+ function check() {
+ try {
+ if (_NanoPowGl.#gl == null || _NanoPowGl.#query == null) throw new Error("WebGL 2 is required to check query results");
+ if (_NanoPowGl.#gl.getQueryParameter(_NanoPowGl.#query, _NanoPowGl.#gl.QUERY_RESULT_AVAILABLE)) {
+ resolve(!!_NanoPowGl.#gl.getQueryParameter(_NanoPowGl.#query, _NanoPowGl.#gl.QUERY_RESULT));
+ } else {
+ _NanoPowGl.#raf = requestAnimationFrame(check);
+ }
+ } catch (err) {
+ reject(err);
}
- } catch (err) {
- reject(err);
}
+ check();
});
}
/**
*/
static #readResult(workHex) {
if (this.#gl == null) throw new Error("WebGL 2 is required to read pixels");
- this.#gl.bindFramebuffer(this.#gl.FRAMEBUFFER, this.#framebuffer);
- this.#gl.readPixels(0, 0, this.#gl.drawingBufferWidth, this.#gl.drawingBufferHeight, this.#gl.RGBA_INTEGER, this.#gl.UNSIGNED_INT, this.#pixels);
+ if (this.#drawFbo == null) throw new Error("Source FBO is required to downsample");
+ let source = this.#drawFbo;
+ let pixelCount;
+ const start = performance.now();
+ if (workHex != null) {
+ this.#gl.bindFramebuffer(this.#gl.FRAMEBUFFER, source.framebuffer);
+ this.#gl.readPixels(0, 0, 1, 1, this.#gl.RGBA_INTEGER, this.#gl.UNSIGNED_INT, this.#pixels);
+ pixelCount = 4;
+ } else {
+ this.#gl.useProgram(this.#downsampleProgram);
+ for (const fbo of this.#downsampleFbos) {
+ this.#gl.bindFramebuffer(this.#gl.FRAMEBUFFER, fbo.framebuffer);
+ this.#gl.activeTexture(this.#gl.TEXTURE0);
+ this.#gl.bindTexture(this.#gl.TEXTURE_2D, source.texture);
+ this.#gl.uniform1i(this.#downsampleSrcLocation, 0);
+ this.#gl.viewport(0, 0, fbo.size.x, fbo.size.y);
+ this.#gl.drawArrays(this.#gl.TRIANGLES, 0, 4);
+ source = fbo;
+ }
+ this.#gl.bindFramebuffer(this.#gl.FRAMEBUFFER, source.framebuffer);
+ this.#gl.readPixels(0, 0, source.size.x, source.size.y, this.#gl.RGBA_INTEGER, this.#gl.UNSIGNED_INT, this.#pixels);
+ pixelCount = source.size.x * source.size.y * 4;
+ }
this.#gl.bindFramebuffer(this.#gl.FRAMEBUFFER, null);
- for (let i = 0; i < this.#pixels.length; i += 4) {
+ for (let i = 0; i < pixelCount; i += 4) {
if (this.#pixels[i] !== 0) {
+ if (this.#debug) console.log(`readResults (${performance.now() - start} ms)`);
+ if (this.#debug) console.log(`Pixel: rgba(${this.#pixels[i]}, ${this.#pixels[i + 1]}, ${this.#pixels[i + 2]}, ${this.#pixels[i + 3].toString(16).padStart(8, "0")})`);
const hex = `${this.#pixels[i + 1].toString(16).padStart(8, "0")}${this.#pixels[i + 2].toString(16).padStart(8, "0")}`;
if (workHex == null || workHex == hex) return hex;
}
* @param {number} [threshold=0xfffffff8] - Difficulty of proof-of-work calculation
*/
static async search(hash, options) {
- if (_NanoPowGl.#gl == null) throw new Error("WebGL 2 is required");
- if (this.#gl == null) throw new Error("WebGL 2 is required");
- if (!/^[A-Fa-f0-9]{64}$/.test(hash)) throw new Error(`Invalid hash ${hash}`);
if (this.#busy) {
+ console.log("NanoPowGl is busy. Retrying search...");
return new Promise((resolve) => {
setTimeout(async () => {
const result = this.search(hash, options);
});
}
this.#busy = true;
+ if (!/^[A-Fa-f0-9]{64}$/.test(hash)) throw new Error(`Invalid hash ${hash}`);
const threshold = typeof options?.threshold !== "number" || options.threshold < 0 || options.threshold > 4294967295 ? 4294967288 : options.threshold;
- const effort = typeof options?.effort !== "number" || options.effort < 1 || options.effort > 32 ? 8 : options.effort;
- const debug = !!options?.debug;
+ const effort = typeof options?.effort !== "number" || options.effort < 1 || options.effort > 32 ? this.#cores : options.effort;
+ this.#debug = !!options?.debug;
if (this.#WORKLOAD !== 256 * effort) {
this.#WORKLOAD = 256 * effort;
+ this.#canvas.height = this.#WORKLOAD;
+ this.#canvas.width = this.#WORKLOAD;
this.reset();
}
- const uboView = new DataView(new ArrayBuffer(144));
+ if (_NanoPowGl.#gl == null) throw new Error("WebGL 2 is required");
+ if (this.#gl == null) throw new Error("WebGL 2 is required");
+ if (this.#drawFbo == null) throw new Error("WebGL framebuffer is required");
+ this.#gl.bindFramebuffer(this.#gl.FRAMEBUFFER, this.#drawFbo.framebuffer);
+ this.#gl.clearBufferuiv(this.#gl.COLOR, 0, [0, 0, 0, 0]);
+ this.#gl.bindFramebuffer(this.#gl.FRAMEBUFFER, null);
+ for (let i = 0; i < this.#uboView.byteLength; i++) this.#uboView.setUint8(i, 0);
for (let i = 0; i < 64; i += 8) {
const uint32 = hash.slice(i, i + 8);
- uboView.setUint32(i * 2, parseInt(uint32, 16));
+ this.#uboView.setUint32(i * 2, parseInt(uint32, 16));
}
- uboView.setUint32(128, threshold, true);
- uboView.setFloat32(132, 256 * effort, true);
- _NanoPowGl.#gl.bindBuffer(_NanoPowGl.#gl.UNIFORM_BUFFER, _NanoPowGl.#uboBuffer);
- _NanoPowGl.#gl.bufferSubData(_NanoPowGl.#gl.UNIFORM_BUFFER, 0, uboView);
- _NanoPowGl.#gl.bindBuffer(_NanoPowGl.#gl.UNIFORM_BUFFER, null);
+ this.#uboView.setUint32(128, threshold, true);
+ this.#uboView.setUint32(132, 1, true);
+ if (this.#debug) console.log("UBO", this.#uboView.buffer.slice(0));
+ this.#gl.bindBuffer(this.#gl.UNIFORM_BUFFER, this.#uboBuffer);
+ this.#gl.bufferSubData(this.#gl.UNIFORM_BUFFER, 0, this.#uboView);
+ this.#gl.bindBuffer(this.#gl.UNIFORM_BUFFER, null);
let times = [];
let start = performance.now();
let nonce = null;
- const seed = new Uint8Array(8);
+ if (this.#debug) console.groupCollapsed("Seeds (click to view)");
while (nonce == null) {
start = performance.now();
- crypto.getRandomValues(seed);
- this.#draw(seed);
+ const random0 = Math.floor(Math.random() * 4294967295);
+ const random1 = Math.floor(Math.random() * 4294967295);
+ this.#seed[0] = BigInt(random0) << 32n | BigInt(random1);
+ if (this.#debug) console.log("Seed", this.#seed);
+ this.#draw(this.#seed);
const found = await this.#checkQueryResult();
times.push(performance.now() - start);
if (found) {
+ if (this.#debug) console.groupEnd();
nonce = this.#readResult();
}
}
this.#busy = false;
- if (debug) this.#logAverages(times);
+ if (this.#debug) this.#logAverages(times);
return nonce;
}
/**
* @param {number} [threshold=0xfffffff8] - Difficulty of proof-of-work calculation
*/
static async validate(work, hash, options) {
- if (_NanoPowGl.#gl == null) throw new Error("WebGL 2 is required");
- if (this.#gl == null) throw new Error("WebGL 2 is required");
- if (!/^[A-Fa-f0-9]{16}$/.test(work)) throw new Error(`Invalid work ${work}`);
- if (!/^[A-Fa-f0-9]{64}$/.test(hash)) throw new Error(`Invalid hash ${hash}`);
if (this.#busy) {
+ console.log("NanoPowGl is busy. Retrying validate...");
return new Promise((resolve) => {
setTimeout(async () => {
const result = this.validate(work, hash, options);
});
}
this.#busy = true;
+ if (!/^[A-Fa-f0-9]{16}$/.test(work)) throw new Error(`Invalid work ${work}`);
+ if (!/^[A-Fa-f0-9]{64}$/.test(hash)) throw new Error(`Invalid hash ${hash}`);
const threshold = typeof options?.threshold !== "number" || options.threshold < 0 || options.threshold > 4294967295 ? 4294967288 : options.threshold;
- const debug = !!options?.debug;
- if (this.#WORKLOAD !== 1) {
- this.#WORKLOAD = 1;
- this.reset();
- }
- const uboView = new DataView(new ArrayBuffer(144));
+ this.#debug = !!options?.debug;
+ if (_NanoPowGl.#gl == null) throw new Error("WebGL 2 is required");
+ if (this.#gl == null) throw new Error("WebGL 2 is required");
+ if (this.#drawFbo == null) throw new Error("WebGL framebuffer is required");
+ this.#gl.bindFramebuffer(this.#gl.FRAMEBUFFER, this.#drawFbo.framebuffer);
+ this.#gl.clearBufferuiv(this.#gl.COLOR, 0, [0, 0, 0, 0]);
+ this.#gl.bindFramebuffer(this.#gl.FRAMEBUFFER, null);
+ for (let i = 0; i < this.#uboView.byteLength; i++) this.#uboView.setUint8(i, 0);
for (let i = 0; i < 64; i += 8) {
const uint32 = hash.slice(i, i + 8);
- uboView.setUint32(i * 2, parseInt(uint32, 16));
+ this.#uboView.setUint32(i * 2, parseInt(uint32, 16));
}
- uboView.setUint32(128, threshold, true);
- uboView.setFloat32(132, _NanoPowGl.#WORKLOAD - 1, true);
- _NanoPowGl.#gl.bindBuffer(_NanoPowGl.#gl.UNIFORM_BUFFER, _NanoPowGl.#uboBuffer);
- _NanoPowGl.#gl.bufferSubData(_NanoPowGl.#gl.UNIFORM_BUFFER, 0, uboView);
- _NanoPowGl.#gl.bindBuffer(_NanoPowGl.#gl.UNIFORM_BUFFER, null);
+ this.#uboView.setUint32(128, threshold, true);
+ this.#uboView.setUint32(132, 0, true);
+ if (this.#debug) console.log("UBO", this.#uboView.buffer.slice(0));
+ this.#gl.bindBuffer(this.#gl.UNIFORM_BUFFER, this.#uboBuffer);
+ this.#gl.bufferSubData(this.#gl.UNIFORM_BUFFER, 0, this.#uboView);
+ this.#gl.bindBuffer(this.#gl.UNIFORM_BUFFER, null);
let nonce = null;
- const data = new DataView(new ArrayBuffer(8));
- data.setBigUint64(0, BigInt(`0x${work}`), true);
- const seed = new Uint8Array(data.buffer);
- this.#draw(seed);
+ this.#seed[0] = BigInt(`0x${work}`);
+ if (this.#debug) console.log("Work", this.#seed);
+ this.#draw(this.#seed);
let found = await this.#checkQueryResult();
if (found) {
try {
var NanoPowGpu = class _NanoPowGpu {
// Initialize WebGPU
static #busy = false;
+ static #debug = false;
static #device = null;
- static #uboBuffer;
+ static #gpuBufferReset = new BigUint64Array([0n, 0n]);
static #gpuBuffer;
static #cpuBuffer;
+ static #uboBuffer;
+ static #uboView;
static #bindGroupLayout;
static #searchPipeline;
static #validatePipeline;
this.#device = device;
this.setup();
} catch (err) {
- throw new Error(`WebGPU initialization failed. ${err}`);
+ throw new Error("WebGPU initialization failed.", { cause: err });
} finally {
this.#busy = false;
}
}
static setup() {
if (this.#device == null) throw new Error(`WebGPU device failed to load.`);
- this.#uboBuffer = this.#device.createBuffer({
- size: 48,
- usage: GPUBufferUsage.UNIFORM | GPUBufferUsage.COPY_DST
- });
this.#gpuBuffer = this.#device.createBuffer({
size: 16,
usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_DST | GPUBufferUsage.COPY_SRC
size: 16,
usage: GPUBufferUsage.COPY_DST | GPUBufferUsage.MAP_READ
});
+ this.#uboBuffer = this.#device.createBuffer({
+ size: 48,
+ usage: GPUBufferUsage.UNIFORM | GPUBufferUsage.COPY_DST
+ });
+ this.#uboView = new DataView(new ArrayBuffer(48));
this.#bindGroupLayout = this.#device.createBindGroupLayout({
entries: [
{
module: shaderModule
}
});
+ console.log(`NanoPow WebGPU initialized. Recommended effort: ${Math.max(1, Math.floor(navigator.hardwareConcurrency / 2))}`);
}
static reset() {
console.warn(`GPU device lost. Reinitializing...`);
}
static async #dispatch(pipeline, seed, hash, threshold, passes) {
if (this.#device == null) throw new Error(`WebGPU device failed to load.`);
- const uboView = new DataView(new ArrayBuffer(48));
+ for (let i = 0; i < this.#uboView.byteLength; i++) this.#uboView.setUint8(i, 0);
for (let i = 0; i < 64; i += 16) {
const u64 = hash.slice(i, i + 16);
- uboView.setBigUint64(i / 2, BigInt(`0x${u64}`));
+ this.#uboView.setBigUint64(i / 2, BigInt(`0x${u64}`));
}
- uboView.setBigUint64(32, seed, true);
- uboView.setUint32(40, threshold, true);
- this.#device.queue.writeBuffer(this.#uboBuffer, 0, uboView);
- this.#device.queue.writeBuffer(this.#gpuBuffer, 0, new Uint32Array([0, 0, 0]));
+ this.#uboView.setBigUint64(32, seed, true);
+ this.#uboView.setUint32(40, threshold, true);
+ if (this.#debug) console.log("UBO", this.#uboView);
+ this.#device.queue.writeBuffer(this.#uboBuffer, 0, this.#uboView);
+ this.#device.queue.writeBuffer(this.#gpuBuffer, 0, this.#gpuBufferReset);
const bindGroup = this.#device.createBindGroup({
layout: this.#bindGroupLayout,
entries: [
console.warn(`Error getting data from GPU. ${err}`);
return this.#dispatch(pipeline, seed, hash, threshold, passes);
}
+ if (this.#debug) console.log("gpuBuffer data", data);
if (data == null) throw new Error(`Failed to get data from buffer.`);
return data;
}
static async search(hash, options) {
if (!/^[A-Fa-f0-9]{64}$/.test(hash)) throw new TypeError(`Invalid hash ${hash}`);
if (this.#busy) {
+ console.log("NanoPowGpu is busy. Retrying search...");
return new Promise((resolve) => {
setTimeout(async () => {
const result = this.search(hash, options);
this.#busy = true;
const threshold = typeof options?.threshold !== "number" || options.threshold < 0 || options.threshold > 4294967295 ? 4294967288 : options.threshold;
const effort = typeof options?.effort !== "number" || options.effort < 1 || options.effort > 32 ? 2048 : options.effort * 256;
- const debug = !!options?.debug;
+ this.#debug = !!options?.debug;
let loads = 0;
while (this.#device == null && loads < 20) {
await new Promise((resolve) => {
let nonce = 0n;
do {
start = performance.now();
- const random = Math.floor(Math.random() * 4294967295);
- const seed = BigInt(random) << 32n | BigInt(random);
+ const random0 = Math.floor(Math.random() * 4294967295);
+ const random1 = Math.floor(Math.random() * 4294967295);
+ const seed = BigInt(random0) << 32n | BigInt(random1);
+ if (this.#debug) console.log(`seed: ${seed}`);
const data = await this.#dispatch(this.#searchPipeline, seed, hash, threshold, effort);
nonce = data.getBigUint64(0, true);
this.#busy = !data.getUint32(8);
times.push(performance.now() - start);
} while (this.#busy);
- if (debug) this.#logAverages(times);
+ if (this.#debug) this.#logAverages(times);
return nonce.toString(16).padStart(16, "0");
}
/**
if (!/^[A-Fa-f0-9]{16}$/.test(work)) throw new TypeError(`Invalid work ${work}`);
if (!/^[A-Fa-f0-9]{64}$/.test(hash)) throw new TypeError(`Invalid hash ${hash}`);
if (this.#busy) {
+ console.log("NanoPowGpu is busy. Retrying validate...");
return new Promise((resolve) => {
setTimeout(async () => {
const result = this.validate(work, hash, options);
});
}
this.#busy = true;
- const debug = !!options?.debug;
+ this.#debug = !!options?.debug;
const threshold = typeof options?.threshold !== "number" || options.threshold < 0 || options.threshold > 4294967295 ? 4294967288 : options.threshold;
let loads = 0;
while (this.#device == null && loads < 20) {
throw new Error(`WebGPU device failed to load.`);
}
const seed = BigInt(`0x${work}`);
+ if (this.#debug) console.log(`work: ${work}`);
const data = await this.#dispatch(this.#validatePipeline, seed, hash, threshold, 1);
const nonce = data.getBigUint64(0, true).toString(16).padStart(16, "0");
+ if (this.#debug) console.log(`nonce: ${nonce}`);
const found = !!data.getUint32(8);
this.#busy = false;
if (found && work !== nonce) throw new Error(`Nonce (${nonce}) found but does not match work (${work})`);
await NanoPowGpu.init();
isGpuSupported = true;
} catch (err) {
- console.error(err);
- console.warn(`WebGPU is not supported in this environment.`);
+ console.warn("WebGPU is not supported in this environment.\n", err);
isGpuSupported = false;
}
try {
await NanoPowGl.init();
isGlSupported = true;
} catch (err) {
- console.error(err);
- console.warn(`WebGL is not supported in this environment.`);
+ console.warn("WebGL is not supported in this environment.\n", err);
isGlSupported = false;
}
var NanoPow = isGpuSupported ? NanoPowGpu : isGlSupported ? NanoPowGl : null;
// SPDX-FileContributor: Ben Green <ben@latenightsketches.com>
// SPDX-License-Identifier: GPL-3.0-or-later AND MIT
-import { NanoPowGlFragmentShader, NanoPowGlVertexShader } from '../shaders'
-import type { NanoPowOptions } from '../../types.d.ts'
+import { NanoPowGlDownsampleShader, NanoPowGlDrawShader, NanoPowGlVertexShader } from '../shaders'
+import type { FBO, NanoPowOptions } from '../../types.d.ts'
export class NanoPowGl {
static #busy: boolean = false
- /** Used to set canvas size. Must be a multiple of 256. */
- static #WORKLOAD: number = 256 * Math.max(1, Math.floor(navigator.hardwareConcurrency))
+ static #debug: boolean = false
+ static #raf: number = 0
+ /** Used to set canvas size. */
+ static #cores: number = Math.max(1, Math.floor(navigator.hardwareConcurrency))
+ static #WORKLOAD: number = 256 * this.#cores
+ static #canvas: OffscreenCanvas = new OffscreenCanvas(this.#WORKLOAD, this.#WORKLOAD)
+ static get size () { return this.#gl?.drawingBufferWidth }
static #gl: WebGL2RenderingContext | null
- static #program: WebGLProgram | null
+ static #drawProgram: WebGLProgram | null
+ static #downsampleProgram: WebGLProgram | null
static #vertexShader: WebGLShader | null
- static #fragmentShader: WebGLShader | null
- static #texture: WebGLTexture | null
- static #framebuffer: WebGLFramebuffer | null
+ static #drawShader: WebGLShader | null
+ static #downsampleShader: WebGLShader | null
static #positionBuffer: WebGLBuffer | null
- static #uvBuffer: WebGLBuffer | null
+ static #drawFbo: FBO | null
+ static #downsampleFbos: FBO[] = []
+ static #downsampleSrcLocation: WebGLUniformLocation | null
static #uboBuffer: WebGLBuffer | null
- static #workBuffer: WebGLBuffer | null
+ static #uboView: DataView = new DataView(new ArrayBuffer(144))
+ static #seedBuffer: WebGLBuffer | null
+ static #seed: BigUint64Array = new BigUint64Array(1)
static #query: WebGLQuery | null
static #pixels: Uint32Array
+
/**Vertex Positions, 2 triangles */
- static #positions = new Float32Array([
- -1, -1, 0, -1, 1, 0, 1, 1, 0,
- 1, -1, 0, 1, 1, 0, -1, -1, 0
- ])
- /** Texture Positions */
- static #uvPosArray = new Float32Array([
- 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1
+ static #positions: Float32Array = new Float32Array([
+ -1, -1, 1, -1, 1, 1, -1, 1
])
/** Compile */
this.#busy = true
try {
- this.#gl = new OffscreenCanvas(this.#WORKLOAD, this.#WORKLOAD).getContext('webgl2')
+ this.#canvas.addEventListener('webglcontextlost', event => {
+ event.preventDefault()
+ console.warn('WebGL context lost. Waiting for it to be restored...')
+ cancelAnimationFrame(this.#raf)
+ }, false)
+ this.#canvas.addEventListener('webglcontextrestored', event => {
+ console.warn('WebGL context restored. Reinitializing...')
+ NanoPowGl.init()
+ }, false)
+ this.#gl = this.#canvas.getContext('webgl2')
if (this.#gl == null) throw new Error('WebGL 2 is required')
- this.#gl.clearColor(0, 0, 0, 1)
- this.#program = this.#gl.createProgram()
- if (this.#program == null) throw new Error('Failed to create shader program')
+ /** Create drawing program */
+ this.#drawProgram = this.#gl.createProgram()
+ if (this.#drawProgram == null) throw new Error('Failed to create shader program')
this.#vertexShader = this.#gl.createShader(this.#gl.VERTEX_SHADER)
if (this.#vertexShader == null) throw new Error('Failed to create vertex shader')
if (!this.#gl.getShaderParameter(this.#vertexShader, this.#gl.COMPILE_STATUS))
throw new Error(this.#gl.getShaderInfoLog(this.#vertexShader) ?? `Failed to compile vertex shader`)
- this.#fragmentShader = this.#gl.createShader(this.#gl.FRAGMENT_SHADER)
- if (this.#fragmentShader == null) throw new Error('Failed to create fragment shader')
- this.#gl.shaderSource(this.#fragmentShader, NanoPowGlFragmentShader)
- this.#gl.compileShader(this.#fragmentShader)
- if (!this.#gl.getShaderParameter(this.#fragmentShader, this.#gl.COMPILE_STATUS))
- throw new Error(this.#gl.getShaderInfoLog(this.#fragmentShader) ?? `Failed to compile fragment shader`)
-
- this.#gl.attachShader(this.#program, this.#vertexShader)
- this.#gl.attachShader(this.#program, this.#fragmentShader)
- this.#gl.linkProgram(this.#program)
- if (!this.#gl.getProgramParameter(this.#program, this.#gl.LINK_STATUS))
- throw new Error(this.#gl.getProgramInfoLog(this.#program) ?? `Failed to link program`)
-
- /** Construct simple 2D geometry */
- this.#gl.useProgram(this.#program)
+ this.#drawShader = this.#gl.createShader(this.#gl.FRAGMENT_SHADER)
+ if (this.#drawShader == null) throw new Error('Failed to create fragment shader')
+ this.#gl.shaderSource(this.#drawShader, NanoPowGlDrawShader)
+ this.#gl.compileShader(this.#drawShader)
+ if (!this.#gl.getShaderParameter(this.#drawShader, this.#gl.COMPILE_STATUS))
+ throw new Error(this.#gl.getShaderInfoLog(this.#drawShader) ?? `Failed to compile fragment shader`)
+
+ this.#gl.attachShader(this.#drawProgram, this.#vertexShader)
+ this.#gl.attachShader(this.#drawProgram, this.#drawShader)
+ this.#gl.linkProgram(this.#drawProgram)
+ if (!this.#gl.getProgramParameter(this.#drawProgram, this.#gl.LINK_STATUS))
+ throw new Error(this.#gl.getProgramInfoLog(this.#drawProgram) ?? `Failed to link program`)
+
+ /** Create downsampling program */
+ this.#downsampleProgram = this.#gl.createProgram()
+ if (this.#downsampleProgram == null) throw new Error('Failed to create downsample program')
+
+ this.#downsampleShader = this.#gl.createShader(this.#gl.FRAGMENT_SHADER)
+ if (this.#downsampleShader == null) throw new Error('Failed to create downsample shader')
+ this.#gl.shaderSource(this.#downsampleShader, NanoPowGlDownsampleShader)
+ this.#gl.compileShader(this.#downsampleShader)
+ if (!this.#gl.getShaderParameter(this.#downsampleShader, this.#gl.COMPILE_STATUS))
+ throw new Error(this.#gl.getShaderInfoLog(this.#downsampleShader) ?? `Failed to compile downsample shader`)
+
+ this.#gl.attachShader(this.#downsampleProgram, this.#vertexShader)
+ this.#gl.attachShader(this.#downsampleProgram, this.#downsampleShader)
+ this.#gl.linkProgram(this.#downsampleProgram)
+ if (!this.#gl.getProgramParameter(this.#downsampleProgram, this.#gl.LINK_STATUS))
+ throw new Error(this.#gl.getProgramInfoLog(this.#downsampleProgram) ?? `Failed to link program`)
+
+ /** Construct fullscreen quad for rendering */
+ this.#gl.useProgram(this.#drawProgram)
const triangleArray = this.#gl.createVertexArray()
this.#gl.bindVertexArray(triangleArray)
- this.#texture = this.#gl.createTexture()
- this.#gl.bindTexture(this.#gl.TEXTURE_2D, this.#texture)
- this.#gl.texImage2D(this.#gl.TEXTURE_2D, 0, this.#gl.RGBA32UI, this.#gl.drawingBufferWidth, this.#gl.drawingBufferHeight, 0, this.#gl.RGBA_INTEGER, this.#gl.UNSIGNED_INT, null)
- this.#gl.texParameteri(this.#gl.TEXTURE_2D, this.#gl.TEXTURE_MIN_FILTER, this.#gl.NEAREST)
- this.#gl.texParameteri(this.#gl.TEXTURE_2D, this.#gl.TEXTURE_MAG_FILTER, this.#gl.NEAREST)
- this.#gl.bindTexture(this.#gl.TEXTURE_2D, null)
-
- this.#framebuffer = this.#gl.createFramebuffer()
- this.#gl.bindFramebuffer(this.#gl.FRAMEBUFFER, this.#framebuffer)
- this.#gl.framebufferTexture2D(this.#gl.FRAMEBUFFER, this.#gl.COLOR_ATTACHMENT0, this.#gl.TEXTURE_2D, this.#texture, 0)
- if (this.#gl.checkFramebufferStatus(this.#gl.FRAMEBUFFER) !== this.#gl.FRAMEBUFFER_COMPLETE)
- throw new Error(`Failed to create framebuffer`)
- this.#gl.bindFramebuffer(this.#gl.FRAMEBUFFER, null)
-
this.#positionBuffer = this.#gl.createBuffer()
this.#gl.bindBuffer(this.#gl.ARRAY_BUFFER, this.#positionBuffer)
this.#gl.bufferData(this.#gl.ARRAY_BUFFER, this.#positions, this.#gl.STATIC_DRAW)
- this.#gl.vertexAttribPointer(0, 3, this.#gl.FLOAT, false, 0, 0)
+ this.#gl.vertexAttribPointer(0, 2, this.#gl.FLOAT, false, 0, 0)
this.#gl.enableVertexAttribArray(0)
this.#gl.bindBuffer(this.#gl.ARRAY_BUFFER, null)
- this.#uvBuffer = this.#gl.createBuffer()
- this.#gl.bindBuffer(this.#gl.ARRAY_BUFFER, this.#uvBuffer)
- this.#gl.bufferData(this.#gl.ARRAY_BUFFER, this.#uvPosArray, this.#gl.STATIC_DRAW)
- this.#gl.vertexAttribPointer(1, 2, this.#gl.FLOAT, false, 0, 0)
- this.#gl.enableVertexAttribArray(1)
- this.#gl.bindBuffer(this.#gl.ARRAY_BUFFER, null)
+ /** Create texture and framebuffer for drawing */
+ const texture = this.#gl.createTexture()
+ this.#gl.bindTexture(this.#gl.TEXTURE_2D, texture)
+ this.#gl.texImage2D(this.#gl.TEXTURE_2D, 0, this.#gl.RGBA32UI, this.#gl.drawingBufferWidth, this.#gl.drawingBufferHeight, 0, this.#gl.RGBA_INTEGER, this.#gl.UNSIGNED_INT, null)
+ this.#gl.texParameteri(this.#gl.TEXTURE_2D, this.#gl.TEXTURE_MIN_FILTER, this.#gl.NEAREST)
+ this.#gl.texParameteri(this.#gl.TEXTURE_2D, this.#gl.TEXTURE_MAG_FILTER, this.#gl.NEAREST)
+ const framebuffer = this.#gl.createFramebuffer()
+ this.#gl.bindFramebuffer(this.#gl.FRAMEBUFFER, framebuffer)
+ this.#gl.framebufferTexture2D(this.#gl.FRAMEBUFFER, this.#gl.COLOR_ATTACHMENT0, this.#gl.TEXTURE_2D, texture, 0)
+ if (this.#gl.checkFramebufferStatus(this.#gl.FRAMEBUFFER) !== this.#gl.FRAMEBUFFER_COMPLETE)
+ throw new Error(`Failed to create drawing framebuffer`)
+ this.#drawFbo = { texture, framebuffer, size: { x: this.#gl.drawingBufferWidth, y: this.#gl.drawingBufferHeight } }
+
+ /** Create textures, framebuffers, and uniform location for downsampling */
+ for (let i = 1; i <= 4; i++) {
+ const width = this.#gl.drawingBufferWidth / (2 ** i)
+ const height = this.#gl.drawingBufferHeight / (2 ** i)
+
+ const texture = this.#gl.createTexture()
+ this.#gl.bindTexture(this.#gl.TEXTURE_2D, texture)
+ this.#gl.texImage2D(this.#gl.TEXTURE_2D, 0, this.#gl.RGBA32UI, width, height, 0, this.#gl.RGBA_INTEGER, this.#gl.UNSIGNED_INT, null)
+ this.#gl.texParameteri(this.#gl.TEXTURE_2D, this.#gl.TEXTURE_MIN_FILTER, this.#gl.NEAREST)
+ this.#gl.texParameteri(this.#gl.TEXTURE_2D, this.#gl.TEXTURE_MAG_FILTER, this.#gl.NEAREST)
+
+ const framebuffer = this.#gl.createFramebuffer()
+ this.#gl.bindFramebuffer(this.#gl.FRAMEBUFFER, framebuffer)
+ this.#gl.framebufferTexture2D(this.#gl.FRAMEBUFFER, this.#gl.COLOR_ATTACHMENT0, this.#gl.TEXTURE_2D, texture, 0)
+ if (this.#gl.checkFramebufferStatus(this.#gl.FRAMEBUFFER) !== this.#gl.FRAMEBUFFER_COMPLETE)
+ throw new Error(`Failed to create downsampling framebuffer ${i}`)
+ this.#downsampleFbos.push({ texture, framebuffer, size: { x: width, y: height } })
+ }
+ this.#downsampleSrcLocation = this.#gl.getUniformLocation(this.#downsampleProgram, 'src')
+ this.#gl.bindTexture(this.#gl.TEXTURE_2D, null)
+ this.#gl.bindFramebuffer(this.#gl.FRAMEBUFFER, null)
+
+ /** Create input buffers */
this.#uboBuffer = this.#gl.createBuffer()
this.#gl.bindBuffer(this.#gl.UNIFORM_BUFFER, this.#uboBuffer)
this.#gl.bufferData(this.#gl.UNIFORM_BUFFER, 144, this.#gl.DYNAMIC_DRAW)
this.#gl.bindBuffer(this.#gl.UNIFORM_BUFFER, null)
this.#gl.bindBufferBase(this.#gl.UNIFORM_BUFFER, 0, this.#uboBuffer)
- this.#gl.uniformBlockBinding(this.#program, this.#gl.getUniformBlockIndex(this.#program, 'UBO'), 0)
+ this.#gl.uniformBlockBinding(this.#drawProgram, this.#gl.getUniformBlockIndex(this.#drawProgram, 'UBO'), 0)
this.#gl.bindBuffer(this.#gl.UNIFORM_BUFFER, null)
- this.#workBuffer = this.#gl.createBuffer()
- this.#gl.bindBuffer(this.#gl.UNIFORM_BUFFER, this.#workBuffer)
- this.#gl.bufferData(this.#gl.UNIFORM_BUFFER, 32, this.#gl.STREAM_DRAW)
+ this.#seedBuffer = this.#gl.createBuffer()
+ this.#gl.bindBuffer(this.#gl.UNIFORM_BUFFER, this.#seedBuffer)
+ this.#gl.bufferData(this.#gl.UNIFORM_BUFFER, 16, this.#gl.DYNAMIC_DRAW)
this.#gl.bindBuffer(this.#gl.UNIFORM_BUFFER, null)
- this.#gl.bindBufferBase(this.#gl.UNIFORM_BUFFER, 1, this.#workBuffer)
- this.#gl.uniformBlockBinding(this.#program, this.#gl.getUniformBlockIndex(this.#program, 'WORK'), 1)
+ this.#gl.bindBufferBase(this.#gl.UNIFORM_BUFFER, 1, this.#seedBuffer)
+ this.#gl.uniformBlockBinding(this.#drawProgram, this.#gl.getUniformBlockIndex(this.#drawProgram, 'WORK'), 1)
this.#gl.bindBuffer(this.#gl.UNIFORM_BUFFER, null)
- this.#pixels = new Uint32Array(this.#gl.drawingBufferWidth * this.#gl.drawingBufferHeight * 4)
+ /** Finalize configuration */
this.#query = this.#gl.createQuery()
+ this.#pixels = new Uint32Array(this.#gl.drawingBufferWidth * this.#gl.drawingBufferHeight * 4)
+ console.log(`NanoPow WebGL initialized at ${this.#gl.drawingBufferWidth}x${this.#gl.drawingBufferHeight}. Maximum nonces checked per frame: ${this.#gl.drawingBufferWidth * this.#gl.drawingBufferHeight}`)
} catch (err) {
- throw new Error(`WebGL initialization failed. ${err}`)
+ throw new Error('WebGL initialization failed.', { cause: err })
} finally {
this.#busy = false
}
}
static reset (): void {
+ cancelAnimationFrame(NanoPowGl.#raf)
+ NanoPowGl.#gl?.deleteQuery(NanoPowGl.#query)
NanoPowGl.#query = null
- NanoPowGl.#workBuffer = null
+ NanoPowGl.#gl?.deleteBuffer(NanoPowGl.#seedBuffer)
+ NanoPowGl.#seedBuffer = null
+ NanoPowGl.#gl?.deleteBuffer(NanoPowGl.#uboBuffer)
NanoPowGl.#uboBuffer = null
- NanoPowGl.#uvBuffer = null
+ for (const fbo of NanoPowGl.#downsampleFbos) {
+ NanoPowGl.#gl?.deleteFramebuffer(fbo.framebuffer)
+ NanoPowGl.#gl?.deleteTexture(fbo.texture)
+ }
+ NanoPowGl.#downsampleFbos = []
+ NanoPowGl.#gl?.deleteShader(NanoPowGl.#downsampleShader)
+ NanoPowGl.#downsampleShader = null
+ NanoPowGl.#gl?.deleteProgram(NanoPowGl.#downsampleProgram)
+ NanoPowGl.#downsampleProgram = null
+ NanoPowGl.#gl?.deleteFramebuffer(NanoPowGl.#drawFbo?.framebuffer ?? null)
+ NanoPowGl.#drawFbo = null
+ NanoPowGl.#gl?.deleteTexture(NanoPowGl.#drawFbo)
+ NanoPowGl.#drawFbo = null
+ NanoPowGl.#gl?.deleteBuffer(NanoPowGl.#positionBuffer)
NanoPowGl.#positionBuffer = null
- NanoPowGl.#framebuffer = null
- NanoPowGl.#texture = null
- NanoPowGl.#fragmentShader = null
+ NanoPowGl.#gl?.deleteShader(NanoPowGl.#drawShader)
+ NanoPowGl.#drawShader = null
+ NanoPowGl.#gl?.deleteShader(NanoPowGl.#vertexShader)
NanoPowGl.#vertexShader = null
- NanoPowGl.#program = null
+ NanoPowGl.#gl?.deleteProgram(NanoPowGl.#drawProgram)
+ NanoPowGl.#drawProgram = null
NanoPowGl.#gl = null
NanoPowGl.#busy = false
NanoPowGl.init()
"Harmonic Mean (ms)": count / reciprocals,
"Geometric Mean (ms)": Math.exp(logarithms / count)
}
+ console.log(`Averages: ${JSON.stringify(averages)}`)
console.table(averages)
}
- static #draw (work: Uint8Array): void {
+ static #draw (seed: BigUint64Array): void {
if (this.#gl == null || this.#query == null) throw new Error('WebGL 2 is required to draw and query pixels')
- if (this.#workBuffer == null) throw new Error('Work buffer is required to draw')
- this.#gl.clear(this.#gl.COLOR_BUFFER_BIT)
+ if (this.#drawFbo == null) throw new Error('FBO is required to draw')
+ if (this.#seed[0] == null || this.#seedBuffer == null) throw new Error('Seed is required to draw')
- /** Upload work buffer */
- this.#gl.bindBuffer(this.#gl.UNIFORM_BUFFER, this.#workBuffer)
- this.#gl.bufferSubData(this.#gl.UNIFORM_BUFFER, 0, work)
+ /** Upload work seed buffer */
+ this.#gl.bindBuffer(this.#gl.UNIFORM_BUFFER, this.#seedBuffer)
+ this.#gl.bufferSubData(this.#gl.UNIFORM_BUFFER, 0, seed)
this.#gl.bindBuffer(this.#gl.UNIFORM_BUFFER, null)
+ /** Draw full canvas */
+ this.#gl.useProgram(this.#drawProgram)
+ this.#gl.bindFramebuffer(this.#gl.FRAMEBUFFER, this.#drawFbo.framebuffer)
+ this.#gl.activeTexture(this.#gl.TEXTURE0)
+ this.#gl.bindTexture(this.#gl.TEXTURE_2D, this.#drawFbo.texture)
+
this.#gl.beginQuery(this.#gl.ANY_SAMPLES_PASSED_CONSERVATIVE, this.#query)
- this.#gl.bindFramebuffer(this.#gl.FRAMEBUFFER, this.#framebuffer)
- this.#gl.drawArrays(this.#gl.TRIANGLES, 0, 6)
+ this.#gl.viewport(0, 0, this.#drawFbo.size.x, this.#drawFbo.size.y)
+ this.#gl.drawArrays(this.#gl.TRIANGLES, 0, 4)
this.#gl.bindFramebuffer(this.#gl.FRAMEBUFFER, null)
this.#gl.endQuery(this.#gl.ANY_SAMPLES_PASSED_CONSERVATIVE)
}
static async #checkQueryResult (): Promise<boolean> {
return new Promise((resolve, reject) => {
- try {
- if (this.#gl == null || this.#query == null) throw new Error('WebGL 2 is required to check query results')
- if (this.#gl.getQueryParameter(this.#query, this.#gl.QUERY_RESULT_AVAILABLE)) {
- resolve(!!(this.#gl.getQueryParameter(this.#query, this.#gl.QUERY_RESULT)))
- } else {
- /** Query result not yet available, check again in the next frame */
- requestAnimationFrame(async (): Promise<void> => {
- const result = await NanoPowGl.#checkQueryResult()
- resolve(result)
- })
+ function check () {
+ try {
+ if (NanoPowGl.#gl == null || NanoPowGl.#query == null) throw new Error('WebGL 2 is required to check query results')
+ if (NanoPowGl.#gl.getQueryParameter(NanoPowGl.#query, NanoPowGl.#gl.QUERY_RESULT_AVAILABLE)) {
+ resolve(!!(NanoPowGl.#gl.getQueryParameter(NanoPowGl.#query, NanoPowGl.#gl.QUERY_RESULT)))
+ } else {
+ /** Query result not yet available, check again in the next frame */
+ NanoPowGl.#raf = requestAnimationFrame(check)
+ }
+ } catch (err) {
+ reject(err)
}
- } catch (err) {
- reject(err)
}
+ check()
})
}
*/
static #readResult (workHex?: string): string {
if (this.#gl == null) throw new Error('WebGL 2 is required to read pixels')
- this.#gl.bindFramebuffer(this.#gl.FRAMEBUFFER, this.#framebuffer)
- this.#gl.readPixels(0, 0, this.#gl.drawingBufferWidth, this.#gl.drawingBufferHeight, this.#gl.RGBA_INTEGER, this.#gl.UNSIGNED_INT, this.#pixels)
+ if (this.#drawFbo == null) throw new Error('Source FBO is required to downsample')
+
+ let source = this.#drawFbo
+ let pixelCount
+ const start = performance.now()
+ if (workHex != null) {
+ /** Read validate results immedidately without unnecessary downsampling */
+ this.#gl.bindFramebuffer(this.#gl.FRAMEBUFFER, source.framebuffer)
+ this.#gl.readPixels(0, 0, 1, 1, this.#gl.RGBA_INTEGER, this.#gl.UNSIGNED_INT, this.#pixels)
+ pixelCount = 4
+ } else {
+ /** Downsample framebuffer */
+ this.#gl.useProgram(this.#downsampleProgram)
+ for (const fbo of this.#downsampleFbos) {
+ this.#gl.bindFramebuffer(this.#gl.FRAMEBUFFER, fbo.framebuffer)
+ this.#gl.activeTexture(this.#gl.TEXTURE0)
+ this.#gl.bindTexture(this.#gl.TEXTURE_2D, source.texture)
+ this.#gl.uniform1i(this.#downsampleSrcLocation, 0)
+ this.#gl.viewport(0, 0, fbo.size.x, fbo.size.y)
+ this.#gl.drawArrays(this.#gl.TRIANGLES, 0, 4)
+ source = fbo
+ }
+ /** Read downsampled result */
+ this.#gl.bindFramebuffer(this.#gl.FRAMEBUFFER, source.framebuffer)
+ this.#gl.readPixels(0, 0, source.size.x, source.size.y, this.#gl.RGBA_INTEGER, this.#gl.UNSIGNED_INT, this.#pixels)
+ pixelCount = source.size.x * source.size.y * 4
+ }
this.#gl.bindFramebuffer(this.#gl.FRAMEBUFFER, null)
- for (let i = 0; i < this.#pixels.length; i += 4) {
+
+ for (let i = 0; i < pixelCount; i += 4) {
if (this.#pixels[i] !== 0) {
+ if (this.#debug) console.log(`readResults (${performance.now() - start} ms)`)
+ if (this.#debug) console.log(`Pixel: rgba(${this.#pixels[i]}, ${this.#pixels[i + 1]}, ${this.#pixels[i + 2]}, ${this.#pixels[i + 3].toString(16).padStart(8, '0')})`)
/** Return the work value with the custom bits */
- const hex = `${this.#pixels[i+1].toString(16).padStart(8, '0')}${this.#pixels[i+2].toString(16).padStart(8, '0')}`
+ const hex = `${this.#pixels[i + 1].toString(16).padStart(8, '0')}${this.#pixels[i + 2].toString(16).padStart(8, '0')}`
if (workHex == null || workHex == hex) return hex
}
}
* @param {number} [threshold=0xfffffff8] - Difficulty of proof-of-work calculation
*/
static async search (hash: string, options?: NanoPowOptions): Promise<string> {
- if (NanoPowGl.#gl == null) throw new Error('WebGL 2 is required')
- if (this.#gl == null) throw new Error('WebGL 2 is required')
- if (!/^[A-Fa-f0-9]{64}$/.test(hash)) throw new Error(`Invalid hash ${hash}`)
if (this.#busy) {
+ console.log('NanoPowGl is busy. Retrying search...')
return new Promise(resolve => {
setTimeout(async (): Promise<void> => {
const result = this.search(hash, options)
})
}
this.#busy = true
+
+ /** Process user input */
+ if (!/^[A-Fa-f0-9]{64}$/.test(hash)) throw new Error(`Invalid hash ${hash}`)
const threshold = (typeof options?.threshold !== 'number' || options.threshold < 0x0 || options.threshold > 0xffffffff)
? 0xfffffff8
: options.threshold
const effort = (typeof options?.effort !== 'number' || options.effort < 0x1 || options.effort > 0x20)
- ? 0x8
+ ? this.#cores
: options.effort
- const debug = !!(options?.debug)
+ this.#debug = !!(options?.debug)
/** Reset if user specified new level of effort */
if (this.#WORKLOAD !== 256 * effort) {
this.#WORKLOAD = 256 * effort
+ this.#canvas.height = this.#WORKLOAD
+ this.#canvas.width = this.#WORKLOAD
this.reset()
}
+ if (NanoPowGl.#gl == null) throw new Error('WebGL 2 is required')
+ if (this.#gl == null) throw new Error('WebGL 2 is required')
+ if (this.#drawFbo == null) throw new Error('WebGL framebuffer is required')
+
+ /** Clear any previous results */
+ this.#gl.bindFramebuffer(this.#gl.FRAMEBUFFER, this.#drawFbo.framebuffer)
+ this.#gl.clearBufferuiv(this.#gl.COLOR, 0, [0, 0, 0, 0])
+ this.#gl.bindFramebuffer(this.#gl.FRAMEBUFFER, null)
/** Set up uniform buffer object */
- const uboView = new DataView(new ArrayBuffer(144))
+ for (let i = 0; i < this.#uboView.byteLength; i++) this.#uboView.setUint8(i, 0)
for (let i = 0; i < 64; i += 8) {
const uint32 = hash.slice(i, i + 8)
- uboView.setUint32(i * 2, parseInt(uint32, 16))
+ this.#uboView.setUint32(i * 2, parseInt(uint32, 16))
}
- uboView.setUint32(128, threshold, true)
- uboView.setFloat32(132, 256 * effort, true)
- NanoPowGl.#gl.bindBuffer(NanoPowGl.#gl.UNIFORM_BUFFER, NanoPowGl.#uboBuffer)
- NanoPowGl.#gl.bufferSubData(NanoPowGl.#gl.UNIFORM_BUFFER, 0, uboView)
- NanoPowGl.#gl.bindBuffer(NanoPowGl.#gl.UNIFORM_BUFFER, null)
+ this.#uboView.setUint32(128, threshold, true)
+ this.#uboView.setUint32(132, 1, true)
+ if (this.#debug) console.log('UBO', this.#uboView.buffer.slice(0))
+ this.#gl.bindBuffer(this.#gl.UNIFORM_BUFFER, this.#uboBuffer)
+ this.#gl.bufferSubData(this.#gl.UNIFORM_BUFFER, 0, this.#uboView)
+ this.#gl.bindBuffer(this.#gl.UNIFORM_BUFFER, null)
/** Start drawing to calculate one nonce per pixel */
let times = []
let start = performance.now()
let nonce = null
- const seed = new Uint8Array(8)
+ if (this.#debug) console.groupCollapsed('Seeds (click to view)')
while (nonce == null) {
start = performance.now()
- crypto.getRandomValues(seed)
- this.#draw(seed)
+ const random0 = Math.floor(Math.random() * 0xffffffff)
+ const random1 = Math.floor(Math.random() * 0xffffffff)
+ this.#seed[0] = (BigInt(random0) << 32n) | BigInt(random1)
+ if (this.#debug) console.log('Seed', this.#seed)
+ this.#draw(this.#seed)
const found = await this.#checkQueryResult()
times.push(performance.now() - start)
if (found) {
+ if (this.#debug) console.groupEnd()
nonce = this.#readResult()
}
}
this.#busy = false
- if (debug) this.#logAverages(times)
+ if (this.#debug) this.#logAverages(times)
return nonce
}
* @param {number} [threshold=0xfffffff8] - Difficulty of proof-of-work calculation
*/
static async validate (work: string, hash: string, options?: NanoPowOptions): Promise<boolean> {
- if (NanoPowGl.#gl == null) throw new Error('WebGL 2 is required')
- if (this.#gl == null) throw new Error('WebGL 2 is required')
- if (!/^[A-Fa-f0-9]{16}$/.test(work)) throw new Error(`Invalid work ${work}`)
- if (!/^[A-Fa-f0-9]{64}$/.test(hash)) throw new Error(`Invalid hash ${hash}`)
if (this.#busy) {
+ console.log('NanoPowGl is busy. Retrying validate...')
return new Promise(resolve => {
setTimeout(async (): Promise<void> => {
const result = this.validate(work, hash, options)
})
}
this.#busy = true
+
+ /** Process user input */
+ if (!/^[A-Fa-f0-9]{16}$/.test(work)) throw new Error(`Invalid work ${work}`)
+ if (!/^[A-Fa-f0-9]{64}$/.test(hash)) throw new Error(`Invalid hash ${hash}`)
const threshold = (typeof options?.threshold !== 'number' || options.threshold < 0x0 || options.threshold > 0xffffffff)
? 0xfffffff8
: options.threshold
- const debug = !!(options?.debug)
+ this.#debug = !!(options?.debug)
- /** Reset if user specified new level of effort */
- if (this.#WORKLOAD !== 1) {
- this.#WORKLOAD = 1
- this.reset()
- }
+ if (NanoPowGl.#gl == null) throw new Error('WebGL 2 is required')
+ if (this.#gl == null) throw new Error('WebGL 2 is required')
+ if (this.#drawFbo == null) throw new Error('WebGL framebuffer is required')
+
+ /** Clear any previous results */
+ this.#gl.bindFramebuffer(this.#gl.FRAMEBUFFER, this.#drawFbo.framebuffer)
+ this.#gl.clearBufferuiv(this.#gl.COLOR, 0, [0, 0, 0, 0])
+ this.#gl.bindFramebuffer(this.#gl.FRAMEBUFFER, null)
/** Set up uniform buffer object */
- const uboView = new DataView(new ArrayBuffer(144))
+ for (let i = 0; i < this.#uboView.byteLength; i++) this.#uboView.setUint8(i, 0)
for (let i = 0; i < 64; i += 8) {
const uint32 = hash.slice(i, i + 8)
- uboView.setUint32(i * 2, parseInt(uint32, 16))
+ this.#uboView.setUint32(i * 2, parseInt(uint32, 16))
}
- uboView.setUint32(128, threshold, true)
- uboView.setFloat32(132, NanoPowGl.#WORKLOAD - 1, true)
- NanoPowGl.#gl.bindBuffer(NanoPowGl.#gl.UNIFORM_BUFFER, NanoPowGl.#uboBuffer)
- NanoPowGl.#gl.bufferSubData(NanoPowGl.#gl.UNIFORM_BUFFER, 0, uboView)
- NanoPowGl.#gl.bindBuffer(NanoPowGl.#gl.UNIFORM_BUFFER, null)
+ this.#uboView.setUint32(128, threshold, true)
+ this.#uboView.setUint32(132, 0, true)
+ if (this.#debug) console.log('UBO', this.#uboView.buffer.slice(0))
+ this.#gl.bindBuffer(this.#gl.UNIFORM_BUFFER, this.#uboBuffer)
+ this.#gl.bufferSubData(this.#gl.UNIFORM_BUFFER, 0, this.#uboView)
+ this.#gl.bindBuffer(this.#gl.UNIFORM_BUFFER, null)
/** Start drawing to calculate one nonce per pixel */
let nonce = null
- const data = new DataView(new ArrayBuffer(8))
- data.setBigUint64(0, BigInt(`0x${work}`), true)
- const seed = new Uint8Array(data.buffer)
- this.#draw(seed)
+ this.#seed[0] = BigInt(`0x${work}`)
+ if (this.#debug) console.log('Work', this.#seed)
+ this.#draw(this.#seed)
let found = await this.#checkQueryResult()
if (found) {
try {
// Initialize WebGPU
static #busy: boolean = false
+ static #debug: boolean = false
static #device: GPUDevice | null = null
- static #uboBuffer: GPUBuffer
+ static #gpuBufferReset: BigUint64Array = new BigUint64Array([0n, 0n])
static #gpuBuffer: GPUBuffer
static #cpuBuffer: GPUBuffer
+ static #uboBuffer: GPUBuffer
+ static #uboView: DataView
static #bindGroupLayout: GPUBindGroupLayout
static #searchPipeline: GPUComputePipeline
static #validatePipeline: GPUComputePipeline
this.#device = device
this.setup()
} catch (err) {
- throw new Error(`WebGPU initialization failed. ${err}`)
+ throw new Error('WebGPU initialization failed.', { cause: err })
} finally {
this.#busy = false
}
static setup (): void {
if (this.#device == null) throw new Error(`WebGPU device failed to load.`)
// Create buffers for writing GPU calculations and reading from Javascript
- this.#uboBuffer = this.#device.createBuffer({
- size: 48,
- usage: GPUBufferUsage.UNIFORM | GPUBufferUsage.COPY_DST
- })
this.#gpuBuffer = this.#device.createBuffer({
size: 16,
usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_DST | GPUBufferUsage.COPY_SRC
size: 16,
usage: GPUBufferUsage.COPY_DST | GPUBufferUsage.MAP_READ
})
+ this.#uboBuffer = this.#device.createBuffer({
+ size: 48,
+ usage: GPUBufferUsage.UNIFORM | GPUBufferUsage.COPY_DST
+ })
+ this.#uboView = new DataView(new ArrayBuffer(48))
// Create binding group data structure and use it later once UBO is known
this.#bindGroupLayout = this.#device.createBindGroupLayout({
entries: [
module: shaderModule
}
})
+ console.log(`NanoPow WebGPU initialized. Recommended effort: ${Math.max(1, Math.floor(navigator.hardwareConcurrency / 2))}`)
}
static reset (): void {
if (this.#device == null) throw new Error(`WebGPU device failed to load.`)
// Set up uniform buffer object
// Note: u32 size is 4, but total alignment must be multiple of 16
- const uboView = new DataView(new ArrayBuffer(48))
+ for (let i = 0; i < this.#uboView.byteLength; i++) this.#uboView.setUint8(i, 0)
for (let i = 0; i < 64; i += 16) {
const u64 = hash.slice(i, i + 16)
- uboView.setBigUint64(i / 2, BigInt(`0x${u64}`))
+ this.#uboView.setBigUint64(i / 2, BigInt(`0x${u64}`))
}
- uboView.setBigUint64(32, seed, true)
- uboView.setUint32(40, threshold, true)
- this.#device.queue.writeBuffer(this.#uboBuffer, 0, uboView)
+ this.#uboView.setBigUint64(32, seed, true)
+ this.#uboView.setUint32(40, threshold, true)
+ if (this.#debug) console.log('UBO', this.#uboView)
+ this.#device.queue.writeBuffer(this.#uboBuffer, 0, this.#uboView)
// Reset `nonce` and `found` to 0u in WORK before each calculation
- this.#device.queue.writeBuffer(this.#gpuBuffer, 0, new Uint32Array([0, 0, 0]))
+ this.#device.queue.writeBuffer(this.#gpuBuffer, 0, this.#gpuBufferReset)
// Bind UBO read and GPU write buffers
const bindGroup = this.#device.createBindGroup({
console.warn(`Error getting data from GPU. ${err}`)
return this.#dispatch(pipeline, seed, hash, threshold, passes)
}
+ if (this.#debug) console.log('gpuBuffer data', data)
if (data == null) throw new Error(`Failed to get data from buffer.`)
return data
}
static async search (hash: string, options?: NanoPowOptions): Promise<string> {
if (!/^[A-Fa-f0-9]{64}$/.test(hash)) throw new TypeError(`Invalid hash ${hash}`)
if (this.#busy) {
+ console.log('NanoPowGpu is busy. Retrying search...')
return new Promise(resolve => {
setTimeout(async (): Promise<void> => {
const result = this.search(hash, options)
const effort = (typeof options?.effort !== 'number' || options.effort < 0x1 || options.effort > 0x20)
? 0x800
: options.effort * 0x100
- const debug = !!(options?.debug)
+ this.#debug = !!(options?.debug)
// Ensure WebGPU is initialized before calculating
let loads = 0
let nonce = 0n
do {
start = performance.now()
- const random = Math.floor(Math.random() * 0xffffffff)
- const seed = (BigInt(random) << 32n) | BigInt(random)
+ const random0 = Math.floor(Math.random() * 0xffffffff)
+ const random1 = Math.floor(Math.random() * 0xffffffff)
+ const seed = (BigInt(random0) << 32n) | BigInt(random1)
+ if (this.#debug) console.log(`seed: ${seed}`)
const data = await this.#dispatch(this.#searchPipeline, seed, hash, threshold, effort)
nonce = data.getBigUint64(0, true)
this.#busy = !data.getUint32(8)
times.push(performance.now() - start)
} while (this.#busy)
- if (debug) this.#logAverages(times)
+ if (this.#debug) this.#logAverages(times)
return nonce.toString(16).padStart(16, '0')
}
if (!/^[A-Fa-f0-9]{16}$/.test(work)) throw new TypeError(`Invalid work ${work}`)
if (!/^[A-Fa-f0-9]{64}$/.test(hash)) throw new TypeError(`Invalid hash ${hash}`)
if (this.#busy) {
+ console.log('NanoPowGpu is busy. Retrying validate...')
return new Promise(resolve => {
setTimeout(async (): Promise<void> => {
const result = this.validate(work, hash, options)
})
}
this.#busy = true
- const debug = !!(options?.debug)
+ this.#debug = !!(options?.debug)
const threshold = (typeof options?.threshold !== 'number' || options.threshold < 0x0 || options.threshold > 0xffffffff)
? 0xfffffff8
: options.threshold
}
const seed = BigInt(`0x${work}`)
+ if (this.#debug) console.log(`work: ${work}`)
const data = await this.#dispatch(this.#validatePipeline, seed, hash, threshold, 1)
const nonce = data.getBigUint64(0, true).toString(16).padStart(16, '0')
+ if (this.#debug) console.log(`nonce: ${nonce}`)
const found = !!data.getUint32(8)
this.#busy = false
if (found && work !== nonce) throw new Error(`Nonce (${nonce}) found but does not match work (${work})`)
await NanoPowGpu.init()
isGpuSupported = true
} catch (err) {
- console.error(err)
- console.warn(`WebGPU is not supported in this environment.`)
+ console.warn('WebGPU is not supported in this environment.\n', err)
isGpuSupported = false
}
try {
await NanoPowGl.init()
isGlSupported = true
} catch (err) {
- console.error(err)
- console.warn(`WebGL is not supported in this environment.`)
+ console.warn('WebGL is not supported in this environment.\n', err)
isGlSupported = false
}
*/
const BLAKE2B_IV_0 = vec2(0xF2BDC900u, 0x6A09E667u);
+/**
+* Numeric literal used in the finalization digest is the original value of the
+* first element of the initialization vector `blake2b_IV[0]` which in NanoPow
+* is initialized at vector component `v01.y`.
+*/
+const BLAKE2B_IV_0 = vec2(0xF2BDC900u, 0x6A09E667u);
+
+/**
+* Used to fill partial `m` vec4 constructions.
+*/
+const Z = vec2(0u);
+
+/**
+* Used to apply boolean mask to swizzled result of carry bit comparison.
+*/
+const CARRY = vec4(1u, 0u, 1u, 0u);
+
/**
* Used to rotate bits by a fixed amount during G mixing.
*/
-const ROTATE_1 = vec2(1u);
-const ROTATE_8 = vec2(8u);
-const ROTATE_16 = vec2(16u);
-const ROTATE_24 = vec2(24u);
-const ROTATE_31 = vec2(31u);
+const ROTATE_1 = vec4(1u);
+const ROTATE_8 = vec4(8u);
+const ROTATE_16 = vec4(16u);
+const ROTATE_24 = vec4(24u);
+const ROTATE_31 = vec4(31u);
/**
* Shared flag to prevent execution for all workgroup threads based on the
*/
@compute @workgroup_size(32)
fn search(@builtin(global_invocation_id) global_id: vec3<u32>, @builtin(local_invocation_id) local_id: vec3<u32>) {
- // found = (local_id.x == 0u && atomicLoad(&work.found) != 0u);
- // workgroupBarrier();
- // if (found) { return; }
+ found = (local_id.x == 0u && atomicLoad(&work.found) != 0u);
+ workgroupBarrier();
+ if (found) { return; }
main(global_id);
}
* It is always the "last" compression at this INLEN
* vE = ~vE;
*/
- var v0: vec2<u32> = BLAKE2B_IV_0;
- var v1: vec2<u32> = vec2(0x84CAA73Bu, 0xBB67AE85u);
- var v2: vec2<u32> = vec2(0xFE94F82Bu, 0x3C6EF372u);
- var v3: vec2<u32> = vec2(0x5F1D36F1u, 0xA54FF53Au);
- var v4: vec2<u32> = vec2(0xADE682D1u, 0x510E527Fu);
- var v5: vec2<u32> = vec2(0x2B3E6C1Fu, 0x9B05688Cu);
- var v6: vec2<u32> = vec2(0xFB41BD6Bu, 0x1F83D9ABu);
- var v7: vec2<u32> = vec2(0x137E2179u, 0x5BE0CD19u);
- var v8: vec2<u32> = vec2(0xF3BCC908u, 0x6A09E667u);
- var v9: vec2<u32> = vec2(0x84CAA73Bu, 0xBB67AE85u);
- var vA: vec2<u32> = vec2(0xFE94F82Bu, 0x3C6EF372u);
- var vB: vec2<u32> = vec2(0x5F1D36F1u, 0xA54FF53Au);
- var vC: vec2<u32> = vec2(0xADE682F9u, 0x510E527Fu);
- var vD: vec2<u32> = vec2(0x2B3E6C1Fu, 0x9B05688Cu);
- var vE: vec2<u32> = vec2(0x04BE4294u, 0xE07C2654u);
- var vF: vec2<u32> = vec2(0x137E2179u, 0x5BE0CD19u);
+ var v01: vec4<u32> = vec4(BLAKE2B_IV_0, 0x84CAA73Bu, 0xBB67AE85u);
+ var v23: vec4<u32> = vec4(0xFE94F82Bu, 0x3C6EF372u, 0x5F1D36F1u, 0xA54FF53Au);
+ var v45: vec4<u32> = vec4(0xADE682D1u, 0x510E527Fu, 0x2B3E6C1Fu, 0x9B05688Cu);
+ var v67: vec4<u32> = vec4(0xFB41BD6Bu, 0x1F83D9ABu, 0x137E2179u, 0x5BE0CD19u);
+ var v89: vec4<u32> = vec4(0xF3BCC908u, 0x6A09E667u, 0x84CAA73Bu, 0xBB67AE85u);
+ var vAB: vec4<u32> = vec4(0xFE94F82Bu, 0x3C6EF372u, 0x5F1D36F1u, 0xA54FF53Au);
+ var vCD: vec4<u32> = vec4(0xADE682F9u, 0x510E527Fu, 0x2B3E6C1Fu, 0x9B05688Cu);
+ var vEF: vec4<u32> = vec4(0x04BE4294u, 0xE07C2654u, 0x137E2179u, 0x5BE0CD19u);
+
+ /**
+ * Temporary variables used for subprocesses i=4 through i=7
+ */
+ var v56: vec4<u32>;
+ var vFC: vec4<u32>;
+ var v74: vec4<u32>;
+ var vDE: vec4<u32>;
/**
* Twelve rounds of G mixing as part of BLAKE2b compression step, each divided
* Each sum step has an extra carry addition. Note that the m[sigma] sum is
* skipped if m[sigma] is zero since it effectively does nothing.
*/
+ var s0: vec4<u32>;
+ var s1: vec4<u32>;
/****************************************************************************
* ROUND(0) *
****************************************************************************/
/**
- * r=0, i=0, a=v[0], b=v[4], c=v[8], d=v[12]
- */
- v0 = v0 + v4 + vec2(0u, u32(v0.x + v4.x < v0.x));
- v0 = v0 + m0 + vec2(0u, u32(v0.x + m0.x < v0.x));
- vC = (vC ^ v0).yx;
- v8 = v8 + vC + vec2(0u, u32(v8.x + vC.x < v8.x));
- v4 = ((v4 ^ v8) >> ROTATE_24) | ((v4 ^ v8).yx << ROTATE_8);
- v0 = v0 + v4 + vec2(0u, u32(v0.x + v4.x < v0.x));
- v0 = v0 + m1 + vec2(0u, u32(v0.x + m1.x < v0.x));
- vC = ((vC ^ v0) >> ROTATE_16) | ((vC ^ v0).yx << ROTATE_16);
- v8 = v8 + vC + vec2(0u, u32(v8.x + vC.x < v8.x));
- v4 = ((v4 ^ v8) << ROTATE_1) | ((v4 ^ v8).yx >> ROTATE_31);
+ * r=0, i=0, a=v[0], b=v[4], c=v[8], d=v[12], m[sigma]=0, m[sigma+1]=1
+ * r=0, i=1, a=v[1], b=v[5], c=v[9], d=v[13], m[sigma]=2, m[sigma+1]=3
+ * r=0, i=2, a=v[2], b=v[6], c=v[10], d=v[14], m[sigma]=4, m[sigma+1]=5
+ * r=0, i=3, a=v[3], b=v[7], c=v[11], d=v[15], m[sigma]=6, m[sigma+1]=7
+ */
+ s0 = v01 + v45;
+ v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
+ s1 = v23 + v67;
+ v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
+
+ s0 = v01 + vec4(m0, m2);
+ v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
+ v23 += vec4(m4, Z);
+ v23.y += u32(v23.x < m4.x);
- /**
- * r=0, i=1, a=v[1], b=v[5], c=v[9], d=v[13]
- */
- v1 = v1 + v5 + vec2(0u, u32(v1.x + v5.x < v1.x));
- v1 = v1 + m2 + vec2(0u, u32(v1.x + m2.x < v1.x));
- vD = (vD ^ v1).yx;
- v9 = v9 + vD + vec2(0u, u32(v9.x + vD.x < v9.x));
- v5 = ((v5 ^ v9) >> ROTATE_24) | ((v5 ^ v9).yx << ROTATE_8);
- v1 = v1 + v5 + vec2(0u, u32(v1.x + v5.x < v1.x));
- v1 = v1 + m3 + vec2(0u, u32(v1.x + m3.x < v1.x));
- vD = ((vD ^ v1) >> ROTATE_16) | ((vD ^ v1).yx << ROTATE_16);
- v9 = v9 + vD + vec2(0u, u32(v9.x + vD.x < v9.x));
- v5 = ((v5 ^ v9) << ROTATE_1) | ((v5 ^ v9).yx >> ROTATE_31);
+ vCD = (vCD ^ v01).yxwz;
+ vEF = (vEF ^ v23).yxwz;
- /**
- * r=0, i=2, a=v[2], b=v[6], c=v[10], d=v[14]
- */
- v2 = v2 + v6 + vec2(0u, u32(v2.x + v6.x < v2.x));
- v2 = v2 + m4 + vec2(0u, u32(v2.x + m4.x < v2.x));
- vE = (vE ^ v2).yx;
- vA = vA + vE + vec2(0u, u32(vA.x + vE.x < vA.x));
- v6 = ((v6 ^ vA) >> ROTATE_24) | ((v6 ^ vA).yx << ROTATE_8);
- v2 = v2 + v6 + vec2(0u, u32(v2.x + v6.x < v2.x));
- vE = ((vE ^ v2) >> ROTATE_16) | ((vE ^ v2).yx << ROTATE_16);
- vA = vA + vE + vec2(0u, u32(vA.x + vE.x < vA.x));
- v6 = ((v6 ^ vA) << ROTATE_1) | ((v6 ^ vA).yx >> ROTATE_31);
+ s0 = v89 + vCD;
+ v89 = s0 + (vec4<u32>(s0 < v89) & CARRY).yxwz;
+ s1 = vAB + vEF;
+ vAB = s1 + (vec4<u32>(s1 < vAB) & CARRY).yxwz;
- /**
- * r=0, i=3, a=v[3], b=v[7], c=v[11], d=v[15]
- */
- v3 = v3 + v7 + vec2(0u, u32(v3.x + v7.x < v3.x));
- vF = (vF ^ v3).yx;
- vB = vB + vF + vec2(0u, u32(vB.x + vF.x < vB.x));
- v7 = ((v7 ^ vB) >> ROTATE_24) | ((v7 ^ vB).yx << ROTATE_8);
- v3 = v3 + v7 + vec2(0u, u32(v3.x + v7.x < v3.x));
- vF = ((vF ^ v3) >> ROTATE_16) | ((vF ^ v3).yx << ROTATE_16);
- vB = vB + vF + vec2(0u, u32(vB.x + vF.x < vB.x));
- v7 = ((v7 ^ vB) << ROTATE_1) | ((v7 ^ vB).yx >> ROTATE_31);
+ v45 ^= v89;
+ v45 = (v45 >> ROTATE_24) | (v45 << ROTATE_8).yxwz;
+ v67 ^= vAB;
+ v67 = (v67 >> ROTATE_24) | (v67 << ROTATE_8).yxwz;
- /**
- * r=0, i=4, a=v[0], b=v[5], c=v[10], d=v[15]
- */
- v0 = v0 + v5 + vec2(0u, u32(v0.x + v5.x < v0.x));
- vF = (vF ^ v0).yx;
- vA = vA + vF + vec2(0u, u32(vA.x + vF.x < vA.x));
- v5 = ((v5 ^ vA) >> ROTATE_24) | ((v5 ^ vA).yx << ROTATE_8);
- v0 = v0 + v5 + vec2(0u, u32(v0.x + v5.x < v0.x));
- vF = ((vF ^ v0) >> ROTATE_16) | ((vF ^ v0).yx << ROTATE_16);
- vA = vA + vF + vec2(0u, u32(vA.x + vF.x < vA.x));
- v5 = ((v5 ^ vA) << ROTATE_1) | ((v5 ^ vA).yx >> ROTATE_31);
+ s0 = v01 + v45;
+ v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
+ s1 = v23 + v67;
+ v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
- /**
- * r=0, i=5, a=v[1], b=v[6], c=v[11], d=v[12]
- */
- v1 = v1 + v6 + vec2(0u, u32(v1.x + v6.x < v1.x));
- vC = (vC ^ v1).yx;
- vB = vB + vC + vec2(0u, u32(vB.x + vC.x < vB.x));
- v6 = ((v6 ^ vB) >> ROTATE_24) | ((v6 ^ vB).yx << ROTATE_8);
- v1 = v1 + v6 + vec2(0u, u32(v1.x + v6.x < v1.x));
- vC = ((vC ^ v1) >> ROTATE_16) | ((vC ^ v1).yx << ROTATE_16);
- vB = vB + vC + vec2(0u, u32(vB.x + vC.x < vB.x));
- v6 = ((v6 ^ vB) << ROTATE_1) | ((v6 ^ vB).yx >> ROTATE_31);
+ s0 = v01 + vec4(m1, m3);
+ v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
+ // NOP
- /**
- * r=0, i=6, a=v[2], b=v[7], c=v[8], d=v[13]
- */
- v2 = v2 + v7 + vec2(0u, u32(v2.x + v7.x < v2.x));
- vD = (vD ^ v2).yx;
- v8 = v8 + vD + vec2(0u, u32(v8.x + vD.x < v8.x));
- v7 = ((v7 ^ v8) >> ROTATE_24) | ((v7 ^ v8).yx << ROTATE_8);
- v2 = v2 + v7 + vec2(0u, u32(v2.x + v7.x < v2.x));
- vD = ((vD ^ v2) >> ROTATE_16) | ((vD ^ v2).yx << ROTATE_16);
- v8 = v8 + vD + vec2(0u, u32(v8.x + vD.x < v8.x));
- v7 = ((v7 ^ v8) << ROTATE_1) | ((v7 ^ v8).yx >> ROTATE_31);
+ vCD ^= v01;
+ vCD = (vCD >> ROTATE_16) | (vCD << ROTATE_16).yxwz;
+ vEF ^= v23;
+ vEF = (vEF >> ROTATE_16) | (vEF << ROTATE_16).yxwz;
+
+ s0 = v89 + vCD;
+ v89 = s0 + (vec4<u32>(s0 < v89) & CARRY).yxwz;
+ s1 = vAB + vEF;
+ vAB = s1 + (vec4<u32>(s1 < vAB) & CARRY).yxwz;
+
+ v45 ^= v89;
+ v45 = (v45 >> ROTATE_31).yxwz | (v45 << ROTATE_1);
+ v67 ^= vAB;
+ v67 = (v67 >> ROTATE_31).yxwz | (v67 << ROTATE_1);
/**
- * r=0, i=7, a=v[3], b=v[4], c=v[9], d=v[14]
+ * r=0, i=4, a=v[0], b=v[5], c=v[10], d=v[15], m[sigma]=8, m[sigma+1]=9
+ * r=0, i=5, a=v[1], b=v[6], c=v[11], d=v[12], m[sigma]=10, m[sigma+1]=11
+ * r=0, i=6, a=v[2], b=v[7], c=v[8], d=v[13], m[sigma]=12, m[sigma+1]=13
+ * r=0, i=7, a=v[3], b=v[4], c=v[9], d=v[14], m[sigma]=14, m[sigma+1]=15
*/
- v3 = v3 + v4 + vec2(0u, u32(v3.x + v4.x < v3.x));
- vE = (vE ^ v3).yx;
- v9 = v9 + vE + vec2(0u, u32(v9.x + vE.x < v9.x));
- v4 = ((v4 ^ v9) >> ROTATE_24) | ((v4 ^ v9).yx << ROTATE_8);
- v3 = v3 + v4 + vec2(0u, u32(v3.x + v4.x < v3.x));
- vE = ((vE ^ v3) >> ROTATE_16) | ((vE ^ v3).yx << ROTATE_16);
- v9 = v9 + vE + vec2(0u, u32(v9.x + vE.x < v9.x));
- v4 = ((v4 ^ v9) << ROTATE_1) | ((v4 ^ v9).yx >> ROTATE_31);
+ v56 = vec4(v45.zw, v67.xy);
+ v74 = vec4(v67.zw, v45.xy);
+ vFC = vec4(vEF.zw, vCD.xy);
+ vDE = vec4(vCD.zw, vEF.xy);
+
+ s0 = v01 + v56;
+ v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
+ s1 = v23 + v74;
+ v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
+ // NOP
+ // NOP
+
+ vFC = (vFC ^ v01).yxwz;
+ vDE = (vDE ^ v23).yxwz;
+
+ s0 = vAB + vFC;
+ vAB = s0 + (vec4<u32>(s0 < vAB) & CARRY).yxwz;
+ s1 = v89 + vDE;
+ v89 = s1 + (vec4<u32>(s1 < v89) & CARRY).yxwz;
+
+ v56 ^= vAB;
+ v56 = (v56 >> ROTATE_24) | (v56 << ROTATE_8).yxwz;
+ v74 ^= v89;
+ v74 = (v74 >> ROTATE_24) | (v74 << ROTATE_8).yxwz;
+
+ s0 = v01 + v56;
+ v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
+ s1 = v23 + v74;
+ v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
+
+ // NOP
+ // NOP
+
+ vFC ^= v01;
+ vFC = (vFC >> ROTATE_16) | (vFC << ROTATE_16).yxwz;
+ vDE ^= v23;
+ vDE = (vDE >> ROTATE_16) | (vDE << ROTATE_16).yxwz;
+
+ s0 = vAB + vFC;
+ vAB = s0 + (vec4<u32>(s0 < vAB) & CARRY).yxwz;
+ s1 = v89 + vDE;
+ v89 = s1 + (vec4<u32>(s1 < v89) & CARRY).yxwz;
+
+ v56 ^= vAB;
+ v56 = (v56 >> ROTATE_31).yxwz | (v56 << ROTATE_1);
+ v74 ^= v89;
+ v74 = (v74 >> ROTATE_31).yxwz | (v74 << ROTATE_1);
+
+ v45 = vec4(v74.zw, v56.xy);
+ v67 = vec4(v56.zw, v74.xy);
+ vCD = vec4(vFC.zw, vDE.xy);
+ vEF = vec4(vDE.zw, vFC.xy);
****************************************************************************/
/**
- * r=1, i=0, a=v[0], b=v[4], c=v[8], d=v[12]
- */
- v0 = v0 + v4 + vec2(0u, u32(v0.x + v4.x < v0.x));
- vC = (vC ^ v0).yx;
- v8 = v8 + vC + vec2(0u, u32(v8.x + vC.x < v8.x));
- v4 = ((v4 ^ v8) >> ROTATE_24) | ((v4 ^ v8).yx << ROTATE_8);
- v0 = v0 + v4 + vec2(0u, u32(v0.x + v4.x < v0.x));
- vC = ((vC ^ v0) >> ROTATE_16) | ((vC ^ v0).yx << ROTATE_16);
- v8 = v8 + vC + vec2(0u, u32(v8.x + vC.x < v8.x));
- v4 = ((v4 ^ v8) << ROTATE_1) | ((v4 ^ v8).yx >> ROTATE_31);
+ * r=1, i=0, a=v[0], b=v[4], c=v[8], d=v[12], m[sigma]=14, m[sigma+1]=10
+ * r=1, i=1, a=v[1], b=v[5], c=v[9], d=v[13], m[sigma]=4, m[sigma+1]=8
+ * r=1, i=2, a=v[2], b=v[6], c=v[10], d=v[14], m[sigma]=9, m[sigma+1]=15
+ * r=1, i=3, a=v[3], b=v[7], c=v[11], d=v[15], m[sigma]=13, m[sigma+1]=6
+ */
+ s0 = v01 + v45;
+ v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
+ s1 = v23 + v67;
+ v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
+
+ v01 += vec4(Z, m4);
+ v01.w += u32(v01.z < m4.x);
+ // NOP
+
+ vCD = (vCD ^ v01).yxwz;
+ vEF = (vEF ^ v23).yxwz;
- /**
- * r=1, i=1, a=v[1], b=v[5], c=v[9], d=v[13]
- */
- v1 = v1 + v5 + vec2(0u, u32(v1.x + v5.x < v1.x));
- v1 = v1 + m4 + vec2(0u, u32(v1.x + m4.x < v1.x));
- vD = (vD ^ v1).yx;
- v9 = v9 + vD + vec2(0u, u32(v9.x + vD.x < v9.x));
- v5 = ((v5 ^ v9) >> ROTATE_24) | ((v5 ^ v9).yx << ROTATE_8);
- v1 = v1 + v5 + vec2(0u, u32(v1.x + v5.x < v1.x));
- vD = ((vD ^ v1) >> ROTATE_16) | ((vD ^ v1).yx << ROTATE_16);
- v9 = v9 + vD + vec2(0u, u32(v9.x + vD.x < v9.x));
- v5 = ((v5 ^ v9) << ROTATE_1) | ((v5 ^ v9).yx >> ROTATE_31);
+ s0 = v89 + vCD;
+ v89 = s0 + (vec4<u32>(s0 < v89) & CARRY).yxwz;
+ s1 = vAB + vEF;
+ vAB = s1 + (vec4<u32>(s1 < vAB) & CARRY).yxwz;
- /**
- * r=1, i=2, a=v[2], b=v[6], c=v[10], d=v[14]
- */
- v2 = v2 + v6 + vec2(0u, u32(v2.x + v6.x < v2.x));
- vE = (vE ^ v2).yx;
- vA = vA + vE + vec2(0u, u32(vA.x + vE.x < vA.x));
- v6 = ((v6 ^ vA) >> ROTATE_24) | ((v6 ^ vA).yx << ROTATE_8);
- v2 = v2 + v6 + vec2(0u, u32(v2.x + v6.x < v2.x));
- vE = ((vE ^ v2) >> ROTATE_16) | ((vE ^ v2).yx << ROTATE_16);
- vA = vA + vE + vec2(0u, u32(vA.x + vE.x < vA.x));
- v6 = ((v6 ^ vA) << ROTATE_1) | ((v6 ^ vA).yx >> ROTATE_31);
+ v45 ^= v89;
+ v45 = (v45 >> ROTATE_24) | (v45 << ROTATE_8).yxwz;
+ v67 ^= vAB;
+ v67 = (v67 >> ROTATE_24) | (v67 << ROTATE_8).yxwz;
- /**
- * r=1, i=3, a=v[3], b=v[7], c=v[11], d=v[15]
- */
- v3 = v3 + v7 + vec2(0u, u32(v3.x + v7.x < v3.x));
- vF = (vF ^ v3).yx;
- vB = vB + vF + vec2(0u, u32(vB.x + vF.x < vB.x));
- v7 = ((v7 ^ vB) >> ROTATE_24) | ((v7 ^ vB).yx << ROTATE_8);
- v3 = v3 + v7 + vec2(0u, u32(v3.x + v7.x < v3.x));
- vF = ((vF ^ v3) >> ROTATE_16) | ((vF ^ v3).yx << ROTATE_16);
- vB = vB + vF + vec2(0u, u32(vB.x + vF.x < vB.x));
- v7 = ((v7 ^ vB) << ROTATE_1) | ((v7 ^ vB).yx >> ROTATE_31);
+ s0 = v01 + v45;
+ v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
+ s1 = v23 + v67;
+ v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
- /**
- * r=1, i=4, a=v[0], b=v[5], c=v[10], d=v[15]
- */
- v0 = v0 + v5 + vec2(0u, u32(v0.x + v5.x < v0.x));
- v0 = v0 + m1 + vec2(0u, u32(v0.x + m1.x < v0.x));
- vF = (vF ^ v0).yx;
- vA = vA + vF + vec2(0u, u32(vA.x + vF.x < vA.x));
- v5 = ((v5 ^ vA) >> ROTATE_24) | ((v5 ^ vA).yx << ROTATE_8);
- v0 = v0 + v5 + vec2(0u, u32(v0.x + v5.x < v0.x));
- vF = ((vF ^ v0) >> ROTATE_16) | ((vF ^ v0).yx << ROTATE_16);
- vA = vA + vF + vec2(0u, u32(vA.x + vF.x < vA.x));
- v5 = ((v5 ^ vA) << ROTATE_1) | ((v5 ^ vA).yx >> ROTATE_31);
+ // NOP
+ // NOP
- /**
- * r=1, i=5, a=v[1], b=v[6], c=v[11], d=v[12]
- */
- v1 = v1 + v6 + vec2(0u, u32(v1.x + v6.x < v1.x));
- v1 = v1 + m0 + vec2(0u, u32(v1.x + m0.x < v1.x));
- vC = (vC ^ v1).yx;
- vB = vB + vC + vec2(0u, u32(vB.x + vC.x < vB.x));
- v6 = ((v6 ^ vB) >> ROTATE_24) | ((v6 ^ vB).yx << ROTATE_8);
- v1 = v1 + v6 + vec2(0u, u32(v1.x + v6.x < v1.x));
- v1 = v1 + m2 + vec2(0u, u32(v1.x + m2.x < v1.x));
- vC = ((vC ^ v1) >> ROTATE_16) | ((vC ^ v1).yx << ROTATE_16);
- vB = vB + vC + vec2(0u, u32(vB.x + vC.x < vB.x));
- v6 = ((v6 ^ vB) << ROTATE_1) | ((v6 ^ vB).yx >> ROTATE_31);
+ vCD ^= v01;
+ vCD = (vCD >> ROTATE_16) | (vCD << ROTATE_16).yxwz;
+ vEF ^= v23;
+ vEF = (vEF >> ROTATE_16) | (vEF << ROTATE_16).yxwz;
- /**
- * r=1, i=6, a=v[2], b=v[7], c=v[8], d=v[13]
- */
- v2 = v2 + v7 + vec2(0u, u32(v2.x + v7.x < v2.x));
- vD = (vD ^ v2).yx;
- v8 = v8 + vD + vec2(0u, u32(v8.x + vD.x < v8.x));
- v7 = ((v7 ^ v8) >> ROTATE_24) | ((v7 ^ v8).yx << ROTATE_8);
- v2 = v2 + v7 + vec2(0u, u32(v2.x + v7.x < v2.x));
- vD = ((vD ^ v2) >> ROTATE_16) | ((vD ^ v2).yx << ROTATE_16);
- v8 = v8 + vD + vec2(0u, u32(v8.x + vD.x < v8.x));
- v7 = ((v7 ^ v8) << ROTATE_1) | ((v7 ^ v8).yx >> ROTATE_31);
+ s0 = v89 + vCD;
+ v89 = s0 + (vec4<u32>(s0 < v89) & CARRY).yxwz;
+ s1 = vAB + vEF;
+ vAB = s1 + (vec4<u32>(s1 < vAB) & CARRY).yxwz;
+
+ v45 ^= v89;
+ v45 = (v45 >> ROTATE_31).yxwz | (v45 << ROTATE_1);
+ v67 ^= vAB;
+ v67 = (v67 >> ROTATE_31).yxwz | (v67 << ROTATE_1);
/**
- * r=1, i=7, a=v[3], b=v[4], c=v[9], d=v[14]
+ * r=1, i=4, a=v[0], b=v[5], c=v[10], d=v[15], m[sigma]=1, m[sigma+1]=12
+ * r=1, i=5, a=v[1], b=v[6], c=v[11], d=v[12], m[sigma]=0, m[sigma+1]=2
+ * r=1, i=6, a=v[2], b=v[7], c=v[8], d=v[13], m[sigma]=11, m[sigma+1]=7
+ * r=1, i=7, a=v[3], b=v[4], c=v[9], d=v[14], m[sigma]=5, m[sigma+1]=3
*/
- v3 = v3 + v4 + vec2(0u, u32(v3.x + v4.x < v3.x));
- vE = (vE ^ v3).yx;
- v9 = v9 + vE + vec2(0u, u32(v9.x + vE.x < v9.x));
- v4 = ((v4 ^ v9) >> ROTATE_24) | ((v4 ^ v9).yx << ROTATE_8);
- v3 = v3 + v4 + vec2(0u, u32(v3.x + v4.x < v3.x));
- v3 = v3 + m3 + vec2(0u, u32(v3.x + m3.x < v3.x));
- vE = ((vE ^ v3) >> ROTATE_16) | ((vE ^ v3).yx << ROTATE_16);
- v9 = v9 + vE + vec2(0u, u32(v9.x + vE.x < v9.x));
- v4 = ((v4 ^ v9) << ROTATE_1) | ((v4 ^ v9).yx >> ROTATE_31);
+ v56 = vec4(v45.zw, v67.xy);
+ v74 = vec4(v67.zw, v45.xy);
+ vFC = vec4(vEF.zw, vCD.xy);
+ vDE = vec4(vCD.zw, vEF.xy);
+
+ s0 = v01 + v56;
+ v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
+ s1 = v23 + v74;
+ v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
+
+ s0 = v01 + vec4(m1, m0);
+ v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
+ // NOP
+
+ vFC = (vFC ^ v01).yxwz;
+ vDE = (vDE ^ v23).yxwz;
+
+ s0 = vAB + vFC;
+ vAB = s0 + (vec4<u32>(s0 < vAB) & CARRY).yxwz;
+ s1 = v89 + vDE;
+ v89 = s1 + (vec4<u32>(s1 < v89) & CARRY).yxwz;
+
+ v56 ^= vAB;
+ v56 = (v56 >> ROTATE_24) | (v56 << ROTATE_8).yxwz;
+ v74 ^= v89;
+ v74 = (v74 >> ROTATE_24) | (v74 << ROTATE_8).yxwz;
+
+ s0 = v01 + v56;
+ v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
+ s1 = v23 + v74;
+ v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
+
+ v01 += vec4(Z, m2);
+ v01.w += u32(v01.z < m2.x);
+ v23 += vec4(Z, m3);
+ v23.w += u32(v23.z < m3.x);
+
+ vFC ^= v01;
+ vFC = (vFC >> ROTATE_16) | (vFC << ROTATE_16).yxwz;
+ vDE ^= v23;
+ vDE = (vDE >> ROTATE_16) | (vDE << ROTATE_16).yxwz;
+
+ s0 = vAB + vFC;
+ vAB = s0 + (vec4<u32>(s0 < vAB) & CARRY).yxwz;
+ s1 = v89 + vDE;
+ v89 = s1 + (vec4<u32>(s1 < v89) & CARRY).yxwz;
+
+ v56 ^= vAB;
+ v56 = (v56 >> ROTATE_31).yxwz | (v56 << ROTATE_1);
+ v74 ^= v89;
+ v74 = (v74 >> ROTATE_31).yxwz | (v74 << ROTATE_1);
+
+ v45 = vec4(v74.zw, v56.xy);
+ v67 = vec4(v56.zw, v74.xy);
+ vCD = vec4(vFC.zw, vDE.xy);
+ vEF = vec4(vDE.zw, vFC.xy);
****************************************************************************/
/**
- * r=2, i=0, a=v[0], b=v[4], c=v[8], d=v[12]
+ * r=2, i=0, a=v[0], b=v[4], c=v[8], d=v[12], m[sigma]=11, m[sigma+1]=8
+ * r=2, i=1, a=v[1], b=v[5], c=v[9], d=v[13], m[sigma]=12, m[sigma+1]=0
+ * r=2, i=2, a=v[2], b=v[6], c=v[10], d=v[14], m[sigma]=5, m[sigma+1]=2
+ * r=2, i=3, a=v[3], b=v[7], c=v[11], d=v[15], m[sigma]=15, m[sigma+1]=13
*/
- v0 = v0 + v4 + vec2(0u, u32(v0.x + v4.x < v0.x));
- vC = (vC ^ v0).yx;
- v8 = v8 + vC + vec2(0u, u32(v8.x + vC.x < v8.x));
- v4 = ((v4 ^ v8) >> ROTATE_24) | ((v4 ^ v8).yx << ROTATE_8);
- v0 = v0 + v4 + vec2(0u, u32(v0.x + v4.x < v0.x));
- vC = ((vC ^ v0) >> ROTATE_16) | ((vC ^ v0).yx << ROTATE_16);
- v8 = v8 + vC + vec2(0u, u32(v8.x + vC.x < v8.x));
- v4 = ((v4 ^ v8) << ROTATE_1) | ((v4 ^ v8).yx >> ROTATE_31);
+ s0 = v01 + v45;
+ v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
+ s1 = v23 + v67;
+ v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
+
+ // NOP
+ // NOP
+
+ vCD = (vCD ^ v01).yxwz;
+ vEF = (vEF ^ v23).yxwz;
- /**
- * r=2, i=1, a=v[1], b=v[5], c=v[9], d=v[13]
- */
- v1 = v1 + v5 + vec2(0u, u32(v1.x + v5.x < v1.x));
- vD = (vD ^ v1).yx;
- v9 = v9 + vD + vec2(0u, u32(v9.x + vD.x < v9.x));
- v5 = ((v5 ^ v9) >> ROTATE_24) | ((v5 ^ v9).yx << ROTATE_8);
- v1 = v1 + v5 + vec2(0u, u32(v1.x + v5.x < v1.x));
- v1 = v1 + m0 + vec2(0u, u32(v1.x + m0.x < v1.x));
- vD = ((vD ^ v1) >> ROTATE_16) | ((vD ^ v1).yx << ROTATE_16);
- v9 = v9 + vD + vec2(0u, u32(v9.x + vD.x < v9.x));
- v5 = ((v5 ^ v9) << ROTATE_1) | ((v5 ^ v9).yx >> ROTATE_31);
+ s0 = v89 + vCD;
+ v89 = s0 + (vec4<u32>(s0 < v89) & CARRY).yxwz;
+ s1 = vAB + vEF;
+ vAB = s1 + (vec4<u32>(s1 < vAB) & CARRY).yxwz;
- /**
- * r=2, i=2, a=v[2], b=v[6], c=v[10], d=v[14]
- */
- v2 = v2 + v6 + vec2(0u, u32(v2.x + v6.x < v2.x));
- vE = (vE ^ v2).yx;
- vA = vA + vE + vec2(0u, u32(vA.x + vE.x < vA.x));
- v6 = ((v6 ^ vA) >> ROTATE_24) | ((v6 ^ vA).yx << ROTATE_8);
- v2 = v2 + v6 + vec2(0u, u32(v2.x + v6.x < v2.x));
- v2 = v2 + m2 + vec2(0u, u32(v2.x + m2.x < v2.x));
- vE = ((vE ^ v2) >> ROTATE_16) | ((vE ^ v2).yx << ROTATE_16);
- vA = vA + vE + vec2(0u, u32(vA.x + vE.x < vA.x));
- v6 = ((v6 ^ vA) << ROTATE_1) | ((v6 ^ vA).yx >> ROTATE_31);
+ v45 ^= v89;
+ v45 = (v45 >> ROTATE_24) | (v45 << ROTATE_8).yxwz;
+ v67 ^= vAB;
+ v67 = (v67 >> ROTATE_24) | (v67 << ROTATE_8).yxwz;
- /**
- * r=2, i=3, a=v[3], b=v[7], c=v[11], d=v[15]
- */
- v3 = v3 + v7 + vec2(0u, u32(v3.x + v7.x < v3.x));
- vF = (vF ^ v3).yx;
- vB = vB + vF + vec2(0u, u32(vB.x + vF.x < vB.x));
- v7 = ((v7 ^ vB) >> ROTATE_24) | ((v7 ^ vB).yx << ROTATE_8);
- v3 = v3 + v7 + vec2(0u, u32(v3.x + v7.x < v3.x));
- vF = ((vF ^ v3) >> ROTATE_16) | ((vF ^ v3).yx << ROTATE_16);
- vB = vB + vF + vec2(0u, u32(vB.x + vF.x < vB.x));
- v7 = ((v7 ^ vB) << ROTATE_1) | ((v7 ^ vB).yx >> ROTATE_31);
+ s0 = v01 + v45;
+ v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
+ s1 = v23 + v67;
+ v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
- /**
- * r=2, i=4, a=v[0], b=v[5], c=v[10], d=v[15]
- */
- v0 = v0 + v5 + vec2(0u, u32(v0.x + v5.x < v0.x));
- vF = (vF ^ v0).yx;
- vA = vA + vF + vec2(0u, u32(vA.x + vF.x < vA.x));
- v5 = ((v5 ^ vA) >> ROTATE_24) | ((v5 ^ vA).yx << ROTATE_8);
- v0 = v0 + v5 + vec2(0u, u32(v0.x + v5.x < v0.x));
- vF = ((vF ^ v0) >> ROTATE_16) | ((vF ^ v0).yx << ROTATE_16);
- vA = vA + vF + vec2(0u, u32(vA.x + vF.x < vA.x));
- v5 = ((v5 ^ vA) << ROTATE_1) | ((v5 ^ vA).yx >> ROTATE_31);
+ s0 = v01 + vec4(Z, m0);
+ v01= s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
+ s1 = v23 + vec4(m2, Z);
+ v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
- /**
- * r=2, i=5, a=v[1], b=v[6], c=v[11], d=v[12]
- */
- v1 = v1 + v6 + vec2(0u, u32(v1.x + v6.x < v1.x));
- v1 = v1 + m3 + vec2(0u, u32(v1.x + m3.x < v1.x));
- vC = (vC ^ v1).yx;
- vB = vB + vC + vec2(0u, u32(vB.x + vC.x < vB.x));
- v6 = ((v6 ^ vB) >> ROTATE_24) | ((v6 ^ vB).yx << ROTATE_8);
- v1 = v1 + v6 + vec2(0u, u32(v1.x + v6.x < v1.x));
- vC = ((vC ^ v1) >> ROTATE_16) | ((vC ^ v1).yx << ROTATE_16);
- vB = vB + vC + vec2(0u, u32(vB.x + vC.x < vB.x));
- v6 = ((v6 ^ vB) << ROTATE_1) | ((v6 ^ vB).yx >> ROTATE_31);
- /**
- * r=2, i=6, a=v[2], b=v[7], c=v[8], d=v[13]
- */
- v2 = v2 + v7 + vec2(0u, u32(v2.x + v7.x < v2.x));
- vD = (vD ^ v2).yx;
- v8 = v8 + vD + vec2(0u, u32(v8.x + vD.x < v8.x));
- v7 = ((v7 ^ v8) >> ROTATE_24) | ((v7 ^ v8).yx << ROTATE_8);
- v2 = v2 + v7 + vec2(0u, u32(v2.x + v7.x < v2.x));
- v2 = v2 + m1 + vec2(0u, u32(v2.x + m1.x < v2.x));
- vD = ((vD ^ v2) >> ROTATE_16) | ((vD ^ v2).yx << ROTATE_16);
- v8 = v8 + vD + vec2(0u, u32(v8.x + vD.x < v8.x));
- v7 = ((v7 ^ v8) << ROTATE_1) | ((v7 ^ v8).yx >> ROTATE_31);
+ vCD ^= v01;
+ vCD = (vCD >> ROTATE_16) | (vCD << ROTATE_16).yxwz;
+ vEF ^= v23;
+ vEF = (vEF >> ROTATE_16) | (vEF << ROTATE_16).yxwz;
+
+ s0 = v89 + vCD;
+ v89 = s0 + (vec4<u32>(s0 < v89) & CARRY).yxwz;
+ s1 = vAB + vEF;
+ vAB = s1 + (vec4<u32>(s1 < vAB) & CARRY).yxwz;
+
+ v45 ^= v89;
+ v45 = (v45 >> ROTATE_31).yxwz | (v45 << ROTATE_1);
+ v67 ^= vAB;
+ v67 = (v67 >> ROTATE_31).yxwz | (v67 << ROTATE_1);
/**
- * r=2, i=7, a=v[3], b=v[4], c=v[9], d=v[14]
+ * r=2, i=4, a=v[0], b=v[5], c=v[10], d=v[15], m[sigma]=10, m[sigma+1]=14
+ * r=2, i=5, a=v[1], b=v[6], c=v[11], d=v[12], m[sigma]=3, m[sigma+1]=6
+ * r=2, i=6, a=v[2], b=v[7], c=v[8], d=v[13], m[sigma]=7, m[sigma+1]=1
+ * r=2, i=7, a=v[3], b=v[4], c=v[9], d=v[14], m[sigma]=9, m[sigma+1]=4
*/
- v3 = v3 + v4 + vec2(0u, u32(v3.x + v4.x < v3.x));
- vE = (vE ^ v3).yx;
- v9 = v9 + vE + vec2(0u, u32(v9.x + vE.x < v9.x));
- v4 = ((v4 ^ v9) >> ROTATE_24) | ((v4 ^ v9).yx << ROTATE_8);
- v3 = v3 + v4 + vec2(0u, u32(v3.x + v4.x < v3.x));
- v3 = v3 + m4 + vec2(0u, u32(v3.x + m4.x < v3.x));
- vE = ((vE ^ v3) >> ROTATE_16) | ((vE ^ v3).yx << ROTATE_16);
- v9 = v9 + vE + vec2(0u, u32(v9.x + vE.x < v9.x));
- v4 = ((v4 ^ v9) << ROTATE_1) | ((v4 ^ v9).yx >> ROTATE_31);
+ v56 = vec4(v45.zw, v67.xy);
+ v74 = vec4(v67.zw, v45.xy);
+ vFC = vec4(vEF.zw, vCD.xy);
+ vDE = vec4(vCD.zw, vEF.xy);
+
+ s0 = v01 + v56;
+ v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
+ s1 = v23 + v74;
+ v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
+
+ // v01 += vec4(Z, m3) + vec4(Z, 0u, u32(v01.z + vec4(Z, m3).z < v01.z));
+ v01.z += m3.x;
+ v01.w += m3.y + u32(v01.z < m3.x);
+ // NOP
+
+ vFC = (vFC ^ v01).yxwz;
+ vDE = (vDE ^ v23).yxwz;
+
+ s0 = vAB + vFC;
+ vAB = s0 + (vec4<u32>(s0 < vAB) & CARRY).yxwz;
+ s1 = v89 + vDE;
+ v89 = s1 + (vec4<u32>(s1 < v89) & CARRY).yxwz;
+
+ v56 ^= vAB;
+ v56 = (v56 >> ROTATE_24) | (v56 << ROTATE_8).yxwz;
+ v74 ^= v89;
+ v74 = (v74 >> ROTATE_24) | (v74 << ROTATE_8).yxwz;
+
+ s0 = v01 + v56;
+ v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
+ s1 = v23 + v74;
+ v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
+
+ // NOP
+ v23 += vec4(m1, m4) + vec4(0u, u32(v23.x + vec4(m1, m4).x < v23.x), 0u, u32(v23.z + vec4(m1, m4).z < v23.z));
+
+ vFC ^= v01;
+ vFC = (vFC >> ROTATE_16) | (vFC << ROTATE_16).yxwz;
+ vDE ^= v23;
+ vDE = (vDE >> ROTATE_16) | (vDE << ROTATE_16).yxwz;
+
+ s0 = vAB + vFC;
+ vAB = s0 + (vec4<u32>(s0 < vAB) & CARRY).yxwz;
+ s1 = v89 + vDE;
+ v89 = s1 + (vec4<u32>(s1 < v89) & CARRY).yxwz;
+
+ v56 ^= vAB;
+ v56 = (v56 >> ROTATE_31).yxwz | (v56 << ROTATE_1);
+ v74 ^= v89;
+ v74 = (v74 >> ROTATE_31).yxwz | (v74 << ROTATE_1);
+
+ v45 = vec4(v74.zw, v56.xy);
+ v67 = vec4(v56.zw, v74.xy);
+ vCD = vec4(vFC.zw, vDE.xy);
+ vEF = vec4(vDE.zw, vFC.xy);
****************************************************************************/
/**
- * r=3, i=0, a=v[0], b=v[4], c=v[8], d=v[12]
+ * r=3, i=0, a=v[0], b=v[4], c=v[8], d=v[12], m[sigma]=7, m[sigma+1]=9
+ * r=3, i=1, a=v[1], b=v[5], c=v[9], d=v[13], m[sigma]=3, m[sigma+1]=1
+ * r=3, i=2, a=v[2], b=v[6], c=v[10], d=v[14], m[sigma]=13, m[sigma+1]=12
+ * r=3, i=3, a=v[3], b=v[7], c=v[11], d=v[15], m[sigma]=11, m[sigma+1]=14
*/
- v0 = v0 + v4 + vec2(0u, u32(v0.x + v4.x < v0.x));
- vC = (vC ^ v0).yx;
- v8 = v8 + vC + vec2(0u, u32(v8.x + vC.x < v8.x));
- v4 = ((v4 ^ v8) >> ROTATE_24) | ((v4 ^ v8).yx << ROTATE_8);
- v0 = v0 + v4 + vec2(0u, u32(v0.x + v4.x < v0.x));
- vC = ((vC ^ v0) >> ROTATE_16) | ((vC ^ v0).yx << ROTATE_16);
- v8 = v8 + vC + vec2(0u, u32(v8.x + vC.x < v8.x));
- v4 = ((v4 ^ v8) << ROTATE_1) | ((v4 ^ v8).yx >> ROTATE_31);
+ s0 = v01 + v45;
+ v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
+ s1 = v23 + v67;
+ v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
- /**
- * r=3, i=1, a=v[1], b=v[5], c=v[9], d=v[13]
- */
- v1 = v1 + v5 + vec2(0u, u32(v1.x + v5.x < v1.x));
- v1 = v1 + m3 + vec2(0u, u32(v1.x + m3.x < v1.x));
- vD = (vD ^ v1).yx;
- v9 = v9 + vD + vec2(0u, u32(v9.x + vD.x < v9.x));
- v5 = ((v5 ^ v9) >> ROTATE_24) | ((v5 ^ v9).yx << ROTATE_8);
- v1 = v1 + v5 + vec2(0u, u32(v1.x + v5.x < v1.x));
- v1 = v1 + m1 + vec2(0u, u32(v1.x + m1.x < v1.x));
- vD = ((vD ^ v1) >> ROTATE_16) | ((vD ^ v1).yx << ROTATE_16);
- v9 = v9 + vD + vec2(0u, u32(v9.x + vD.x < v9.x));
- v5 = ((v5 ^ v9) << ROTATE_1) | ((v5 ^ v9).yx >> ROTATE_31);
+ v01 += vec4(Z, m3) + vec4(Z, 0u, u32(v01.z + vec4(Z, m3).z < v01.z));
+ // NOP
- /**
- * r=3, i=2, a=v[2], b=v[6], c=v[10], d=v[14]
- */
- v2 = v2 + v6 + vec2(0u, u32(v2.x + v6.x < v2.x));
- vE = (vE ^ v2).yx;
- vA = vA + vE + vec2(0u, u32(vA.x + vE.x < vA.x));
- v6 = ((v6 ^ vA) >> ROTATE_24) | ((v6 ^ vA).yx << ROTATE_8);
- v2 = v2 + v6 + vec2(0u, u32(v2.x + v6.x < v2.x));
- vE = ((vE ^ v2) >> ROTATE_16) | ((vE ^ v2).yx << ROTATE_16);
- vA = vA + vE + vec2(0u, u32(vA.x + vE.x < vA.x));
- v6 = ((v6 ^ vA) << ROTATE_1) | ((v6 ^ vA).yx >> ROTATE_31);
+ vCD = (vCD ^ v01).yxwz;
+ vEF = (vEF ^ v23).yxwz;
- /**
- * r=3, i=3, a=v[3], b=v[7], c=v[11], d=v[15]
- */
- v3 = v3 + v7 + vec2(0u, u32(v3.x + v7.x < v3.x));
- vF = (vF ^ v3).yx;
- vB = vB + vF + vec2(0u, u32(vB.x + vF.x < vB.x));
- v7 = ((v7 ^ vB) >> ROTATE_24) | ((v7 ^ vB).yx << ROTATE_8);
- v3 = v3 + v7 + vec2(0u, u32(v3.x + v7.x < v3.x));
- vF = ((vF ^ v3) >> ROTATE_16) | ((vF ^ v3).yx << ROTATE_16);
- vB = vB + vF + vec2(0u, u32(vB.x + vF.x < vB.x));
- v7 = ((v7 ^ vB) << ROTATE_1) | ((v7 ^ vB).yx >> ROTATE_31);
+ s0 = v89 + vCD;
+ v89 = s0 + (vec4<u32>(s0 < v89) & CARRY).yxwz;
+ s1 = vAB + vEF;
+ vAB = s1 + (vec4<u32>(s1 < vAB) & CARRY).yxwz;
- /**
- * r=3, i=4, a=v[0], b=v[5], c=v[10], d=v[15]
- */
- v0 = v0 + v5 + vec2(0u, u32(v0.x + v5.x < v0.x));
- v0 = v0 + m2 + vec2(0u, u32(v0.x + m2.x < v0.x));
- vF = (vF ^ v0).yx;
- vA = vA + vF + vec2(0u, u32(vA.x + vF.x < vA.x));
- v5 = ((v5 ^ vA) >> ROTATE_24) | ((v5 ^ vA).yx << ROTATE_8);
- v0 = v0 + v5 + vec2(0u, u32(v0.x + v5.x < v0.x));
- vF = ((vF ^ v0) >> ROTATE_16) | ((vF ^ v0).yx << ROTATE_16);
- vA = vA + vF + vec2(0u, u32(vA.x + vF.x < vA.x));
- v5 = ((v5 ^ vA) << ROTATE_1) | ((v5 ^ vA).yx >> ROTATE_31);
+ v45 ^= v89;
+ v45 = (v45 >> ROTATE_24) | (v45 << ROTATE_8).yxwz;
+ v67 ^= vAB;
+ v67 = (v67 >> ROTATE_24) | (v67 << ROTATE_8).yxwz;
- /**
- * r=3, i=5, a=v[1], b=v[6], c=v[11], d=v[12]
- */
- v1 = v1 + v6 + vec2(0u, u32(v1.x + v6.x < v1.x));
- vC = (vC ^ v1).yx;
- vB = vB + vC + vec2(0u, u32(vB.x + vC.x < vB.x));
- v6 = ((v6 ^ vB) >> ROTATE_24) | ((v6 ^ vB).yx << ROTATE_8);
- v1 = v1 + v6 + vec2(0u, u32(v1.x + v6.x < v1.x));
- vC = ((vC ^ v1) >> ROTATE_16) | ((vC ^ v1).yx << ROTATE_16);
- vB = vB + vC + vec2(0u, u32(vB.x + vC.x < vB.x));
- v6 = ((v6 ^ vB) << ROTATE_1) | ((v6 ^ vB).yx >> ROTATE_31);
+ s0 = v01 + v45;
+ v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
+ s1 = v23 + v67;
+ v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
- /**
- * r=3, i=6, a=v[2], b=v[7], c=v[8], d=v[13]
- */
- v2 = v2 + v7 + vec2(0u, u32(v2.x + v7.x < v2.x));
- v2 = v2 + m4 + vec2(0u, u32(v2.x + m4.x < v2.x));
- vD = (vD ^ v2).yx;
- v8 = v8 + vD + vec2(0u, u32(v8.x + vD.x < v8.x));
- v7 = ((v7 ^ v8) >> ROTATE_24) | ((v7 ^ v8).yx << ROTATE_8);
- v2 = v2 + v7 + vec2(0u, u32(v2.x + v7.x < v2.x));
- v2 = v2 + m0 + vec2(0u, u32(v2.x + m0.x < v2.x));
- vD = ((vD ^ v2) >> ROTATE_16) | ((vD ^ v2).yx << ROTATE_16);
- v8 = v8 + vD + vec2(0u, u32(v8.x + vD.x < v8.x));
- v7 = ((v7 ^ v8) << ROTATE_1) | ((v7 ^ v8).yx >> ROTATE_31);
+ v01 += vec4(Z, m1) + vec4(Z, 0u, u32(v01.z + vec4(Z, m1).z < v01.z));
+ // NOP
+
+ vCD ^= v01;
+ vCD = (vCD >> ROTATE_16) | (vCD << ROTATE_16).yxwz;
+ vEF ^= v23;
+ vEF = (vEF >> ROTATE_16) | (vEF << ROTATE_16).yxwz;
+
+ s0 = v89 + vCD;
+ v89 = s0 + (vec4<u32>(s0 < v89) & CARRY).yxwz;
+ s1 = vAB + vEF;
+ vAB = s1 + (vec4<u32>(s1 < vAB) & CARRY).yxwz;
+
+ v45 ^= v89;
+ v45 = (v45 >> ROTATE_31).yxwz | (v45 << ROTATE_1);
+ v67 ^= vAB;
+ v67 = (v67 >> ROTATE_31).yxwz | (v67 << ROTATE_1);
/**
- * r=3, i=7, a=v[3], b=v[4], c=v[9], d=v[14]
+ * r=3, i=4, a=v[0], b=v[5], c=v[10], d=v[15], m[sigma]=2, m[sigma+1]=6
+ * r=3, i=5, a=v[1], b=v[6], c=v[11], d=v[12], m[sigma]=5, m[sigma+1]=10
+ * r=3, i=6, a=v[2], b=v[7], c=v[8], d=v[13], m[sigma]=4, m[sigma+1]=0
+ * r=3, i=7, a=v[3], b=v[4], c=v[9], d=v[14], m[sigma]=15, m[sigma+1]=8
*/
- v3 = v3 + v4 + vec2(0u, u32(v3.x + v4.x < v3.x));
- vE = (vE ^ v3).yx;
- v9 = v9 + vE + vec2(0u, u32(v9.x + vE.x < v9.x));
- v4 = ((v4 ^ v9) >> ROTATE_24) | ((v4 ^ v9).yx << ROTATE_8);
- v3 = v3 + v4 + vec2(0u, u32(v3.x + v4.x < v3.x));
- vE = ((vE ^ v3) >> ROTATE_16) | ((vE ^ v3).yx << ROTATE_16);
- v9 = v9 + vE + vec2(0u, u32(v9.x + vE.x < v9.x));
- v4 = ((v4 ^ v9) << ROTATE_1) | ((v4 ^ v9).yx >> ROTATE_31);
+ v56 = vec4(v45.zw, v67.xy);
+ v74 = vec4(v67.zw, v45.xy);
+ vFC = vec4(vEF.zw, vCD.xy);
+ vDE = vec4(vCD.zw, vEF.xy);
+
+ s0 = v01 + v56;
+ v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
+ s1 = v23 + v74;
+ v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
+
+ v01 += vec4(m2, Z) + vec4(0u, u32(v01.x + vec4(m2, Z).x < v01.x), Z);
+ v23 += vec4(m4, Z) + vec4(0u, u32(v23.x + vec4(m4, Z).x < v23.x), Z);
+
+ vFC = (vFC ^ v01).yxwz;
+ vDE = (vDE ^ v23).yxwz;
+
+ s0 = vAB + vFC;
+ vAB = s0 + (vec4<u32>(s0 < vAB) & CARRY).yxwz;
+ s1 = v89 + vDE;
+ v89 = s1 + (vec4<u32>(s1 < v89) & CARRY).yxwz;
+
+ v56 ^= vAB;
+ v56 = (v56 >> ROTATE_24) | (v56 << ROTATE_8).yxwz;
+ v74 ^= v89;
+ v74 = (v74 >> ROTATE_24) | (v74 << ROTATE_8).yxwz;
+
+ s0 = v01 + v56;
+ v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
+ s1 = v23 + v74;
+ v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
+
+ // NOP
+ v23 += vec4(m0, Z) + vec4(0u, u32(v23.x + vec4(m0, Z).x < v23.x), Z);
+
+ vFC ^= v01;
+ vFC = (vFC >> ROTATE_16) | (vFC << ROTATE_16).yxwz;
+ vDE ^= v23;
+ vDE = (vDE >> ROTATE_16) | (vDE << ROTATE_16).yxwz;
+
+ s0 = vAB + vFC;
+ vAB = s0 + (vec4<u32>(s0 < vAB) & CARRY).yxwz;
+ s1 = v89 + vDE;
+ v89 = s1 + (vec4<u32>(s1 < v89) & CARRY).yxwz;
+
+ v56 ^= vAB;
+ v56 = (v56 >> ROTATE_31).yxwz | (v56 << ROTATE_1);
+ v74 ^= v89;
+ v74 = (v74 >> ROTATE_31).yxwz | (v74 << ROTATE_1);
+
+ v45 = vec4(v74.zw, v56.xy);
+ v67 = vec4(v56.zw, v74.xy);
+ vCD = vec4(vFC.zw, vDE.xy);
+ vEF = vec4(vDE.zw, vFC.xy);
****************************************************************************/
/**
- * r=4, i=0, a=v[0], b=v[4], c=v[8], d=v[12]
+ * r=4, i=0, a=v[0], b=v[4], c=v[8], d=v[12], m[sigma]=9, m[sigma+1]=0
+ * r=4, i=1, a=v[1], b=v[5], c=v[9], d=v[13], m[sigma]=5, m[sigma+1]=7
+ * r=4, i=2, a=v[2], b=v[6], c=v[10], d=v[14], m[sigma]=2, m[sigma+1]=4
+ * r=4, i=3, a=v[3], b=v[7], c=v[11], d=v[15], m[sigma]=10, m[sigma+1]=15
*/
- v0 = v0 + v4 + vec2(0u, u32(v0.x + v4.x < v0.x));
- vC = (vC ^ v0).yx;
- v8 = v8 + vC + vec2(0u, u32(v8.x + vC.x < v8.x));
- v4 = ((v4 ^ v8) >> ROTATE_24) | ((v4 ^ v8).yx << ROTATE_8);
- v0 = v0 + v4 + vec2(0u, u32(v0.x + v4.x < v0.x));
- v0 = v0 + m0 + vec2(0u, u32(v0.x + m0.x < v0.x));
- vC = ((vC ^ v0) >> ROTATE_16) | ((vC ^ v0).yx << ROTATE_16);
- v8 = v8 + vC + vec2(0u, u32(v8.x + vC.x < v8.x));
- v4 = ((v4 ^ v8) << ROTATE_1) | ((v4 ^ v8).yx >> ROTATE_31);
+ s0 = v01 + v45;
+ v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
+ s1 = v23 + v67;
+ v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
- /**
- * r=4, i=1, a=v[1], b=v[5], c=v[9], d=v[13]
- */
- v1 = v1 + v5 + vec2(0u, u32(v1.x + v5.x < v1.x));
- vD = (vD ^ v1).yx;
- v9 = v9 + vD + vec2(0u, u32(v9.x + vD.x < v9.x));
- v5 = ((v5 ^ v9) >> ROTATE_24) | ((v5 ^ v9).yx << ROTATE_8);
- v1 = v1 + v5 + vec2(0u, u32(v1.x + v5.x < v1.x));
- vD = ((vD ^ v1) >> ROTATE_16) | ((vD ^ v1).yx << ROTATE_16);
- v9 = v9 + vD + vec2(0u, u32(v9.x + vD.x < v9.x));
- v5 = ((v5 ^ v9) << ROTATE_1) | ((v5 ^ v9).yx >> ROTATE_31);
+ // NOP
+ s1 = v23 + vec4(m2, Z);
+ v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
- /**
- * r=4, i=2, a=v[2], b=v[6], c=v[10], d=v[14]
- */
- v2 = v2 + v6 + vec2(0u, u32(v2.x + v6.x < v2.x));
- v2 = v2 + m2 + vec2(0u, u32(v2.x + m2.x < v2.x));
- vE = (vE ^ v2).yx;
- vA = vA + vE + vec2(0u, u32(vA.x + vE.x < vA.x));
- v6 = ((v6 ^ vA) >> ROTATE_24) | ((v6 ^ vA).yx << ROTATE_8);
- v2 = v2 + v6 + vec2(0u, u32(v2.x + v6.x < v2.x));
- v2 = v2 + m4 + vec2(0u, u32(v2.x + m4.x < v2.x));
- vE = ((vE ^ v2) >> ROTATE_16) | ((vE ^ v2).yx << ROTATE_16);
- vA = vA + vE + vec2(0u, u32(vA.x + vE.x < vA.x));
- v6 = ((v6 ^ vA) << ROTATE_1) | ((v6 ^ vA).yx >> ROTATE_31);
+ vCD = (vCD ^ v01).yxwz;
+ vEF = (vEF ^ v23).yxwz;
- /**
- * r=4, i=3, a=v[3], b=v[7], c=v[11], d=v[15]
- */
- v3 = v3 + v7 + vec2(0u, u32(v3.x + v7.x < v3.x));
- vF = (vF ^ v3).yx;
- vB = vB + vF + vec2(0u, u32(vB.x + vF.x < vB.x));
- v7 = ((v7 ^ vB) >> ROTATE_24) | ((v7 ^ vB).yx << ROTATE_8);
- v3 = v3 + v7 + vec2(0u, u32(v3.x + v7.x < v3.x));
- vF = ((vF ^ v3) >> ROTATE_16) | ((vF ^ v3).yx << ROTATE_16);
- vB = vB + vF + vec2(0u, u32(vB.x + vF.x < vB.x));
- v7 = ((v7 ^ vB) << ROTATE_1) | ((v7 ^ vB).yx >> ROTATE_31);
+ s0 = v89 + vCD;
+ v89 = s0 + (vec4<u32>(s0 < v89) & CARRY).yxwz;
+ s1 = vAB + vEF;
+ vAB = s1 + (vec4<u32>(s1 < vAB) & CARRY).yxwz;
- /**
- * r=4, i=4, a=v[0], b=v[5], c=v[10], d=v[15]
- */
- v0 = v0 + v5 + vec2(0u, u32(v0.x + v5.x < v0.x));
- vF = (vF ^ v0).yx;
- vA = vA + vF + vec2(0u, u32(vA.x + vF.x < vA.x));
- v5 = ((v5 ^ vA) >> ROTATE_24) | ((v5 ^ vA).yx << ROTATE_8);
- v0 = v0 + v5 + vec2(0u, u32(v0.x + v5.x < v0.x));
- v0 = v0 + m1 + vec2(0u, u32(v0.x + m1.x < v0.x));
- vF = ((vF ^ v0) >> ROTATE_16) | ((vF ^ v0).yx << ROTATE_16);
- vA = vA + vF + vec2(0u, u32(vA.x + vF.x < vA.x));
- v5 = ((v5 ^ vA) << ROTATE_1) | ((v5 ^ vA).yx >> ROTATE_31);
+ v45 ^= v89;
+ v45 = (v45 >> ROTATE_24) | (v45 << ROTATE_8).yxwz;
+ v67 ^= vAB;
+ v67 = (v67 >> ROTATE_24) | (v67 << ROTATE_8).yxwz;
- /**
- * r=4, i=5, a=v[1], b=v[6], c=v[11], d=v[12]
- */
- v1 = v1 + v6 + vec2(0u, u32(v1.x + v6.x < v1.x));
- vC = (vC ^ v1).yx;
- vB = vB + vC + vec2(0u, u32(vB.x + vC.x < vB.x));
- v6 = ((v6 ^ vB) >> ROTATE_24) | ((v6 ^ vB).yx << ROTATE_8);
- v1 = v1 + v6 + vec2(0u, u32(v1.x + v6.x < v1.x));
- vC = ((vC ^ v1) >> ROTATE_16) | ((vC ^ v1).yx << ROTATE_16);
- vB = vB + vC + vec2(0u, u32(vB.x + vC.x < vB.x));
- v6 = ((v6 ^ vB) << ROTATE_1) | ((v6 ^ vB).yx >> ROTATE_31);
+ s0 = v01 + v45;
+ v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
+ s1 = v23 + v67;
+ v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
+
+ v01 += vec4(m0, Z) + vec4(0u, u32(v01.x + vec4(m0, Z).x < v01.x), Z);
+ v23 += vec4(m4, Z) + vec4(0u, u32(v23.x + vec4(m4, Z).x < v23.x), Z);
+
+ vCD ^= v01;
+ vCD = (vCD >> ROTATE_16) | (vCD << ROTATE_16).yxwz;
+ vEF ^= v23;
+ vEF = (vEF >> ROTATE_16) | (vEF << ROTATE_16).yxwz;
+
+ s0 = v89 + vCD;
+ v89 = s0 + (vec4<u32>(s0 < v89) & CARRY).yxwz;
+ s1 = vAB + vEF;
+ vAB = s1 + (vec4<u32>(s1 < vAB) & CARRY).yxwz;
+
+ v45 ^= v89;
+ v45 = (v45 >> ROTATE_31).yxwz | (v45 << ROTATE_1);
+ v67 ^= vAB;
+ v67 = (v67 >> ROTATE_31).yxwz | (v67 << ROTATE_1);
/**
- * r=4, i=6, a=v[2], b=v[7], c=v[8], d=v[13]
*/
- v2 = v2 + v7 + vec2(0u, u32(v2.x + v7.x < v2.x));
- vD = (vD ^ v2).yx;
- v8 = v8 + vD + vec2(0u, u32(v8.x + vD.x < v8.x));
- v7 = ((v7 ^ v8) >> ROTATE_24) | ((v7 ^ v8).yx << ROTATE_8);
- v2 = v2 + v7 + vec2(0u, u32(v2.x + v7.x < v2.x));
- vD = ((vD ^ v2) >> ROTATE_16) | ((vD ^ v2).yx << ROTATE_16);
- v8 = v8 + vD + vec2(0u, u32(v8.x + vD.x < v8.x));
- v7 = ((v7 ^ v8) << ROTATE_1) | ((v7 ^ v8).yx >> ROTATE_31);
/**
- * r=4, i=7, a=v[3], b=v[4], c=v[9], d=v[14]
+ * r=4, i=4, a=v[0], b=v[5], c=v[10], d=v[15], m[sigma]=14, m[sigma+1]=1
+ * r=4, i=5, a=v[1], b=v[6], c=v[11], d=v[12], m[sigma]=11, m[sigma+1]=12
+ * r=4, i=6, a=v[2], b=v[7], c=v[8], d=v[13], m[sigma]=6, m[sigma+1]=8
+ * r=4, i=7, a=v[3], b=v[4], c=v[9], d=v[14], m[sigma]=3, m[sigma+1]=13
*/
- v3 = v3 + v4 + vec2(0u, u32(v3.x + v4.x < v3.x));
- v3 = v3 + m3 + vec2(0u, u32(v3.x + m3.x < v3.x));
- vE = (vE ^ v3).yx;
- v9 = v9 + vE + vec2(0u, u32(v9.x + vE.x < v9.x));
- v4 = ((v4 ^ v9) >> ROTATE_24) | ((v4 ^ v9).yx << ROTATE_8);
- v3 = v3 + v4 + vec2(0u, u32(v3.x + v4.x < v3.x));
- vE = ((vE ^ v3) >> ROTATE_16) | ((vE ^ v3).yx << ROTATE_16);
- v9 = v9 + vE + vec2(0u, u32(v9.x + vE.x < v9.x));
- v4 = ((v4 ^ v9) << ROTATE_1) | ((v4 ^ v9).yx >> ROTATE_31);
+ v56 = vec4(v45.zw, v67.xy);
+ v74 = vec4(v67.zw, v45.xy);
+ vFC = vec4(vEF.zw, vCD.xy);
+ vDE = vec4(vCD.zw, vEF.xy);
+
+ s0 = v01 + v56;
+ v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
+ s1 = v23 + v74;
+ v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
+
+ // NOP
+ s1 = v23 + vec4(Z, m3);
+ v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
+
+ vFC = (vFC ^ v01).yxwz;
+ vDE = (vDE ^ v23).yxwz;
+
+ s0 = vAB + vFC;
+ vAB = s0 + (vec4<u32>(s0 < vAB) & CARRY).yxwz;
+ s1 = v89 + vDE;
+ v89 = s1 + (vec4<u32>(s1 < v89) & CARRY).yxwz;
+
+ v56 ^= vAB;
+ v56 = (v56 >> ROTATE_24) | (v56 << ROTATE_8).yxwz;
+ v74 ^= v89;
+ v74 = (v74 >> ROTATE_24) | (v74 << ROTATE_8).yxwz;
+
+ s0 = v01 + v56;
+ v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
+ s1 = v23 + v74;
+ v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
+
+ v01 += vec4(m1, Z) + vec4(0u, u32(v01.x + vec4(m1, Z).x < v01.x), Z);
+ // NOP
+
+ vFC ^= v01;
+ vFC = (vFC >> ROTATE_16) | (vFC << ROTATE_16).yxwz;
+ vDE ^= v23;
+ vDE = (vDE >> ROTATE_16) | (vDE << ROTATE_16).yxwz;
+
+ s0 = vAB + vFC;
+ vAB = s0 + (vec4<u32>(s0 < vAB) & CARRY).yxwz;
+ s1 = v89 + vDE;
+ v89 = s1 + (vec4<u32>(s1 < v89) & CARRY).yxwz;
+
+ v56 ^= vAB;
+ v56 = (v56 >> ROTATE_31).yxwz | (v56 << ROTATE_1);
+ v74 ^= v89;
+ v74 = (v74 >> ROTATE_31).yxwz | (v74 << ROTATE_1);
+
+ v45 = vec4(v74.zw, v56.xy);
+ v67 = vec4(v56.zw, v74.xy);
+ vCD = vec4(vFC.zw, vDE.xy);
+ vEF = vec4(vDE.zw, vFC.xy);
****************************************************************************/
/**
- * r=5, i=0, a=v[0], b=v[4], c=v[8], d=v[12]
+ * r=5, i=0, a=v[0], b=v[4], c=v[8], d=v[12], m[sigma]=2, m[sigma+1]=12
+ * r=5, i=1, a=v[1], b=v[5], c=v[9], d=v[13], m[sigma]=6, m[sigma+1]=10
+ * r=5, i=2, a=v[2], b=v[6], c=v[10], d=v[14], m[sigma]=0, m[sigma+1]=11
+ * r=5, i=3, a=v[3], b=v[7], c=v[11], d=v[15], m[sigma]=8, m[sigma+1]=3
*/
- v0 = v0 + v4 + vec2(0u, u32(v0.x + v4.x < v0.x));
- v0 = v0 + m2 + vec2(0u, u32(v0.x + m2.x < v0.x));
- vC = (vC ^ v0).yx;
- v8 = v8 + vC + vec2(0u, u32(v8.x + vC.x < v8.x));
- v4 = ((v4 ^ v8) >> ROTATE_24) | ((v4 ^ v8).yx << ROTATE_8);
- v0 = v0 + v4 + vec2(0u, u32(v0.x + v4.x < v0.x));
- vC = ((vC ^ v0) >> ROTATE_16) | ((vC ^ v0).yx << ROTATE_16);
- v8 = v8 + vC + vec2(0u, u32(v8.x + vC.x < v8.x));
- v4 = ((v4 ^ v8) << ROTATE_1) | ((v4 ^ v8).yx >> ROTATE_31);
+ s0 = v01 + v45;
+ v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
+ s1 = v23 + v67;
+ v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
+
+ v01 += vec4(m2, Z) + vec4(0u, u32(v01.x + vec4(m2, Z).x < v01.x), Z);
+ v23 += vec4(m0, Z) + vec4(0u, u32(v23.x + vec4(m0, Z).x < v23.x), Z);
- /**
- * r=5, i=1, a=v[1], b=v[5], c=v[9], d=v[13]
- */
- v1 = v1 + v5 + vec2(0u, u32(v1.x + v5.x < v1.x));
- vD = (vD ^ v1).yx;
- v9 = v9 + vD + vec2(0u, u32(v9.x + vD.x < v9.x));
- v5 = ((v5 ^ v9) >> ROTATE_24) | ((v5 ^ v9).yx << ROTATE_8);
- v1 = v1 + v5 + vec2(0u, u32(v1.x + v5.x < v1.x));
- vD = ((vD ^ v1) >> ROTATE_16) | ((vD ^ v1).yx << ROTATE_16);
- v9 = v9 + vD + vec2(0u, u32(v9.x + vD.x < v9.x));
- v5 = ((v5 ^ v9) << ROTATE_1) | ((v5 ^ v9).yx >> ROTATE_31);
+ vCD = (vCD ^ v01).yxwz;
+ vEF = (vEF ^ v23).yxwz;
- /**
- * r=5, i=2, a=v[2], b=v[6], c=v[10], d=v[14]
- */
- v2 = v2 + v6 + vec2(0u, u32(v2.x + v6.x < v2.x));
- v2 = v2 + m0 + vec2(0u, u32(v2.x + m0.x < v2.x));
- vE = (vE ^ v2).yx;
- vA = vA + vE + vec2(0u, u32(vA.x + vE.x < vA.x));
- v6 = ((v6 ^ vA) >> ROTATE_24) | ((v6 ^ vA).yx << ROTATE_8);
- v2 = v2 + v6 + vec2(0u, u32(v2.x + v6.x < v2.x));
- vE = ((vE ^ v2) >> ROTATE_16) | ((vE ^ v2).yx << ROTATE_16);
- vA = vA + vE + vec2(0u, u32(vA.x + vE.x < vA.x));
- v6 = ((v6 ^ vA) << ROTATE_1) | ((v6 ^ vA).yx >> ROTATE_31);
+ s0 = v89 + vCD;
+ v89 = s0 + (vec4<u32>(s0 < v89) & CARRY).yxwz;
+ s1 = vAB + vEF;
+ vAB = s1 + (vec4<u32>(s1 < vAB) & CARRY).yxwz;
- /**
- * r=5, i=3, a=v[3], b=v[7], c=v[11], d=v[15]
- */
- v3 = v3 + v7 + vec2(0u, u32(v3.x + v7.x < v3.x));
- vF = (vF ^ v3).yx;
- vB = vB + vF + vec2(0u, u32(vB.x + vF.x < vB.x));
- v7 = ((v7 ^ vB) >> ROTATE_24) | ((v7 ^ vB).yx << ROTATE_8);
- v3 = v3 + v7 + vec2(0u, u32(v3.x + v7.x < v3.x));
- v3 = v3 + m3 + vec2(0u, u32(v3.x + m3.x < v3.x));
- vF = ((vF ^ v3) >> ROTATE_16) | ((vF ^ v3).yx << ROTATE_16);
- vB = vB + vF + vec2(0u, u32(vB.x + vF.x < vB.x));
- v7 = ((v7 ^ vB) << ROTATE_1) | ((v7 ^ vB).yx >> ROTATE_31);
+ v45 ^= v89;
+ v45 = (v45 >> ROTATE_24) | (v45 << ROTATE_8).yxwz;
+ v67 ^= vAB;
+ v67 = (v67 >> ROTATE_24) | (v67 << ROTATE_8).yxwz;
- /**
- * r=5, i=4, a=v[0], b=v[5], c=v[10], d=v[15]
- */
- v0 = v0 + v5 + vec2(0u, u32(v0.x + v5.x < v0.x));
- v0 = v0 + m4 + vec2(0u, u32(v0.x + m4.x < v0.x));
- vF = (vF ^ v0).yx;
- vA = vA + vF + vec2(0u, u32(vA.x + vF.x < vA.x));
- v5 = ((v5 ^ vA) >> ROTATE_24) | ((v5 ^ vA).yx << ROTATE_8);
- v0 = v0 + v5 + vec2(0u, u32(v0.x + v5.x < v0.x));
- vF = ((vF ^ v0) >> ROTATE_16) | ((vF ^ v0).yx << ROTATE_16);
- vA = vA + vF + vec2(0u, u32(vA.x + vF.x < vA.x));
- v5 = ((v5 ^ vA) << ROTATE_1) | ((v5 ^ vA).yx >> ROTATE_31);
+ s0 = v01 + v45;
+ v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
+ s1 = v23 + v67;
+ v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
- /**
- * r=5, i=5, a=v[1], b=v[6], c=v[11], d=v[12]
- */
- v1 = v1 + v6 + vec2(0u, u32(v1.x + v6.x < v1.x));
- vC = (vC ^ v1).yx;
- vB = vB + vC + vec2(0u, u32(vB.x + vC.x < vB.x));
- v6 = ((v6 ^ vB) >> ROTATE_24) | ((v6 ^ vB).yx << ROTATE_8);
- v1 = v1 + v6 + vec2(0u, u32(v1.x + v6.x < v1.x));
- vC = ((vC ^ v1) >> ROTATE_16) | ((vC ^ v1).yx << ROTATE_16);
- vB = vB + vC + vec2(0u, u32(vB.x + vC.x < vB.x));
- v6 = ((v6 ^ vB) << ROTATE_1) | ((v6 ^ vB).yx >> ROTATE_31);
+ // NOP
+ s1 = v23 + vec4(Z, m3);
+ v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
- /**
- * r=5, i=6, a=v[2], b=v[7], c=v[8], d=v[13]
- */
- v2 = v2 + v7 + vec2(0u, u32(v2.x + v7.x < v2.x));
- vD = (vD ^ v2).yx;
- v8 = v8 + vD + vec2(0u, u32(v8.x + vD.x < v8.x));
- v7 = ((v7 ^ v8) >> ROTATE_24) | ((v7 ^ v8).yx << ROTATE_8);
- v2 = v2 + v7 + vec2(0u, u32(v2.x + v7.x < v2.x));
- vD = ((vD ^ v2) >> ROTATE_16) | ((vD ^ v2).yx << ROTATE_16);
- v8 = v8 + vD + vec2(0u, u32(v8.x + vD.x < v8.x));
- v7 = ((v7 ^ v8) << ROTATE_1) | ((v7 ^ v8).yx >> ROTATE_31);
+ vCD ^= v01;
+ vCD = (vCD >> ROTATE_16) | (vCD << ROTATE_16).yxwz;
+ vEF ^= v23;
+ vEF = (vEF >> ROTATE_16) | (vEF << ROTATE_16).yxwz;
+
+ s0 = v89 + vCD;
+ v89 = s0 + (vec4<u32>(s0 < v89) & CARRY).yxwz;
+ s1 = vAB + vEF;
+ vAB = s1 + (vec4<u32>(s1 < vAB) & CARRY).yxwz;
+
+ v45 ^= v89;
+ v45 = (v45 >> ROTATE_31).yxwz | (v45 << ROTATE_1);
+ v67 ^= vAB;
+ v67 = (v67 >> ROTATE_31).yxwz | (v67 << ROTATE_1);
/**
- * r=5, i=7, a=v[3], b=v[4], c=v[9], d=v[14]
+ * r=5, i=4, a=v[0], b=v[5], c=v[10], d=v[15], m[sigma]=4, m[sigma+1]=13
+ * r=5, i=5, a=v[1], b=v[6], c=v[11], d=v[12], m[sigma]=7, m[sigma+1]=5
+ * r=5, i=6, a=v[2], b=v[7], c=v[8], d=v[13], m[sigma]=15, m[sigma+1]=14
+ * r=5, i=7, a=v[3], b=v[4], c=v[9], d=v[14], m[sigma]=1, m[sigma+1]=9
*/
- v3 = v3 + v4 + vec2(0u, u32(v3.x + v4.x < v3.x));
- v3 = v3 + m1 + vec2(0u, u32(v3.x + m1.x < v3.x));
- vE = (vE ^ v3).yx;
- v9 = v9 + vE + vec2(0u, u32(v9.x + vE.x < v9.x));
- v4 = ((v4 ^ v9) >> ROTATE_24) | ((v4 ^ v9).yx << ROTATE_8);
- v3 = v3 + v4 + vec2(0u, u32(v3.x + v4.x < v3.x));
- vE = ((vE ^ v3) >> ROTATE_16) | ((vE ^ v3).yx << ROTATE_16);
- v9 = v9 + vE + vec2(0u, u32(v9.x + vE.x < v9.x));
- v4 = ((v4 ^ v9) << ROTATE_1) | ((v4 ^ v9).yx >> ROTATE_31);
+ v56 = vec4(v45.zw, v67.xy);
+ v74 = vec4(v67.zw, v45.xy);
+ vFC = vec4(vEF.zw, vCD.xy);
+ vDE = vec4(vCD.zw, vEF.xy);
+
+ s0 = v01 + v56;
+ v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
+ s1 = v23 + v74;
+ v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
+
+ v01 += vec4(m4, Z) + vec4(0u, u32(v01.x + vec4(m4, Z).x < v01.x), Z);
+ v23 += vec4(Z, m1) + vec4(Z, 0u, u32(v23.z + vec4(Z, m1).z < v23.z));
+
+ vFC = (vFC ^ v01).yxwz;
+ vDE = (vDE ^ v23).yxwz;
+
+ s0 = vAB + vFC;
+ vAB = s0 + (vec4<u32>(s0 < vAB) & CARRY).yxwz;
+ s1 = v89 + vDE;
+ v89 = s1 + (vec4<u32>(s1 < v89) & CARRY).yxwz;
+
+ v56 ^= vAB;
+ v56 = (v56 >> ROTATE_24) | (v56 << ROTATE_8).yxwz;
+ v74 ^= v89;
+ v74 = (v74 >> ROTATE_24) | (v74 << ROTATE_8).yxwz;
+
+ s0 = v01 + v56;
+ v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
+ s1 = v23 + v74;
+ v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
+
+ // NOP
+ // NOP
+
+ vFC ^= v01;
+ vFC = (vFC >> ROTATE_16) | (vFC << ROTATE_16).yxwz;
+ vDE ^= v23;
+ vDE = (vDE >> ROTATE_16) | (vDE << ROTATE_16).yxwz;
+
+ s0 = vAB + vFC;
+ vAB = s0 + (vec4<u32>(s0 < vAB) & CARRY).yxwz;
+ s1 = v89 + vDE;
+ v89 = s1 + (vec4<u32>(s1 < v89) & CARRY).yxwz;
+
+ v56 ^= vAB;
+ v56 = (v56 >> ROTATE_31).yxwz | (v56 << ROTATE_1);
+ v74 ^= v89;
+ v74 = (v74 >> ROTATE_31).yxwz | (v74 << ROTATE_1);
+
+ v45 = vec4(v74.zw, v56.xy);
+ v67 = vec4(v56.zw, v74.xy);
+ vCD = vec4(vFC.zw, vDE.xy);
+ vEF = vec4(vDE.zw, vFC.xy);
****************************************************************************/
/**
- * r=6, i=0, a=v[0], b=v[4], c=v[8], d=v[12]
+ * r=6, i=0, a=v[0], b=v[4], c=v[8], d=v[12], m[sigma]=12, m[sigma+1]=5
+ * r=6, i=1, a=v[1], b=v[5], c=v[9], d=v[13], m[sigma]=1, m[sigma+1]=15
+ * r=6, i=2, a=v[2], b=v[6], c=v[10], d=v[14], m[sigma]=14, m[sigma+1]=13
+ * r=6, i=3, a=v[3], b=v[7], c=v[11], d=v[15], m[sigma]=4, m[sigma+1]=10
*/
- v0 = v0 + v4 + vec2(0u, u32(v0.x + v4.x < v0.x));
- vC = (vC ^ v0).yx;
- v8 = v8 + vC + vec2(0u, u32(v8.x + vC.x < v8.x));
- v4 = ((v4 ^ v8) >> ROTATE_24) | ((v4 ^ v8).yx << ROTATE_8);
- v0 = v0 + v4 + vec2(0u, u32(v0.x + v4.x < v0.x));
- vC = ((vC ^ v0) >> ROTATE_16) | ((vC ^ v0).yx << ROTATE_16);
- v8 = v8 + vC + vec2(0u, u32(v8.x + vC.x < v8.x));
- v4 = ((v4 ^ v8) << ROTATE_1) | ((v4 ^ v8).yx >> ROTATE_31);
+ s0 = v01 + v45;
+ v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
+ s1 = v23 + v67;
+ v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
+
+ v01 += vec4(Z, m1) + vec4(Z, 0u, u32(v01.z + vec4(Z, m1).z < v01.z));
+ v23 += vec4(Z, m4) + vec4(Z, 0u, u32(v23.z + vec4(Z, m4).z < v23.z));
- /**
- * r=6, i=1, a=v[1], b=v[5], c=v[9], d=v[13]
- */
- v1 = v1 + v5 + vec2(0u, u32(v1.x + v5.x < v1.x));
- v1 = v1 + m1 + vec2(0u, u32(v1.x + m1.x < v1.x));
- vD = (vD ^ v1).yx;
- v9 = v9 + vD + vec2(0u, u32(v9.x + vD.x < v9.x));
- v5 = ((v5 ^ v9) >> ROTATE_24) | ((v5 ^ v9).yx << ROTATE_8);
- v1 = v1 + v5 + vec2(0u, u32(v1.x + v5.x < v1.x));
- vD = ((vD ^ v1) >> ROTATE_16) | ((vD ^ v1).yx << ROTATE_16);
- v9 = v9 + vD + vec2(0u, u32(v9.x + vD.x < v9.x));
- v5 = ((v5 ^ v9) << ROTATE_1) | ((v5 ^ v9).yx >> ROTATE_31);
+ vCD = (vCD ^ v01).yxwz;
+ vEF = (vEF ^ v23).yxwz;
- /**
- * r=6, i=2, a=v[2], b=v[6], c=v[10], d=v[14]
- */
- v2 = v2 + v6 + vec2(0u, u32(v2.x + v6.x < v2.x));
- vE = (vE ^ v2).yx;
- vA = vA + vE + vec2(0u, u32(vA.x + vE.x < vA.x));
- v6 = ((v6 ^ vA) >> ROTATE_24) | ((v6 ^ vA).yx << ROTATE_8);
- v2 = v2 + v6 + vec2(0u, u32(v2.x + v6.x < v2.x));
- vE = ((vE ^ v2) >> ROTATE_16) | ((vE ^ v2).yx << ROTATE_16);
- vA = vA + vE + vec2(0u, u32(vA.x + vE.x < vA.x));
- v6 = ((v6 ^ vA) << ROTATE_1) | ((v6 ^ vA).yx >> ROTATE_31);
+ s0 = v89 + vCD;
+ v89 = s0 + (vec4<u32>(s0 < v89) & CARRY).yxwz;
+ s1 = vAB + vEF;
+ vAB = s1 + (vec4<u32>(s1 < vAB) & CARRY).yxwz;
- /**
- * r=6, i=3, a=v[3], b=v[7], c=v[11], d=v[15]
- */
- v3 = v3 + v7 + vec2(0u, u32(v3.x + v7.x < v3.x));
- v3 = v3 + m4 + vec2(0u, u32(v3.x + m4.x < v3.x));
- vF = (vF ^ v3).yx;
- vB = vB + vF + vec2(0u, u32(vB.x + vF.x < vB.x));
- v7 = ((v7 ^ vB) >> ROTATE_24) | ((v7 ^ vB).yx << ROTATE_8);
- v3 = v3 + v7 + vec2(0u, u32(v3.x + v7.x < v3.x));
- vF = ((vF ^ v3) >> ROTATE_16) | ((vF ^ v3).yx << ROTATE_16);
- vB = vB + vF + vec2(0u, u32(vB.x + vF.x < vB.x));
- v7 = ((v7 ^ vB) << ROTATE_1) | ((v7 ^ vB).yx >> ROTATE_31);
+ v45 ^= v89;
+ v45 = (v45 >> ROTATE_24) | (v45 << ROTATE_8).yxwz;
+ v67 ^= vAB;
+ v67 = (v67 >> ROTATE_24) | (v67 << ROTATE_8).yxwz;
- /**
- * r=6, i=4, a=v[0], b=v[5], c=v[10], d=v[15]
- */
- v0 = v0 + v5 + vec2(0u, u32(v0.x + v5.x < v0.x));
- v0 = v0 + m0 + vec2(0u, u32(v0.x + m0.x < v0.x));
- vF = (vF ^ v0).yx;
- vA = vA + vF + vec2(0u, u32(vA.x + vF.x < vA.x));
- v5 = ((v5 ^ vA) >> ROTATE_24) | ((v5 ^ vA).yx << ROTATE_8);
- v0 = v0 + v5 + vec2(0u, u32(v0.x + v5.x < v0.x));
- vF = ((vF ^ v0) >> ROTATE_16) | ((vF ^ v0).yx << ROTATE_16);
- vA = vA + vF + vec2(0u, u32(vA.x + vF.x < vA.x));
- v5 = ((v5 ^ vA) << ROTATE_1) | ((v5 ^ vA).yx >> ROTATE_31);
+ s0 = v01 + v45;
+ v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
+ s1 = v23 + v67;
+ v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
- /**
- * r=6, i=5, a=v[1], b=v[6], c=v[11], d=v[12]
- */
- v1 = v1 + v6 + vec2(0u, u32(v1.x + v6.x < v1.x));
- vC = (vC ^ v1).yx;
- vB = vB + vC + vec2(0u, u32(vB.x + vC.x < vB.x));
- v6 = ((v6 ^ vB) >> ROTATE_24) | ((v6 ^ vB).yx << ROTATE_8);
- v1 = v1 + v6 + vec2(0u, u32(v1.x + v6.x < v1.x));
- v1 = v1 + m3 + vec2(0u, u32(v1.x + m3.x < v1.x));
- vC = ((vC ^ v1) >> ROTATE_16) | ((vC ^ v1).yx << ROTATE_16);
- vB = vB + vC + vec2(0u, u32(vB.x + vC.x < vB.x));
- v6 = ((v6 ^ vB) << ROTATE_1) | ((v6 ^ vB).yx >> ROTATE_31);
+ // NOP
+ // NOP
- /**
- * r=6, i=6, a=v[2], b=v[7], c=v[8], d=v[13]
- */
- v2 = v2 + v7 + vec2(0u, u32(v2.x + v7.x < v2.x));
- vD = (vD ^ v2).yx;
- v8 = v8 + vD + vec2(0u, u32(v8.x + vD.x < v8.x));
- v7 = ((v7 ^ v8) >> ROTATE_24) | ((v7 ^ v8).yx << ROTATE_8);
- v2 = v2 + v7 + vec2(0u, u32(v2.x + v7.x < v2.x));
- v2 = v2 + m2 + vec2(0u, u32(v2.x + m2.x < v2.x));
- vD = ((vD ^ v2) >> ROTATE_16) | ((vD ^ v2).yx << ROTATE_16);
- v8 = v8 + vD + vec2(0u, u32(v8.x + vD.x < v8.x));
- v7 = ((v7 ^ v8) << ROTATE_1) | ((v7 ^ v8).yx >> ROTATE_31);
+ vCD ^= v01;
+ vCD = (vCD >> ROTATE_16) | (vCD << ROTATE_16).yxwz;
+ vEF ^= v23;
+ vEF = (vEF >> ROTATE_16) | (vEF << ROTATE_16).yxwz;
+
+ s0 = v89 + vCD;
+ v89 = s0 + (vec4<u32>(s0 < v89) & CARRY).yxwz;
+ s1 = vAB + vEF;
+ vAB = s1 + (vec4<u32>(s1 < vAB) & CARRY).yxwz;
+
+ v45 ^= v89;
+ v45 = (v45 >> ROTATE_31).yxwz | (v45 << ROTATE_1);
+ v67 ^= vAB;
+ v67 = (v67 >> ROTATE_31).yxwz | (v67 << ROTATE_1);
/**
- * r=6, i=7, a=v[3], b=v[4], c=v[9], d=v[14]
+ * r=6, i=4, a=v[0], b=v[5], c=v[10], d=v[15], m[sigma]=0, m[sigma+1]=7
+ * r=6, i=5, a=v[1], b=v[6], c=v[11], d=v[12], m[sigma]=6, m[sigma+1]=3
+ * r=6, i=6, a=v[2], b=v[7], c=v[8], d=v[13], m[sigma]=9, m[sigma+1]=2
+ * r=6, i=7, a=v[3], b=v[4], c=v[9], d=v[14], m[sigma]=8, m[sigma+1]=11
*/
- v3 = v3 + v4 + vec2(0u, u32(v3.x + v4.x < v3.x));
- vE = (vE ^ v3).yx;
- v9 = v9 + vE + vec2(0u, u32(v9.x + vE.x < v9.x));
- v4 = ((v4 ^ v9) >> ROTATE_24) | ((v4 ^ v9).yx << ROTATE_8);
- v3 = v3 + v4 + vec2(0u, u32(v3.x + v4.x < v3.x));
- vE = ((vE ^ v3) >> ROTATE_16) | ((vE ^ v3).yx << ROTATE_16);
- v9 = v9 + vE + vec2(0u, u32(v9.x + vE.x < v9.x));
- v4 = ((v4 ^ v9) << ROTATE_1) | ((v4 ^ v9).yx >> ROTATE_31);
+ v56 = vec4(v45.zw, v67.xy);
+ v74 = vec4(v67.zw, v45.xy);
+ vFC = vec4(vEF.zw, vCD.xy);
+ vDE = vec4(vCD.zw, vEF.xy);
+
+ s0 = v01 + v56;
+ v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
+ s1 = v23 + v74;
+ v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
+
+ v01 += vec4(m0, Z) + vec4(0u, u32(v01.x + vec4(m0, Z).x < v01.x), Z);
+ // NOP
+
+ vFC = (vFC ^ v01).yxwz;
+ vDE = (vDE ^ v23).yxwz;
+
+ s0 = vAB + vFC;
+ vAB = s0 + (vec4<u32>(s0 < vAB) & CARRY).yxwz;
+ s1 = v89 + vDE;
+ v89 = s1 + (vec4<u32>(s1 < v89) & CARRY).yxwz;
+
+ v56 ^= vAB;
+ v56 = (v56 >> ROTATE_24) | (v56 << ROTATE_8).yxwz;
+ v74 ^= v89;
+ v74 = (v74 >> ROTATE_24) | (v74 << ROTATE_8).yxwz;
+
+ s0 = v01 + v56;
+ v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
+ s1 = v23 + v74;
+ v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
+
+ v01 += vec4(Z, m3) + vec4(Z, 0u, u32(v01.z + vec4(Z, m3).z < v01.z));
+ s1 = v23 + vec4(m2, Z);
+ v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
+
+ vFC ^= v01;
+ vFC = (vFC >> ROTATE_16) | (vFC << ROTATE_16).yxwz;
+ vDE ^= v23;
+ vDE = (vDE >> ROTATE_16) | (vDE << ROTATE_16).yxwz;
+
+ s0 = vAB + vFC;
+ vAB = s0 + (vec4<u32>(s0 < vAB) & CARRY).yxwz;
+ s1 = v89 + vDE;
+ v89 = s1 + (vec4<u32>(s1 < v89) & CARRY).yxwz;
+
+ v56 ^= vAB;
+ v56 = (v56 >> ROTATE_31).yxwz | (v56 << ROTATE_1);
+ v74 ^= v89;
+ v74 = (v74 >> ROTATE_31).yxwz | (v74 << ROTATE_1);
+
+ v45 = vec4(v74.zw, v56.xy);
+ v67 = vec4(v56.zw, v74.xy);
+ vCD = vec4(vFC.zw, vDE.xy);
+ vEF = vec4(vDE.zw, vFC.xy);
****************************************************************************/
/**
- * r=7, i=0, a=v[0], b=v[4], c=v[8], d=v[12]
+ * r=7, i=0, a=v[0], b=v[4], c=v[8], d=v[12], m[sigma]=13, m[sigma+1]=11
+ * r=7, i=1, a=v[1], b=v[5], c=v[9], d=v[13], m[sigma]=7, m[sigma+1]=14
+ * r=7, i=2, a=v[2], b=v[6], c=v[10], d=v[14], m[sigma]=12, m[sigma+1]=1
+ * r=7, i=3, a=v[3], b=v[7], c=v[11], d=v[15], m[sigma]=3, m[sigma+1]=9
*/
- v0 = v0 + v4 + vec2(0u, u32(v0.x + v4.x < v0.x));
- vC = (vC ^ v0).yx;
- v8 = v8 + vC + vec2(0u, u32(v8.x + vC.x < v8.x));
- v4 = ((v4 ^ v8) >> ROTATE_24) | ((v4 ^ v8).yx << ROTATE_8);
- v0 = v0 + v4 + vec2(0u, u32(v0.x + v4.x < v0.x));
- vC = ((vC ^ v0) >> ROTATE_16) | ((vC ^ v0).yx << ROTATE_16);
- v8 = v8 + vC + vec2(0u, u32(v8.x + vC.x < v8.x));
- v4 = ((v4 ^ v8) << ROTATE_1) | ((v4 ^ v8).yx >> ROTATE_31);
+ s0 = v01 + v45;
+ v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
+ s1 = v23 + v67;
+ v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
+
+ // NOP
+ s1 = v23 + vec4(Z, m3);
+ v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
- /**
- * r=7, i=1, a=v[1], b=v[5], c=v[9], d=v[13]
- */
- v1 = v1 + v5 + vec2(0u, u32(v1.x + v5.x < v1.x));
- vD = (vD ^ v1).yx;
- v9 = v9 + vD + vec2(0u, u32(v9.x + vD.x < v9.x));
- v5 = ((v5 ^ v9) >> ROTATE_24) | ((v5 ^ v9).yx << ROTATE_8);
- v1 = v1 + v5 + vec2(0u, u32(v1.x + v5.x < v1.x));
- vD = ((vD ^ v1) >> ROTATE_16) | ((vD ^ v1).yx << ROTATE_16);
- v9 = v9 + vD + vec2(0u, u32(v9.x + vD.x < v9.x));
- v5 = ((v5 ^ v9) << ROTATE_1) | ((v5 ^ v9).yx >> ROTATE_31);
+ vCD = (vCD ^ v01).yxwz;
+ vEF = (vEF ^ v23).yxwz;
- /**
- * r=7, i=2, a=v[2], b=v[6], c=v[10], d=v[14]
- */
- v2 = v2 + v6 + vec2(0u, u32(v2.x + v6.x < v2.x));
- vE = (vE ^ v2).yx;
- vA = vA + vE + vec2(0u, u32(vA.x + vE.x < vA.x));
- v6 = ((v6 ^ vA) >> ROTATE_24) | ((v6 ^ vA).yx << ROTATE_8);
- v2 = v2 + v6 + vec2(0u, u32(v2.x + v6.x < v2.x));
- v2 = v2 + m1 + vec2(0u, u32(v2.x + m1.x < v2.x));
- vE = ((vE ^ v2) >> ROTATE_16) | ((vE ^ v2).yx << ROTATE_16);
- vA = vA + vE + vec2(0u, u32(vA.x + vE.x < vA.x));
- v6 = ((v6 ^ vA) << ROTATE_1) | ((v6 ^ vA).yx >> ROTATE_31);
+ s0 = v89 + vCD;
+ v89 = s0 + (vec4<u32>(s0 < v89) & CARRY).yxwz;
+ s1 = vAB + vEF;
+ vAB = s1 + (vec4<u32>(s1 < vAB) & CARRY).yxwz;
- /**
- * r=7, i=3, a=v[3], b=v[7], c=v[11], d=v[15]
- */
- v3 = v3 + v7 + vec2(0u, u32(v3.x + v7.x < v3.x));
- v3 = v3 + m3 + vec2(0u, u32(v3.x + m3.x < v3.x));
- vF = (vF ^ v3).yx;
- vB = vB + vF + vec2(0u, u32(vB.x + vF.x < vB.x));
- v7 = ((v7 ^ vB) >> ROTATE_24) | ((v7 ^ vB).yx << ROTATE_8);
- v3 = v3 + v7 + vec2(0u, u32(v3.x + v7.x < v3.x));
- vF = ((vF ^ v3) >> ROTATE_16) | ((vF ^ v3).yx << ROTATE_16);
- vB = vB + vF + vec2(0u, u32(vB.x + vF.x < vB.x));
- v7 = ((v7 ^ vB) << ROTATE_1) | ((v7 ^ vB).yx >> ROTATE_31);
+ v45 ^= v89;
+ v45 = (v45 >> ROTATE_24) | (v45 << ROTATE_8).yxwz;
+ v67 ^= vAB;
+ v67 = (v67 >> ROTATE_24) | (v67 << ROTATE_8).yxwz;
- /**
- * r=7, i=4, a=v[0], b=v[5], c=v[10], d=v[15]
- */
- v0 = v0 + v5 + vec2(0u, u32(v0.x + v5.x < v0.x));
- vF = (vF ^ v0).yx;
- vA = vA + vF + vec2(0u, u32(vA.x + vF.x < vA.x));
- v5 = ((v5 ^ vA) >> ROTATE_24) | ((v5 ^ vA).yx << ROTATE_8);
- v0 = v0 + v5 + vec2(0u, u32(v0.x + v5.x < v0.x));
- v0 = v0 + m0 + vec2(0u, u32(v0.x + m0.x < v0.x));
- vF = ((vF ^ v0) >> ROTATE_16) | ((vF ^ v0).yx << ROTATE_16);
- vA = vA + vF + vec2(0u, u32(vA.x + vF.x < vA.x));
- v5 = ((v5 ^ vA) << ROTATE_1) | ((v5 ^ vA).yx >> ROTATE_31);
+ s0 = v01 + v45;
+ v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
+ s1 = v23 + v67;
+ v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
- /**
- * r=7, i=5, a=v[1], b=v[6], c=v[11], d=v[12]
- */
- v1 = v1 + v6 + vec2(0u, u32(v1.x + v6.x < v1.x));
- vC = (vC ^ v1).yx;
- vB = vB + vC + vec2(0u, u32(vB.x + vC.x < vB.x));
- v6 = ((v6 ^ vB) >> ROTATE_24) | ((v6 ^ vB).yx << ROTATE_8);
- v1 = v1 + v6 + vec2(0u, u32(v1.x + v6.x < v1.x));
- v1 = v1 + m4 + vec2(0u, u32(v1.x + m4.x < v1.x));
- vC = ((vC ^ v1) >> ROTATE_16) | ((vC ^ v1).yx << ROTATE_16);
- vB = vB + vC + vec2(0u, u32(vB.x + vC.x < vB.x));
- v6 = ((v6 ^ vB) << ROTATE_1) | ((v6 ^ vB).yx >> ROTATE_31);
+ // NOP
+ v23 += vec4(m1, Z) + vec4(0u, u32(v23.x + vec4(m1, Z).x < v23.x), Z);
- /**
- * r=7, i=6, a=v[2], b=v[7], c=v[8], d=v[13]
- */
- v2 = v2 + v7 + vec2(0u, u32(v2.x + v7.x < v2.x));
- vD = (vD ^ v2).yx;
- v8 = v8 + vD + vec2(0u, u32(v8.x + vD.x < v8.x));
- v7 = ((v7 ^ v8) >> ROTATE_24) | ((v7 ^ v8).yx << ROTATE_8);
- v2 = v2 + v7 + vec2(0u, u32(v2.x + v7.x < v2.x));
- vD = ((vD ^ v2) >> ROTATE_16) | ((vD ^ v2).yx << ROTATE_16);
- v8 = v8 + vD + vec2(0u, u32(v8.x + vD.x < v8.x));
- v7 = ((v7 ^ v8) << ROTATE_1) | ((v7 ^ v8).yx >> ROTATE_31);
+ vCD ^= v01;
+ vCD = (vCD >> ROTATE_16) | (vCD << ROTATE_16).yxwz;
+ vEF ^= v23;
+ vEF = (vEF >> ROTATE_16) | (vEF << ROTATE_16).yxwz;
+
+ s0 = v89 + vCD;
+ v89 = s0 + (vec4<u32>(s0 < v89) & CARRY).yxwz;
+ s1 = vAB + vEF;
+ vAB = s1 + (vec4<u32>(s1 < vAB) & CARRY).yxwz;
+
+ v45 ^= v89;
+ v45 = (v45 >> ROTATE_31).yxwz | (v45 << ROTATE_1);
+ v67 ^= vAB;
+ v67 = (v67 >> ROTATE_31).yxwz | (v67 << ROTATE_1);
/**
- * r=7, i=7, a=v[3], b=v[4], c=v[9], d=v[14]
+ * r=7, i=4, a=v[0], b=v[5], c=v[10], d=v[15], m[sigma]=5, m[sigma+1]=0
+ * r=7, i=5, a=v[1], b=v[6], c=v[11], d=v[12], m[sigma]=15, m[sigma+1]=4
+ * r=7, i=6, a=v[2], b=v[7], c=v[8], d=v[13], m[sigma]=8, m[sigma+1]=6
+ * r=7, i=7, a=v[3], b=v[4], c=v[9], d=v[14], m[sigma]=2, m[sigma+1]=10
*/
- v3 = v3 + v4 + vec2(0u, u32(v3.x + v4.x < v3.x));
- v3 = v3 + m2 + vec2(0u, u32(v3.x + m2.x < v3.x));
- vE = (vE ^ v3).yx;
- v9 = v9 + vE + vec2(0u, u32(v9.x + vE.x < v9.x));
- v4 = ((v4 ^ v9) >> ROTATE_24) | ((v4 ^ v9).yx << ROTATE_8);
- v3 = v3 + v4 + vec2(0u, u32(v3.x + v4.x < v3.x));
- vE = ((vE ^ v3) >> ROTATE_16) | ((vE ^ v3).yx << ROTATE_16);
- v9 = v9 + vE + vec2(0u, u32(v9.x + vE.x < v9.x));
- v4 = ((v4 ^ v9) << ROTATE_1) | ((v4 ^ v9).yx >> ROTATE_31);
+ v56 = vec4(v45.zw, v67.xy);
+ v74 = vec4(v67.zw, v45.xy);
+ vFC = vec4(vEF.zw, vCD.xy);
+ vDE = vec4(vCD.zw, vEF.xy);
+
+ s0 = v01 + v56;
+ v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
+ s1 = v23 + v74;
+ v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
+
+ // NOP
+ v23 += vec4(Z, m2) + vec4(Z, 0u, u32(v23.z + vec4(Z, m2).z < v23.z));
+
+ vFC = (vFC ^ v01).yxwz;
+ vDE = (vDE ^ v23).yxwz;
+
+ s0 = vAB + vFC;
+ vAB = s0 + (vec4<u32>(s0 < vAB) & CARRY).yxwz;
+ s1 = v89 + vDE;
+ v89 = s1 + (vec4<u32>(s1 < v89) & CARRY).yxwz;
+
+ v56 ^= vAB;
+ v56 = (v56 >> ROTATE_24) | (v56 << ROTATE_8).yxwz;
+ v74 ^= v89;
+ v74 = (v74 >> ROTATE_24) | (v74 << ROTATE_8).yxwz;
+
+ s0 = v01 + v56;
+ v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
+ s1 = v23 + v74;
+ v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
+
+ v01 += vec4(m0, m4) + vec4(0u, u32(v01.x + vec4(m0, m4).x < v01.x), 0u, u32(v01.z + vec4(m0, m4).z < v01.z));
+ // NOP
+
+ vFC ^= v01;
+ vFC = (vFC >> ROTATE_16) | (vFC << ROTATE_16).yxwz;
+ vDE ^= v23;
+ vDE = (vDE >> ROTATE_16) | (vDE << ROTATE_16).yxwz;
+
+ s0 = vAB + vFC;
+ vAB = s0 + (vec4<u32>(s0 < vAB) & CARRY).yxwz;
+ s1 = v89 + vDE;
+ v89 = s1 + (vec4<u32>(s1 < v89) & CARRY).yxwz;
+
+ v56 ^= vAB;
+ v56 = (v56 >> ROTATE_31).yxwz | (v56 << ROTATE_1);
+ v74 ^= v89;
+ v74 = (v74 >> ROTATE_31).yxwz | (v74 << ROTATE_1);
+
+ v45 = vec4(v74.zw, v56.xy);
+ v67 = vec4(v56.zw, v74.xy);
+ vCD = vec4(vFC.zw, vDE.xy);
+ vEF = vec4(vDE.zw, vFC.xy);
****************************************************************************/
/**
- * r=8, i=0, a=v[0], b=v[4], c=v[8], d=v[12]
+ * r=8, i=0, a=v[0], b=v[4], c=v[8], d=v[12], m[sigma]=6, m[sigma+1]=15
+ * r=8, i=1, a=v[1], b=v[5], c=v[9], d=v[13], m[sigma]=14, m[sigma+1]=9
+ * r=8, i=2, a=v[2], b=v[6], c=v[10], d=v[14], m[sigma]=11, m[sigma+1]=3
+ * r=8, i=3, a=v[3], b=v[7], c=v[11], d=v[15], m[sigma]=0, m[sigma+1]=8
*/
- v0 = v0 + v4 + vec2(0u, u32(v0.x + v4.x < v0.x));
- vC = (vC ^ v0).yx;
- v8 = v8 + vC + vec2(0u, u32(v8.x + vC.x < v8.x));
- v4 = ((v4 ^ v8) >> ROTATE_24) | ((v4 ^ v8).yx << ROTATE_8);
- v0 = v0 + v4 + vec2(0u, u32(v0.x + v4.x < v0.x));
- vC = ((vC ^ v0) >> ROTATE_16) | ((vC ^ v0).yx << ROTATE_16);
- v8 = v8 + vC + vec2(0u, u32(v8.x + vC.x < v8.x));
- v4 = ((v4 ^ v8) << ROTATE_1) | ((v4 ^ v8).yx >> ROTATE_31);
+ s0 = v01 + v45;
+ v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
+ s1 = v23 + v67;
+ v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
+
+ // NOP
+ v23 += vec4(Z, m0);
+ v23.w += u32(v23.z < m0.x);
- /**
- * r=8, i=1, a=v[1], b=v[5], c=v[9], d=v[13]
- */
- v1 = v1 + v5 + vec2(0u, u32(v1.x + v5.x < v1.x));
- vD = (vD ^ v1).yx;
- v9 = v9 + vD + vec2(0u, u32(v9.x + vD.x < v9.x));
- v5 = ((v5 ^ v9) >> ROTATE_24) | ((v5 ^ v9).yx << ROTATE_8);
- v1 = v1 + v5 + vec2(0u, u32(v1.x + v5.x < v1.x));
- vD = ((vD ^ v1) >> ROTATE_16) | ((vD ^ v1).yx << ROTATE_16);
- v9 = v9 + vD + vec2(0u, u32(v9.x + vD.x < v9.x));
- v5 = ((v5 ^ v9) << ROTATE_1) | ((v5 ^ v9).yx >> ROTATE_31);
+ vCD = (vCD ^ v01).yxwz;
+ vEF = (vEF ^ v23).yxwz;
- /**
- * r=8, i=2, a=v[2], b=v[6], c=v[10], d=v[14]
- */
- v2 = v2 + v6 + vec2(0u, u32(v2.x + v6.x < v2.x));
- vE = (vE ^ v2).yx;
- vA = vA + vE + vec2(0u, u32(vA.x + vE.x < vA.x));
- v6 = ((v6 ^ vA) >> ROTATE_24) | ((v6 ^ vA).yx << ROTATE_8);
- v2 = v2 + v6 + vec2(0u, u32(v2.x + v6.x < v2.x));
- v2 = v2 + m3 + vec2(0u, u32(v2.x + m3.x < v2.x));
- vE = ((vE ^ v2) >> ROTATE_16) | ((vE ^ v2).yx << ROTATE_16);
- vA = vA + vE + vec2(0u, u32(vA.x + vE.x < vA.x));
- v6 = ((v6 ^ vA) << ROTATE_1) | ((v6 ^ vA).yx >> ROTATE_31);
+ s0 = v89 + vCD;
+ v89 = s0 + (vec4<u32>(s0 < v89) & CARRY).yxwz;
+ s1 = vAB + vEF;
+ vAB = s1 + (vec4<u32>(s1 < vAB) & CARRY).yxwz;
- /**
- * r=8, i=3, a=v[3], b=v[7], c=v[11], d=v[15]
- */
- v3 = v3 + v7 + vec2(0u, u32(v3.x + v7.x < v3.x));
- v3 = v3 + m0 + vec2(0u, u32(v3.x + m0.x < v3.x));
- vF = (vF ^ v3).yx;
- vB = vB + vF + vec2(0u, u32(vB.x + vF.x < vB.x));
- v7 = ((v7 ^ vB) >> ROTATE_24) | ((v7 ^ vB).yx << ROTATE_8);
- v3 = v3 + v7 + vec2(0u, u32(v3.x + v7.x < v3.x));
- vF = ((vF ^ v3) >> ROTATE_16) | ((vF ^ v3).yx << ROTATE_16);
- vB = vB + vF + vec2(0u, u32(vB.x + vF.x < vB.x));
- v7 = ((v7 ^ vB) << ROTATE_1) | ((v7 ^ vB).yx >> ROTATE_31);
+ v45 ^= v89;
+ v45 = (v45 >> ROTATE_24) | (v45 << ROTATE_8).yxwz;
+ v67 ^= vAB;
+ v67 = (v67 >> ROTATE_24) | (v67 << ROTATE_8).yxwz;
- /**
- * r=8, i=4, a=v[0], b=v[5], c=v[10], d=v[15]
- */
- v0 = v0 + v5 + vec2(0u, u32(v0.x + v5.x < v0.x));
- vF = (vF ^ v0).yx;
- vA = vA + vF + vec2(0u, u32(vA.x + vF.x < vA.x));
- v5 = ((v5 ^ vA) >> ROTATE_24) | ((v5 ^ vA).yx << ROTATE_8);
- v0 = v0 + v5 + vec2(0u, u32(v0.x + v5.x < v0.x));
- v0 = v0 + m2 + vec2(0u, u32(v0.x + m2.x < v0.x));
- vF = ((vF ^ v0) >> ROTATE_16) | ((vF ^ v0).yx << ROTATE_16);
- vA = vA + vF + vec2(0u, u32(vA.x + vF.x < vA.x));
- v5 = ((v5 ^ vA) << ROTATE_1) | ((v5 ^ vA).yx >> ROTATE_31);
+ s0 = v01 + v45;
+ v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
+ s1 = v23 + v67;
+ v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
- /**
- * r=8, i=5, a=v[1], b=v[6], c=v[11], d=v[12]
- */
- v1 = v1 + v6 + vec2(0u, u32(v1.x + v6.x < v1.x));
- vC = (vC ^ v1).yx;
- vB = vB + vC + vec2(0u, u32(vB.x + vC.x < vB.x));
- v6 = ((v6 ^ vB) >> ROTATE_24) | ((v6 ^ vB).yx << ROTATE_8);
- v1 = v1 + v6 + vec2(0u, u32(v1.x + v6.x < v1.x));
- vC = ((vC ^ v1) >> ROTATE_16) | ((vC ^ v1).yx << ROTATE_16);
- vB = vB + vC + vec2(0u, u32(vB.x + vC.x < vB.x));
- v6 = ((v6 ^ vB) << ROTATE_1) | ((v6 ^ vB).yx >> ROTATE_31);
+ // NOP
+ v23 += vec4(m3, Z) + vec4(0u, u32(v23.x + vec4(m3, Z).x < v23.x), Z);
- /**
- * r=8, i=6, a=v[2], b=v[7], c=v[8], d=v[13]
- */
- v2 = v2 + v7 + vec2(0u, u32(v2.x + v7.x < v2.x));
- v2 = v2 + m1 + vec2(0u, u32(v2.x + m1.x < v2.x));
- vD = (vD ^ v2).yx;
- v8 = v8 + vD + vec2(0u, u32(v8.x + vD.x < v8.x));
- v7 = ((v7 ^ v8) >> ROTATE_24) | ((v7 ^ v8).yx << ROTATE_8);
- v2 = v2 + v7 + vec2(0u, u32(v2.x + v7.x < v2.x));
- v2 = v2 + m4 + vec2(0u, u32(v2.x + m4.x < v2.x));
- vD = ((vD ^ v2) >> ROTATE_16) | ((vD ^ v2).yx << ROTATE_16);
- v8 = v8 + vD + vec2(0u, u32(v8.x + vD.x < v8.x));
- v7 = ((v7 ^ v8) << ROTATE_1) | ((v7 ^ v8).yx >> ROTATE_31);
+ vCD ^= v01;
+ vCD = (vCD >> ROTATE_16) | (vCD << ROTATE_16).yxwz;
+ vEF ^= v23;
+ vEF = (vEF >> ROTATE_16) | (vEF << ROTATE_16).yxwz;
+
+ s0 = v89 + vCD;
+ v89 = s0 + (vec4<u32>(s0 < v89) & CARRY).yxwz;
+ s1 = vAB + vEF;
+ vAB = s1 + (vec4<u32>(s1 < vAB) & CARRY).yxwz;
+
+ v45 ^= v89;
+ v45 = (v45 >> ROTATE_31).yxwz | (v45 << ROTATE_1);
+ v67 ^= vAB;
+ v67 = (v67 >> ROTATE_31).yxwz | (v67 << ROTATE_1);
/**
- * r=8, i=7, a=v[3], b=v[4], c=v[9], d=v[14]
+ * r=8, i=4, a=v[0], b=v[5], c=v[10], d=v[15], m[sigma]=12, m[sigma+1]=2
+ * r=8, i=5, a=v[1], b=v[6], c=v[11], d=v[12], m[sigma]=13, m[sigma+1]=7
+ * r=8, i=6, a=v[2], b=v[7], c=v[8], d=v[13], m[sigma]=1, m[sigma+1]=4
+ * r=8, i=7, a=v[3], b=v[4], c=v[9], d=v[14], m[sigma]=10, m[sigma+1]=5
*/
- v3 = v3 + v4 + vec2(0u, u32(v3.x + v4.x < v3.x));
- vE = (vE ^ v3).yx;
- v9 = v9 + vE + vec2(0u, u32(v9.x + vE.x < v9.x));
- v4 = ((v4 ^ v9) >> ROTATE_24) | ((v4 ^ v9).yx << ROTATE_8);
- v3 = v3 + v4 + vec2(0u, u32(v3.x + v4.x < v3.x));
- vE = ((vE ^ v3) >> ROTATE_16) | ((vE ^ v3).yx << ROTATE_16);
- v9 = v9 + vE + vec2(0u, u32(v9.x + vE.x < v9.x));
- v4 = ((v4 ^ v9) << ROTATE_1) | ((v4 ^ v9).yx >> ROTATE_31);
+ v56 = vec4(v45.zw, v67.xy);
+ v74 = vec4(v67.zw, v45.xy);
+ vFC = vec4(vEF.zw, vCD.xy);
+ vDE = vec4(vCD.zw, vEF.xy);
+
+ s0 = v01 + v56;
+ v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
+ s1 = v23 + v74;
+ v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
+
+ // NOP
+ v23 += vec4(m1, Z) + vec4(0u, u32(v23.x + vec4(m1, Z).x < v23.x), Z);
+
+ vFC = (vFC ^ v01).yxwz;
+ vDE = (vDE ^ v23).yxwz;
+
+ s0 = vAB + vFC;
+ vAB = s0 + (vec4<u32>(s0 < vAB) & CARRY).yxwz;
+ s1 = v89 + vDE;
+ v89 = s1 + (vec4<u32>(s1 < v89) & CARRY).yxwz;
+
+ v56 ^= vAB;
+ v56 = (v56 >> ROTATE_24) | (v56 << ROTATE_8).yxwz;
+ v74 ^= v89;
+ v74 = (v74 >> ROTATE_24) | (v74 << ROTATE_8).yxwz;
+
+ s0 = v01 + v56;
+ v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
+ s1 = v23 + v74;
+ v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
+
+ v01 += vec4(m2, Z) + vec4(0u, u32(v01.x + vec4(m2, Z).x < v01.x), Z);
+ v23 += vec4(m4, Z) + vec4(0u, u32(v23.x + vec4(m4, Z).x < v23.x), Z);
+
+ vFC ^= v01;
+ vFC = (vFC >> ROTATE_16) | (vFC << ROTATE_16).yxwz;
+ vDE ^= v23;
+ vDE = (vDE >> ROTATE_16) | (vDE << ROTATE_16).yxwz;
+
+ s0 = vAB + vFC;
+ vAB = s0 + (vec4<u32>(s0 < vAB) & CARRY).yxwz;
+ s1 = v89 + vDE;
+ v89 = s1 + (vec4<u32>(s1 < v89) & CARRY).yxwz;
+
+ v56 ^= vAB;
+ v56 = (v56 >> ROTATE_31).yxwz | (v56 << ROTATE_1);
+ v74 ^= v89;
+ v74 = (v74 >> ROTATE_31).yxwz | (v74 << ROTATE_1);
+
+ v45 = vec4(v74.zw, v56.xy);
+ v67 = vec4(v56.zw, v74.xy);
+ vCD = vec4(vFC.zw, vDE.xy);
+ vEF = vec4(vDE.zw, vFC.xy);
****************************************************************************/
/**
- * r=9, i=0, a=v[0], b=v[4], c=v[8], d=v[12]
- */
- v0 = v0 + v4 + vec2(0u, u32(v0.x + v4.x < v0.x));
- vC = (vC ^ v0).yx;
- v8 = v8 + vC + vec2(0u, u32(v8.x + vC.x < v8.x));
- v4 = ((v4 ^ v8) >> ROTATE_24) | ((v4 ^ v8).yx << ROTATE_8);
- v0 = v0 + v4 + vec2(0u, u32(v0.x + v4.x < v0.x));
- v0 = v0 + m2 + vec2(0u, u32(v0.x + m2.x < v0.x));
- vC = ((vC ^ v0) >> ROTATE_16) | ((vC ^ v0).yx << ROTATE_16);
- v8 = v8 + vC + vec2(0u, u32(v8.x + vC.x < v8.x));
- v4 = ((v4 ^ v8) << ROTATE_1) | ((v4 ^ v8).yx >> ROTATE_31);
+ * r=9, i=0, a=v[0], b=v[4], c=v[8], d=v[12], m[sigma]=10, m[sigma+1]=2
+ * r=9, i=1, a=v[1], b=v[5], c=v[9], d=v[13], m[sigma]=8, m[sigma+1]=4
+ * r=9, i=2, a=v[2], b=v[6], c=v[10], d=v[14], m[sigma]=7, m[sigma+1]=6
+ * r=9, i=3, a=v[3], b=v[7], c=v[11], d=v[15], m[sigma]=1, m[sigma+1]=5
+ */
+ s0 = v01 + v45;
+ v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
+ s1 = v23 + v67;
+ v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
+
+ // NOP
+ v23 += vec4(Z, m1);
+ v23.w += u32(v23.z < m1.x);
+
+ vCD = (vCD ^ v01).yxwz;
+ vEF = (vEF ^ v23).yxwz;
- /**
- * r=9, i=1, a=v[1], b=v[5], c=v[9], d=v[13]
- */
- v1 = v1 + v5 + vec2(0u, u32(v1.x + v5.x < v1.x));
- vD = (vD ^ v1).yx;
- v9 = v9 + vD + vec2(0u, u32(v9.x + vD.x < v9.x));
- v5 = ((v5 ^ v9) >> ROTATE_24) | ((v5 ^ v9).yx << ROTATE_8);
- v1 = v1 + v5 + vec2(0u, u32(v1.x + v5.x < v1.x));
- v1 = v1 + m4 + vec2(0u, u32(v1.x + m4.x < v1.x));
- vD = ((vD ^ v1) >> ROTATE_16) | ((vD ^ v1).yx << ROTATE_16);
- v9 = v9 + vD + vec2(0u, u32(v9.x + vD.x < v9.x));
- v5 = ((v5 ^ v9) << ROTATE_1) | ((v5 ^ v9).yx >> ROTATE_31);
+ s0 = v89 + vCD;
+ v89 = s0 + (vec4<u32>(s0 < v89) & CARRY).yxwz;
+ s1 = vAB + vEF;
+ vAB = s1 + (vec4<u32>(s1 < vAB) & CARRY).yxwz;
- /**
- * r=9, i=2, a=v[2], b=v[6], c=v[10], d=v[14]
- */
- v2 = v2 + v6 + vec2(0u, u32(v2.x + v6.x < v2.x));
- vE = (vE ^ v2).yx;
- vA = vA + vE + vec2(0u, u32(vA.x + vE.x < vA.x));
- v6 = ((v6 ^ vA) >> ROTATE_24) | ((v6 ^ vA).yx << ROTATE_8);
- v2 = v2 + v6 + vec2(0u, u32(v2.x + v6.x < v2.x));
- vE = ((vE ^ v2) >> ROTATE_16) | ((vE ^ v2).yx << ROTATE_16);
- vA = vA + vE + vec2(0u, u32(vA.x + vE.x < vA.x));
- v6 = ((v6 ^ vA) << ROTATE_1) | ((v6 ^ vA).yx >> ROTATE_31);
+ v45 ^= v89;
+ v45 = (v45 >> ROTATE_24) | (v45 << ROTATE_8).yxwz;
+ v67 ^= vAB;
+ v67 = (v67 >> ROTATE_24) | (v67 << ROTATE_8).yxwz;
- /**
- * r=9, i=3, a=v[3], b=v[7], c=v[11], d=v[15]
- */
- v3 = v3 + v7 + vec2(0u, u32(v3.x + v7.x < v3.x));
- v3 = v3 + m1 + vec2(0u, u32(v3.x + m1.x < v3.x));
- vF = (vF ^ v3).yx;
- vB = vB + vF + vec2(0u, u32(vB.x + vF.x < vB.x));
- v7 = ((v7 ^ vB) >> ROTATE_24) | ((v7 ^ vB).yx << ROTATE_8);
- v3 = v3 + v7 + vec2(0u, u32(v3.x + v7.x < v3.x));
- vF = ((vF ^ v3) >> ROTATE_16) | ((vF ^ v3).yx << ROTATE_16);
- vB = vB + vF + vec2(0u, u32(vB.x + vF.x < vB.x));
- v7 = ((v7 ^ vB) << ROTATE_1) | ((v7 ^ vB).yx >> ROTATE_31);
+ s0 = v01 + v45;
+ v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
+ s1 = v23 + v67;
+ v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
- /**
- * r=9, i=4, a=v[0], b=v[5], c=v[10], d=v[15]
- */
- v0 = v0 + v5 + vec2(0u, u32(v0.x + v5.x < v0.x));
- vF = (vF ^ v0).yx;
- vA = vA + vF + vec2(0u, u32(vA.x + vF.x < vA.x));
- v5 = ((v5 ^ vA) >> ROTATE_24) | ((v5 ^ vA).yx << ROTATE_8);
- v0 = v0 + v5 + vec2(0u, u32(v0.x + v5.x < v0.x));
- vF = ((vF ^ v0) >> ROTATE_16) | ((vF ^ v0).yx << ROTATE_16);
- vA = vA + vF + vec2(0u, u32(vA.x + vF.x < vA.x));
- v5 = ((v5 ^ vA) << ROTATE_1) | ((v5 ^ vA).yx >> ROTATE_31);
+ s0 = v01 + vec4(m2, m4);
+ v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
+ // NOP
- /**
- * r=9, i=5, a=v[1], b=v[6], c=v[11], d=v[12]
- */
- v1 = v1 + v6 + vec2(0u, u32(v1.x + v6.x < v1.x));
- vC = (vC ^ v1).yx;
- vB = vB + vC + vec2(0u, u32(vB.x + vC.x < vB.x));
- v6 = ((v6 ^ vB) >> ROTATE_24) | ((v6 ^ vB).yx << ROTATE_8);
- v1 = v1 + v6 + vec2(0u, u32(v1.x + v6.x < v1.x));
- vC = ((vC ^ v1) >> ROTATE_16) | ((vC ^ v1).yx << ROTATE_16);
- vB = vB + vC + vec2(0u, u32(vB.x + vC.x < vB.x));
- v6 = ((v6 ^ vB) << ROTATE_1) | ((v6 ^ vB).yx >> ROTATE_31);
+ vCD ^= v01;
+ vCD = (vCD >> ROTATE_16) | (vCD << ROTATE_16).yxwz;
+ vEF ^= v23;
+ vEF = (vEF >> ROTATE_16) | (vEF << ROTATE_16).yxwz;
- /**
- * r=9, i=6, a=v[2], b=v[7], c=v[8], d=v[13]
- */
- v2 = v2 + v7 + vec2(0u, u32(v2.x + v7.x < v2.x));
- v2 = v2 + m3 + vec2(0u, u32(v2.x + m3.x < v2.x));
- vD = (vD ^ v2).yx;
- v8 = v8 + vD + vec2(0u, u32(v8.x + vD.x < v8.x));
- v7 = ((v7 ^ v8) >> ROTATE_24) | ((v7 ^ v8).yx << ROTATE_8);
- v2 = v2 + v7 + vec2(0u, u32(v2.x + v7.x < v2.x));
- vD = ((vD ^ v2) >> ROTATE_16) | ((vD ^ v2).yx << ROTATE_16);
- v8 = v8 + vD + vec2(0u, u32(v8.x + vD.x < v8.x));
- v7 = ((v7 ^ v8) << ROTATE_1) | ((v7 ^ v8).yx >> ROTATE_31);
+ s0 = v89 + vCD;
+ v89 = s0 + (vec4<u32>(s0 < v89) & CARRY).yxwz;
+ s1 = vAB + vEF;
+ vAB = s1 + (vec4<u32>(s1 < vAB) & CARRY).yxwz;
+
+ v45 ^= v89;
+ v45 = (v45 >> ROTATE_31).yxwz | (v45 << ROTATE_1);
+ v67 ^= vAB;
+ v67 = (v67 >> ROTATE_31).yxwz | (v67 << ROTATE_1);
/**
- * r=9, i=7, a=v[3], b=v[4], c=v[9], d=v[14]
+ * r=9, i=4, a=v[0], b=v[5], c=v[10], d=v[15], m[sigma]=15, m[sigma+1]=11
+ * r=9, i=5, a=v[1], b=v[6], c=v[11], d=v[12], m[sigma]=9, m[sigma+1]=14
+ * r=9, i=6, a=v[2], b=v[7], c=v[8], d=v[13], m[sigma]=3, m[sigma+1]=12
+ * r=9, i=7, a=v[3], b=v[4], c=v[9], d=v[14], m[sigma]=13, m[sigma+1]=0
*/
- v3 = v3 + v4 + vec2(0u, u32(v3.x + v4.x < v3.x));
- vE = (vE ^ v3).yx;
- v9 = v9 + vE + vec2(0u, u32(v9.x + vE.x < v9.x));
- v4 = ((v4 ^ v9) >> ROTATE_24) | ((v4 ^ v9).yx << ROTATE_8);
- v3 = v3 + v4 + vec2(0u, u32(v3.x + v4.x < v3.x));
- v3 = v3 + m0 + vec2(0u, u32(v3.x + m0.x < v3.x));
- vE = ((vE ^ v3) >> ROTATE_16) | ((vE ^ v3).yx << ROTATE_16);
- v9 = v9 + vE + vec2(0u, u32(v9.x + vE.x < v9.x));
- v4 = ((v4 ^ v9) << ROTATE_1) | ((v4 ^ v9).yx >> ROTATE_31);
+ v56 = vec4(v45.zw, v67.xy);
+ v74 = vec4(v67.zw, v45.xy);
+ vFC = vec4(vEF.zw, vCD.xy);
+ vDE = vec4(vCD.zw, vEF.xy);
+
+ s0 = v01 + v56;
+ v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
+ s1 = v23 + v74;
+ v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
+
+ // NOP
+ v23 += vec4(m3, Z);
+ v23.y += u32(v23.x < m3.x);
+
+ vFC = (vFC ^ v01).yxwz;
+ vDE = (vDE ^ v23).yxwz;
+
+ s0 = vAB + vFC;
+ vAB = s0 + (vec4<u32>(s0 < vAB) & CARRY).yxwz;
+ s1 = v89 + vDE;
+ v89 = s1 + (vec4<u32>(s1 < v89) & CARRY).yxwz;
+
+ v56 ^= vAB;
+ v56 = (v56 >> ROTATE_24) | (v56 << ROTATE_8).yxwz;
+ v74 ^= v89;
+ v74 = (v74 >> ROTATE_24) | (v74 << ROTATE_8).yxwz;
+
+ s0 = v01 + v56;
+ v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
+ s1 = v23 + v74;
+ v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
+
+ // NOP
+ v23 += vec4(Z, m0);
+ v23.w += u32(v23.z < m0.x);
+
+ vFC ^= v01;
+ vFC = (vFC >> ROTATE_16) | (vFC << ROTATE_16).yxwz;
+ vDE ^= v23;
+ vDE = (vDE >> ROTATE_16) | (vDE << ROTATE_16).yxwz;
+
+ s0 = vAB + vFC;
+ vAB = s0 + (vec4<u32>(s0 < vAB) & CARRY).yxwz;
+ s1 = v89 + vDE;
+ v89 = s1 + (vec4<u32>(s1 < v89) & CARRY).yxwz;
+
+ v56 ^= vAB;
+ v56 = (v56 >> ROTATE_31).yxwz | (v56 << ROTATE_1);
+ v74 ^= v89;
+ v74 = (v74 >> ROTATE_31).yxwz | (v74 << ROTATE_1);
+
+ v45 = vec4(v74.zw, v56.xy);
+ v67 = vec4(v56.zw, v74.xy);
+ vCD = vec4(vFC.zw, vDE.xy);
+ vEF = vec4(vDE.zw, vFC.xy);
****************************************************************************/
/**
- * r=10, i=0, a=v[0], b=v[4], c=v[8], d=v[12]
- */
- v0 = v0 + v4 + vec2(0u, u32(v0.x + v4.x < v0.x));
- v0 = v0 + m0 + vec2(0u, u32(v0.x + m0.x < v0.x));
- vC = (vC ^ v0).yx;
- v8 = v8 + vC + vec2(0u, u32(v8.x + vC.x < v8.x));
- v4 = ((v4 ^ v8) >> ROTATE_24) | ((v4 ^ v8).yx << ROTATE_8);
- v0 = v0 + v4 + vec2(0u, u32(v0.x + v4.x < v0.x));
- v0 = v0 + m1 + vec2(0u, u32(v0.x + m1.x < v0.x));
- vC = ((vC ^ v0) >> ROTATE_16) | ((vC ^ v0).yx << ROTATE_16);
- v8 = v8 + vC + vec2(0u, u32(v8.x + vC.x < v8.x));
- v4 = ((v4 ^ v8) << ROTATE_1) | ((v4 ^ v8).yx >> ROTATE_31);
+ * r=10, i=0, a=v[0], b=v[4], c=v[8], d=v[12], m[sigma]=0, m[sigma+1]=1
+ * r=10, i=1, a=v[1], b=v[5], c=v[9], d=v[13], m[sigma]=2, m[sigma+1]=3
+ * r=10, i=2, a=v[2], b=v[6], c=v[10], d=v[14], m[sigma]=4, m[sigma+1]=5
+ * r=10, i=3, a=v[3], b=v[7], c=v[11], d=v[15], m[sigma]=6, m[sigma+1]=7
+ */
+ s0 = v01 + v45;
+ v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
+ s1 = v23 + v67;
+ v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
+
+ s0 = v01 + vec4(m0, m2);
+ v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
+ v23 += vec4(m4, Z);
+ v23.y += u32(v23.x < m4.x);
- /**
- * r=10, i=1, a=v[1], b=v[5], c=v[9], d=v[13]
- */
- v1 = v1 + v5 + vec2(0u, u32(v1.x + v5.x < v1.x));
- v1 = v1 + m2 + vec2(0u, u32(v1.x + m2.x < v1.x));
- vD = (vD ^ v1).yx;
- v9 = v9 + vD + vec2(0u, u32(v9.x + vD.x < v9.x));
- v5 = ((v5 ^ v9) >> ROTATE_24) | ((v5 ^ v9).yx << ROTATE_8);
- v1 = v1 + v5 + vec2(0u, u32(v1.x + v5.x < v1.x));
- v1 = v1 + m3 + vec2(0u, u32(v1.x + m3.x < v1.x));
- vD = ((vD ^ v1) >> ROTATE_16) | ((vD ^ v1).yx << ROTATE_16);
- v9 = v9 + vD + vec2(0u, u32(v9.x + vD.x < v9.x));
- v5 = ((v5 ^ v9) << ROTATE_1) | ((v5 ^ v9).yx >> ROTATE_31);
+ vCD = (vCD ^ v01).yxwz;
+ vEF = (vEF ^ v23).yxwz;
- /**
- * r=10, i=2, a=v[2], b=v[6], c=v[10], d=v[14]
- */
- v2 = v2 + v6 + vec2(0u, u32(v2.x + v6.x < v2.x));
- v2 = v2 + m4 + vec2(0u, u32(v2.x + m4.x < v2.x));
- vE = (vE ^ v2).yx;
- vA = vA + vE + vec2(0u, u32(vA.x + vE.x < vA.x));
- v6 = ((v6 ^ vA) >> ROTATE_24) | ((v6 ^ vA).yx << ROTATE_8);
- v2 = v2 + v6 + vec2(0u, u32(v2.x + v6.x < v2.x));
- vE = ((vE ^ v2) >> ROTATE_16) | ((vE ^ v2).yx << ROTATE_16);
- vA = vA + vE + vec2(0u, u32(vA.x + vE.x < vA.x));
- v6 = ((v6 ^ vA) << ROTATE_1) | ((v6 ^ vA).yx >> ROTATE_31);
+ s0 = v89 + vCD;
+ v89 = s0 + (vec4<u32>(s0 < v89) & CARRY).yxwz;
+ s1 = vAB + vEF;
+ vAB = s1 + (vec4<u32>(s1 < vAB) & CARRY).yxwz;
- /**
- * r=10, i=3, a=v[3], b=v[7], c=v[11], d=v[15]
- */
- v3 = v3 + v7 + vec2(0u, u32(v3.x + v7.x < v3.x));
- vF = (vF ^ v3).yx;
- vB = vB + vF + vec2(0u, u32(vB.x + vF.x < vB.x));
- v7 = ((v7 ^ vB) >> ROTATE_24) | ((v7 ^ vB).yx << ROTATE_8);
- v3 = v3 + v7 + vec2(0u, u32(v3.x + v7.x < v3.x));
- vF = ((vF ^ v3) >> ROTATE_16) | ((vF ^ v3).yx << ROTATE_16);
- vB = vB + vF + vec2(0u, u32(vB.x + vF.x < vB.x));
- v7 = ((v7 ^ vB) << ROTATE_1) | ((v7 ^ vB).yx >> ROTATE_31);
+ v45 ^= v89;
+ v45 = (v45 >> ROTATE_24) | (v45 << ROTATE_8).yxwz;
+ v67 ^= vAB;
+ v67 = (v67 >> ROTATE_24) | (v67 << ROTATE_8).yxwz;
- /**
- * r=10, i=4, a=v[0], b=v[5], c=v[10], d=v[15]
- */
- v0 = v0 + v5 + vec2(0u, u32(v0.x + v5.x < v0.x));
- vF = (vF ^ v0).yx;
- vA = vA + vF + vec2(0u, u32(vA.x + vF.x < vA.x));
- v5 = ((v5 ^ vA) >> ROTATE_24) | ((v5 ^ vA).yx << ROTATE_8);
- v0 = v0 + v5 + vec2(0u, u32(v0.x + v5.x < v0.x));
- vF = ((vF ^ v0) >> ROTATE_16) | ((vF ^ v0).yx << ROTATE_16);
- vA = vA + vF + vec2(0u, u32(vA.x + vF.x < vA.x));
- v5 = ((v5 ^ vA) << ROTATE_1) | ((v5 ^ vA).yx >> ROTATE_31);
+ s0 = v01 + v45;
+ v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
+ s1 = v23 + v67;
+ v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
- /**
- * r=10, i=5, a=v[1], b=v[6], c=v[11], d=v[12]
- */
- v1 = v1 + v6 + vec2(0u, u32(v1.x + v6.x < v1.x));
- vC = (vC ^ v1).yx;
- vB = vB + vC + vec2(0u, u32(vB.x + vC.x < vB.x));
- v6 = ((v6 ^ vB) >> ROTATE_24) | ((v6 ^ vB).yx << ROTATE_8);
- v1 = v1 + v6 + vec2(0u, u32(v1.x + v6.x < v1.x));
- vC = ((vC ^ v1) >> ROTATE_16) | ((vC ^ v1).yx << ROTATE_16);
- vB = vB + vC + vec2(0u, u32(vB.x + vC.x < vB.x));
- v6 = ((v6 ^ vB) << ROTATE_1) | ((v6 ^ vB).yx >> ROTATE_31);
+ s0 = v01 + vec4(m1, m3);
+ v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
+ // NOP
- /**
- * r=10, i=6, a=v[2], b=v[7], c=v[8], d=v[13]
- */
- v2 = v2 + v7 + vec2(0u, u32(v2.x + v7.x < v2.x));
- vD = (vD ^ v2).yx;
- v8 = v8 + vD + vec2(0u, u32(v8.x + vD.x < v8.x));
- v7 = ((v7 ^ v8) >> ROTATE_24) | ((v7 ^ v8).yx << ROTATE_8);
- v2 = v2 + v7 + vec2(0u, u32(v2.x + v7.x < v2.x));
- vD = ((vD ^ v2) >> ROTATE_16) | ((vD ^ v2).yx << ROTATE_16);
- v8 = v8 + vD + vec2(0u, u32(v8.x + vD.x < v8.x));
- v7 = ((v7 ^ v8) << ROTATE_1) | ((v7 ^ v8).yx >> ROTATE_31);
+ vCD ^= v01;
+ vCD = (vCD >> ROTATE_16) | (vCD << ROTATE_16).yxwz;
+ vEF ^= v23;
+ vEF = (vEF >> ROTATE_16) | (vEF << ROTATE_16).yxwz;
+
+ s0 = v89 + vCD;
+ v89 = s0 + (vec4<u32>(s0 < v89) & CARRY).yxwz;
+ s1 = vAB + vEF;
+ vAB = s1 + (vec4<u32>(s1 < vAB) & CARRY).yxwz;
+
+ v45 ^= v89;
+ v45 = (v45 >> ROTATE_31).yxwz | (v45 << ROTATE_1);
+ v67 ^= vAB;
+ v67 = (v67 >> ROTATE_31).yxwz | (v67 << ROTATE_1);
/**
- * r=10, i=7, a=v[3], b=v[4], c=v[9], d=v[14]
+ * r=10, i=4, a=v[0], b=v[5], c=v[10], d=v[15], m[sigma]=8, m[sigma+1]=9
+ * r=10, i=5, a=v[1], b=v[6], c=v[11], d=v[12], m[sigma]=10, m[sigma+1]=11
+ * r=10, i=6, a=v[2], b=v[7], c=v[8], d=v[13], m[sigma]=12, m[sigma+1]=13
+ * r=10, i=7, a=v[3], b=v[4], c=v[9], d=v[14], m[sigma]=14, m[sigma+1]=15
*/
- v3 = v3 + v4 + vec2(0u, u32(v3.x + v4.x < v3.x));
- vE = (vE ^ v3).yx;
- v9 = v9 + vE + vec2(0u, u32(v9.x + vE.x < v9.x));
- v4 = ((v4 ^ v9) >> ROTATE_24) | ((v4 ^ v9).yx << ROTATE_8);
- v3 = v3 + v4 + vec2(0u, u32(v3.x + v4.x < v3.x));
- vE = ((vE ^ v3) >> ROTATE_16) | ((vE ^ v3).yx << ROTATE_16);
- v9 = v9 + vE + vec2(0u, u32(v9.x + vE.x < v9.x));
- v4 = ((v4 ^ v9) << ROTATE_1) | ((v4 ^ v9).yx >> ROTATE_31);
+ v56 = vec4(v45.zw, v67.xy);
+ v74 = vec4(v67.zw, v45.xy);
+ vFC = vec4(vEF.zw, vCD.xy);
+ vDE = vec4(vCD.zw, vEF.xy);
+
+ s0 = v01 + v56;
+ v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
+ s1 = v23 + v74;
+ v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
+
+ // NOP
+ // NOP
+
+ vFC = (vFC ^ v01).yxwz;
+ vDE = (vDE ^ v23).yxwz;
+
+ s0 = vAB + vFC;
+ vAB = s0 + (vec4<u32>(s0 < vAB) & CARRY).yxwz;
+ s1 = v89 + vDE;
+ v89 = s1 + (vec4<u32>(s1 < v89) & CARRY).yxwz;
+
+ v56 ^= vAB;
+ v56 = (v56 >> ROTATE_24) | (v56 << ROTATE_8).yxwz;
+ v74 ^= v89;
+ v74 = (v74 >> ROTATE_24) | (v74 << ROTATE_8).yxwz;
+
+ s0 = v01 + v56;
+ v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
+ s1 = v23 + v74;
+ v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
+
+ // NOP
+ // NOP
+
+ vFC ^= v01;
+ vFC = (vFC >> ROTATE_16) | (vFC << ROTATE_16).yxwz;
+ vDE ^= v23;
+ vDE = (vDE >> ROTATE_16) | (vDE << ROTATE_16).yxwz;
+
+ s0 = vAB + vFC;
+ vAB = s0 + (vec4<u32>(s0 < vAB) & CARRY).yxwz;
+ s1 = v89 + vDE;
+ v89 = s1 + (vec4<u32>(s1 < v89) & CARRY).yxwz;
+
+ v56 ^= vAB;
+ v56 = (v56 >> ROTATE_31).yxwz | (v56 << ROTATE_1);
+ v74 ^= v89;
+ v74 = (v74 >> ROTATE_31).yxwz | (v74 << ROTATE_1);
+
+ v45 = vec4(v74.zw, v56.xy);
+ v67 = vec4(v56.zw, v74.xy);
+ vCD = vec4(vFC.zw, vDE.xy);
+ vEF = vec4(vDE.zw, vFC.xy);
****************************************************************************/
/**
- * r=11, i=0, a=v[0], b=v[4], c=v[8], d=v[12]
- */
- v0 = v0 + v4 + vec2(0u, u32(v0.x + v4.x < v0.x));
- vC = (vC ^ v0).yx;
- v8 = v8 + vC + vec2(0u, u32(v8.x + vC.x < v8.x));
- v4 = ((v4 ^ v8) >> ROTATE_24) | ((v4 ^ v8).yx << ROTATE_8);
- v0 = v0 + v4 + vec2(0u, u32(v0.x + v4.x < v0.x));
- vC = ((vC ^ v0) >> ROTATE_16) | ((vC ^ v0).yx << ROTATE_16);
- v8 = v8 + vC + vec2(0u, u32(v8.x + vC.x < v8.x));
- // skip since it does not affect the final values of `v0` and `v8`
+ * r=11, i=0, a=v[0], b=v[4], c=v[8], d=v[12], m[sigma]=14, m[sigma+1]=10
+ * r=11, i=1, a=v[1], b=v[5], c=v[9], d=v[13], m[sigma]=4, m[sigma+1]=8
+ * r=11, i=2, a=v[2], b=v[6], c=v[10], d=v[14], m[sigma]=9, m[sigma+1]=15
+ * r=11, i=3, a=v[3], b=v[7], c=v[11], d=v[15], m[sigma]=13, m[sigma+1]=6
+ */
+ s0 = v01 + v45;
+ v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
+ s1 = v23 + v67;
+ v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
+
+ v01 += vec4(Z, m4);
+ v01.w += u32(v01.z < m4.x);
+ // NOP
+
+ vCD = (vCD ^ v01).yxwz;
+ vEF = (vEF ^ v23).yxwz;
- /**
- * r=11, i=1, a=v[1], b=v[5], c=v[9], d=v[13]
- */
- v1 = v1 + v5 + vec2(0u, u32(v1.x + v5.x < v1.x));
- v1 = v1 + m4 + vec2(0u, u32(v1.x + m4.x < v1.x));
- vD = (vD ^ v1).yx;
- v9 = v9 + vD + vec2(0u, u32(v9.x + vD.x < v9.x));
- v5 = ((v5 ^ v9) >> ROTATE_24) | ((v5 ^ v9).yx << ROTATE_8);
- v1 = v1 + v5 + vec2(0u, u32(v1.x + v5.x < v1.x));
- vD = ((vD ^ v1) >> ROTATE_16) | ((vD ^ v1).yx << ROTATE_16);
- v9 = v9 + vD + vec2(0u, u32(v9.x + vD.x < v9.x));
- v5 = ((v5 ^ v9) << ROTATE_1) | ((v5 ^ v9).yx >> ROTATE_31);
+ s0 = v89 + vCD;
+ v89 = s0 + (vec4<u32>(s0 < v89) & CARRY).yxwz;
+ s1 = vAB + vEF;
+ vAB = s1 + (vec4<u32>(s1 < vAB) & CARRY).yxwz;
- /**
- * r=11, i=2, a=v[2], b=v[6], c=v[10], d=v[14]
- */
- v2 = v2 + v6 + vec2(0u, u32(v2.x + v6.x < v2.x));
- vE = (vE ^ v2).yx;
- vA = vA + vE + vec2(0u, u32(vA.x + vE.x < vA.x));
- v6 = ((v6 ^ vA) >> ROTATE_24) | ((v6 ^ vA).yx << ROTATE_8);
- v2 = v2 + v6 + vec2(0u, u32(v2.x + v6.x < v2.x));
- vE = ((vE ^ v2) >> ROTATE_16) | ((vE ^ v2).yx << ROTATE_16);
- vA = vA + vE + vec2(0u, u32(vA.x + vE.x < vA.x));
- // skip since it does not affect the final values of `v0` and `v8`
+ v45 ^= v89;
+ v45 = (v45 >> ROTATE_24) | (v45 << ROTATE_8).yxwz;
+ v67 ^= vAB;
+ v67 = (v67 >> ROTATE_24) | (v67 << ROTATE_8).yxwz;
- /**
- * r=11, i=3, a=v[3], b=v[7], c=v[11], d=v[15]
- */
- v3 = v3 + v7 + vec2(0u, u32(v3.x + v7.x < v3.x));
- vF = (vF ^ v3).yx;
- vB = vB + vF + vec2(0u, u32(vB.x + vF.x < vB.x));
- v7 = ((v7 ^ vB) >> ROTATE_24) | ((v7 ^ vB).yx << ROTATE_8);
- v3 = v3 + v7 + vec2(0u, u32(v3.x + v7.x < v3.x));
- vF = ((vF ^ v3) >> ROTATE_16) | ((vF ^ v3).yx << ROTATE_16);
- vB = vB + vF + vec2(0u, u32(vB.x + vF.x < vB.x));
- v7 = ((v7 ^ vB) << ROTATE_1) | ((v7 ^ vB).yx >> ROTATE_31);
+ s0 = v01 + v45;
+ v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
+ s1 = v23 + v67;
+ v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
- /**
- * r=11, i=4, a=v[0], b=v[5], c=v[10], d=v[15]
- */
- v0 = v0 + v5 + vec2(0u, u32(v0.x + v5.x < v0.x));
- v0 = v0 + m1 + vec2(0u, u32(v0.x + m1.x < v0.x));
- vF = (vF ^ v0).yx;
- vA = vA + vF + vec2(0u, u32(vA.x + vF.x < vA.x));
- v5 = ((v5 ^ vA) >> ROTATE_24) | ((v5 ^ vA).yx << ROTATE_8);
- v0 = v0 + v5 + vec2(0u, u32(v0.x + v5.x < v0.x));
- // skip since it does not affect the final values of `v0` and `v8`
- // skip since it does not affect the final values of `v0` and `v8`
- // skip since it does not affect the final values of `v0` and `v8`
+ // NOP
+ // NOP
- /**
- * r=11, i=5, a=v[1], b=v[6], c=v[11], d=v[12]
- */
- // skip entire step since it does not affect the final values of `v0` and `v8`
+ vCD ^= v01;
+ vCD = (vCD >> ROTATE_16) | (vCD << ROTATE_16).yxwz;
+ vEF ^= v23;
+ vEF = (vEF >> ROTATE_16) | (vEF << ROTATE_16).yxwz;
- /**
- * r=11, i=6, a=v[2], b=v[7], c=v[8], d=v[13]
- */
- v2 = v2 + v7 + vec2(0u, u32(v2.x + v7.x < v2.x));
- vD = (vD ^ v2).yx;
- v8 = v8 + vD + vec2(0u, u32(v8.x + vD.x < v8.x));
- v7 = ((v7 ^ v8) >> ROTATE_24) | ((v7 ^ v8).yx << ROTATE_8);
- v2 = v2 + v7 + vec2(0u, u32(v2.x + v7.x < v2.x));
- vD = ((vD ^ v2) >> ROTATE_16) | ((vD ^ v2).yx << ROTATE_16);
- v8 = v8 + vD + vec2(0u, u32(v8.x + vD.x < v8.x));
- // skip since we already have the final values of `v0` and `v8`
+ s0 = v89 + vCD;
+ v89 = s0 + (vec4<u32>(s0 < v89) & CARRY).yxwz;
+ s1 = vAB + vEF;
+ vAB = s1 + (vec4<u32>(s1 < vAB) & CARRY).yxwz;
+
+ v45 ^= v89;
+ v45 = (v45 >> ROTATE_31).yxwz | (v45 << ROTATE_1);
+ v67 ^= vAB;
+ v67 = (v67 >> ROTATE_31).yxwz | (v67 << ROTATE_1);
/**
- * r=11, i=7, a=v[3], b=v[4], c=v[9], d=v[14]
+ * r=11, i=4, a=v[0], b=v[5], c=v[10], d=v[15], m[sigma]=1, m[sigma+1]=12
+ * r=11, i=5, a=v[1], b=v[6], c=v[11], d=v[12], m[sigma]=0, m[sigma+1]=2
+ * r=11, i=6, a=v[2], b=v[7], c=v[8], d=v[13], m[sigma]=11, m[sigma+1]=7
+ * r=11, i=7, a=v[3], b=v[4], c=v[9], d=v[14], m[sigma]=5, m[sigma+1]=3
*/
- // skip entire step since we already have the final values of `v0` and `v8`
+ v56 = vec4(v45.zw, v67.xy);
+ v74 = vec4(v67.zw, v45.xy);
+ vFC = vec4(vEF.zw, vCD.xy);
+ vDE = vec4(vCD.zw, vEF.xy);
+
+ s0 = v01 + v56;
+ v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
+ s1 = v23 + v74;
+ v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
+
+ s0 = v01 + vec4(m1, m0);
+ v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
+ // NOP
+
+ vFC = (vFC ^ v01).yxwz;
+ vDE = (vDE ^ v23).yxwz;
+
+ s0 = vAB + vFC;
+ vAB = s0 + (vec4<u32>(s0 < vAB) & CARRY).yxwz;
+ s1 = v89 + vDE;
+ v89 = s1 + (vec4<u32>(s1 < v89) & CARRY).yxwz;
+
+ v56 ^= vAB;
+ v56 = (v56 >> ROTATE_24) | (v56 << ROTATE_8).yxwz;
+ v74 ^= v89;
+ v74 = (v74 >> ROTATE_24) | (v74 << ROTATE_8).yxwz;
+
+ s0 = v01 + v56;
+ v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
+ s1 = v23 + v74;
+ v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
+
+ v01 += vec4(Z, m2);
+ v01.w += u32(v01.z < m2.x);
+ v23 += vec4(Z, m3);
+ v23.w += u32(v23.z < m3.x);
+
+ // vFC ^= v01;
+ // vFC = (vFC >> ROTATE_16) | (vFC << ROTATE_16).yxwz;
+ vDE ^= v23;
+ vDE = (vDE >> ROTATE_16) | (vDE << ROTATE_16).yxwz;
+
+ // s0 = vAB + vFC;
+ // vAB = s0 + (vec4<u32>(s0 < vAB).yxwz & CARRY);
+ s1 = v89 + vDE;
+ v89 = s1 + (vec4<u32>(s1 < v89).yxwz & CARRY);
+
+ // v56 ^= vAB;
+ // v74 ^= v89;
+ // v56 = (v56 << ROTATE_1) | (v56 >> ROTATE_31).yxwz;
+ // v74 = (v74 << ROTATE_1) | (v74 >> ROTATE_31).yxwz;
+
+ // v45 = vec4(v74.zw, v56.xy);
+ // v67 = vec4(v56.zw, v74.xy);
+ // vCD = vec4(vFC.zw, vDE.xy);
+ // vEF = vec4(vDE.zw, vFC.xy);
* Set nonce if it passes the threshold and no other thread has set it.
* Only high bits are needed for comparison since threshold low bits are zero.
*/
- if ((BLAKE2B_IV_0.y ^ v0.y ^ v8.y) > ubo.threshold && atomicLoad(&work.found) == 0u) {
+ if ((BLAKE2B_IV_0.y ^ v01.y ^ v89.y) > ubo.threshold && atomicLoad(&work.found) == 0u) {
atomicStore(&work.found, 1u);
work.nonce = m0;
}
--- /dev/null
+// SPDX-FileCopyrightText: 2025 Chris Duncan <chris@zoso.dev>
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+export const NanoPowGlDownsampleShader = `#version 300 es
+#pragma vscode_glsllint_stage: frag
+#ifdef GL_FRAGMENT_PRECISION_HIGH
+precision highp float;
+#else
+precision mediump float;
+#endif
+precision highp int;
+
+out uvec4 nonce;
+
+// source texture to be downsampled
+uniform highp usampler2D src;
+
+void main() {
+ nonce = uvec4(0u);
+ vec2 inputSize = vec2(textureSize(src, 0));
+ vec2 texel = vec2(1.0) / inputSize;
+ vec2 blockCoord = (floor(gl_FragCoord.xy) * 2.0 + vec2(0.5)) / inputSize;
+
+ uvec4 pixel = texture(src, blockCoord);
+ nonce = pixel.x == 0u ? nonce : pixel;
+
+ pixel = texture(src, blockCoord + vec2(texel.x, 0.0));
+ nonce = pixel.x == 0u ? nonce : pixel;
+
+ pixel = texture(src, blockCoord + vec2(0.0, texel.y));
+ nonce = pixel.x == 0u ? nonce : pixel;
+
+ pixel = texture(src, blockCoord + vec2(texel.x, texel.y));
+ nonce = pixel.x == 0u ? nonce : pixel;
+}
+`
--- /dev/null
+// SPDX-FileCopyrightText: 2025 Chris Duncan <chris@zoso.dev>
+// SPDX-FileContributor: Ben Green <ben@latenightsketches.com>
+// SPDX-License-Identifier: GPL-3.0-or-later AND MIT
+
+export const NanoPowGlDrawShader = `#version 300 es
+#pragma vscode_glsllint_stage: frag
+#ifdef GL_FRAGMENT_PRECISION_HIGH
+precision highp float;
+#else
+precision mediump float;
+#endif
+
+out uvec4 nonce;
+
+// blockhash - Array of precalculated block hash components
+// threshold - 0xfffffff8 for send/change blocks, 0xfffffe00 for all else
+// search - Checks all pixels if true, else only checks 1 pixel to validate
+layout(std140) uniform UBO {
+ uint blockhash[8];
+ uint threshold;
+ bool search;
+};
+
+// Random work seed values
+layout(std140) uniform WORK {
+ uvec2 seed;
+};
+
+// Defined separately from uint v[0].y below as the original value is required
+// to calculate the second uint32 of the digest for threshold comparison
+const uint BLAKE2B_IV32_1 = 0x6A09E667u;
+
+// Used during G for vector bit rotations
+const uvec4 ROTATE_1 = uvec4(1u);
+const uvec4 ROTATE_8 = uvec4(8u);
+const uvec4 ROTATE_16 = uvec4(16u);
+const uvec4 ROTATE_24 = uvec4(24u);
+const uvec4 ROTATE_31 = uvec4(31u);
+
+// Both buffers represent 16 uint64s as 32 uint32s
+// because that's what GLSL offers, just like Javascript
+
+// Compression buffer, intialized to 2 instances of the initialization vector
+// The following values have been modified from the BLAKE2B_IV:
+// OUTLEN is constant 8 bytes
+// v[0] ^= 0x01010000u ^ uint(OUTLEN);
+// INLEN is constant 40 bytes: work value (8) + block hash (32)
+// v[12] ^= uint(INLEN);
+// It's always the "last" compression at this INLEN
+// v[14] = ~v[14];
+const uvec2 blake2b_iv[16] = uvec2[16](
+ uvec2(0xF2BDC900u, 0x6A09E667u),
+ uvec2(0x84CAA73Bu, 0xBB67AE85u),
+ uvec2(0xFE94F82Bu, 0x3C6EF372u),
+ uvec2(0x5F1D36F1u, 0xA54FF53Au),
+ uvec2(0xADE682D1u, 0x510E527Fu),
+ uvec2(0x2B3E6C1Fu, 0x9B05688Cu),
+ uvec2(0xFB41BD6Bu, 0x1F83D9ABu),
+ uvec2(0x137E2179u, 0x5BE0CD19u),
+ uvec2(0xF3BCC908u, 0x6A09E667u),
+ uvec2(0x84CAA73Bu, 0xBB67AE85u),
+ uvec2(0xFE94F82Bu, 0x3C6EF372u),
+ uvec2(0x5F1D36F1u, 0xA54FF53Au),
+ uvec2(0xADE682F9u, 0x510E527Fu),
+ uvec2(0x2B3E6C1Fu, 0x9B05688Cu),
+ uvec2(0x04BE4294u, 0xE07C2654u),
+ uvec2(0x137E2179u, 0x5BE0CD19u)
+);
+
+// Iterated initialization vector
+uvec2 v[16];
+
+// Input data buffer
+uvec2 m[16];
+
+// G mixing function, compressing two subprocesses into one
+void G (
+ uint a0, uint b0, uint c0, uint d0, uvec2 x0, uvec2 y0,
+ uint a1, uint b1, uint c1, uint d1, uvec2 x1, uvec2 y1
+) {
+ uvec4 a = uvec4(v[a0], v[a1]);
+ uvec4 b = uvec4(v[b0], v[b1]);
+ uvec4 c = uvec4(v[c0], v[c1]);
+ uvec4 d = uvec4(v[d0], v[d1]);
+ uvec4 mx = uvec4(x0, x1);
+ uvec4 my = uvec4(y0, y1);
+
+ a = a + b + uvec4(0u, uint(a.x + b.x < a.x), 0u, uint(a.z + b.z < a.z));
+ a = a + mx + uvec4(0u, uint(a.x + mx.x < a.x), 0u, uint(a.z + mx.z < a.z));
+ d = (d ^ a).yxwz;
+ c = c + d + uvec4(0u, uint(c.x + d.x < c.x), 0u, uint(c.z + d.z < c.z));
+ b = ((b ^ c) >> ROTATE_24) | ((b ^ c) << ROTATE_8).yxwz;
+ a = a + b + uvec4(0u, uint(a.x + b.x < b.x), 0u, uint(a.z + b.z < b.z));
+ a = a + my + uvec4(0u, uint(a.x + my.x < a.x), 0u, uint(a.z + my.z < a.z));
+ d = ((d ^ a) >> ROTATE_16) | ((d ^ a) << ROTATE_16).yxwz;
+ c = c + d + uvec4(0u, uint(c.x + d.x < c.x), 0u, uint(c.z + d.z < c.z));
+ b = ((b ^ c) >> ROTATE_31).yxwz | ((b ^ c) << ROTATE_1);
+
+ v[a0] = a.xy;
+ v[b0] = b.xy;
+ v[c0] = c.xy;
+ v[d0] = d.xy;
+ v[a1] = a.zw;
+ v[b1] = b.zw;
+ v[c1] = c.zw;
+ v[d1] = d.zw;
+}
+
+void main() {
+ // Initialize fragment output
+ nonce = uvec4(0u);
+
+ // Nonce uniquely differentiated by pixel location
+ m[0u] = seed ^ uvec2(gl_FragCoord);
+
+ // Block hash
+ m[1u] = uvec2(blockhash[0u], blockhash[1u]);
+ m[2u] = uvec2(blockhash[2u], blockhash[3u]);
+ m[3u] = uvec2(blockhash[4u], blockhash[5u]);
+ m[4u] = uvec2(blockhash[6u], blockhash[7u]);
+
+ // Reset v
+ v = blake2b_iv;
+
+ // Twelve rounds of G mixing
+
+ // Round 0
+ G(0u, 4u, 8u, 12u, m[0u], m[1u], 1u, 5u, 9u, 13u, m[2u], m[3u]);
+ G(2u, 6u, 10u, 14u, m[4u], m[5u], 3u, 7u, 11u, 15u, m[6u], m[7u]);
+ G(0u, 5u, 10u, 15u, m[8u], m[9u], 1u, 6u, 11u, 12u, m[10u], m[11u]);
+ G(2u, 7u, 8u, 13u, m[12u], m[13u], 3u, 4u, 9u, 14u, m[14u], m[15u]);
+
+ // Round 1
+ G(0u, 4u, 8u, 12u, m[14u], m[10u], 1u, 5u, 9u, 13u, m[4u], m[8u]);
+ G(2u, 6u, 10u, 14u, m[9u], m[15u], 3u, 7u, 11u, 15u, m[13u], m[6u]);
+ G(0u, 5u, 10u, 15u, m[1u], m[12u], 1u, 6u, 11u, 12u, m[0u], m[2u]);
+ G(2u, 7u, 8u, 13u, m[11u], m[7u], 3u, 4u, 9u, 14u, m[5u], m[3u]);
+
+ // Round 2
+ G(0u, 4u, 8u, 12u, m[11u], m[8u], 1u, 5u, 9u, 13u, m[12u], m[0u]);
+ G(2u, 6u, 10u, 14u, m[5u], m[2u], 3u, 7u, 11u, 15u, m[15u], m[13u]);
+ G(0u, 5u, 10u, 15u, m[10u], m[14u], 1u, 6u, 11u, 12u, m[3u], m[6u]);
+ G(2u, 7u, 8u, 13u, m[7u], m[1u], 3u, 4u, 9u, 14u, m[9u], m[4u]);
+
+ // Round 3
+ G(0u, 4u, 8u, 12u, m[7u], m[9u], 1u, 5u, 9u, 13u, m[3u], m[1u]);
+ G(2u, 6u, 10u, 14u, m[13u], m[12u], 3u, 7u, 11u, 15u, m[11u], m[14u]);
+ G(0u, 5u, 10u, 15u, m[2u], m[6u], 1u, 6u, 11u, 12u, m[5u], m[10u]);
+ G(2u, 7u, 8u, 13u, m[4u], m[0u], 3u, 4u, 9u, 14u, m[15u], m[8u]);
+
+ // Round 4
+ G(0u, 4u, 8u, 12u, m[9u], m[0u], 1u, 5u, 9u, 13u, m[5u], m[7u]);
+ G(2u, 6u, 10u, 14u, m[2u], m[4u], 3u, 7u, 11u, 15u, m[10u], m[15u]);
+ G(0u, 5u, 10u, 15u, m[14u], m[1u], 1u, 6u, 11u, 12u, m[11u], m[12u]);
+ G(2u, 7u, 8u, 13u, m[6u], m[8u], 3u, 4u, 9u, 14u, m[3u], m[13u]);
+
+ // Round 5
+ G(0u, 4u, 8u, 12u, m[2u], m[12u], 1u, 5u, 9u, 13u, m[6u], m[10u]);
+ G(2u, 6u, 10u, 14u, m[0u], m[11u], 3u, 7u, 11u, 15u, m[8u], m[3u]);
+ G(0u, 5u, 10u, 15u, m[4u], m[13u], 1u, 6u, 11u, 12u, m[7u], m[5u]);
+ G(2u, 7u, 8u, 13u, m[15u], m[14u], 3u, 4u, 9u, 14u, m[1u], m[9u]);
+
+ // Round 6
+ G(0u, 4u, 8u, 12u, m[12u], m[5u], 1u, 5u, 9u, 13u, m[1u], m[15u]);
+ G(2u, 6u, 10u, 14u, m[14u], m[13u], 3u, 7u, 11u, 15u, m[4u], m[10u]);
+ G(0u, 5u, 10u, 15u, m[0u], m[7u], 1u, 6u, 11u, 12u, m[6u], m[3u]);
+ G(2u, 7u, 8u, 13u, m[9u], m[2u], 3u, 4u, 9u, 14u, m[8u], m[11u]);
+
+ // Round 7
+ G(0u, 4u, 8u, 12u, m[13u], m[11u], 1u, 5u, 9u, 13u, m[7u], m[14u]);
+ G(2u, 6u, 10u, 14u, m[12u], m[1u], 3u, 7u, 11u, 15u, m[3u], m[9u]);
+ G(0u, 5u, 10u, 15u, m[5u], m[0u], 1u, 6u, 11u, 12u, m[15u], m[4u]);
+ G(2u, 7u, 8u, 13u, m[8u], m[6u], 3u, 4u, 9u, 14u, m[2u], m[10u]);
+
+ // Round 8
+ G(0u, 4u, 8u, 12u, m[6u], m[15u], 1u, 5u, 9u, 13u, m[14u], m[9u]);
+ G(2u, 6u, 10u, 14u, m[11u], m[3u], 3u, 7u, 11u, 15u, m[0u], m[8u]);
+ G(0u, 5u, 10u, 15u, m[12u], m[2u], 1u, 6u, 11u, 12u, m[13u], m[7u]);
+ G(2u, 7u, 8u, 13u, m[1u], m[4u], 3u, 4u, 9u, 14u, m[10u], m[5u]);
+
+ // Round 9
+ G(0u, 4u, 8u, 12u, m[10u], m[2u], 1u, 5u, 9u, 13u, m[8u], m[4u]);
+ G(2u, 6u, 10u, 14u, m[7u], m[6u], 3u, 7u, 11u, 15u, m[1u], m[5u]);
+ G(0u, 5u, 10u, 15u, m[15u], m[11u], 1u, 6u, 11u, 12u, m[9u], m[14u]);
+ G(2u, 7u, 8u, 13u, m[3u], m[12u], 3u, 4u, 9u, 14u, m[13u], m[0u]);
+
+ // Round 10
+ G(0u, 4u, 8u, 12u, m[0u], m[1u], 1u, 5u, 9u, 13u, m[2u], m[3u]);
+ G(2u, 6u, 10u, 14u, m[4u], m[5u], 3u, 7u, 11u, 15u, m[6u], m[7u]);
+ G(0u, 5u, 10u, 15u, m[8u], m[9u], 1u, 6u, 11u, 12u, m[10u], m[11u]);
+ G(2u, 7u, 8u, 13u, m[12u], m[13u], 3u, 4u, 9u, 14u, m[14u], m[15u]);
+
+ // Round 11
+ G(0u, 4u, 8u, 12u, m[14u], m[10u], 1u, 5u, 9u, 13u, m[4u], m[8u]);
+ G(2u, 6u, 10u, 14u, m[9u], m[15u], 3u, 7u, 11u, 15u, m[13u], m[6u]);
+ G(0u, 5u, 10u, 15u, m[1u], m[12u], 1u, 6u, 11u, 12u, m[0u], m[2u]);
+ G(2u, 7u, 8u, 13u, m[11u], m[7u], 3u, 4u, 9u, 14u, m[5u], m[3u]);
+
+ // Pixel data set from work seed values
+ // Finalize digest from high bits, low bits can be safely ignored
+ if ((BLAKE2B_IV32_1 ^ v[0u].y ^ v[8u].y) >= threshold && (search || uvec2(gl_FragCoord) == uvec2(0u))) {
+ nonce = uvec4(1u, m[0u].y, m[0u].x, (uint(gl_FragCoord.x) << 16u) | uint(gl_FragCoord.y));
+ }
+
+ // Valid nonce not found
+ if (nonce.x == 0u) {
+ discard;
+ }
+}
+`
+++ /dev/null
-// SPDX-FileCopyrightText: 2025 Chris Duncan <chris@zoso.dev>
-// SPDX-FileContributor: Ben Green <ben@latenightsketches.com>
-// SPDX-License-Identifier: GPL-3.0-or-later AND MIT
-
-export const NanoPowGlFragmentShader = `#version 300 es
-#pragma vscode_glsllint_stage: frag
-precision highp float;
-
-in vec2 uv_pos;
-out uvec4 nonce;
-
-// blockhash - array of precalculated block hash components
-// threshold - 0xfffffff8 for send/change blocks, 0xfffffe00 for all else
-// workload - Defines canvas size
-layout(std140) uniform UBO {
- uint blockhash[8];
- uint threshold;
- float workload;
-};
-
-// Random work values
-layout(std140) uniform WORK {
- uvec2 work;
-};
-
-// Defined separately from uint v[32] below as the original value is required
-// to calculate the second uint32 of the digest for threshold comparison
-const uint BLAKE2B_IV32_1 = 0x6A09E667u;
-
-// Used during G for vector bit rotations
-const uvec2 ROTATE_1 = uvec2(1u, 1u);
-const uvec2 ROTATE_8 = uvec2(8u, 8u);
-const uvec2 ROTATE_16 = uvec2(16u, 16u);
-const uvec2 ROTATE_24 = uvec2(24u, 24u);
-const uvec2 ROTATE_31 = uvec2(31u, 31u);
-
-// Both buffers represent 16 uint64s as 32 uint32s
-// because that's what GLSL offers, just like Javascript
-
-// Compression buffer, intialized to 2 instances of the initialization vector
-// The following values have been modified from the BLAKE2B_IV:
-// OUTLEN is constant 8 bytes
-// v[0] ^= 0x01010000u ^ uint(OUTLEN);
-// INLEN is constant 40 bytes: work value (8) + block hash (32)
-// v[24] ^= uint(INLEN);
-// It's always the "last" compression at this INLEN
-// v[28] = ~v[28];
-// v[29] = ~v[29];
-uvec2 v[16] = uvec2[16](
- uvec2(0xF2BDC900u, 0x6A09E667u),
- uvec2(0x84CAA73Bu, 0xBB67AE85u),
- uvec2(0xFE94F82Bu, 0x3C6EF372u),
- uvec2(0x5F1D36F1u, 0xA54FF53Au),
- uvec2(0xADE682D1u, 0x510E527Fu),
- uvec2(0x2B3E6C1Fu, 0x9B05688Cu),
- uvec2(0xFB41BD6Bu, 0x1F83D9ABu),
- uvec2(0x137E2179u, 0x5BE0CD19u),
- uvec2(0xF3BCC908u, 0x6A09E667u),
- uvec2(0x84CAA73Bu, 0xBB67AE85u),
- uvec2(0xFE94F82Bu, 0x3C6EF372u),
- uvec2(0x5F1D36F1u, 0xA54FF53Au),
- uvec2(0xADE682F9u, 0x510E527Fu),
- uvec2(0x2B3E6C1Fu, 0x9B05688Cu),
- uvec2(0x04BE4294u, 0xE07C2654u),
- uvec2(0x137E2179u, 0x5BE0CD19u)
-);
-
-// Input data buffer
-uvec2 m[16];
-
-// Offsets into the input data buffer for each mixing step
-const uint SIGMA[192] = uint[192](
- 0u,1u,2u,3u,4u,5u,6u,7u,8u,9u,10u,11u,12u,13u,14u,15u,
- 14u,10u,4u,8u,9u,15u,13u,6u,1u,12u,0u,2u,11u,7u,5u,3u,
- 11u,8u,12u,0u,5u,2u,15u,13u,10u,14u,3u,6u,7u,1u,9u,4u,
- 7u,9u,3u,1u,13u,12u,11u,14u,2u,6u,5u,10u,4u,0u,15u,8u,
- 9u,0u,5u,7u,2u,4u,10u,15u,14u,1u,11u,12u,6u,8u,3u,13u,
- 2u,12u,6u,10u,0u,11u,8u,3u,4u,13u,7u,5u,15u,14u,1u,9u,
- 12u,5u,1u,15u,14u,13u,4u,10u,0u,7u,6u,3u,9u,2u,8u,11u,
- 13u,11u,7u,14u,12u,1u,3u,9u,5u,0u,15u,4u,8u,6u,2u,10u,
- 6u,15u,14u,9u,11u,3u,0u,8u,12u,2u,13u,7u,1u,4u,10u,5u,
- 10u,2u,8u,4u,7u,6u,1u,5u,15u,11u,9u,14u,3u,12u,13u,0u,
- 0u,1u,2u,3u,4u,5u,6u,7u,8u,9u,10u,11u,12u,13u,14u,15u,
- 14u,10u,4u,8u,9u,15u,13u,6u,1u,12u,0u,2u,11u,7u,5u,3u
-);
-
-// G mixing function
-void G (uint a, uint b, uint c, uint d, uint x, uint y) {
- v[a] = v[a] + v[b] + uvec2(0u, uint(v[a].x + v[b].x < v[b].x));
- v[a] = v[a] + m[x] + uvec2(0u, uint(v[a].x + m[x].x < m[x].x));
- v[d] = (v[d] ^ v[a]).yx;
- v[c] = v[c] + v[d] + uvec2(0u, uint(v[c].x + v[d].x < v[d].x));
- v[b] = ((v[b] ^ v[c]) >> ROTATE_24) | ((v[b] ^ v[c]).yx << ROTATE_8);
- v[a] = v[a] + v[b] + uvec2(0u, uint(v[a].x + v[b].x < v[b].x));
- v[a] = v[a] + m[y] + uvec2(0u, uint(v[a].x + m[y].x < m[y].x));
- v[d] = ((v[d] ^ v[a]) >> ROTATE_16) | ((v[d] ^ v[a]).yx << ROTATE_16);
- v[c] = v[c] + v[d] + uvec2(0u, uint(v[c].x + v[d].x < v[d].x));
- v[b] = ((v[b] ^ v[c]).yx >> ROTATE_31) | ((v[b] ^ v[c]) << ROTATE_1);
-}
-
-void main() {
- // Nonce uniquely differentiated by pixel location
- m[0u].x = work.x ^ uint(uv_pos.x * workload);
- m[0u].y = work.y ^ uint(uv_pos.y * workload);
-
- // Block hash
- m[1u] = uvec2(blockhash[0u], blockhash[1u]);
- m[2u] = uvec2(blockhash[2u], blockhash[3u]);
- m[3u] = uvec2(blockhash[4u], blockhash[5u]);
- m[4u] = uvec2(blockhash[6u], blockhash[7u]);
-
- // twelve rounds of mixing
- for(uint i = 0u; i < 12u; i = i + 1u) {
- G(0u, 4u, 8u, 12u, SIGMA[i * 16u + 0u], SIGMA[i * 16u + 1u]);
- G(1u, 5u, 9u, 13u, SIGMA[i * 16u + 2u], SIGMA[i * 16u + 3u]);
- G(2u, 6u, 10u, 14u, SIGMA[i * 16u + 4u], SIGMA[i * 16u + 5u]);
- G(3u, 7u, 11u, 15u, SIGMA[i * 16u + 6u], SIGMA[i * 16u + 7u]);
- G(0u, 5u, 10u, 15u, SIGMA[i * 16u + 8u], SIGMA[i * 16u + 9u]);
- G(1u, 6u, 11u, 12u, SIGMA[i * 16u + 10u], SIGMA[i * 16u + 11u]);
- G(2u, 7u, 8u, 13u, SIGMA[i * 16u + 12u], SIGMA[i * 16u + 13u]);
- G(3u, 4u, 9u, 14u, SIGMA[i * 16u + 14u], SIGMA[i * 16u + 15u]);
- }
-
- // Pixel data set from work values
- // Finalize digest from high bits, low bits can be safely ignored
- if ((BLAKE2B_IV32_1 ^ v[0u].y ^ v[8u].y) > threshold) {
- nonce = uvec4(1u, m[0].y, m[0].x, 1u);
- } else {
- discard;
- }
-}
-`
export const NanoPowGlVertexShader = `#version 300 es
#pragma vscode_glsllint_stage: vert
+#ifdef GL_FRAGMENT_PRECISION_HIGH
precision highp float;
-layout (location=0) in vec4 position;
-layout (location=1) in vec2 uv;
+#else
+precision mediump float;
+#endif
-out vec2 uv_pos;
+layout (location=0) in vec4 position;
void main() {
- uv_pos = uv;
gl_Position = position;
}
`
// SPDX-License-Identifier: GPL-3.0-or-later
import { default as NanoPowGpuComputeShader } from "./compute.wgsl"
-import { NanoPowGlFragmentShader } from "./gl-fragment.js"
+import { NanoPowGlDownsampleShader } from "./gl-downsample.js"
+import { NanoPowGlDrawShader } from "./gl-draw.js"
import { NanoPowGlVertexShader } from "./gl-vertex.js"
export {
NanoPowGpuComputeShader,
- NanoPowGlFragmentShader,
+ NanoPowGlDownsampleShader,
+ NanoPowGlDrawShader,
NanoPowGlVertexShader
}
<!DOCTYPE html>
<head>
- <link rel="icon" href="./favicon.ico">
+ <link rel="shortcut icon" href="#">
<script type="module">
let NanoPow, NanoPowGl, NanoPowGpu
try {
logarithms += Math.log(times[i])
min = Math.min(min, times[i])
max = Math.max(max, times[i])
- if (i === Math.ceil(count / 2)) {
+ if (i + 1 === Math.floor(count / 2)) {
median = times[i]
}
if (count < 3 || (i > (0.1 * count) && i < (0.9 * (count - 1)))) {
truncatedCount++
}
}
- const title = `NanoPow (${type}) | Effort: ${effort} | Dispatch: ${(0x100 * effort) ** 2} | Threads: ${8 * 8 * (0x100 * effort) ** 2}`
+ const title = type === 'WebGPU'
+ ? `NanoPow (${type}) | Effort: ${effort} | Dispatch: ${(0x100 * effort) ** 2} | Threads: ${8 * 8 * (0x100 * effort) ** 2}`
+ : `NanoPow (${type}) | Effort: ${effort} | Frame: ${NanoPowGl.size} | Pixels: ${NanoPowGl.size ** 2}`
return {
[title]: {
count: count,
}
export async function run (threshold, size, effort, isOutputShown, isGlForced, isDebug) {
- NanoPow = isGlForced ? NanoPowGl : NanoPowGpu
- const type = (NanoPow === NanoPowGpu) ? 'WebGPU' : (NanoPow === NanoPowGl) ? 'WebGL' : 'unknown API'
+ const NP = isGlForced ? NanoPowGl : NanoPow
+ const type = (NP === NanoPowGpu) ? 'WebGPU' : (NP === NanoPowGl) ? 'WebGL' : 'unknown API'
document.getElementById('status').innerHTML = `TESTING IN PROGRESS 0/${size}`
console.log(`%cNanoPow`, 'color:green', 'Checking validate()')
const expect = []
let result
- result = await NanoPow.validate('47c83266398728cf', '92BA74A7D6DC7557F3EDA95ADC6341D51AC777A0A6FF0688A5C492AB2B2CB40D', { debug: true })
+ result = await NP.validate('47c83266398728cf', '92BA74A7D6DC7557F3EDA95ADC6341D51AC777A0A6FF0688A5C492AB2B2CB40D', { debug: isDebug })
console.log(`validate() output for good nonce 1 is ${result === true ? 'correct' : 'incorrect'}`)
expect.push(result === true)
- result = await NanoPow.validate('4a8fb104eebbd336', '8797585D56B8AEA3A62899C31FC088F9BE849BA8298A88E94F6E3112D4E55D01', { debug: true })
+ result = await NP.validate('4a8fb104eebbd336', '8797585D56B8AEA3A62899C31FC088F9BE849BA8298A88E94F6E3112D4E55D01', { debug: isDebug })
console.log(`validate() output for good nonce 2 is ${result === true ? 'correct' : 'incorrect'}`)
expect.push(result === true)
- result = await NanoPow.validate('c5d5d6f7c5d6ccd1', '281E89AC73B1082B464B9C3C1168384F846D39F6DF25105F8B4A22915E999117', { debug: true })
+ result = await NP.validate('c5d5d6f7c5d6ccd1', '281E89AC73B1082B464B9C3C1168384F846D39F6DF25105F8B4A22915E999117', { debug: isDebug })
console.log(`validate() output for colliding nonce is ${result === true ? 'correct' : 'incorrect'}`)
expect.push(result === true)
- result = await NanoPow.validate('0000000000000000', '0000000000000000000000000000000000000000000000000000000000000000', { debug: true })
+ result = await NP.validate('0000000000000000', '0000000000000000000000000000000000000000000000000000000000000000', { debug: isDebug })
console.log(`validate() output for bad nonce 1 is ${result === false ? 'correct' : 'incorrect'}`)
expect.push(result === false)
- result = await NanoPow.validate('c5d5d6f7c5d6ccd1', 'BA1E946BA3D778C2F30A83D44D2132CC6EEF010D8D06FF10A8ABD0100D8FB47E', { debug: true })
+ result = await NP.validate('c5d5d6f7c5d6ccd1', 'BA1E946BA3D778C2F30A83D44D2132CC6EEF010D8D06FF10A8ABD0100D8FB47E', { debug: isDebug })
console.log(`validate() output for bad nonce 2 is ${result === false ? 'correct' : 'incorrect'}`)
expect.push(result === false)
- result = await NanoPow.validate('47c83266398728ce', '92BA74A7D6DC7557F3EDA95ADC6341D51AC777A0A6FF0688A5C492AB2B2CB40D', { debug: true })
+ result = await NP.validate('47c83266398728ce', '92BA74A7D6DC7557F3EDA95ADC6341D51AC777A0A6FF0688A5C492AB2B2CB40D', { debug: isDebug })
console.log(`validate() output for slightly wrong nonce is ${result === false ? 'correct' : 'incorrect'}`)
expect.push(result === false)
let work = null
const start = performance.now()
try {
- work = await NanoPow.search(hash, { threshold, effort, debug: isDebug })
+ work = await NP.search(hash, { threshold, effort, debug: isDebug })
} catch (err) {
document.getElementById('output').innerHTML += `Error: ${err.message}<br/>`
console.error(err)
return
}
const end = performance.now()
- const isValid = (await NanoPow.validate(work, hash, { threshold: 0 })) ? 'VALID' : 'INVALID'
+ const isValid = (await NP.validate(work, hash)) ? 'VALID' : 'INVALID'
times.push(end - start)
const msg = `${isValid} [${work}] ${hash} (${end - start} ms)`
if (isOutputShown) document.getElementById('output').innerHTML += `${msg}<br/>`
const validation = document.getElementById('validation')
validation.innerText = '⏳'
if (work.value.length === 16 && hash.value.length === 64) {
- NanoPow.validate(work.value, hash.value, { threshold: `0x${+threshold.value}` })
+ NP.validate(work.value, hash.value, { threshold: `0x${+threshold.value}` })
.then(result => {
validation.innerText = result
? '✔️'
run(+`0x${threshold.value}`, +size.value, +effort.value, isOutputShown.checked, isGlForced.checked, isDebug.checked)
}
document.getElementById('btnStartTest').addEventListener('click', startTest)
+ document.getElementById('effort').value = Math.max(1, Math.floor(navigator.hardwareConcurrency))
</script>
<style>body{background:black;color:white;}a{color:darkcyan;}input[type=number]{width:5em;}span{margin:0.5em;}</style>
</head>
<label for="size">Test Size</label>
<input id="size" type="number" value="1" autofocus />
<label for="effort">Effort (1-32)</label>
- <input id="effort" type="number" value="8" min="1" max="32" />
+ <input id="effort" type="number" min="1" max="32" />
<span>
<label for="isOutputShown">Show output?</label>
<input id="isOutputShown" type="checkbox" checked />
import "@webgpu/types"
-export declare const NanoPowGlFragmentShader: string
+export declare const NanoPowGlDownsampleShader: string
+export declare const NanoPowGlDrawShader: string
export declare const NanoPowGlVertexShader: string
export declare const NanoPowGpuComputeShader: any
declare const NanoPow: typeof NanoPowGl | typeof NanoPowGpu | null
+/**
+* Used to create WebGL framebuffer objects.
+*
+* @param {WebGLTexture} - Defines storage size
+* @param {WebGLFramebuffer} - Holds texture data
+* @param {size} - 2D lengths of texture
+*/
+export type FBO = {
+ texture: WebGLTexture
+ framebuffer: WebGLFramebuffer
+ size: {
+ x: number
+ y: number
+ }
+}
+
/**
* Used to configure NanoPow.
*