From 71fc2d8c5a2603168fceb2b904394902d093afc4 Mon Sep 17 00:00:00 2001 From: Chris Duncan Date: Mon, 3 Feb 2025 15:15:59 -0800 Subject: [PATCH] Convert eight sequential rounds of vec2 G mixing into parallelized four rounds of vec4 G. Read threshold direct from uniform to save a redundant assignment. Fix test page validation executing on every single input event. Delete benchmark file since it is completely outdated. Delete bundle to be reuploaded after tweaking new build. Update comment documentation. --- benchmarks.md | 1554 -------------------------------- main.min.js | 759 ---------------- src/shaders/compute.wgsl | 1828 +++++++++++++++++--------------------- test.html | 23 +- 4 files changed, 849 insertions(+), 3315 deletions(-) delete mode 100644 benchmarks.md delete mode 100644 main.min.js diff --git a/benchmarks.md b/benchmarks.md deleted file mode 100644 index 8dc9d64..0000000 --- a/benchmarks.md +++ /dev/null @@ -1,1554 +0,0 @@ - - -2025-02-02 (desktop, temp vars, vec2 operations, vec2 type cast for carries, zero threshold, no early return) -{ - "NanoPow (WebGPU) | Effort: 8 | Dispatch: 4194304 | Threads: 268435456": { - "count": 1024, - "total": 154634.60000056028, - "rate": 6.62815637585526, - "min": 148.30000001192093, - "max": 364.80000001192093, - "median": 150.90000000596046, - "arithmetic": 151.01035156304715, - "truncated": 150.8715158928286, - "harmonic": 150.88339631489575, - "geometric": 150.9293154354564 - } -} - -2025-02-02 (desktop, vec2 operations, temp vars, select for carries, zero threshold, no early return) -{ - "NanoPow (WebGPU) | Effort: 8 | Dispatch: 4194304 | Threads: 268435456": { - "count": 1024, - "total": 155016.60000002384, - "rate": 6.623733858698416, - "min": 148.2000000178814, - "max": 405.2000000178814, - "median": 150.90000000596046, - "arithmetic": 151.38339843752328, - "truncated": 150.9722493887312, - "harmonic": 151.2095653216647, - "geometric": 151.27158853045546 - } -} - -2025-02-02 (desktop, temp vars, vec4 operations, u32 type cast for carries, zero threshold, no early return) -{ - "NanoPow (WebGPU) | Effort: 8 | Dispatch: 4194304 | Threads: 268435456": { - "count": 1024, - "total": 154134.300001204, - "rate": 6.652845880818371, - "min": 149.09999999403954, - "max": 332.90000000596046, - "median": 150.19999998807907, - "arithmetic": 150.5217773449258, - "truncated": 150.31161369350545, - "harmonic": 150.42122356282434, - "geometric": 150.45883671227364 - } -} - -2025-02-02 (desktop, compress-inline, no temp vars, vec4 operations, vec2 select for carries, zero threshold, no early return) -{ - "NanoPow (WebGPU) | Effort: 8 | Dispatch: 4194304 | Threads: 268435456": { - "count": 1024, - "total": 153803.49999952316, - "rate": 6.670591823407945, - "min": 149.19999998807907, - "max": 398.59999999403954, - "median": 149.90000000596046, - "arithmetic": 150.19873046828434, - "truncated": 149.91173594086123, - "harmonic": 150.04617979659852, - "geometric": 150.09857450851368 - } -} - -2025-02-02 (desktop, compress-inline, no temp vars, vec4 operations, vec4 type cast for carries, zero threshold, no early return) -{ - "NanoPow (WebGPU) | Effort: 8 | Dispatch: 4194304 | Threads: 268435456": { - "count": 1024, - "total": 153660.0999993682, - "rate": 6.671968353041561, - "min": 148.59999999403954, - "max": 299.69999998807907, - "median": 149.89999997615814, - "arithmetic": 150.058691405633, - "truncated": 149.88080684527355, - "harmonic": 149.98481082698984, - "geometric": 150.01345917720215 - } -} - -2025-02-02 (desktop, compress-inline, no temp vars, vec4 operations, u32 type cast for carries, zero threshold, no early return) -{ - "NanoPow (WebGPU) | Effort: 8 | Dispatch: 4194304 | Threads: 268435456": { - "count": 1024, - "total": 153737.2999996543, - "rate": 6.672381966975314, - "min": 148.10000002384186, - "max": 399.90000000596046, - "median": 149.80000001192093, - "arithmetic": 150.1340820309124, - "truncated": 149.8715158918449, - "harmonic": 149.98033249642677, - "geometric": 150.0330563607954 - } -} - -2025-02-02 (desktop, compress, temp vars, vec4 operations, vec4 type cast for carries, zero threshold, no early return) -{ - "NanoPow (WebGPU) | Effort: 8 | Dispatch: 4194304 | Threads: 268435456": { - "count": 1024, - "total": 153746.30000066757, - "rate": 6.672626894103991, - "min": 147.19999998807907, - "max": 413, - "median": 149.80000001192093, - "arithmetic": 150.14287109440193, - "truncated": 149.8660146701161, - "harmonic": 149.97814196658746, - "geometric": 150.03380952128336 - } -} - -2025-02-02 (desktop, main, vec2 operations, no temp vars, u32 type cast for carries, zero threshold, no early return) -{ - "NanoPow (WebGPU) | Effort: 8 | Dispatch: 4194304 | Threads: 268435456": { - "count": 1024, - "total": 153949.80000004172, - "rate": 6.663815400905096, - "min": 149.2999999821186, - "max": 334.40000000596046, - "median": 149.90000000596046, - "arithmetic": 150.34160156254075, - "truncated": 150.06418092916823, - "harmonic": 150.2393007860643, - "geometric": 150.2775205781935 - } -} - - - ------ - - - -2025-02-01 (laptop, main, vec2 operations, no temp vars, u32 type cast for carries, zero threshold, no early return) -{ - "NanoPow (WebGPU) | Effort: 8 | Dispatch: 4194304 | Threads: 268435456": { - "count": 128, - "total": 215110.39999985695, - "rate": 0.5826656955575709, - "min": 1231.8999999761581, - "max": 1940.1000000238419, - "median": 1731.3999999761581, - "arithmetic": 1680.5499999988824, - "truncated": 1716.2499999988313, - "harmonic": 1658.1870357863268, - "geometric": 1670.1939377722429 - } -} - -2025-02-01 (desktop, main, vec2 operations, u32 type cast for carries, zero threshold, no early return) -{ - "NanoPow (WebGPU) | Effort: 8 | Dispatch: 4194304 | Threads: 268435456": { - "count": 4096, - "total": 619720.5000002831, - "rate": 6.6117984273376305, - "min": 148, - "max": 363.09999999403954, - "median": 151.20000000298023, - "arithmetic": 151.29895019538162, - "truncated": 151.24478021975474, - "harmonic": 151.2643921172073, - "geometric": 151.27733674927475 - } -} - -2025-02-01 (laptop, main, u32 type cast for carries, zero threshold, no early return) -{ - "NanoPow (WebGPU) | Effort: 8 | Dispatch: 4194304 | Threads: 268435456": { - "count": 128, - "total": 223308.59999984503, - "rate": 0.55975739455978, - "min": 1235.5999999642372, - "max": 2072.899999976158, - "median": 1810.2000000476837, - "arithmetic": 1744.5984374987893, - "truncated": 1786.4882352942345, - "harmonic": 1717.3672232961123, - "geometric": 1732.0991790493958 - } -} - - - -main - -2025-01-31 -{ - "NanoPow (WebGPU) | Effort: 16 | Dispatch: 16777216 | Threads: 1073741824": { - "count": 4096, - "total": 1068449.1999994218, - "rate": 3.8973748396630357, - "min": 12.5, - "max": 2738.4000000059605, - "median": 315.40000000596046, - "arithmetic": 260.85185546860885, - "truncated": 256.5829670328706, - "harmonic": 98.0229216514215, - "geometric": 168.12396106469913 - } -} -{ - "NanoPow (WebGPU) | Effort: 8 | Dispatch: 4194304 | Threads: 268435456": { - "count": 4096, - "total": 1069178.7000000179, - "rate": 3.909655452661306, - "min": 3.699999988079071, - "max": 2211.300000011921, - "median": 316.59999999403954, - "arithmetic": 261.02995605469187, - "truncated": 255.77701465209654, - "harmonic": 73.26505112089419, - "geometric": 156.17227103593234 - } -} - - - -2025-01-30 -{ - "NanoPow (WebGPU) | Effort: 16 | Dispatch: 16777216 | Threads: 1073741824": { - "count": 4096, - "total": 1101947.3999997675, - "rate": 3.8753692602297347, - "min": 13.100000023841858, - "max": 2062.699999988079, - "median": 319.7000000178814, - "arithmetic": 269.03012695306825, - "truncated": 258.03992673996675, - "harmonic": 99.43298937394798, - "geometric": 171.24141726167608 - } -} - -{ - "NanoPow (WebGPU) | Effort: 8 | Dispatch: 4194304 | Threads: 268435456": { - "count": 4096, - "total": 1090902.8999999762, - "rate": 3.799836314741894, - "min": 3.5999999940395355, - "max": 2885.5, - "median": 321.40000000596046, - "arithmetic": 266.3337158203067, - "truncated": 263.16923076933267, - "harmonic": 70.99602668653543, - "geometric": 156.60774106488077 - } -} - - - ------ - - - -{ - "NanoPow (WebGPU) | Effort: 8 | Dispatch: 4194304 | Threads: 268435456": { - "count": 512, - "total": 138372.20000001788, - "rate": 3.919394953010938, - "min": 3.7999999970197678, - "max": 1618.9000000059605, - "arithmetic": 270.2582031250349, - "truncated": 255.14142156859825, - "harmonic": 72.1287653118942, - "geometric": 157.72414891838542 - } -} - -Zero threshold benchmark (vectors) - -16.0569471600238 average on 512 passes 8 effort when logging mapAsync only -118.135686272967 average on 256 passes 24 effort when logging mapAsync only - -{ - "NanoPow (WebGPU) | Effort: 8 | Dispatch: 4194304 | Threads: 268435456": { - "count": 512, - "total": 1851.7000002861023, - "rate": 282.7638780600227, - "min": 3.300000011920929, - "max": 8.5, - "arithmetic": 3.6166015630587935, - "truncated": 3.5365196108525874, - "harmonic": 3.5909492641537284, - "geometric": 3.6023994761114566 - } -} - -{ - "NanoPow (WebGPU) | Effort: 8 | Dispatch: 4194304 | Threads: 268435456": { - "count": 512, - "total": 2059.3000002503395, - "rate": 273.1655059917369, - "min": 3.199999988079071, - "max": 13.399999976158142, - "arithmetic": 4.022070312988944, - "truncated": 3.6607843159752735, - "harmonic": 3.791171787620167, - "geometric": 3.8764125112547805 - } -} - - - ------ - - - -iPhone -88.7974 geometric mean on 256 passes 24 effort - -{ - "NanoPow (WebGPU) | Effort: 8 | Dispatch: 4194304 | Threads: 268435456": { - "count": 512, - "total": 10057.50000011921, - "rate": 64.04963847019617, - "min": 15.399999976158142, - "max": 36.89999997615814, - "arithmetic": 19.64355468773283, - "truncated": 15.612890624906868, - "harmonic": 19.455959597968477, - "geometric": 19.54743125733686 - } -} - -{ - "NanoPow (WebGPU) | Effort: 8 | Dispatch: 4194304 | Threads: 268435456": { - "count": 512, - "total": 10370.999999821186, - "rate": 62.232594321365944, - "min": 17, - "max": 88.5, - "arithmetic": 20.255859374650754, - "truncated": 16.068749999976717, - "harmonic": 20.060722114122232, - "geometric": 20.134613393434872 - } -} - -{ - "NanoPow (WebGPU) | Effort: 8 | Dispatch: 4194304 | Threads: 268435456": { - "count": 4096, - "total": 75452.59999632835, - "rate": 67.94629986572919, - "min": 13.400000035762787, - "max": 37.60000002384186, - "arithmetic": 18.421044920978602, - "truncated": 14.717504882180947, - "harmonic": 18.21353217876272, - "geometric": 18.31703265536469 - } -} - - - ------ - - - -Super benchmarks - -{ - (16,16) "NanoPow (WebGPU) | Effort: 4 | Dispatch: 1048576 | Threads: 67108864": { - "count": 4096, - "total": 1089734.8000002578, - "rate": 4.829357209724284, - "min": 5.899999998509884, - "max": 2568.10000000149, - "arithmetic": 266.04853515631294, - "truncated": 207.06689453131003, - "harmonic": 81.18982598620626, - "geometric": 161.6413047635922 - } -} - -{ - (8,8) "NanoPow (WebGPU) | Effort: 16 | Dispatch: 16777216 | Threads: 1073741824": { - "count": 4096, - "total": 1083644.400000088, - "rate": 4.819279376542908, - "min": 20.399999998509884, - "max": 2163.1999999955297, - "arithmetic": 264.56162109377146, - "truncated": 207.49990234377037, - "harmonic": 112.25554889616, - "geometric": 175.89628874036063 - } -} - - - ------ - - - -After fixing test page bug setting Effort incorrectly - -16,16 ------ -"NanoPow (WebGPU) | Effort: 16 | Dispatch: 16777216 | Threads: 1073741824": { - "count": 512, - "total": 164300.40000003576, - "rate": 3.8249916888598743, - "min": 77.30000000074506, - "max": 1980.5, - "arithmetic": 320.89921875006985, - "truncated": 261.4384765625655, - "harmonic": 198.34621659194366, - "geometric": 246.96739280242824 - } - -"NanoPow (WebGPU) | Effort: 8 | Dispatch: 4194304 | Threads: 268435456": { - "count": 512, - "total": 133643.70000002906, - "rate": 5.0202476786249735, - "min": 20.699999999254942, - "max": 1318.199999999255, - "arithmetic": 261.02285156255675, - "truncated": 199.1933593750291, - "harmonic": 106.89582610328786, - "geometric": 171.1679993707545 - } - -"NanoPow (WebGPU) | Effort: 4 | Dispatch: 4194304 | Threads: 268435456": { - "count": 512, - "total": 130945.59999998286, - "rate": 4.937419176281642, - "min": 7, - "max": 1650.7999999970198, - "arithmetic": 255.75312499996653, - "truncated": 202.53496093744616, - "harmonic": 78.50458783656828, - "geometric": 155.70211798018283 - } - - -8,8 ------ -"NanoPow (WebGPU) | Effort: 32 | Dispatch: 67108864 | Threads: 4294967296": { - "count": 512, - "total": 168521.00000004843, - "rate": 3.871414593719482, - "min": 76.30000000074506, - "max": 2497.300000000745, - "arithmetic": 329.1425781250946, - "truncated": 258.3035156250844, - "harmonic": 209.53630679307048, - "geometric": 259.087912072887 - } - -"NanoPow (WebGPU) | Effort: 16 | Dispatch: 16777216 | Threads: 1073741824": { - "count": 512, - "total": 128736.9999999404, - "rate": 4.969079128705403, - "min": 20.800000000745058, - "max": 1565.699999999255, - "arithmetic": 251.43945312488358, - "truncated": 201.24453124990396, - "harmonic": 106.21772967372581, - "geometric": 167.1924244801166 - } - -"NanoPow (WebGPU) | Effort: 8 | Dispatch: 4194304 | Threads: 268435456": { - "count": 512, - "total": 133008.6999999974, - "rate": 4.771005703775073, - "min": 6.5, - "max": 1757.5999999996275, - "arithmetic": 259.7826171874949, - "truncated": 209.5994140624789, - "harmonic": 77.38622139525103, - "geometric": 155.94476005599634 - } - -"NanoPow (WebGPU) | Effort: 1 | Dispatch: 65536 | Threads: 4194304": { - "count": 512, - "total": 178794.09999999218, - "rate": 3.724110341315865, - "min": 2.699999999254942, - "max": 3011.2999999988824, - "arithmetic": 349.2072265624847, - "truncated": 268.5205078125218, - "harmonic": 79.8080055571974, - "geometric": 202.6704071324123 - } - - - ------ - - - -NanoPow (WebGPU) 3070 (dispatch 0x2000 workgroup 256) -{ - "count": 512, - "total": 159978.0000000596, - "rate": 3.8420526166080182, - "min": 81, - "max": 1383.3999999910593, - "arithmetic": 312.4570312501164, - "truncated": 260.277539062663, - "harmonic": 196.8969789336903, - "geometric": 244.9313792773437 -} - -NanoPow (WebGPU) 3070 (dispatch 0x1000 workgroup 256) -{ - "count": 512, - "total": 167904.10000032187, - "rate": 3.772403249271633, - "min": 77.30000001192093, - "max": 1771.5, - "arithmetic": 327.93769531312864, - "truncated": 265.08300781287835, - "harmonic": 209.60304208880888, - "geometric": 258.285687628797 -} - -NanoPow (WebGPU) 3070 (dispatch 0x800 workgroup 256) -{ - "count": 512, - "total": 165119.60000000894, - "rate": 3.793500222645779, - "min": 79.90000000596046, - "max": 2104.0999999940395, - "arithmetic": 322.49921875001746, - "truncated": 263.6087890625058, - "harmonic": 207.56200329474862, - "geometric": 256.16574636086983 -} - -NanoPow (WebGPU) 3070 (dispatch 0x400 workgroup 256) -{ - "count": 512, - "total": 163759.80000016093, - "rate": 3.8946941638158328, - "min": 79.90000000596046, - "max": 2280, - "arithmetic": 319.8433593753143, - "truncated": 256.75957031251164, - "harmonic": 204.43707515011045, - "geometric": 252.168610285593 -} - -NanoPow (WebGPU) 3070 (dispatch 0x2000 workgroup 64) -{ - "count": 512, - "total": 150115.60000014305, - "rate": 4.336664038122414, - "min": 24.5, - "max": 1650.6000000089407, - "arithmetic": 293.1945312502794, - "truncated": 230.5919921878376, - "harmonic": 123.67804479239821, - "geometric": 194.04155076078126 -} - -NanoPow (WebGPU) 3070 (dispatch 0x1000 workgroup 64) -{ - "count": 512, - "total": 138818.7000001073, - "rate": 4.618541821704324, - "min": 24.599999994039536, - "max": 1552.699999988079, - "arithmetic": 271.13027343770955, - "truncated": 216.51855468767462, - "harmonic": 121.22290429039016, - "geometric": 185.89186266835625 -} - -NanoPow (WebGPU) 3070 (dispatch 0x800 workgroup 64) -{ - "count": 512, - "total": 135636.89999975264, - "rate": 4.719968508971256, - "min": 25.900000005960464, - "max": 1588, - "arithmetic": 264.9158203120169, - "truncated": 211.8658203119994, - "harmonic": 124.27757176664986, - "geometric": 181.82369853819247 -} - -NanoPow (WebGPU) 3070 (dispatch 0x400 workgroup 64) -{ - "count": 512, - "total": 140658.69999991357, - "rate": 4.752414960082367, - "min": 23.700000002980232, - "max": 1747.2000000029802, - "arithmetic": 274.7240234373312, - "truncated": 210.41933593750582, - "harmonic": 110.33702507461787, - "geometric": 175.14808439595487 -} - -NanoPow (WebGPU) 3070 (dispatch 0x2000 workgroup 32) -{ - "count": 512, - "total": 141774.90000011027, - "rate": 4.605960941090885, - "min": 16.100000008940697, - "max": 1443.2999999970198, - "arithmetic": 276.90410156271537, - "truncated": 217.10996093752328, - "harmonic": 114.98646010749061, - "geometric": 184.62279065842682 -} - -NanoPow (WebGPU) 3070 (dispatch 0x1000 workgroup 32) -{ - "count": 512, - "total": 143074.00000026822, - "rate": 4.8437740946897145, - "min": 19.399999991059303, - "max": 1366.8999999910593, - "arithmetic": 279.44140625052387, - "truncated": 206.4505859380006, - "harmonic": 111.60267816366408, - "geometric": 183.1466099827166 -} - -NanoPow (WebGPU) 3070 (dispatch 0x800 workgroup 32) -{ - "count": 512, - "total": 144550.89999994636, - "rate": 4.644462748147642, - "min": 18.5, - "max": 1449.6000000089407, - "arithmetic": 282.3259765623952, - "truncated": 215.31015624979045, - "harmonic": 114.96709424095182, - "geometric": 186.24836896995765 -} - -NanoPow (WebGPU) 3070 (dispatch 0x400 workgroup 32) -{ - "count": 512, - "total": 145009.69999994338, - "rate": 4.465056571918978, - "min": 14.200000002980232, - "max": 2710.2999999970198, - "arithmetic": 283.2220703123894, - "truncated": 223.9613281249476, - "harmonic": 105.73248510637895, - "geometric": 178.59991348235997 -} - -NanoPow (WebGPU) 3070 (dispatch 0x2000 workgroup 16,16) -{ - "count": 512, - "total": 171064.90000003576, - "rate": 3.6180268186231834, - "min": 81.3999999910593, - "max": 1453.3999999910593, - "arithmetic": 334.11113281256985, - "truncated": 276.39375000004657, - "harmonic": 217.83822531129073, - "geometric": 267.8978578209064 -} - -NanoPow (WebGPU) 3070 (dispatch 0x1000 workgroup 16,16) -{ - "count": 512, - "total": 167172.10000005364, - "rate": 3.8445942901746104, - "min": 78.29999999701977, - "max": 3207.6000000089407, - "arithmetic": 326.5080078126048, - "truncated": 260.1054687501455, - "harmonic": 210.8258370543174, - "geometric": 257.37154756448814 -} - -NanoPow (WebGPU) 3070 (dispatch 0x800 workgroup 16,16) -{ - "count": 512, - "total": 161936.60000008345, - "rate": 3.740251239688252, - "min": 80.6000000089407, - "max": 1553.8999999910593, - "arithmetic": 316.282421875163, - "truncated": 267.3617187500349, - "harmonic": 204.59200263160372, - "geometric": 250.96434072996382 -} - -NanoPow (WebGPU) 3070 (dispatch 0x400 workgroup 16,16) -{ - "count": 512, - "total": 158915.09999985993, - "rate": 3.7711889999175563, - "min": 81, - "max": 2144.199999988079, - "arithmetic": 310.3810546872264, - "truncated": 265.1683593746857, - "harmonic": 201.944298796899, - "geometric": 246.41513206426896 -} - -NanoPow (WebGPU) 3070 (dispatch 0x2000 workgroup 8,8) -{ - "count": 512, - "total": 139888.0000000596, - "rate": 4.623788851791469, - "min": 24.100000008940697, - "max": 1754.5, - "arithmetic": 273.2187500001164, - "truncated": 216.27285156253492, - "harmonic": 124.13430182526332, - "geometric": 186.66857347407046 -} - -NanoPow (WebGPU) 3070 (dispatch 0x1000 workgroup 8,8) -{ - "count": 512, - "total": 141471.0000000447, - "rate": 4.8109313380457674, - "min": 25.799999997019768, - "max": 1512.4000000059605, - "arithmetic": 276.3105468750873, - "truncated": 207.8599609376688, - "harmonic": 127.00893713470423, - "geometric": 192.06862657670237 -} - -NanoPow (WebGPU) 3070 (dispatch 0x800 workgroup 8,8) -{ - "count": 4096, - "total": 1061022.09999986, - "rate": 4.8102132476979635, - "min": 5.5, - "max": 2067.7000000029802, - "arithmetic": 259.0385986327783, - "truncated": 207.8909912109557, - "harmonic": 75.27207858671284, - "geometric": 156.19893668278434 -} - -NanoPow (WebGPU) 3070 (dispatch 0x800 workgroup 8,8) -{ - "count": 512, - "total": 133226.09999994934, - "rate": 4.905215336077563, - "min": 24.599999994039536, - "max": 1432.3999999910593, - "arithmetic": 260.20722656240105, - "truncated": 203.86464843756403, - "harmonic": 116.05589569169062, - "geometric": 178.05592700404114 -} - -NanoPow (WebGPU) 3070 (dispatch 0x400 workgroup 8,8) -{ - "count": 512, - "total": 146197.10000024736, - "rate": 4.400326586729279, - "min": 25.399999991059303, - "max": 1748.5, - "arithmetic": 285.5412109379831, - "truncated": 227.25585937549477, - "harmonic": 130.29367565620703, - "geometric": 196.68629173860154 -} - -NanoPow (WebGPU) 3070 (dispatch 0x2000 workgroup 8,4) -{ - "count": 512, - "total": 146798.40000000596, - "rate": 4.501937943601046, - "min": 18.099999994039536, - "max": 1979.1000000089407, - "arithmetic": 286.71562500001164, - "truncated": 222.12656249990687, - "harmonic": 103.21952194085847, - "geometric": 176.78141792063872 -} - -NanoPow (WebGPU) 3070 (dispatch 0x1000 workgroup 8,4) -{ - "count": 512, - "total": 138210.7000002265, - "rate": 4.76931689781462, - "min": 16.799999997019768, - "max": 1626.0999999940395, - "arithmetic": 269.9427734379424, - "truncated": 209.67363281274447, - "harmonic": 101.34635015953711, - "geometric": 172.06200959967907 -} - -NanoPow (WebGPU) 3070 (dispatch 0x800 workgroup 8,4) -{ - "count": 512, - "total": 149949.19999992847, - "rate": 4.415311818463056, - "min": 16.799999997019768, - "max": 1790.7999999970198, - "arithmetic": 292.8695312498603, - "truncated": 226.48457031243015, - "harmonic": 112.43872189933657, - "geometric": 189.39141120585325 -} - -NanoPow (WebGPU) 3070 (dispatch 0x400 workgroup 8,4) -{ - "count": 512, - "total": 147114.90000009537, - "rate": 4.581196336470157, - "min": 17.600000008940697, - "max": 3584.5999999940395, - "arithmetic": 287.33378906268626, - "truncated": 218.28359375020955, - "harmonic": 118.69066922246898, - "geometric": 188.03357141542313 -} - -NanoPow (WebGPU) 3070 (dispatch 0x2000 workgroup 4,4) -{ - "count": 512, - "total": 275257.4999998361, - "rate": 2.5033687912081977, - "min": 16.700000002980232, - "max": 4439.20000000298, - "arithmetic": 537.6123046871799, - "truncated": 399.461718749546, - "harmonic": 179.34949948078622, - "geometric": 340.32029726440055 -} - -NanoPow (WebGPU) 3070 (dispatch 0x1000 workgroup 4,4) -{ - "count": 512, - "total": 265632.5000000298, - "rate": 2.4961071453998693, - "min": 19, - "max": 4817.0999999940395, - "arithmetic": 518.8134765625582, - "truncated": 400.6238281248952, - "harmonic": 152.2455042043822, - "geometric": 300.11657404770995 -} - ------ - - - -NanoPow (WebGPU) 3070 (dispatch 0x1000 workgroup 256) -{ - "count": 2048, - "total": 547002.6999999285, - "rate": 3.744040020278269, - "min": 22.599999994039536, - "max": 2720.5, - "arithmetic": 267.0911621093401, - "truncated": 207.97036132813082, - "harmonic": 126.12851499707206, - "geometric": 183.20580886813133 -} - -NanoPow (WebGPU) 3070 (dispatch 0x800 workgroup 256) -{ - "count": 2048, - "total": 543507.1000000834, - "rate": 3.768120048477169, - "min": 23.400000005960464, - "max": 2603.699999988079, - "arithmetic": 265.38432617191575, - "truncated": 213.63740234381112, - "harmonic": 130.06039992520343, - "geometric": 185.4924137470012 -} - -NanoPow (WebGPU) 3070 (dispatch 0x400 workgroup 256) -{ - "count": 2048, - "total": 556310.1000003219, - "rate": 3.6813999961510944, - "min": 22.299999982118607, - "max": 2052.5, - "arithmetic": 271.63579101578216, - "truncated": 220.8329589846253, - "harmonic": 129.12128518320455, - "geometric": 186.29511532186783 -} - -NanoPow (WebGPU) 3070 (dispatch 0x1000 workgroup 128) -{ - "count": 2048, - "total": 548659.7999989986, - "rate": 3.7327320135423405, - "min": 13.699999988079071, - "max": 2517.0999999940395, - "arithmetic": 267.90029296826106, - "truncated": 208.58183593714784, - "harmonic": 117.53834727081843, - "geometric": 177.7358800826145 -} - -NanoPow (WebGPU) 3070 (dispatch 0x800 workgroup 128) -{ - "count": 2048, - "total": 566167.6000000024, - "rate": 3.6173034274656324, - "min": 15.399999999906868, - "max": 2000.2999999998137, - "arithmetic": 276.4490234375012, - "truncated": 218.4107421874994, - "harmonic": 122.29973069005368, - "geometric": 184.8814662429658 -} - - -NanoPow (WebGPU) 3070 (dispatch 0x1000 workgroup 64) -{ - "count": 2048, - "total": 539941.3999999762, - "rate": 3.79300420378969, - "min": 9.5, - "max": 2157.300000011921, - "arithmetic": 263.64326171873836, - "truncated": 205.91958007811627, - "harmonic": 106.17587018373969, - "geometric": 171.19015104475864 -} - -NanoPow (WebGPU) 3070 (dispatch 0x800 workgroup 64) -{ - "count": 2048, - "total": 535917.800000757, - "rate": 3.821481577953013, - "min": 10.300000011920929, - "max": 2099.4000000059605, - "arithmetic": 261.6786132816196, - "truncated": 206.90126953163417, - "harmonic": 108.36494901651302, - "geometric": 171.00530036092783 -} - -NanoPow (WebGPU) 3070 (dispatch 0x1000 workgroup 32) -{ - "count": 2048, - "total": 582139.9999999758, - "rate": 3.518054076339171, - "min": 9.200000000186265, - "max": 1908.5999999996275, - "arithmetic": 284.2480468749882, - "truncated": 218.86826171873827, - "harmonic": 123.58316256156988, - "geometric": 190.28283027676605 -} - - -NanoPow (WebGPU) 3070 (dispatch 0x800 workgroup 64) -{ - "count": 64, - "total": 14740.50000000745, - "rate": 4.341779451169747, - "min": 14.199999999254942, - "max": 1088.800000000745, - "arithmetic": 230.32031250011642, - "truncated": 181.82343750004657, - "harmonic": 87.12614438828882, - "geometric": 149.3650172355075 -} - -NanoPow (WebGPU) 3070 (dispatch 0x800 workgroup 256) -{ - "count": 64, - "total": 15690.999999988824, - "rate": 4.078771270157771, - "min": 10.400000002235174, - "max": 989.8000000007451, - "arithmetic": 245.17187499982538, - "truncated": 186.7796874998021, - "harmonic": 76.98798798236564, - "geometric": 147.89606519234277 -} - -NanoPow (WebGPU) 3070 (dispatch 0x800 workgroup 256) -{ - "count": 64, - "total": 16779.50000002235, - "rate": 4.4576316046016, - "min": 24.199999999254942, - "max": 1552, - "arithmetic": 262.17968750034925, - "truncated": 224.33437500032596, - "harmonic": 108.25402886637463, - "geometric": 169.6307793361205 -} - -NanoPow (WebGPU) 3070 (dispatch 0x1000 workgroup 64) -{ - "count": 64, - "total": 19302.5, - "rate": 4.219075495082348, - "min": 27.699999999254942, - "max": 1248.5, - "arithmetic": 301.6015625, - "truncated": 237.01874999998836, - "harmonic": 112.89841890437856, - "geometric": 188.28065698087113 -} - - - ------ - - - -nano-webgl-pow: Time to calculate proof-of-work for a send block 16 times -Total: 89756 ms -Average: 5609.75 ms -Harmonic: 2092.567565254879 ms -Geometric: 3612.112662613675 ms - -NanoPowGl: Time to calculate proof-of-work for a send block 16 times -Total: 33240 ms -Average: 2077.5 ms -Harmonic: 1328.5635414262717 ms -Geometric: 1663.110986923899 ms - -How much faster? -Total: 56156 ms -Average: 3532 ms -Harmonic: 764 ms -Geometric: 1949 ms - -NanoPowGl: -Total: 22831.300000041723 ms -Average: 3805.2166666736207 ms -Harmonic: 928.6432328540742 ms -Geometric: 2500.810238375608 ms -Minimum: 193 ms -Maximum: 8361 ms - -NanoPowGl: Time to calculate proof-of-work for a send block 512 times -Total: 680948 ms -Average: 1329.9765625 ms -Harmonic: 749.6552658409396 ms - - - -CHROMIUM with more accurate timings - -NanoPowGpu: Time to calculate proof-of-work for a send block 8192 times -Total: 2934170.3000008166 ms -Average: 358.17508544931843 ms -Harmonic: 218.11823673331645 ms -Minimum: 76.2000000178814 ms -Maximum: 2999.9000000059605 ms - - - -NanoPowGpu: Time to calculate proof-of-work for a send block 512 times -Total: 187428.40000000596 ms -Average: 366.07109375001164 ms -Harmonic: 220.70399520519166 ms - -NanoPowGpu: Time to calculate proof-of-work for a send block 512 times -Total: 187827.7999998629 ms -Average: 366.85117187473224 ms -Harmonic: 223.9897252426498 ms - -NanoPowGpu: Time to calculate proof-of-work for a send block 512 times -(after inlining entire first G round) -Total: 156981.3999993205 ms -Average: 306.60429687367287 ms -Harmonic: 128.74904701127866 ms -Minimum: 21.700000047683716 ms -Maximum: 1981.199999988079 ms - -NanoPowGpu: Time to calculate proof-of-work for a send block 512 times -(after inlining entire first G round) -Total: 162225.30000036955 ms -Average: 316.8462890632218 ms -Harmonic: 130.50451885939313 ms -Geometric: 211.25671228925867 ms -Minimum: 21.600000023841858 ms -Maximum: 2267.600000023842 ms - -NanoPowGpu: Time to calculate proof-of-work for a send block 512 times -(after inlining 3 rounds of G mixing) -Total: 155547.09999996424 ms -Average: 303.80292968743015 ms -Harmonic: 118.19131857240315 ms -Geometric: 196.77234360098842 ms -Minimum: 19.5 ms -Maximum: 2140.2000000476837 ms - -NanoPowGpu: Time to calculate proof-of-work for a send block 512 times -(after inlining 5 rounds of G mixing) -Total: 165145.19999998808 ms -Average: 322.5492187499767 ms -Harmonic: 126.40822610460997 ms -Geometric: 205.28427810986508 ms -Minimum: 20.099999964237213 ms -Maximum: 1850.5 ms - -NanoPowGpu: Time to calculate proof-of-work for a send block 512 times -(after inlining 5 rounds of G mixing and replacing if with select in original G function) -Total: 135665.40000021458 ms -Average: 264.9714843754191 ms -Harmonic: 118.80915172412905 ms -Geometric: 181.19191881133972 ms -Minimum: 19.599999964237213 ms -Maximum: 1908.5 ms - -NanoPowGpu: Time to calculate proof-of-work for a send block 512 times -(after inlining 9 rounds of G mixing and replacing if with select in original G function) -Total: 147481.09999907017 ms -Average: 288.0490234356839 ms -Harmonic: 122.25335314983455 ms -Geometric: 192.75325397221323 ms -Minimum: 22.19999998807907 ms -Maximum: 1762.800000011921 ms - -NanoPowGpu: Time to calculate proof-of-work for a send block 512 times -(after inlining all rounds of G mixing) -Total: 165041.20000058413 ms -Average: 322.34609375114087 ms -Harmonic: 121.30652845909019 ms -Geometric: 202.80092012876665 ms -Minimum: 21.69999998807907 ms -Maximum: 2303 ms - -NanoPowGpu: Time to calculate proof-of-work for a send block 512 times -(after inlining all rounds of G mixing and all if statements replaced with select function) -Total: 134865.20000064373 ms -Average: 263.4085937512573 ms -Harmonic: 108.28019222737286 ms -Geometric: 171.8797089689105 ms -Minimum: 20.80000001192093 ms -Maximum: 2093.199999988079 ms - -NanoPow (WebGPU) 0xff -{ - "count": 512, - "total": 149335.80000003055, - "min": 9.400000000372529, - "max": 1503.300000000745, - "arithmetic": 291.67148437505966, - "truncated": 222.58417968753201, - "harmonic": 106.71381226989509, - "geometric": 186.92638314142255 -} - -NanoPow (WebGPU) 0xfff -{ - "count": 512, - "total": 164261.39999999292, - "min": 79.5, - "max": 1424.7000000011176, - "arithmetic": 320.8230468749862, - "truncated": 263.8744140625058, - "harmonic": 209.95457211379528, - "geometric": 256.8968599479061 -} - -NanoPow (WebGPU) 0x800 -{ - "count": 512, - "total": 125924.59999999404, - "min": 23, - "max": 1799.1000000014901, - "arithmetic": 245.94648437498836, - "truncated": 198.84531250000146, - "harmonic": 115.44432001873471, - "geometric": 171.54249948295475 -} - -NanoPow (WebGPU) 0x400 -{ - "count": 512, - "total": 132129.60000000335, - "min": 11.799999998882413, - "max": 2051.9000000003725, - "arithmetic": 258.06562500000655, - "truncated": 201.65429687500364, - "harmonic": 86.37881890351905, - "geometric": 156.54611901649818 -} - -NanoPow (WebGPU) 0x999 -{ - "count": 512, - "total": 132693.0000000093, - "min": 32.30000000074506, - "max": 2258.800000000745, - "arithmetic": 259.1660156250182, - "truncated": 208.9763671874971, - "harmonic": 133.4766737582568, - "geometric": 185.94074203825846 -} - -NanoPow (WebGPU) 0x400 -{ - "count": 512, - "total": 136912.30000001006, - "min": 8.900000000372529, - "max": 1369.9000000003725, - "arithmetic": 267.40683593751965, - "truncated": 196.9111328124891, - "harmonic": 96.43707569252571, - "geometric": 166.5867151432514 -} - -NanoPow (WebGPU) 3070 (0x400) -{ - "count": 512, - "total": 138190.59999987483, - "min": 9.799999997019768, - "max": 1512.5999999940395, - "arithmetic": 269.9035156247555, - "truncated": 201.92070312495343, - "harmonic": 90.32084803883801, - "geometric": 166.60778722043577 -} - -NanoPow (WebGPU) 3070 (0x400) -{ - "count": 512, - "total": 139817.9999998659, - "rate": 3.6619033314772853, - "min": 11.599999994039536, - "max": 1748.4000000059605, - "arithmetic": 273.08203124973807, - "truncated": 202.18183593745925, - "harmonic": 92.89356260079113, - "geometric": 168.54025739379932 -} - -NanoPow (WebGPU) 3070 (dispatch 0x400 workgroup 256) -(after converting `v` scalars to vectors) -{ - "count": 512, - "total": 136717.80000030994, - "rate": 3.74494030769102, - "min": 11.700000002980232, - "max": 2057.2999999970198, - "arithmetic": 267.02695312560536, - "truncated": 213.24863281298894, - "harmonic": 94.06525233622521, - "geometric": 162.9622640163259 -} - -NanoPow (WebGPU) 3070 (dispatch 0x800 workgroup 64) -{ - "count": 512, - "total": 127778.10000002384, - "rate": 4.006946417264809, - "min": 15, - "max": 1386.9000000059605, - "arithmetic": 249.56660156254657, - "truncated": 176.96152343729045, - "harmonic": 88.48986919127314, - "geometric": 152.85173378506695 -} - -NanoPow (WebGPU) 3070 (dispatch 0x800 workgroup 64) -{ - "count": 512, - "total": 126437.00000026822, - "rate": 4.04944755094564, - "min": 8.299999982118607, - "max": 1592.5999999940395, - "arithmetic": 246.94726562552387, - "truncated": 197.15312500030268, - "harmonic": 83.44136709616109, - "geometric": 151.69548923418716 -} - -NanoPow (WebGPU) 3070 (dispatch 0x1000 workgroup 32) -{ - "count": 512, - "total": 130610.90000012517, - "rate": 3.920040364161868, - "min": 16.899999976158142, - "max": 1475.5, - "arithmetic": 255.09941406274447, - "truncated": 201.2101562502212, - "harmonic": 108.50998153048026, - "geometric": 169.9536026991119 -} - -NanoPow (WebGPU) 3070 (dispatch 0x800 workgroup 64) -{ - "count": 512, - "total": 133471.2999998033, - "rate": 3.836030667272699, - "min": 9.400000005960464, - "max": 1703.5, - "arithmetic": 260.68613281211583, - "truncated": 205.00488281232538, - "harmonic": 88.65601312209833, - "geometric": 160.81454554342605 -} - -NanoPow (WebGPU) 3070 (dispatch 0x800 workgroup 64) debug mode -{ - "count": 512, - "total": 138230.30000001192, - "rate": 3.70396360276984, - "min": 11.199999988079071, - "max": 1709.9000000059605, - "arithmetic": 269.9810546875233, - "truncated": 223.11933593766298, - "harmonic": 86.31741549131624, - "geometric": 164.2553862861393 -} - -NanoPow (WebGPU) 3070 (dispatch 0x800 workgroup 64) debug mode -{ - "count": 512, - "total": 137174.0999996364, - "rate": 3.732483027053628, - "min": 9.300000011920929, - "max": 1598.0999999940395, - "arithmetic": 267.91816406178987, - "truncated": 189.36367187427823, - "harmonic": 88.46031367791892, - "geometric": 157.678025720888 -} - -NanoPow (WebGPU) 3070 (dispatch 0x800 workgroup 64) debug mode -{ - "count": 512, - "total": 140824.9999999702, - "rate": 3.63571808982857, - "min": 12.100000023841858, - "max": 1640.300000011921, - "arithmetic": 275.0488281249418, - "truncated": 209.06249999988358, - "harmonic": 97.42050559825007, - "geometric": 174.1405646344008 -} - -NanoPow (WebGPU) 3070 (dispatch 0x800 workgroup 256) debug mode -{ - "count": 512, - "total": 134499.49999976158, - "rate": 3.806705601142812, - "min": 24.200000017881393, - "max": 1324.5999999940395, - "arithmetic": 262.69433593703434, - "truncated": 206.15488281205762, - "harmonic": 117.14110375247057, - "geometric": 178.66910197648477 -} - -NanoPow (WebGPU) 3070 (dispatch 0x800 workgroup 64) -{ - "count": 512, - "total": 135615.80000019073, - "rate": 3.7753713062879095, - "min": 11.300000011920929, - "max": 1967.5, - "arithmetic": 264.87460937537253, - "truncated": 214.91972656268626, - "harmonic": 95.29029745129836, - "geometric": 166.91867186739316 -} - -NanoPow (WebGPU) 3070 (dispatch 0x800 workgroup 64) -{ - "count": 512, - "total": 129097.29999995232, - "rate": 3.9660008381289855, - "min": 9.099999994039536, - "max": 1335.9000000059605, - "arithmetic": 252.14316406240687, - "truncated": 200.86796874983702, - "harmonic": 87.70052168819247, - "geometric": 159.12504898549835 -} - -NanoPow (WebGPU) 3070 (dispatch 0x800 workgroup 64) -{ - "count": 512, - "total": 129204.69999992847, - "rate": 3.9627041431177306, - "min": 14.900000005960464, - "max": 1968.5, - "arithmetic": 252.3529296873603, - "truncated": 205.91660156229045, - "harmonic": 97.46470054285273, - "geometric": 162.08582278945588 -} - -NanoPow (WebGPU) 3070 (dispatch 0x800 workgroup 64) -{ - "count": 512, - "total": 140579.30000019073, - "rate": 3.6420724815054943, - "min": 11.200000017881393, - "max": 1399.0999999940395, - "arithmetic": 274.56894531287253, - "truncated": 213.73515625024447, - "harmonic": 104.79799509895179, - "geometric": 179.31860807870936 -} - -NanoPow (WebGPU) 3070 (dispatch 0x800 workgroup 256) -{ - "count": 512, - "total": 138712.99999949336, - "rate": 3.691074376603996, - "min": 24.799999982118607, - "max": 1613.0999999940395, - "arithmetic": 270.92382812401047, - "truncated": 207.68144531175494, - "harmonic": 117.38847021261739, - "geometric": 179.3126486381509 -} - -NanoPow (WebGPU) 3070 (dispatch 0x800 workgroup 64) -{ - "count": 64, - "total": 13121.699999928474, - "rate": 4.877416798154878, - "min": 9.700000017881393, - "max": 1030.5, - "arithmetic": 205.0265624988824, - "truncated": 142.53124999860302, - "harmonic": 63.87965522037639, - "geometric": 117.9588307327579 -} - -NanoPow (WebGPU) 3070 (dispatch 0x800 workgroup 64) -{ - "count": 64, - "total": 16865.300000220537, - "rate": 3.7947738847908496, - "min": 18.099999994039536, - "max": 1783.199999988079, - "arithmetic": 263.5203125034459, - "truncated": 240.13593750307336, - "harmonic": 113.00040689556182, - "geometric": 184.71274112138641 -} - -NanoPow (WebGPU) 3070 (dispatch 0x400 workgroup 64) -{ - "count": 64, - "total": 19725, - "rate": 3.244613434727503, - "min": 9.400000005960464, - "max": 1713.7999999821186, - "arithmetic": 308.203125, - "truncated": 248.60625000065193, - "harmonic": 80.57924794605395, - "geometric": 182.49585952465603 -} - -NanoPow (WebGPU) 3070 (dispatch 0x1000 workgroup 64) -{ - "count": 64, - "total": 16167.400000214577, - "rate": 3.958583321941102, - "min": 9.599999994039536, - "max": 1210.800000011921, - "arithmetic": 252.61562500335276, - "truncated": 208.5312500023283, - "harmonic": 79.1288243036302, - "geometric": 160.41093372644522 -} -NanoPow (WebGPU) 3070 (dispatch 0x1000 workgroup 256) -{ - "count": 64, - "total": 16317.899999916553, - "rate": 3.9220733060214417, - "min": 24.400000005960464, - "max": 858.2999999821186, - "arithmetic": 254.96718749869615, - "truncated": 208.06562499934807, - "harmonic": 90.51419688879719, - "geometric": 152.3002389506704 -} - - -NanoPowGpu: Time to calculate proof-of-work for a send block 32 times -Total: 8909.500000029802 ms -Average: 278.4218750009313 ms -Harmonic: 191.49100480215873 ms -Geometric: 232.13670548729021 ms -Minimum: 76.69999998807907 ms -Maximum: 641.5 ms - -NanoPowGpu: Time to calculate proof-of-work for a send block 32 times -Total: 11805.200000077486 ms -Average: 368.91250000242144 ms -Harmonic: 131.36379466491744 ms -Geometric: 228.69384924435158 ms -Minimum: 21.900000005960464 ms -Maximum: 1479.5 ms - -NanoPowGpu: Time to calculate proof-of-work for a send block 32 times -(after inlining three G calls) -Total: 11208.399999916553 ms -Average: 350.2624999973923 ms -Harmonic: 115.78621253028925 ms -Geometric: 210.41080264689026 ms -Minimum: 25 ms -Maximum: 1249.199999988079 ms - -NanoPowGpu: Time to calculate proof-of-work for a send block 32 times -(after inlining entire first G round) -Total: 9778.899999797344 ms -Average: 305.590624993667 ms -Harmonic: 120.65186396913916 ms -Geometric: 193.85674573632113 ms -Minimum: 23.69999998807907 ms -Maximum: 1752.199999988079 ms - -NanoPowGpu: Time to calculate proof-of-work for a send block 32 times -(after inlining 3 rounds of G mixing) -Total: 10425.399999856949 ms -Average: 325.79374999552965 ms -Harmonic: 150.5729403997282 ms -Geometric: 231.43806657572657 ms -Minimum: 31.900000035762787 ms -Maximum: 954.9000000357628 ms - - -NanoPow (WebGPU) iPhone 0xff -{ - "count": 32, - "total": 161323, - "min": 130, - "max": 22190, - "arithmetic": 5041.3438, - "truncated": 3780.2813, - "harmonic": 1252.8660, - "geometric": 2906.9620 -} - -NanoPow (WebGPU) iPad Mini 2 0x400 -{ - "count": 32, - "total": 226661, - "min": 169, - "max": 31964, - "arithmetic": 7083.1563, - "truncated": 5208.2813, - "harmonic": 1722.7691, - "geometric": 4127.4107 -} - diff --git a/main.min.js b/main.min.js deleted file mode 100644 index f05aac9..0000000 --- a/main.min.js +++ /dev/null @@ -1,759 +0,0 @@ -// src/shaders/compute.wgsl -var compute_default = "struct UBO{blockhash:array,2>,random:vec2,threshold:u32};@group(0)@binding(0)var ubo:UBO;struct WORK{nonce:vec2,found:atomic};@group(0)@binding(1)varwork:WORK;var found:bool;const BLAKE2B_IV32_0:vec2=vec2(0xF2BDC900u,0x6A09E667u);const ROTATE_1=vec2(1u,1u);const ROTATE_8=vec2(8u,8u);const ROTATE_16=vec2(16u,16u);const ROTATE_24=vec2(24u,24u);const ROTATE_31=vec2(31u,31u);@compute @workgroup_size(64)fn search(@builtin(global_invocation_id)global_id:vec3,@builtin(local_invocation_id)local_id:vec3){found=(local_id.x==0u&&atomicLoad(&work.found)!=0u);workgroupBarrier();if(found){return;}main(global_id);}@compute @workgroup_size(1)fn validate(@builtin(global_invocation_id)global_id:vec3){main(global_id);}fn main(id:vec3){let threshold:u32=ubo.threshold;let m0:vec2=ubo.random ^ id.xy;let m1:vec2=ubo.blockhash[0u].xy;let m2:vec2=ubo.blockhash[0u].zw;let m3:vec2=ubo.blockhash[1u].xy;let m4:vec2=ubo.blockhash[1u].zw;var v0:vec2=BLAKE2B_IV32_0;var v1:vec2=vec2(0x84CAA73Bu,0xBB67AE85u);var v2:vec2=vec2(0xFE94F82Bu,0x3C6EF372u);var v3:vec2=vec2(0x5F1D36F1u,0xA54FF53Au);var v4:vec2=vec2(0xADE682D1u,0x510E527Fu);var v5:vec2=vec2(0x2B3E6C1Fu,0x9B05688Cu);var v6:vec2=vec2(0xFB41BD6Bu,0x1F83D9ABu);var v7:vec2=vec2(0x137E2179u,0x5BE0CD19u);var v8:vec2=vec2(0xF3BCC908u,0x6A09E667u);var v9:vec2=vec2(0x84CAA73Bu,0xBB67AE85u);var v10:vec2=vec2(0xFE94F82Bu,0x3C6EF372u);var v11:vec2=vec2(0x5F1D36F1u,0xA54FF53Au);var v12:vec2=vec2(0xADE682F9u,0x510E527Fu);var v13:vec2=vec2(0x2B3E6C1Fu,0x9B05688Cu);var v14:vec2=vec2(0x04BE4294u,0xE07C2654u);var v15:vec2=vec2(0x137E2179u,0x5BE0CD19u);v0=v0+v4+vec2(0u,u32(v0.x+v4.x>ROTATE_24)|((v4 ^ v8).yx<>ROTATE_16)|((v12 ^ v0).yx<>ROTATE_31);v1=v1+v5+vec2(0u,u32(v1.x+v5.x>ROTATE_24)|((v5 ^ v9).yx<>ROTATE_16)|((v13 ^ v1).yx<>ROTATE_31);v2=v2+v6+vec2(0u,u32(v2.x+v6.x>ROTATE_24)|((v6 ^ v10).yx<>ROTATE_16)|((v14 ^ v2).yx<>ROTATE_31);v3=v3+v7+vec2(0u,u32(v3.x+v7.x>ROTATE_24)|((v7 ^ v11).yx<>ROTATE_16)|((v15 ^ v3).yx<>ROTATE_31);v0=v0+v5+vec2(0u,u32(v0.x+v5.x>ROTATE_24)|((v5 ^ v10).yx<>ROTATE_16)|((v15 ^ v0).yx<>ROTATE_31);v1=v1+v6+vec2(0u,u32(v1.x+v6.x>ROTATE_24)|((v6 ^ v11).yx<>ROTATE_16)|((v12 ^ v1).yx<>ROTATE_31);v2=v2+v7+vec2(0u,u32(v2.x+v7.x>ROTATE_24)|((v7 ^ v8).yx<>ROTATE_16)|((v13 ^ v2).yx<>ROTATE_31);v3=v3+v4+vec2(0u,u32(v3.x+v4.x>ROTATE_24)|((v4 ^ v9).yx<>ROTATE_16)|((v14 ^ v3).yx<>ROTATE_31);v0=v0+v4+vec2(0u,u32(v0.x+v4.x>ROTATE_24)|((v4 ^ v8).yx<>ROTATE_16)|((v12 ^ v0).yx<>ROTATE_31);v1=v1+v5+vec2(0u,u32(v1.x+v5.x>ROTATE_24)|((v5 ^ v9).yx<>ROTATE_16)|((v13 ^ v1).yx<>ROTATE_31);v2=v2+v6+vec2(0u,u32(v2.x+v6.x>ROTATE_24)|((v6 ^ v10).yx<>ROTATE_16)|((v14 ^ v2).yx<>ROTATE_31);v3=v3+v7+vec2(0u,u32(v3.x+v7.x>ROTATE_24)|((v7 ^ v11).yx<>ROTATE_16)|((v15 ^ v3).yx<>ROTATE_31);v0=v0+v5+vec2(0u,u32(v0.x+v5.x>ROTATE_24)|((v5 ^ v10).yx<>ROTATE_16)|((v15 ^ v0).yx<>ROTATE_31);v1=v1+v6+vec2(0u,u32(v1.x+v6.x>ROTATE_24)|((v6 ^ v11).yx<>ROTATE_16)|((v12 ^ v1).yx<>ROTATE_31);v2=v2+v7+vec2(0u,u32(v2.x+v7.x>ROTATE_24)|((v7 ^ v8).yx<>ROTATE_16)|((v13 ^ v2).yx<>ROTATE_31);v3=v3+v4+vec2(0u,u32(v3.x+v4.x>ROTATE_24)|((v4 ^ v9).yx<>ROTATE_16)|((v14 ^ v3).yx<>ROTATE_31);v0=v0+v4+vec2(0u,u32(v0.x+v4.x>ROTATE_24)|((v4 ^ v8).yx<>ROTATE_16)|((v12 ^ v0).yx<>ROTATE_31);v1=v1+v5+vec2(0u,u32(v1.x+v5.x>ROTATE_24)|((v5 ^ v9).yx<>ROTATE_16)|((v13 ^ v1).yx<>ROTATE_31);v2=v2+v6+vec2(0u,u32(v2.x+v6.x>ROTATE_24)|((v6 ^ v10).yx<>ROTATE_16)|((v14 ^ v2).yx<>ROTATE_31);v3=v3+v7+vec2(0u,u32(v3.x+v7.x>ROTATE_24)|((v7 ^ v11).yx<>ROTATE_16)|((v15 ^ v3).yx<>ROTATE_31);v0=v0+v5+vec2(0u,u32(v0.x+v5.x>ROTATE_24)|((v5 ^ v10).yx<>ROTATE_16)|((v15 ^ v0).yx<>ROTATE_31);v1=v1+v6+vec2(0u,u32(v1.x+v6.x>ROTATE_24)|((v6 ^ v11).yx<>ROTATE_16)|((v12 ^ v1).yx<>ROTATE_31);v2=v2+v7+vec2(0u,u32(v2.x+v7.x>ROTATE_24)|((v7 ^ v8).yx<>ROTATE_16)|((v13 ^ v2).yx<>ROTATE_31);v3=v3+v4+vec2(0u,u32(v3.x+v4.x>ROTATE_24)|((v4 ^ v9).yx<>ROTATE_16)|((v14 ^ v3).yx<>ROTATE_31);v0=v0+v4+vec2(0u,u32(v0.x+v4.x>ROTATE_24)|((v4 ^ v8).yx<>ROTATE_16)|((v12 ^ v0).yx<>ROTATE_31);v1=v1+v5+vec2(0u,u32(v1.x+v5.x>ROTATE_24)|((v5 ^ v9).yx<>ROTATE_16)|((v13 ^ v1).yx<>ROTATE_31);v2=v2+v6+vec2(0u,u32(v2.x+v6.x>ROTATE_24)|((v6 ^ v10).yx<>ROTATE_16)|((v14 ^ v2).yx<>ROTATE_31);v3=v3+v7+vec2(0u,u32(v3.x+v7.x>ROTATE_24)|((v7 ^ v11).yx<>ROTATE_16)|((v15 ^ v3).yx<>ROTATE_31);v0=v0+v5+vec2(0u,u32(v0.x+v5.x>ROTATE_24)|((v5 ^ v10).yx<>ROTATE_16)|((v15 ^ v0).yx<>ROTATE_31);v1=v1+v6+vec2(0u,u32(v1.x+v6.x>ROTATE_24)|((v6 ^ v11).yx<>ROTATE_16)|((v12 ^ v1).yx<>ROTATE_31);v2=v2+v7+vec2(0u,u32(v2.x+v7.x>ROTATE_24)|((v7 ^ v8).yx<>ROTATE_16)|((v13 ^ v2).yx<>ROTATE_31);v3=v3+v4+vec2(0u,u32(v3.x+v4.x>ROTATE_24)|((v4 ^ v9).yx<>ROTATE_16)|((v14 ^ v3).yx<>ROTATE_31);v0=v0+v4+vec2(0u,u32(v0.x+v4.x>ROTATE_24)|((v4 ^ v8).yx<>ROTATE_16)|((v12 ^ v0).yx<>ROTATE_31);v1=v1+v5+vec2(0u,u32(v1.x+v5.x>ROTATE_24)|((v5 ^ v9).yx<>ROTATE_16)|((v13 ^ v1).yx<>ROTATE_31);v2=v2+v6+vec2(0u,u32(v2.x+v6.x>ROTATE_24)|((v6 ^ v10).yx<>ROTATE_16)|((v14 ^ v2).yx<>ROTATE_31);v3=v3+v7+vec2(0u,u32(v3.x+v7.x>ROTATE_24)|((v7 ^ v11).yx<>ROTATE_16)|((v15 ^ v3).yx<>ROTATE_31);v0=v0+v5+vec2(0u,u32(v0.x+v5.x>ROTATE_24)|((v5 ^ v10).yx<>ROTATE_16)|((v15 ^ v0).yx<>ROTATE_31);v1=v1+v6+vec2(0u,u32(v1.x+v6.x>ROTATE_24)|((v6 ^ v11).yx<>ROTATE_16)|((v12 ^ v1).yx<>ROTATE_31);v2=v2+v7+vec2(0u,u32(v2.x+v7.x>ROTATE_24)|((v7 ^ v8).yx<>ROTATE_16)|((v13 ^ v2).yx<>ROTATE_31);v3=v3+v4+vec2(0u,u32(v3.x+v4.x>ROTATE_24)|((v4 ^ v9).yx<>ROTATE_16)|((v14 ^ v3).yx<>ROTATE_31);v0=v0+v4+vec2(0u,u32(v0.x+v4.x>ROTATE_24)|((v4 ^ v8).yx<>ROTATE_16)|((v12 ^ v0).yx<>ROTATE_31);v1=v1+v5+vec2(0u,u32(v1.x+v5.x>ROTATE_24)|((v5 ^ v9).yx<>ROTATE_16)|((v13 ^ v1).yx<>ROTATE_31);v2=v2+v6+vec2(0u,u32(v2.x+v6.x>ROTATE_24)|((v6 ^ v10).yx<>ROTATE_16)|((v14 ^ v2).yx<>ROTATE_31);v3=v3+v7+vec2(0u,u32(v3.x+v7.x>ROTATE_24)|((v7 ^ v11).yx<>ROTATE_16)|((v15 ^ v3).yx<>ROTATE_31);v0=v0+v5+vec2(0u,u32(v0.x+v5.x>ROTATE_24)|((v5 ^ v10).yx<>ROTATE_16)|((v15 ^ v0).yx<>ROTATE_31);v1=v1+v6+vec2(0u,u32(v1.x+v6.x>ROTATE_24)|((v6 ^ v11).yx<>ROTATE_16)|((v12 ^ v1).yx<>ROTATE_31);v2=v2+v7+vec2(0u,u32(v2.x+v7.x>ROTATE_24)|((v7 ^ v8).yx<>ROTATE_16)|((v13 ^ v2).yx<>ROTATE_31);v3=v3+v4+vec2(0u,u32(v3.x+v4.x>ROTATE_24)|((v4 ^ v9).yx<>ROTATE_16)|((v14 ^ v3).yx<>ROTATE_31);v0=v0+v4+vec2(0u,u32(v0.x+v4.x>ROTATE_24)|((v4 ^ v8).yx<>ROTATE_16)|((v12 ^ v0).yx<>ROTATE_31);v1=v1+v5+vec2(0u,u32(v1.x+v5.x>ROTATE_24)|((v5 ^ v9).yx<>ROTATE_16)|((v13 ^ v1).yx<>ROTATE_31);v2=v2+v6+vec2(0u,u32(v2.x+v6.x>ROTATE_24)|((v6 ^ v10).yx<>ROTATE_16)|((v14 ^ v2).yx<>ROTATE_31);v3=v3+v7+vec2(0u,u32(v3.x+v7.x>ROTATE_24)|((v7 ^ v11).yx<>ROTATE_16)|((v15 ^ v3).yx<>ROTATE_31);v0=v0+v5+vec2(0u,u32(v0.x+v5.x>ROTATE_24)|((v5 ^ v10).yx<>ROTATE_16)|((v15 ^ v0).yx<>ROTATE_31);v1=v1+v6+vec2(0u,u32(v1.x+v6.x>ROTATE_24)|((v6 ^ v11).yx<>ROTATE_16)|((v12 ^ v1).yx<>ROTATE_31);v2=v2+v7+vec2(0u,u32(v2.x+v7.x>ROTATE_24)|((v7 ^ v8).yx<>ROTATE_16)|((v13 ^ v2).yx<>ROTATE_31);v3=v3+v4+vec2(0u,u32(v3.x+v4.x>ROTATE_24)|((v4 ^ v9).yx<>ROTATE_16)|((v14 ^ v3).yx<>ROTATE_31);v0=v0+v4+vec2(0u,u32(v0.x+v4.x>ROTATE_24)|((v4 ^ v8).yx<>ROTATE_16)|((v12 ^ v0).yx<>ROTATE_31);v1=v1+v5+vec2(0u,u32(v1.x+v5.x>ROTATE_24)|((v5 ^ v9).yx<>ROTATE_16)|((v13 ^ v1).yx<>ROTATE_31);v2=v2+v6+vec2(0u,u32(v2.x+v6.x>ROTATE_24)|((v6 ^ v10).yx<>ROTATE_16)|((v14 ^ v2).yx<>ROTATE_31);v3=v3+v7+vec2(0u,u32(v3.x+v7.x>ROTATE_24)|((v7 ^ v11).yx<>ROTATE_16)|((v15 ^ v3).yx<>ROTATE_31);v0=v0+v5+vec2(0u,u32(v0.x+v5.x>ROTATE_24)|((v5 ^ v10).yx<>ROTATE_16)|((v15 ^ v0).yx<>ROTATE_31);v1=v1+v6+vec2(0u,u32(v1.x+v6.x>ROTATE_24)|((v6 ^ v11).yx<>ROTATE_16)|((v12 ^ v1).yx<>ROTATE_31);v2=v2+v7+vec2(0u,u32(v2.x+v7.x>ROTATE_24)|((v7 ^ v8).yx<>ROTATE_16)|((v13 ^ v2).yx<>ROTATE_31);v3=v3+v4+vec2(0u,u32(v3.x+v4.x>ROTATE_24)|((v4 ^ v9).yx<>ROTATE_16)|((v14 ^ v3).yx<>ROTATE_31);v0=v0+v4+vec2(0u,u32(v0.x+v4.x>ROTATE_24)|((v4 ^ v8).yx<>ROTATE_16)|((v12 ^ v0).yx<>ROTATE_31);v1=v1+v5+vec2(0u,u32(v1.x+v5.x>ROTATE_24)|((v5 ^ v9).yx<>ROTATE_16)|((v13 ^ v1).yx<>ROTATE_31);v2=v2+v6+vec2(0u,u32(v2.x+v6.x>ROTATE_24)|((v6 ^ v10).yx<>ROTATE_16)|((v14 ^ v2).yx<>ROTATE_31);v3=v3+v7+vec2(0u,u32(v3.x+v7.x>ROTATE_24)|((v7 ^ v11).yx<>ROTATE_16)|((v15 ^ v3).yx<>ROTATE_31);v0=v0+v5+vec2(0u,u32(v0.x+v5.x>ROTATE_24)|((v5 ^ v10).yx<>ROTATE_16)|((v15 ^ v0).yx<>ROTATE_31);v1=v1+v6+vec2(0u,u32(v1.x+v6.x>ROTATE_24)|((v6 ^ v11).yx<>ROTATE_16)|((v12 ^ v1).yx<>ROTATE_31);v2=v2+v7+vec2(0u,u32(v2.x+v7.x>ROTATE_24)|((v7 ^ v8).yx<>ROTATE_16)|((v13 ^ v2).yx<>ROTATE_31);v3=v3+v4+vec2(0u,u32(v3.x+v4.x>ROTATE_24)|((v4 ^ v9).yx<>ROTATE_16)|((v14 ^ v3).yx<>ROTATE_31);v0=v0+v4+vec2(0u,u32(v0.x+v4.x>ROTATE_24)|((v4 ^ v8).yx<>ROTATE_16)|((v12 ^ v0).yx<>ROTATE_31);v1=v1+v5+vec2(0u,u32(v1.x+v5.x>ROTATE_24)|((v5 ^ v9).yx<>ROTATE_16)|((v13 ^ v1).yx<>ROTATE_31);v2=v2+v6+vec2(0u,u32(v2.x+v6.x>ROTATE_24)|((v6 ^ v10).yx<>ROTATE_16)|((v14 ^ v2).yx<>ROTATE_31);v3=v3+v7+vec2(0u,u32(v3.x+v7.x>ROTATE_24)|((v7 ^ v11).yx<>ROTATE_16)|((v15 ^ v3).yx<>ROTATE_31);v0=v0+v5+vec2(0u,u32(v0.x+v5.x>ROTATE_24)|((v5 ^ v10).yx<>ROTATE_16)|((v15 ^ v0).yx<>ROTATE_31);v1=v1+v6+vec2(0u,u32(v1.x+v6.x>ROTATE_24)|((v6 ^ v11).yx<>ROTATE_16)|((v12 ^ v1).yx<>ROTATE_31);v2=v2+v7+vec2(0u,u32(v2.x+v7.x>ROTATE_24)|((v7 ^ v8).yx<>ROTATE_16)|((v13 ^ v2).yx<>ROTATE_31);v3=v3+v4+vec2(0u,u32(v3.x+v4.x>ROTATE_24)|((v4 ^ v9).yx<>ROTATE_16)|((v14 ^ v3).yx<>ROTATE_31);v0=v0+v4+vec2(0u,u32(v0.x+v4.x>ROTATE_24)|((v4 ^ v8).yx<>ROTATE_16)|((v12 ^ v0).yx<>ROTATE_31);v1=v1+v5+vec2(0u,u32(v1.x+v5.x>ROTATE_24)|((v5 ^ v9).yx<>ROTATE_16)|((v13 ^ v1).yx<>ROTATE_31);v2=v2+v6+vec2(0u,u32(v2.x+v6.x>ROTATE_24)|((v6 ^ v10).yx<>ROTATE_16)|((v14 ^ v2).yx<>ROTATE_31);v3=v3+v7+vec2(0u,u32(v3.x+v7.x>ROTATE_24)|((v7 ^ v11).yx<>ROTATE_16)|((v15 ^ v3).yx<>ROTATE_31);v0=v0+v5+vec2(0u,u32(v0.x+v5.x>ROTATE_24)|((v5 ^ v10).yx<>ROTATE_16)|((v15 ^ v0).yx<>ROTATE_31);v1=v1+v6+vec2(0u,u32(v1.x+v6.x>ROTATE_24)|((v6 ^ v11).yx<>ROTATE_16)|((v12 ^ v1).yx<>ROTATE_31);v2=v2+v7+vec2(0u,u32(v2.x+v7.x>ROTATE_24)|((v7 ^ v8).yx<>ROTATE_16)|((v13 ^ v2).yx<>ROTATE_31);v3=v3+v4+vec2(0u,u32(v3.x+v4.x>ROTATE_24)|((v4 ^ v9).yx<>ROTATE_16)|((v14 ^ v3).yx<>ROTATE_31);v0=v0+v4+vec2(0u,u32(v0.x+v4.x>ROTATE_24)|((v4 ^ v8).yx<>ROTATE_16)|((v12 ^ v0).yx<>ROTATE_24)|((v5 ^ v9).yx<>ROTATE_16)|((v13 ^ v1).yx<>ROTATE_31);v2=v2+v6+vec2(0u,u32(v2.x+v6.x>ROTATE_24)|((v6 ^ v10).yx<>ROTATE_16)|((v14 ^ v2).yx<>ROTATE_24)|((v7 ^ v11).yx<>ROTATE_16)|((v15 ^ v3).yx<>ROTATE_31);v0=v0+v5+vec2(0u,u32(v0.x+v5.x>ROTATE_24)|((v5 ^ v10).yx<>ROTATE_24)|((v7 ^ v8).yx<>ROTATE_16)|((v13 ^ v2).yx<threshold){let wasFound:u32=atomicExchange(&work.found,1u);if(wasFound==0u){work.nonce=m0;}}return;}"; - -// src/shaders/gl-fragment.ts -var NanoPowGlFragmentShader = `#version 300 es -#pragma vscode_glsllint_stage: frag -precision highp float; - -in vec2 uv_pos; -out uvec4 nonce; - -// blockhash - array of precalculated block hash components -// threshold - 0xfffffff8 for send/change blocks, 0xfffffe00 for all else -// workload - Defines canvas size -layout(std140) uniform UBO { - uint blockhash[8]; - uint threshold; - float workload; -}; - -// Random work values -layout(std140) uniform WORK { - uvec2 work; -}; - -// Defined separately from uint v[32] below as the original value is required -// to calculate the second uint32 of the digest for threshold comparison -const uint BLAKE2B_IV32_1 = 0x6A09E667u; - -// Used during G for vector bit rotations -const uvec2 ROTATE_1 = uvec2(1u, 1u); -const uvec2 ROTATE_8 = uvec2(8u, 8u); -const uvec2 ROTATE_16 = uvec2(16u, 16u); -const uvec2 ROTATE_24 = uvec2(24u, 24u); -const uvec2 ROTATE_31 = uvec2(31u, 31u); - -// Both buffers represent 16 uint64s as 32 uint32s -// because that's what GLSL offers, just like Javascript - -// Compression buffer, intialized to 2 instances of the initialization vector -// The following values have been modified from the BLAKE2B_IV: -// OUTLEN is constant 8 bytes -// v[0] ^= 0x01010000u ^ uint(OUTLEN); -// INLEN is constant 40 bytes: work value (8) + block hash (32) -// v[24] ^= uint(INLEN); -// It's always the "last" compression at this INLEN -// v[28] = ~v[28]; -// v[29] = ~v[29]; -uvec2 v[16] = uvec2[16]( - uvec2(0xF2BDC900u, 0x6A09E667u), - uvec2(0x84CAA73Bu, 0xBB67AE85u), - uvec2(0xFE94F82Bu, 0x3C6EF372u), - uvec2(0x5F1D36F1u, 0xA54FF53Au), - uvec2(0xADE682D1u, 0x510E527Fu), - uvec2(0x2B3E6C1Fu, 0x9B05688Cu), - uvec2(0xFB41BD6Bu, 0x1F83D9ABu), - uvec2(0x137E2179u, 0x5BE0CD19u), - uvec2(0xF3BCC908u, 0x6A09E667u), - uvec2(0x84CAA73Bu, 0xBB67AE85u), - uvec2(0xFE94F82Bu, 0x3C6EF372u), - uvec2(0x5F1D36F1u, 0xA54FF53Au), - uvec2(0xADE682F9u, 0x510E527Fu), - uvec2(0x2B3E6C1Fu, 0x9B05688Cu), - uvec2(0x04BE4294u, 0xE07C2654u), - uvec2(0x137E2179u, 0x5BE0CD19u) -); - -// Input data buffer -uvec2 m[16]; - -// Offsets into the input data buffer for each mixing step -const uint SIGMA[192] = uint[192]( - 0u,1u,2u,3u,4u,5u,6u,7u,8u,9u,10u,11u,12u,13u,14u,15u, - 14u,10u,4u,8u,9u,15u,13u,6u,1u,12u,0u,2u,11u,7u,5u,3u, - 11u,8u,12u,0u,5u,2u,15u,13u,10u,14u,3u,6u,7u,1u,9u,4u, - 7u,9u,3u,1u,13u,12u,11u,14u,2u,6u,5u,10u,4u,0u,15u,8u, - 9u,0u,5u,7u,2u,4u,10u,15u,14u,1u,11u,12u,6u,8u,3u,13u, - 2u,12u,6u,10u,0u,11u,8u,3u,4u,13u,7u,5u,15u,14u,1u,9u, - 12u,5u,1u,15u,14u,13u,4u,10u,0u,7u,6u,3u,9u,2u,8u,11u, - 13u,11u,7u,14u,12u,1u,3u,9u,5u,0u,15u,4u,8u,6u,2u,10u, - 6u,15u,14u,9u,11u,3u,0u,8u,12u,2u,13u,7u,1u,4u,10u,5u, - 10u,2u,8u,4u,7u,6u,1u,5u,15u,11u,9u,14u,3u,12u,13u,0u, - 0u,1u,2u,3u,4u,5u,6u,7u,8u,9u,10u,11u,12u,13u,14u,15u, - 14u,10u,4u,8u,9u,15u,13u,6u,1u,12u,0u,2u,11u,7u,5u,3u -); - -// G mixing function -void G (uint a, uint b, uint c, uint d, uint x, uint y) { - v[a] = v[a] + v[b] + uvec2(0u, uint(v[a].x + v[b].x < v[b].x)); - v[a] = v[a] + m[x] + uvec2(0u, uint(v[a].x + m[x].x < m[x].x)); - v[d] = (v[d] ^ v[a]).yx; - v[c] = v[c] + v[d] + uvec2(0u, uint(v[c].x + v[d].x < v[d].x)); - v[b] = ((v[b] ^ v[c]) >> ROTATE_24) | ((v[b] ^ v[c]).yx << ROTATE_8); - v[a] = v[a] + v[b] + uvec2(0u, uint(v[a].x + v[b].x < v[b].x)); - v[a] = v[a] + m[y] + uvec2(0u, uint(v[a].x + m[y].x < m[y].x)); - v[d] = ((v[d] ^ v[a]) >> ROTATE_16) | ((v[d] ^ v[a]).yx << ROTATE_16); - v[c] = v[c] + v[d] + uvec2(0u, uint(v[c].x + v[d].x < v[d].x)); - v[b] = ((v[b] ^ v[c]).yx >> ROTATE_31) | ((v[b] ^ v[c]) << ROTATE_1); -} - -void main() { - // Nonce uniquely differentiated by pixel location - m[0u].x = work.x ^ uint(uv_pos.x * workload); - m[0u].y = work.y ^ uint(uv_pos.y * workload); - - // Block hash - m[1u] = uvec2(blockhash[0u], blockhash[1u]); - m[2u] = uvec2(blockhash[2u], blockhash[3u]); - m[3u] = uvec2(blockhash[4u], blockhash[5u]); - m[4u] = uvec2(blockhash[6u], blockhash[7u]); - - // twelve rounds of mixing - for(uint i = 0u; i < 12u; i = i + 1u) { - G(0u, 4u, 8u, 12u, SIGMA[i * 16u + 0u], SIGMA[i * 16u + 1u]); - G(1u, 5u, 9u, 13u, SIGMA[i * 16u + 2u], SIGMA[i * 16u + 3u]); - G(2u, 6u, 10u, 14u, SIGMA[i * 16u + 4u], SIGMA[i * 16u + 5u]); - G(3u, 7u, 11u, 15u, SIGMA[i * 16u + 6u], SIGMA[i * 16u + 7u]); - G(0u, 5u, 10u, 15u, SIGMA[i * 16u + 8u], SIGMA[i * 16u + 9u]); - G(1u, 6u, 11u, 12u, SIGMA[i * 16u + 10u], SIGMA[i * 16u + 11u]); - G(2u, 7u, 8u, 13u, SIGMA[i * 16u + 12u], SIGMA[i * 16u + 13u]); - G(3u, 4u, 9u, 14u, SIGMA[i * 16u + 14u], SIGMA[i * 16u + 15u]); - } - - // Pixel data set from work values - // Finalize digest from high bits, low bits can be safely ignored - if ((BLAKE2B_IV32_1 ^ v[0u].y ^ v[8u].y) > threshold) { - nonce = uvec4(1u, m[0].y, m[0].x, 1u); - } else { - discard; - } -} -`; - -// src/shaders/gl-vertex.ts -var NanoPowGlVertexShader = `#version 300 es -#pragma vscode_glsllint_stage: vert -precision highp float; -layout (location=0) in vec4 position; -layout (location=1) in vec2 uv; - -out vec2 uv_pos; - -void main() { - uv_pos = uv; - gl_Position = position; -} -`; - -// src/classes/gl.ts -var NanoPowGl = class _NanoPowGl { - static #busy = false; - /** Used to set canvas size. Must be a multiple of 256. */ - static #WORKLOAD = 256 * Math.max(1, Math.floor(navigator.hardwareConcurrency)); - static #gl; - static #program; - static #vertexShader; - static #fragmentShader; - static #texture; - static #framebuffer; - static #positionBuffer; - static #uvBuffer; - static #uboBuffer; - static #workBuffer; - static #query; - static #pixels; - /**Vertex Positions, 2 triangles */ - static #positions = new Float32Array([ - -1, - -1, - 0, - -1, - 1, - 0, - 1, - 1, - 0, - 1, - -1, - 0, - 1, - 1, - 0, - -1, - -1, - 0 - ]); - /** Texture Positions */ - static #uvPosArray = new Float32Array([ - 1, - 1, - 1, - 0, - 0, - 0, - 0, - 1, - 0, - 0, - 1, - 1 - ]); - /** Compile */ - static async init() { - if (this.#busy) return; - this.#busy = true; - try { - this.#gl = new OffscreenCanvas(this.#WORKLOAD, this.#WORKLOAD).getContext("webgl2"); - if (this.#gl == null) throw new Error("WebGL 2 is required"); - this.#gl.clearColor(0, 0, 0, 1); - this.#program = this.#gl.createProgram(); - if (this.#program == null) throw new Error("Failed to create shader program"); - this.#vertexShader = this.#gl.createShader(this.#gl.VERTEX_SHADER); - if (this.#vertexShader == null) throw new Error("Failed to create vertex shader"); - this.#gl.shaderSource(this.#vertexShader, NanoPowGlVertexShader); - this.#gl.compileShader(this.#vertexShader); - if (!this.#gl.getShaderParameter(this.#vertexShader, this.#gl.COMPILE_STATUS)) - throw new Error(this.#gl.getShaderInfoLog(this.#vertexShader) ?? `Failed to compile vertex shader`); - this.#fragmentShader = this.#gl.createShader(this.#gl.FRAGMENT_SHADER); - if (this.#fragmentShader == null) throw new Error("Failed to create fragment shader"); - this.#gl.shaderSource(this.#fragmentShader, NanoPowGlFragmentShader); - this.#gl.compileShader(this.#fragmentShader); - if (!this.#gl.getShaderParameter(this.#fragmentShader, this.#gl.COMPILE_STATUS)) - throw new Error(this.#gl.getShaderInfoLog(this.#fragmentShader) ?? `Failed to compile fragment shader`); - this.#gl.attachShader(this.#program, this.#vertexShader); - this.#gl.attachShader(this.#program, this.#fragmentShader); - this.#gl.linkProgram(this.#program); - if (!this.#gl.getProgramParameter(this.#program, this.#gl.LINK_STATUS)) - throw new Error(this.#gl.getProgramInfoLog(this.#program) ?? `Failed to link program`); - this.#gl.useProgram(this.#program); - const triangleArray = this.#gl.createVertexArray(); - this.#gl.bindVertexArray(triangleArray); - this.#texture = this.#gl.createTexture(); - this.#gl.bindTexture(this.#gl.TEXTURE_2D, this.#texture); - this.#gl.texImage2D(this.#gl.TEXTURE_2D, 0, this.#gl.RGBA32UI, this.#gl.drawingBufferWidth, this.#gl.drawingBufferHeight, 0, this.#gl.RGBA_INTEGER, this.#gl.UNSIGNED_INT, null); - this.#gl.texParameteri(this.#gl.TEXTURE_2D, this.#gl.TEXTURE_MIN_FILTER, this.#gl.NEAREST); - this.#gl.texParameteri(this.#gl.TEXTURE_2D, this.#gl.TEXTURE_MAG_FILTER, this.#gl.NEAREST); - this.#gl.bindTexture(this.#gl.TEXTURE_2D, null); - this.#framebuffer = this.#gl.createFramebuffer(); - this.#gl.bindFramebuffer(this.#gl.FRAMEBUFFER, this.#framebuffer); - this.#gl.framebufferTexture2D(this.#gl.FRAMEBUFFER, this.#gl.COLOR_ATTACHMENT0, this.#gl.TEXTURE_2D, this.#texture, 0); - if (this.#gl.checkFramebufferStatus(this.#gl.FRAMEBUFFER) !== this.#gl.FRAMEBUFFER_COMPLETE) - throw new Error(`Failed to create framebuffer`); - this.#gl.bindFramebuffer(this.#gl.FRAMEBUFFER, null); - this.#positionBuffer = this.#gl.createBuffer(); - this.#gl.bindBuffer(this.#gl.ARRAY_BUFFER, this.#positionBuffer); - this.#gl.bufferData(this.#gl.ARRAY_BUFFER, this.#positions, this.#gl.STATIC_DRAW); - this.#gl.vertexAttribPointer(0, 3, this.#gl.FLOAT, false, 0, 0); - this.#gl.enableVertexAttribArray(0); - this.#gl.bindBuffer(this.#gl.ARRAY_BUFFER, null); - this.#uvBuffer = this.#gl.createBuffer(); - this.#gl.bindBuffer(this.#gl.ARRAY_BUFFER, this.#uvBuffer); - this.#gl.bufferData(this.#gl.ARRAY_BUFFER, this.#uvPosArray, this.#gl.STATIC_DRAW); - this.#gl.vertexAttribPointer(1, 2, this.#gl.FLOAT, false, 0, 0); - this.#gl.enableVertexAttribArray(1); - this.#gl.bindBuffer(this.#gl.ARRAY_BUFFER, null); - this.#uboBuffer = this.#gl.createBuffer(); - this.#gl.bindBuffer(this.#gl.UNIFORM_BUFFER, this.#uboBuffer); - this.#gl.bufferData(this.#gl.UNIFORM_BUFFER, 144, this.#gl.DYNAMIC_DRAW); - this.#gl.bindBuffer(this.#gl.UNIFORM_BUFFER, null); - this.#gl.bindBufferBase(this.#gl.UNIFORM_BUFFER, 0, this.#uboBuffer); - this.#gl.uniformBlockBinding(this.#program, this.#gl.getUniformBlockIndex(this.#program, "UBO"), 0); - this.#gl.bindBuffer(this.#gl.UNIFORM_BUFFER, null); - this.#workBuffer = this.#gl.createBuffer(); - this.#gl.bindBuffer(this.#gl.UNIFORM_BUFFER, this.#workBuffer); - this.#gl.bufferData(this.#gl.UNIFORM_BUFFER, 32, this.#gl.STREAM_DRAW); - this.#gl.bindBuffer(this.#gl.UNIFORM_BUFFER, null); - this.#gl.bindBufferBase(this.#gl.UNIFORM_BUFFER, 1, this.#workBuffer); - this.#gl.uniformBlockBinding(this.#program, this.#gl.getUniformBlockIndex(this.#program, "WORK"), 1); - this.#gl.bindBuffer(this.#gl.UNIFORM_BUFFER, null); - this.#pixels = new Uint32Array(this.#gl.drawingBufferWidth * this.#gl.drawingBufferHeight * 4); - this.#query = this.#gl.createQuery(); - } catch (err) { - throw new Error(`WebGL initialization failed. ${err}`); - } finally { - this.#busy = false; - } - } - static reset() { - _NanoPowGl.#query = null; - _NanoPowGl.#workBuffer = null; - _NanoPowGl.#uboBuffer = null; - _NanoPowGl.#uvBuffer = null; - _NanoPowGl.#positionBuffer = null; - _NanoPowGl.#framebuffer = null; - _NanoPowGl.#texture = null; - _NanoPowGl.#fragmentShader = null; - _NanoPowGl.#vertexShader = null; - _NanoPowGl.#program = null; - _NanoPowGl.#gl = null; - _NanoPowGl.#busy = false; - _NanoPowGl.init(); - } - static #logAverages(times) { - let count = times.length, sum = 0, reciprocals = 0, logarithms = 0, truncated = 0, min = 65535, max = 0, rate = 0; - times.sort(); - for (let i = 0; i < count; i++) { - sum += times[i]; - reciprocals += 1 / times[i]; - logarithms += Math.log(times[i]); - min = Math.min(min, times[i]); - max = Math.max(max, times[i]); - if (count < 3 || i > count * 0.1 && i < count * 0.9) truncated += times[i]; - } - const averages = { - "Count (frames)": count, - "Total (ms)": sum, - "Rate (f/s)": 1e3 * count * 0.8 / (truncated || sum), - "Minimum (ms)": min, - "Maximum (ms)": max, - "Arithmetic Mean (ms)": sum / count, - "Truncated Mean (ms)": truncated / count, - "Harmonic Mean (ms)": count / reciprocals, - "Geometric Mean (ms)": Math.exp(logarithms / count) - }; - console.table(averages); - } - static #draw(work) { - if (this.#gl == null || this.#query == null) throw new Error("WebGL 2 is required to draw and query pixels"); - if (this.#workBuffer == null) throw new Error("Work buffer is required to draw"); - this.#gl.clear(this.#gl.COLOR_BUFFER_BIT); - this.#gl.bindBuffer(this.#gl.UNIFORM_BUFFER, this.#workBuffer); - this.#gl.bufferSubData(this.#gl.UNIFORM_BUFFER, 0, work); - this.#gl.bindBuffer(this.#gl.UNIFORM_BUFFER, null); - this.#gl.beginQuery(this.#gl.ANY_SAMPLES_PASSED_CONSERVATIVE, this.#query); - this.#gl.bindFramebuffer(this.#gl.FRAMEBUFFER, this.#framebuffer); - this.#gl.drawArrays(this.#gl.TRIANGLES, 0, 6); - this.#gl.bindFramebuffer(this.#gl.FRAMEBUFFER, null); - this.#gl.endQuery(this.#gl.ANY_SAMPLES_PASSED_CONSERVATIVE); - } - static async #checkQueryResult() { - return new Promise((resolve, reject) => { - try { - if (this.#gl == null || this.#query == null) throw new Error("WebGL 2 is required to check query results"); - if (this.#gl.getQueryParameter(this.#query, this.#gl.QUERY_RESULT_AVAILABLE)) { - resolve(!!this.#gl.getQueryParameter(this.#query, this.#gl.QUERY_RESULT)); - } else { - requestAnimationFrame(async () => { - const result = await _NanoPowGl.#checkQueryResult(); - resolve(result); - }); - } - } catch (err) { - reject(err); - } - }); - } - /** - * Reads pixels into the work buffer, checks every 4th pixel for the 'found' - * byte, converts the subsequent 3 pixels with the nonce byte values to a hex - * string, and returns the result. - * - * @param workHex - Original nonce if provided for a validation call - * @returns Nonce as an 8-byte (16-char) hexadecimal string - */ - static #readResult(workHex) { - if (this.#gl == null) throw new Error("WebGL 2 is required to read pixels"); - this.#gl.bindFramebuffer(this.#gl.FRAMEBUFFER, this.#framebuffer); - this.#gl.readPixels(0, 0, this.#gl.drawingBufferWidth, this.#gl.drawingBufferHeight, this.#gl.RGBA_INTEGER, this.#gl.UNSIGNED_INT, this.#pixels); - this.#gl.bindFramebuffer(this.#gl.FRAMEBUFFER, null); - for (let i = 0; i < this.#pixels.length; i += 4) { - if (this.#pixels[i] !== 0) { - const hex = `${this.#pixels[i + 1].toString(16).padStart(8, "0")}${this.#pixels[i + 2].toString(16).padStart(8, "0")}`; - if (workHex == null || workHex == hex) return hex; - } - } - throw new Error("Query reported result but nonce value not found"); - } - /** - * Finds a nonce that satisfies the Nano proof-of-work requirements. - * - * @param {string} hash - Hexadecimal hash of previous block, or public key for new accounts - * @param {number} [threshold=0xfffffff8] - Difficulty of proof-of-work calculation - */ - static async search(hash, options) { - if (_NanoPowGl.#gl == null) throw new Error("WebGL 2 is required"); - if (this.#gl == null) throw new Error("WebGL 2 is required"); - if (!/^[A-Fa-f0-9]{64}$/.test(hash)) throw new Error(`Invalid hash ${hash}`); - if (this.#busy) { - return new Promise((resolve) => { - setTimeout(async () => { - const result = this.search(hash, options); - resolve(result); - }, 100); - }); - } - this.#busy = true; - const threshold = typeof options?.threshold !== "number" || options.threshold < 0 || options.threshold > 4294967295 ? 4294967288 : options.threshold; - const effort = typeof options?.effort !== "number" || options.effort < 1 || options.effort > 32 ? 8 : options.effort; - const debug = !!options?.debug; - if (this.#WORKLOAD !== 256 * effort) { - this.#WORKLOAD = 256 * effort; - this.reset(); - } - const uboView = new DataView(new ArrayBuffer(144)); - for (let i = 0; i < 64; i += 8) { - const uint32 = hash.slice(i, i + 8); - uboView.setUint32(i * 2, parseInt(uint32, 16)); - } - uboView.setUint32(128, threshold, true); - uboView.setFloat32(132, 256 * effort, true); - _NanoPowGl.#gl.bindBuffer(_NanoPowGl.#gl.UNIFORM_BUFFER, _NanoPowGl.#uboBuffer); - _NanoPowGl.#gl.bufferSubData(_NanoPowGl.#gl.UNIFORM_BUFFER, 0, uboView); - _NanoPowGl.#gl.bindBuffer(_NanoPowGl.#gl.UNIFORM_BUFFER, null); - let times = []; - let start = performance.now(); - let nonce = null; - const seed = new Uint8Array(8); - while (nonce == null) { - start = performance.now(); - crypto.getRandomValues(seed); - this.#draw(seed); - const found = await this.#checkQueryResult(); - times.push(performance.now() - start); - if (found) { - nonce = this.#readResult(); - } - } - this.#busy = false; - if (debug) this.#logAverages(times); - return nonce; - } - /** - * Validates that a nonce satisfies Nano proof-of-work requirements. - * - * @param {string} work - Hexadecimal proof-of-work value to validate - * @param {string} hash - Hexadecimal hash of previous block, or public key for new accounts - * @param {number} [threshold=0xfffffff8] - Difficulty of proof-of-work calculation - */ - static async validate(work, hash, options) { - if (_NanoPowGl.#gl == null) throw new Error("WebGL 2 is required"); - if (this.#gl == null) throw new Error("WebGL 2 is required"); - if (!/^[A-Fa-f0-9]{16}$/.test(work)) throw new Error(`Invalid work ${work}`); - if (!/^[A-Fa-f0-9]{64}$/.test(hash)) throw new Error(`Invalid hash ${hash}`); - if (this.#busy) { - return new Promise((resolve) => { - setTimeout(async () => { - const result = this.validate(work, hash, options); - resolve(result); - }, 100); - }); - } - this.#busy = true; - const threshold = typeof options?.threshold !== "number" || options.threshold < 0 || options.threshold > 4294967295 ? 4294967288 : options.threshold; - const debug = !!options?.debug; - if (this.#WORKLOAD !== 1) { - this.#WORKLOAD = 1; - this.reset(); - } - const uboView = new DataView(new ArrayBuffer(144)); - for (let i = 0; i < 64; i += 8) { - const uint32 = hash.slice(i, i + 8); - uboView.setUint32(i * 2, parseInt(uint32, 16)); - } - uboView.setUint32(128, threshold, true); - uboView.setFloat32(132, _NanoPowGl.#WORKLOAD - 1, true); - _NanoPowGl.#gl.bindBuffer(_NanoPowGl.#gl.UNIFORM_BUFFER, _NanoPowGl.#uboBuffer); - _NanoPowGl.#gl.bufferSubData(_NanoPowGl.#gl.UNIFORM_BUFFER, 0, uboView); - _NanoPowGl.#gl.bindBuffer(_NanoPowGl.#gl.UNIFORM_BUFFER, null); - let nonce = null; - const data = new DataView(new ArrayBuffer(8)); - data.setBigUint64(0, BigInt(`0x${work}`), true); - const seed = new Uint8Array(data.buffer); - this.#draw(seed); - let found = await this.#checkQueryResult(); - if (found) { - try { - nonce = this.#readResult(work); - } catch (err) { - found = false; - } - } - this.#busy = false; - if (found && nonce !== work) throw new Error(`Nonce found but does not match work`); - return found; - } -}; - -// src/classes/gpu.ts -var NanoPowGpu = class _NanoPowGpu { - // Initialize WebGPU - static #busy = false; - static #device = null; - static #uboBuffer; - static #gpuBuffer; - static #cpuBuffer; - static #bindGroupLayout; - static #searchPipeline; - static #validatePipeline; - // Initialize WebGPU - static async init() { - if (this.#busy) return; - this.#busy = true; - try { - if (navigator.gpu == null) throw new Error("WebGPU is not supported in this browser."); - const adapter = await navigator.gpu.requestAdapter(); - if (adapter == null) throw new Error("WebGPU adapter refused by browser."); - const device = await adapter.requestDevice(); - if (!(device instanceof GPUDevice)) throw new Error("WebGPU device failed to load."); - device.lost.then(this.reset); - this.#device = device; - this.setup(); - } catch (err) { - throw new Error(`WebGPU initialization failed. ${err}`); - } finally { - this.#busy = false; - } - } - static setup() { - if (this.#device == null) throw new Error(`WebGPU device failed to load.`); - this.#uboBuffer = this.#device.createBuffer({ - size: 48, - usage: GPUBufferUsage.UNIFORM | GPUBufferUsage.COPY_DST - }); - this.#gpuBuffer = this.#device.createBuffer({ - size: 16, - usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_DST | GPUBufferUsage.COPY_SRC - }); - this.#cpuBuffer = this.#device.createBuffer({ - size: 16, - usage: GPUBufferUsage.COPY_DST | GPUBufferUsage.MAP_READ - }); - this.#bindGroupLayout = this.#device.createBindGroupLayout({ - entries: [ - { - binding: 0, - visibility: GPUShaderStage.COMPUTE, - buffer: { type: "uniform" } - }, - { - binding: 1, - visibility: GPUShaderStage.COMPUTE, - buffer: { type: "storage" } - } - ] - }); - const shaderModule = this.#device.createShaderModule({ - code: compute_default - }); - this.#searchPipeline = this.#device.createComputePipeline({ - layout: this.#device.createPipelineLayout({ - bindGroupLayouts: [this.#bindGroupLayout] - }), - compute: { - entryPoint: "search", - module: shaderModule - } - }); - this.#validatePipeline = this.#device.createComputePipeline({ - layout: this.#device.createPipelineLayout({ - bindGroupLayouts: [this.#bindGroupLayout] - }), - compute: { - entryPoint: "validate", - module: shaderModule - } - }); - } - static reset() { - console.warn(`GPU device lost. Reinitializing...`); - _NanoPowGpu.#cpuBuffer?.destroy(); - _NanoPowGpu.#gpuBuffer?.destroy(); - _NanoPowGpu.#uboBuffer?.destroy(); - _NanoPowGpu.#busy = false; - _NanoPowGpu.init(); - } - static #logAverages(times) { - let count = times.length, truncatedCount = 0, truncated = 0, sum = 0, reciprocals = 0, logarithms = 0, min = Number.MAX_SAFE_INTEGER, max = 0, median = 0, rate = 0; - times.sort(); - for (let i = 0; i < count; i++) { - sum += times[i]; - reciprocals += 1 / times[i]; - logarithms += Math.log(times[i]); - min = Math.min(min, times[i]); - max = Math.max(max, times[i]); - if (i === Math.ceil(count / 2)) { - median = times[i]; - } - if (count < 3 || i > 0.1 * count && i < 0.9 * (count - 1)) { - truncated += times[i]; - truncatedCount++; - } - } - const averages = { - "Count (dispatches)": count, - "Total (ms)": sum, - "Rate (d/s)": 1e3 * truncatedCount / (truncated || sum), - "Minimum (ms)": min, - "Maximum (ms)": max, - "Median (ms)": median, - "Arithmetic Mean (ms)": sum / count, - "Truncated Mean (ms)": truncated / truncatedCount, - "Harmonic Mean (ms)": count / reciprocals, - "Geometric Mean (ms)": Math.exp(logarithms / count) - }; - console.table(averages); - } - static async #dispatch(pipeline, seed, hash, threshold, passes) { - if (this.#device == null) throw new Error(`WebGPU device failed to load.`); - const uboView = new DataView(new ArrayBuffer(48)); - for (let i = 0; i < 64; i += 16) { - const u64 = hash.slice(i, i + 16); - uboView.setBigUint64(i / 2, BigInt(`0x${u64}`)); - } - uboView.setBigUint64(32, seed, true); - uboView.setUint32(40, threshold, true); - this.#device.queue.writeBuffer(this.#uboBuffer, 0, uboView); - this.#device.queue.writeBuffer(this.#gpuBuffer, 0, new Uint32Array([0, 0, 0])); - const bindGroup = this.#device.createBindGroup({ - layout: this.#bindGroupLayout, - entries: [ - { - binding: 0, - resource: { - buffer: this.#uboBuffer - } - }, - { - binding: 1, - resource: { - buffer: this.#gpuBuffer - } - } - ] - }); - const commandEncoder = this.#device.createCommandEncoder(); - const passEncoder = commandEncoder.beginComputePass(); - passEncoder.setPipeline(pipeline); - passEncoder.setBindGroup(0, bindGroup); - passEncoder.dispatchWorkgroups(passes, passes); - passEncoder.end(); - commandEncoder.copyBufferToBuffer(this.#gpuBuffer, 0, this.#cpuBuffer, 0, 12); - this.#device.queue.submit([commandEncoder.finish()]); - let data = null; - try { - await this.#cpuBuffer.mapAsync(GPUMapMode.READ); - await this.#device.queue.onSubmittedWorkDone(); - data = new DataView(this.#cpuBuffer.getMappedRange().slice(0)); - this.#cpuBuffer.unmap(); - } catch (err) { - console.warn(`Error getting data from GPU. ${err}`); - return this.#dispatch(pipeline, seed, hash, threshold, passes); - } - if (data == null) throw new Error(`Failed to get data from buffer.`); - return data; - } - /** - * Finds a nonce that satisfies the Nano proof-of-work requirements. - * - * @param {string} hash - Hexadecimal hash of previous block, or public key for new accounts - * @param {NanoPowOptions} options - Used to configure search execution - */ - static async search(hash, options) { - if (!/^[A-Fa-f0-9]{64}$/.test(hash)) throw new TypeError(`Invalid hash ${hash}`); - if (this.#busy) { - return new Promise((resolve) => { - setTimeout(async () => { - const result = this.search(hash, options); - resolve(result); - }, 100); - }); - } - this.#busy = true; - const threshold = typeof options?.threshold !== "number" || options.threshold < 0 || options.threshold > 4294967295 ? 4294967288 : options.threshold; - const effort = typeof options?.effort !== "number" || options.effort < 1 || options.effort > 32 ? 2048 : options.effort * 256; - const debug = !!options?.debug; - let loads = 0; - while (this.#device == null && loads < 20) { - await new Promise((resolve) => { - setTimeout(resolve, 500); - }); - } - if (this.#device == null) { - this.#busy = false; - throw new Error(`WebGPU device failed to load.`); - } - let times = []; - let start = performance.now(); - let nonce = 0n; - do { - start = performance.now(); - const random = Math.floor(Math.random() * 4294967295); - const seed = BigInt(random) << 32n | BigInt(random); - const data = await this.#dispatch(this.#searchPipeline, seed, hash, threshold, effort); - nonce = data.getBigUint64(0, true); - this.#busy = !data.getUint32(8); - times.push(performance.now() - start); - } while (this.#busy); - if (debug) this.#logAverages(times); - return nonce.toString(16).padStart(16, "0"); - } - /** - * Validates that a nonce satisfies Nano proof-of-work requirements. - * - * @param {string} work - Hexadecimal proof-of-work value to validate - * @param {string} hash - Hexadecimal hash of previous block, or public key for new accounts - * @param {NanoPowOptions} options - Options used to configure search execution - */ - static async validate(work, hash, options) { - if (!/^[A-Fa-f0-9]{16}$/.test(work)) throw new TypeError(`Invalid work ${work}`); - if (!/^[A-Fa-f0-9]{64}$/.test(hash)) throw new TypeError(`Invalid hash ${hash}`); - if (this.#busy) { - return new Promise((resolve) => { - setTimeout(async () => { - const result = this.validate(work, hash, options); - resolve(result); - }, 100); - }); - } - this.#busy = true; - const debug = !!options?.debug; - const threshold = typeof options?.threshold !== "number" || options.threshold < 0 || options.threshold > 4294967295 ? 4294967288 : options.threshold; - let loads = 0; - while (this.#device == null && loads < 20) { - await new Promise((resolve) => { - setTimeout(resolve, 500); - }); - } - if (this.#device == null) { - this.#busy = false; - throw new Error(`WebGPU device failed to load.`); - } - const seed = BigInt(`0x${work}`); - const data = await this.#dispatch(this.#validatePipeline, seed, hash, threshold, 1); - const nonce = data.getBigUint64(0, true).toString(16).padStart(16, "0"); - const found = !!data.getUint32(8); - this.#busy = false; - if (found && work !== nonce) throw new Error(`Nonce (${nonce}) found but does not match work (${work})`); - return found; - } -}; - -// src/classes/index.ts -var isGlSupported; -var isGpuSupported = false; -try { - await NanoPowGpu.init(); - isGpuSupported = true; -} catch (err) { - console.warn(`WebGPU is not supported in this environment.`); - isGpuSupported = false; -} -try { - await NanoPowGl.init(); - isGlSupported = true; -} catch (err) { - console.error(err); - console.warn(`WebGL is not supported in this environment.`); - isGlSupported = false; -} -var NanoPow = isGpuSupported ? NanoPowGpu : isGlSupported ? NanoPowGl : null; - -// src/main.ts -var main_default = NanoPow; -export { - NanoPow, - NanoPowGl, - NanoPowGpu, - main_default as default -}; diff --git a/src/shaders/compute.wgsl b/src/shaders/compute.wgsl index f2b8090..bbaafca 100644 --- a/src/shaders/compute.wgsl +++ b/src/shaders/compute.wgsl @@ -1,6 +1,9 @@ // SPDX-FileCopyrightText: 2025 Chris Duncan // SPDX-License-Identifier: GPL-3.0-or-later +/** +* Input buffers +*/ struct UBO { blockhash: array, 2>, random: vec2, @@ -8,18 +11,15 @@ struct UBO { }; @group(0) @binding(0) var ubo: UBO; +/** +* Output buffers +*/ struct WORK { nonce: vec2, found: atomic }; @group(0) @binding(1) var work: WORK; -/** -* Shared flag to prevent execution for all workgroup threads based on the -* atomicLoad() result of a single member thread. -*/ -var found: bool; - /** * Defined separately from `v0` because the original value is required to * calculate the digest and compare it to the threshold. @@ -29,15 +29,26 @@ const BLAKE2B_IV32_0: vec2 = vec2(0xF2BDC900u, 0x6A09E667u); /** * Used to rotate bits by a fixed amount during G mixing. */ -const ROTATE_1 = vec2(1u, 1u); -const ROTATE_8 = vec2(8u, 8u); -const ROTATE_16 = vec2(16u, 16u); -const ROTATE_24 = vec2(24u, 24u); -const ROTATE_31 = vec2(31u, 31u); +const VEC2_0 = vec2(0u); +const VEC4_0 = vec4(0u); +const ROTATE_1 = vec4(1u); +const ROTATE_8 = vec4(8u); +const ROTATE_16 = vec4(16u); +const ROTATE_24 = vec4(24u); +const ROTATE_31 = vec4(31u); + +/** +* Shared flag to prevent execution for all workgroup threads based on the +* atomicLoad() result of a single member thread. +*/ +var found: bool; /** * Search compute function -* Calls main with a workgroup size of 64 which has been tested as optimal +* Calls main with a workgroup size of 64 which is generally considered optimal +* due to how warps and wavefronts are executed on modern GPUs. The entire +* workgroup exits immediately if a nonce was already found by a previous +* workgroup. */ @compute @workgroup_size(64) fn search(@builtin(global_invocation_id) global_id: vec3, @builtin(local_invocation_id) local_id: vec3) { @@ -67,8 +78,6 @@ fn validate(@builtin(global_invocation_id) global_id: vec3) { * performance, but the variable name still contains the original index digit. */ fn main(id: vec3) { - let threshold: u32 = ubo.threshold; - /** * Initialize (nonce||blockhash) concatenation */ @@ -97,28 +106,24 @@ fn main(id: vec3) { * It is always the "last" compression at this INLEN * v14 = ~v14; */ - var v0: vec2 = BLAKE2B_IV32_0; - var v1: vec2 = vec2(0x84CAA73Bu, 0xBB67AE85u); - var v2: vec2 = vec2(0xFE94F82Bu, 0x3C6EF372u); - var v3: vec2 = vec2(0x5F1D36F1u, 0xA54FF53Au); - var v4: vec2 = vec2(0xADE682D1u, 0x510E527Fu); - var v5: vec2 = vec2(0x2B3E6C1Fu, 0x9B05688Cu); - var v6: vec2 = vec2(0xFB41BD6Bu, 0x1F83D9ABu); - var v7: vec2 = vec2(0x137E2179u, 0x5BE0CD19u); - var v8: vec2 = vec2(0xF3BCC908u, 0x6A09E667u); - var v9: vec2 = vec2(0x84CAA73Bu, 0xBB67AE85u); - var v10: vec2 = vec2(0xFE94F82Bu, 0x3C6EF372u); - var v11: vec2 = vec2(0x5F1D36F1u, 0xA54FF53Au); - var v12: vec2 = vec2(0xADE682F9u, 0x510E527Fu); - var v13: vec2 = vec2(0x2B3E6C1Fu, 0x9B05688Cu); - var v14: vec2 = vec2(0x04BE4294u, 0xE07C2654u); - var v15: vec2 = vec2(0x137E2179u, 0x5BE0CD19u); + var v01: vec4 = vec4(BLAKE2B_IV32_0, 0x84CAA73Bu, 0xBB67AE85u); + var v23: vec4 = vec4(0xFE94F82Bu, 0x3C6EF372u, 0x5F1D36F1u, 0xA54FF53Au); + var v45: vec4 = vec4(0xADE682D1u, 0x510E527Fu, 0x2B3E6C1Fu, 0x9B05688Cu); + var v67: vec4 = vec4(0xFB41BD6Bu, 0x1F83D9ABu, 0x137E2179u, 0x5BE0CD19u); + var v89: vec4 = vec4(0xF3BCC908u, 0x6A09E667u, 0x84CAA73Bu, 0xBB67AE85u); + var v1011: vec4 = vec4(0xFE94F82Bu, 0x3C6EF372u, 0x5F1D36F1u, 0xA54FF53Au); + var v1213: vec4 = vec4(0xADE682F9u, 0x510E527Fu, 0x2B3E6C1Fu, 0x9B05688Cu); + var v1415: vec4 = vec4(0x04BE4294u, 0xE07C2654u, 0x137E2179u, 0x5BE0CD19u); /** - * Twelve rounds of G mixing as part of BLAKE2b compression step. Each round is - * divided into eight subprocesses. Each subprocesses applies transformations - * to `m` and `v` variables based on a defined set of index inputs. The - * algorithm for each subprocess is defined as follows: + * Twelve rounds of G mixing as part of BLAKE2b compression step. Normally, + * each round is divided into eight subprocesses; NanoPow compresses these + * operations into four subprocesses by executing sequential pairs + * simultaneously, inspired by https://github.com/minio/blake2b-simd + * + * Each subprocess applies transformations to to `m` and `v` variables based on + * a defined set of index inputs. The algorithm for each subprocess is defined + * as follows: * * r is the current round * i is the current subprocess within that round @@ -140,112 +145,101 @@ fn main(id: vec3) { * Each sum step has an extra carry addition. Note that the m[sigma] sum is * skipped if m[sigma] is zero since it effectively does nothing. */ - + var a: vec4; + var b: vec4; + var c: vec4; + var d: vec4; + var x: vec4; + var y: vec4; /**************************************************************************** * ROUND(0) * ****************************************************************************/ /** * r=0, i=0, a=v[0], b=v[4], c=v[8], d=v[12] - */ - v0 = v0 + v4 + vec2(0u, u32(v0.x + v4.x < v0.x)); - v0 = v0 + m0 + vec2(0u, u32(v0.x + m0.x < v0.x)); - v12 = (v12 ^ v0).yx; - v8 = v8 + v12 + vec2(0u, u32(v8.x + v12.x < v8.x)); - v4 = ((v4 ^ v8) >> ROTATE_24) | ((v4 ^ v8).yx << ROTATE_8); - v0 = v0 + v4 + vec2(0u, u32(v0.x + v4.x < v0.x)); - v0 = v0 + m1 + vec2(0u, u32(v0.x + m1.x < v0.x)); - v12 = ((v12 ^ v0) >> ROTATE_16) | ((v12 ^ v0).yx << ROTATE_16); - v8 = v8 + v12 + vec2(0u, u32(v8.x + v12.x < v8.x)); - v4 = ((v4 ^ v8) << ROTATE_1) | ((v4 ^ v8).yx >> ROTATE_31); - - /** * r=0, i=1, a=v[1], b=v[5], c=v[9], d=v[13] */ - v1 = v1 + v5 + vec2(0u, u32(v1.x + v5.x < v1.x)); - v1 = v1 + m2 + vec2(0u, u32(v1.x + m2.x < v1.x)); - v13 = (v13 ^ v1).yx; - v9 = v9 + v13 + vec2(0u, u32(v9.x + v13.x < v9.x)); - v5 = ((v5 ^ v9) >> ROTATE_24) | ((v5 ^ v9).yx << ROTATE_8); - v1 = v1 + v5 + vec2(0u, u32(v1.x + v5.x < v1.x)); - v1 = v1 + m3 + vec2(0u, u32(v1.x + m3.x < v1.x)); - v13 = ((v13 ^ v1) >> ROTATE_16) | ((v13 ^ v1).yx << ROTATE_16); - v9 = v9 + v13 + vec2(0u, u32(v9.x + v13.x < v9.x)); - v5 = ((v5 ^ v9) << ROTATE_1) | ((v5 ^ v9).yx >> ROTATE_31); + x = vec4(m0, m2); + y = vec4(m1, m3); - /** - * r=0, i=2, a=v[2], b=v[6], c=v[10], d=v[14] - */ - v2 = v2 + v6 + vec2(0u, u32(v2.x + v6.x < v2.x)); - v2 = v2 + m4 + vec2(0u, u32(v2.x + m4.x < v2.x)); - v14 = (v14 ^ v2).yx; - v10 = v10 + v14 + vec2(0u, u32(v10.x + v14.x < v10.x)); - v6 = ((v6 ^ v10) >> ROTATE_24) | ((v6 ^ v10).yx << ROTATE_8); - v2 = v2 + v6 + vec2(0u, u32(v2.x + v6.x < v2.x)); - v14 = ((v14 ^ v2) >> ROTATE_16) | ((v14 ^ v2).yx << ROTATE_16); - v10 = v10 + v14 + vec2(0u, u32(v10.x + v14.x < v10.x)); - v6 = ((v6 ^ v10) << ROTATE_1) | ((v6 ^ v10).yx >> ROTATE_31); + v01 += v45 + vec4(0u, u32(v01.x + v45.x < v01.x), 0u, u32(v01.z + v45.z < v01.z)); + v01 += x + vec4(0u, u32(v01.x + x.x < v01.x), 0u, u32(v01.z + x.z < v01.z)); + v1213 = (v1213 ^ v01).yxwz; + v89 += v1213 + vec4(0u, u32(v89.x + v1213.x < v89.x), 0u, u32(v89.z + v1213.z < v89.z)); + v45 = ((v45 ^ v89) >> ROTATE_24) | ((v45 ^ v89).yxwz << ROTATE_8); + v01 += v45 + vec4(0u, u32(v01.x + v45.x < v01.x), 0u, u32(v01.z + v45.z < v01.z)); + v01 += y + vec4(0u, u32(v01.x + y.x < v01.x), 0u, u32(v01.z + y.z < v01.z)); + v1213 = ((v1213 ^ v01) >> ROTATE_16) | ((v1213 ^ v01).yxwz << ROTATE_16); + v89 += v1213 + vec4(0u, u32(v89.x + v1213.x < v89.x), 0u, u32(v89.z + v1213.z < v89.z)); + v45 = ((v45 ^ v89) << ROTATE_1) | ((v45 ^ v89).yxwz >> ROTATE_31); /** + * r=0, i=2, a=v[2], b=v[6], c=v[10], d=v[14] * r=0, i=3, a=v[3], b=v[7], c=v[11], d=v[15] */ - v3 = v3 + v7 + vec2(0u, u32(v3.x + v7.x < v3.x)); - v15 = (v15 ^ v3).yx; - v11 = v11 + v15 + vec2(0u, u32(v11.x + v15.x < v11.x)); - v7 = ((v7 ^ v11) >> ROTATE_24) | ((v7 ^ v11).yx << ROTATE_8); - v3 = v3 + v7 + vec2(0u, u32(v3.x + v7.x < v3.x)); - v15 = ((v15 ^ v3) >> ROTATE_16) | ((v15 ^ v3).yx << ROTATE_16); - v11 = v11 + v15 + vec2(0u, u32(v11.x + v15.x < v11.x)); - v7 = ((v7 ^ v11) << ROTATE_1) | ((v7 ^ v11).yx >> ROTATE_31); + x = vec4(m4, VEC2_0); + // y = VEC4_0; - /** - * r=0, i=4, a=v[0], b=v[5], c=v[10], d=v[15] - */ - v0 = v0 + v5 + vec2(0u, u32(v0.x + v5.x < v0.x)); - v15 = (v15 ^ v0).yx; - v10 = v10 + v15 + vec2(0u, u32(v10.x + v15.x < v10.x)); - v5 = ((v5 ^ v10) >> ROTATE_24) | ((v5 ^ v10).yx << ROTATE_8); - v0 = v0 + v5 + vec2(0u, u32(v0.x + v5.x < v0.x)); - v15 = ((v15 ^ v0) >> ROTATE_16) | ((v15 ^ v0).yx << ROTATE_16); - v10 = v10 + v15 + vec2(0u, u32(v10.x + v15.x < v10.x)); - v5 = ((v5 ^ v10) << ROTATE_1) | ((v5 ^ v10).yx >> ROTATE_31); + v23 += v67 + vec4(0u, u32(v23.x + v67.x < v23.x), 0u, u32(v23.z + v67.z < v23.z)); + v23 += x + vec4(0u, u32(v23.x + x.x < v23.x), 0u, u32(v23.z + x.z < v23.z)); + v1415 = (v1415 ^ v23).yxwz; + v1011 += v1415 + vec4(0u, u32(v1011.x + v1415.x < v1011.x), 0u, u32(v1011.z + v1415.z < v1011.z)); + v67 = ((v67 ^ v1011) >> ROTATE_24) | ((v67 ^ v1011).yxwz << ROTATE_8); + v23 += v67 + vec4(0u, u32(v23.x + v67.x < v23.x), 0u, u32(v23.z + v67.z < v23.z)); + // v23 += y + vec4(0u, u32(v23.x + y.x < v23.x), 0u, u32(v23.z + y.z < v23.z)); + v1415 = ((v1415 ^ v23) >> ROTATE_16) | ((v1415 ^ v23).yxwz << ROTATE_16); + v1011 += v1415 + vec4(0u, u32(v1011.x + v1415.x < v1011.x), 0u, u32(v1011.z + v1415.z < v1011.z)); + v67 = ((v67 ^ v1011) << ROTATE_1) | ((v67 ^ v1011).yxwz >> ROTATE_31); /** + * r=0, i=4, a=v[0], b=v[5], c=v[10], d=v[15] * r=0, i=5, a=v[1], b=v[6], c=v[11], d=v[12] */ - v1 = v1 + v6 + vec2(0u, u32(v1.x + v6.x < v1.x)); - v12 = (v12 ^ v1).yx; - v11 = v11 + v12 + vec2(0u, u32(v11.x + v12.x < v11.x)); - v6 = ((v6 ^ v11) >> ROTATE_24) | ((v6 ^ v11).yx << ROTATE_8); - v1 = v1 + v6 + vec2(0u, u32(v1.x + v6.x < v1.x)); - v12 = ((v12 ^ v1) >> ROTATE_16) | ((v12 ^ v1).yx << ROTATE_16); - v11 = v11 + v12 + vec2(0u, u32(v11.x + v12.x < v11.x)); - v6 = ((v6 ^ v11) << ROTATE_1) | ((v6 ^ v11).yx >> ROTATE_31); + b = vec4(v45.zw, v67.xy); + d = vec4(v1415.zw, v1213.xy); + // x = VEC4_0; + // y = VEC4_0; - /** - * r=0, i=6, a=v[2], b=v[7], c=v[8], d=v[13] - */ - v2 = v2 + v7 + vec2(0u, u32(v2.x + v7.x < v2.x)); - v13 = (v13 ^ v2).yx; - v8 = v8 + v13 + vec2(0u, u32(v8.x + v13.x < v8.x)); - v7 = ((v7 ^ v8) >> ROTATE_24) | ((v7 ^ v8).yx << ROTATE_8); - v2 = v2 + v7 + vec2(0u, u32(v2.x + v7.x < v2.x)); - v13 = ((v13 ^ v2) >> ROTATE_16) | ((v13 ^ v2).yx << ROTATE_16); - v8 = v8 + v13 + vec2(0u, u32(v8.x + v13.x < v8.x)); - v7 = ((v7 ^ v8) << ROTATE_1) | ((v7 ^ v8).yx >> ROTATE_31); + v01 += b + vec4(0u, u32(v01.x + b.x < v01.x), 0u, u32(v01.z + b.z < v01.z)); + // v01 += x + vec4(0u, u32(v01.x + x.x < v01.x), 0u, u32(v01.z + x.z < v01.z)); + d = (d ^ v01).yxwz; + v1011 += d + vec4(0u, u32(v1011.x + d.x < v1011.x), 0u, u32(v1011.z + d.z < v1011.z)); + b = ((b ^ v1011) >> ROTATE_24) | ((b ^ v1011).yxwz << ROTATE_8); + v01 += b + vec4(0u, u32(v01.x + b.x < v01.x), 0u, u32(v01.z + b.z < v01.z)); + // v01 += y + vec4(0u, u32(v01.x + y.x < v01.x), 0u, u32(v01.z + y.z < v01.z)); + d = ((d ^ v01) >> ROTATE_16) | ((d ^ v01).yxwz << ROTATE_16); + v1011 += d + vec4(0u, u32(v1011.x + d.x < v1011.x), 0u, u32(v1011.z + d.z < v1011.z)); + b = ((b ^ v1011) << ROTATE_1) | ((b ^ v1011).yxwz >> ROTATE_31); + + v45 = vec4(v45.xy, b.xy); + v67 = vec4(b.zw, v67.zw); + v1415 = vec4(v1415.xy, d.xy); + v1213 = vec4(d.zw, v1213.zw); /** + * r=0, i=6, a=v[2], b=v[7], c=v[8], d=v[13] * r=0, i=7, a=v[3], b=v[4], c=v[9], d=v[14] */ - v3 = v3 + v4 + vec2(0u, u32(v3.x + v4.x < v3.x)); - v14 = (v14 ^ v3).yx; - v9 = v9 + v14 + vec2(0u, u32(v9.x + v14.x < v9.x)); - v4 = ((v4 ^ v9) >> ROTATE_24) | ((v4 ^ v9).yx << ROTATE_8); - v3 = v3 + v4 + vec2(0u, u32(v3.x + v4.x < v3.x)); - v14 = ((v14 ^ v3) >> ROTATE_16) | ((v14 ^ v3).yx << ROTATE_16); - v9 = v9 + v14 + vec2(0u, u32(v9.x + v14.x < v9.x)); - v4 = ((v4 ^ v9) << ROTATE_1) | ((v4 ^ v9).yx >> ROTATE_31); + b = vec4(v67.zw, v45.xy); + d = vec4(v1213.zw, v1415.xy); + // x = VEC4_0; + // y = VEC4_0; + + v23 += b + vec4(0u, u32(v23.x + b.x < v23.x), 0u, u32(v23.z + b.z < v23.z)); + // v23 += x + vec4(0u, u32(v23.x + x.x < v23.x), 0u, u32(v23.z + x.z < v23.z)); + d = (d ^ v23).yxwz; + v89 += d + vec4(0u, u32(v89.x + d.x < v89.x), 0u, u32(v89.z + d.z < v89.z)); + b = ((b ^ v89) >> ROTATE_24) | ((b ^ v89).yxwz << ROTATE_8); + v23 += b + vec4(0u, u32(v23.x + b.x < v23.x), 0u, u32(v23.z + b.z < v23.z)); + // v23 += y + vec4(0u, u32(v23.x + y.x < v23.x), 0u, u32(v23.z + y.z < v23.z)); + d = ((d ^ v23) >> ROTATE_16) | ((d ^ v23).yxwz << ROTATE_16); + v89 += d + vec4(0u, u32(v89.x + d.x < v89.x), 0u, u32(v89.z + d.z < v89.z)); + b = ((b ^ v89) << ROTATE_1) | ((b ^ v89).yxwz >> ROTATE_31); + v67 = vec4(v67.xy, b.xy); + v45 = vec4(b.zw, v45.zw); + v1213 = vec4(v1213.xy, d.xy); + v1415 = vec4(d.zw, v1415.zw); @@ -256,104 +250,89 @@ fn main(id: vec3) { /** * r=1, i=0, a=v[0], b=v[4], c=v[8], d=v[12] - */ - v0 = v0 + v4 + vec2(0u, u32(v0.x + v4.x < v0.x)); - v12 = (v12 ^ v0).yx; - v8 = v8 + v12 + vec2(0u, u32(v8.x + v12.x < v8.x)); - v4 = ((v4 ^ v8) >> ROTATE_24) | ((v4 ^ v8).yx << ROTATE_8); - v0 = v0 + v4 + vec2(0u, u32(v0.x + v4.x < v0.x)); - v12 = ((v12 ^ v0) >> ROTATE_16) | ((v12 ^ v0).yx << ROTATE_16); - v8 = v8 + v12 + vec2(0u, u32(v8.x + v12.x < v8.x)); - v4 = ((v4 ^ v8) << ROTATE_1) | ((v4 ^ v8).yx >> ROTATE_31); - - /** * r=1, i=1, a=v[1], b=v[5], c=v[9], d=v[13] */ - v1 = v1 + v5 + vec2(0u, u32(v1.x + v5.x < v1.x)); - v1 = v1 + m4 + vec2(0u, u32(v1.x + m4.x < v1.x)); - v13 = (v13 ^ v1).yx; - v9 = v9 + v13 + vec2(0u, u32(v9.x + v13.x < v9.x)); - v5 = ((v5 ^ v9) >> ROTATE_24) | ((v5 ^ v9).yx << ROTATE_8); - v1 = v1 + v5 + vec2(0u, u32(v1.x + v5.x < v1.x)); - v13 = ((v13 ^ v1) >> ROTATE_16) | ((v13 ^ v1).yx << ROTATE_16); - v9 = v9 + v13 + vec2(0u, u32(v9.x + v13.x < v9.x)); - v5 = ((v5 ^ v9) << ROTATE_1) | ((v5 ^ v9).yx >> ROTATE_31); + x = vec4(VEC2_0, m4); + // y = VEC4_0; - /** - * r=1, i=2, a=v[2], b=v[6], c=v[10], d=v[14] - */ - v2 = v2 + v6 + vec2(0u, u32(v2.x + v6.x < v2.x)); - v14 = (v14 ^ v2).yx; - v10 = v10 + v14 + vec2(0u, u32(v10.x + v14.x < v10.x)); - v6 = ((v6 ^ v10) >> ROTATE_24) | ((v6 ^ v10).yx << ROTATE_8); - v2 = v2 + v6 + vec2(0u, u32(v2.x + v6.x < v2.x)); - v14 = ((v14 ^ v2) >> ROTATE_16) | ((v14 ^ v2).yx << ROTATE_16); - v10 = v10 + v14 + vec2(0u, u32(v10.x + v14.x < v10.x)); - v6 = ((v6 ^ v10) << ROTATE_1) | ((v6 ^ v10).yx >> ROTATE_31); + v01 += v45 + vec4(0u, u32(v01.x + v45.x < v01.x), 0u, u32(v01.z + v45.z < v01.z)); + v01 += x + vec4(0u, u32(v01.x + x.x < v01.x), 0u, u32(v01.z + x.z < v01.z)); + v1213 = (v1213 ^ v01).yxwz; + v89 += v1213 + vec4(0u, u32(v89.x + v1213.x < v89.x), 0u, u32(v89.z + v1213.z < v89.z)); + v45 = ((v45 ^ v89) >> ROTATE_24) | ((v45 ^ v89).yxwz << ROTATE_8); + v01 += v45 + vec4(0u, u32(v01.x + v45.x < v01.x), 0u, u32(v01.z + v45.z < v01.z)); + // v01 += y + vec4(0u, u32(v01.x + y.x < v01.x), 0u, u32(v01.z + y.z < v01.z)); + v1213 = ((v1213 ^ v01) >> ROTATE_16) | ((v1213 ^ v01).yxwz << ROTATE_16); + v89 += v1213 + vec4(0u, u32(v89.x + v1213.x < v89.x), 0u, u32(v89.z + v1213.z < v89.z)); + v45 = ((v45 ^ v89) << ROTATE_1) | ((v45 ^ v89).yxwz >> ROTATE_31); /** + * r=1, i=2, a=v[2], b=v[6], c=v[10], d=v[14] * r=1, i=3, a=v[3], b=v[7], c=v[11], d=v[15] */ - v3 = v3 + v7 + vec2(0u, u32(v3.x + v7.x < v3.x)); - v15 = (v15 ^ v3).yx; - v11 = v11 + v15 + vec2(0u, u32(v11.x + v15.x < v11.x)); - v7 = ((v7 ^ v11) >> ROTATE_24) | ((v7 ^ v11).yx << ROTATE_8); - v3 = v3 + v7 + vec2(0u, u32(v3.x + v7.x < v3.x)); - v15 = ((v15 ^ v3) >> ROTATE_16) | ((v15 ^ v3).yx << ROTATE_16); - v11 = v11 + v15 + vec2(0u, u32(v11.x + v15.x < v11.x)); - v7 = ((v7 ^ v11) << ROTATE_1) | ((v7 ^ v11).yx >> ROTATE_31); + // x = VEC4_0; + // y = VEC4_0; - /** - * r=1, i=4, a=v[0], b=v[5], c=v[10], d=v[15] - */ - v0 = v0 + v5 + vec2(0u, u32(v0.x + v5.x < v0.x)); - v0 = v0 + m1 + vec2(0u, u32(v0.x + m1.x < v0.x)); - v15 = (v15 ^ v0).yx; - v10 = v10 + v15 + vec2(0u, u32(v10.x + v15.x < v10.x)); - v5 = ((v5 ^ v10) >> ROTATE_24) | ((v5 ^ v10).yx << ROTATE_8); - v0 = v0 + v5 + vec2(0u, u32(v0.x + v5.x < v0.x)); - v15 = ((v15 ^ v0) >> ROTATE_16) | ((v15 ^ v0).yx << ROTATE_16); - v10 = v10 + v15 + vec2(0u, u32(v10.x + v15.x < v10.x)); - v5 = ((v5 ^ v10) << ROTATE_1) | ((v5 ^ v10).yx >> ROTATE_31); + v23 += v67 + vec4(0u, u32(v23.x + v67.x < v23.x), 0u, u32(v23.z + v67.z < v23.z)); + // v23 += x + vec4(0u, u32(v23.x + x.x < v23.x), 0u, u32(v23.z + x.z < v23.z)); + v1415 = (v1415 ^ v23).yxwz; + v1011 += v1415 + vec4(0u, u32(v1011.x + v1415.x < v1011.x), 0u, u32(v1011.z + v1415.z < v1011.z)); + v67 = ((v67 ^ v1011) >> ROTATE_24) | ((v67 ^ v1011).yxwz << ROTATE_8); + v23 += v67 + vec4(0u, u32(v23.x + v67.x < v23.x), 0u, u32(v23.z + v67.z < v23.z)); + // v23 += y + vec4(0u, u32(v23.x + y.x < v23.x), 0u, u32(v23.z + y.z < v23.z)); + v1415 = ((v1415 ^ v23) >> ROTATE_16) | ((v1415 ^ v23).yxwz << ROTATE_16); + v1011 += v1415 + vec4(0u, u32(v1011.x + v1415.x < v1011.x), 0u, u32(v1011.z + v1415.z < v1011.z)); + v67 = ((v67 ^ v1011) << ROTATE_1) | ((v67 ^ v1011).yxwz >> ROTATE_31); /** + * r=1, i=4, a=v[0], b=v[5], c=v[10], d=v[15] * r=1, i=5, a=v[1], b=v[6], c=v[11], d=v[12] */ - v1 = v1 + v6 + vec2(0u, u32(v1.x + v6.x < v1.x)); - v1 = v1 + m0 + vec2(0u, u32(v1.x + m0.x < v1.x)); - v12 = (v12 ^ v1).yx; - v11 = v11 + v12 + vec2(0u, u32(v11.x + v12.x < v11.x)); - v6 = ((v6 ^ v11) >> ROTATE_24) | ((v6 ^ v11).yx << ROTATE_8); - v1 = v1 + v6 + vec2(0u, u32(v1.x + v6.x < v1.x)); - v1 = v1 + m2 + vec2(0u, u32(v1.x + m2.x < v1.x)); - v12 = ((v12 ^ v1) >> ROTATE_16) | ((v12 ^ v1).yx << ROTATE_16); - v11 = v11 + v12 + vec2(0u, u32(v11.x + v12.x < v11.x)); - v6 = ((v6 ^ v11) << ROTATE_1) | ((v6 ^ v11).yx >> ROTATE_31); + b = vec4(v45.zw, v67.xy); + d = vec4(v1415.zw, v1213.xy); + x = vec4(m1, m0); + y = vec4(VEC2_0, m2); - /** - * r=1, i=6, a=v[2], b=v[7], c=v[8], d=v[13] - */ - v2 = v2 + v7 + vec2(0u, u32(v2.x + v7.x < v2.x)); - v13 = (v13 ^ v2).yx; - v8 = v8 + v13 + vec2(0u, u32(v8.x + v13.x < v8.x)); - v7 = ((v7 ^ v8) >> ROTATE_24) | ((v7 ^ v8).yx << ROTATE_8); - v2 = v2 + v7 + vec2(0u, u32(v2.x + v7.x < v2.x)); - v13 = ((v13 ^ v2) >> ROTATE_16) | ((v13 ^ v2).yx << ROTATE_16); - v8 = v8 + v13 + vec2(0u, u32(v8.x + v13.x < v8.x)); - v7 = ((v7 ^ v8) << ROTATE_1) | ((v7 ^ v8).yx >> ROTATE_31); + v01 += b + vec4(0u, u32(v01.x + b.x < v01.x), 0u, u32(v01.z + b.z < v01.z)); + v01 += x + vec4(0u, u32(v01.x + x.x < v01.x), 0u, u32(v01.z + x.z < v01.z)); + d = (d ^ v01).yxwz; + v1011 += d + vec4(0u, u32(v1011.x + d.x < v1011.x), 0u, u32(v1011.z + d.z < v1011.z)); + b = ((b ^ v1011) >> ROTATE_24) | ((b ^ v1011).yxwz << ROTATE_8); + v01 += b + vec4(0u, u32(v01.x + b.x < v01.x), 0u, u32(v01.z + b.z < v01.z)); + v01 += y + vec4(0u, u32(v01.x + y.x < v01.x), 0u, u32(v01.z + y.z < v01.z)); + d = ((d ^ v01) >> ROTATE_16) | ((d ^ v01).yxwz << ROTATE_16); + v1011 += d + vec4(0u, u32(v1011.x + d.x < v1011.x), 0u, u32(v1011.z + d.z < v1011.z)); + b = ((b ^ v1011) << ROTATE_1) | ((b ^ v1011).yxwz >> ROTATE_31); + + v45 = vec4(v45.xy, b.xy); + v67 = vec4(b.zw, v67.zw); + v1415 = vec4(v1415.xy, d.xy); + v1213 = vec4(d.zw, v1213.zw); /** + * r=1, i=6, a=v[2], b=v[7], c=v[8], d=v[13] * r=1, i=7, a=v[3], b=v[4], c=v[9], d=v[14] */ - v3 = v3 + v4 + vec2(0u, u32(v3.x + v4.x < v3.x)); - v14 = (v14 ^ v3).yx; - v9 = v9 + v14 + vec2(0u, u32(v9.x + v14.x < v9.x)); - v4 = ((v4 ^ v9) >> ROTATE_24) | ((v4 ^ v9).yx << ROTATE_8); - v3 = v3 + v4 + vec2(0u, u32(v3.x + v4.x < v3.x)); - v3 = v3 + m3 + vec2(0u, u32(v3.x + m3.x < v3.x)); - v14 = ((v14 ^ v3) >> ROTATE_16) | ((v14 ^ v3).yx << ROTATE_16); - v9 = v9 + v14 + vec2(0u, u32(v9.x + v14.x < v9.x)); - v4 = ((v4 ^ v9) << ROTATE_1) | ((v4 ^ v9).yx >> ROTATE_31); + b = vec4(v67.zw, v45.xy); + d = vec4(v1213.zw, v1415.xy); + // x = VEC4_0; + y = vec4(VEC2_0, m3); + + v23 += b + vec4(0u, u32(v23.x + b.x < v23.x), 0u, u32(v23.z + b.z < v23.z)); + // v23 += x + vec4(0u, u32(v23.x + x.x < v23.x), 0u, u32(v23.z + x.z < v23.z)); + d = (d ^ v23).yxwz; + v89 += d + vec4(0u, u32(v89.x + d.x < v89.x), 0u, u32(v89.z + d.z < v89.z)); + b = ((b ^ v89) >> ROTATE_24) | ((b ^ v89).yxwz << ROTATE_8); + v23 += b + vec4(0u, u32(v23.x + b.x < v23.x), 0u, u32(v23.z + b.z < v23.z)); + v23 += y + vec4(0u, u32(v23.x + y.x < v23.x), 0u, u32(v23.z + y.z < v23.z)); + d = ((d ^ v23) >> ROTATE_16) | ((d ^ v23).yxwz << ROTATE_16); + v89 += d + vec4(0u, u32(v89.x + d.x < v89.x), 0u, u32(v89.z + d.z < v89.z)); + b = ((b ^ v89) << ROTATE_1) | ((b ^ v89).yxwz >> ROTATE_31); + + v67 = vec4(v67.xy, b.xy); + v45 = vec4(b.zw, v45.zw); + v1213 = vec4(v1213.xy, d.xy); + v1415 = vec4(d.zw, v1415.zw); @@ -365,104 +344,89 @@ fn main(id: vec3) { /** * r=2, i=0, a=v[0], b=v[4], c=v[8], d=v[12] - */ - v0 = v0 + v4 + vec2(0u, u32(v0.x + v4.x < v0.x)); - v12 = (v12 ^ v0).yx; - v8 = v8 + v12 + vec2(0u, u32(v8.x + v12.x < v8.x)); - v4 = ((v4 ^ v8) >> ROTATE_24) | ((v4 ^ v8).yx << ROTATE_8); - v0 = v0 + v4 + vec2(0u, u32(v0.x + v4.x < v0.x)); - v12 = ((v12 ^ v0) >> ROTATE_16) | ((v12 ^ v0).yx << ROTATE_16); - v8 = v8 + v12 + vec2(0u, u32(v8.x + v12.x < v8.x)); - v4 = ((v4 ^ v8) << ROTATE_1) | ((v4 ^ v8).yx >> ROTATE_31); - - /** * r=2, i=1, a=v[1], b=v[5], c=v[9], d=v[13] */ - v1 = v1 + v5 + vec2(0u, u32(v1.x + v5.x < v1.x)); - v13 = (v13 ^ v1).yx; - v9 = v9 + v13 + vec2(0u, u32(v9.x + v13.x < v9.x)); - v5 = ((v5 ^ v9) >> ROTATE_24) | ((v5 ^ v9).yx << ROTATE_8); - v1 = v1 + v5 + vec2(0u, u32(v1.x + v5.x < v1.x)); - v1 = v1 + m0 + vec2(0u, u32(v1.x + m0.x < v1.x)); - v13 = ((v13 ^ v1) >> ROTATE_16) | ((v13 ^ v1).yx << ROTATE_16); - v9 = v9 + v13 + vec2(0u, u32(v9.x + v13.x < v9.x)); - v5 = ((v5 ^ v9) << ROTATE_1) | ((v5 ^ v9).yx >> ROTATE_31); + // x = VEC4_0; + y = vec4(VEC2_0, m0); - /** - * r=2, i=2, a=v[2], b=v[6], c=v[10], d=v[14] - */ - v2 = v2 + v6 + vec2(0u, u32(v2.x + v6.x < v2.x)); - v14 = (v14 ^ v2).yx; - v10 = v10 + v14 + vec2(0u, u32(v10.x + v14.x < v10.x)); - v6 = ((v6 ^ v10) >> ROTATE_24) | ((v6 ^ v10).yx << ROTATE_8); - v2 = v2 + v6 + vec2(0u, u32(v2.x + v6.x < v2.x)); - v2 = v2 + m2 + vec2(0u, u32(v2.x + m2.x < v2.x)); - v14 = ((v14 ^ v2) >> ROTATE_16) | ((v14 ^ v2).yx << ROTATE_16); - v10 = v10 + v14 + vec2(0u, u32(v10.x + v14.x < v10.x)); - v6 = ((v6 ^ v10) << ROTATE_1) | ((v6 ^ v10).yx >> ROTATE_31); + v01 += v45 + vec4(0u, u32(v01.x + v45.x < v01.x), 0u, u32(v01.z + v45.z < v01.z)); + // v01 += x + vec4(0u, u32(v01.x + x.x < v01.x), 0u, u32(v01.z + x.z < v01.z)); + v1213 = (v1213 ^ v01).yxwz; + v89 += v1213 + vec4(0u, u32(v89.x + v1213.x < v89.x), 0u, u32(v89.z + v1213.z < v89.z)); + v45 = ((v45 ^ v89) >> ROTATE_24) | ((v45 ^ v89).yxwz << ROTATE_8); + v01 += v45 + vec4(0u, u32(v01.x + v45.x < v01.x), 0u, u32(v01.z + v45.z < v01.z)); + v01 += y + vec4(0u, u32(v01.x + y.x < v01.x), 0u, u32(v01.z + y.z < v01.z)); + v1213 = ((v1213 ^ v01) >> ROTATE_16) | ((v1213 ^ v01).yxwz << ROTATE_16); + v89 += v1213 + vec4(0u, u32(v89.x + v1213.x < v89.x), 0u, u32(v89.z + v1213.z < v89.z)); + v45 = ((v45 ^ v89) << ROTATE_1) | ((v45 ^ v89).yxwz >> ROTATE_31); /** + * r=2, i=2, a=v[2], b=v[6], c=v[10], d=v[14] * r=2, i=3, a=v[3], b=v[7], c=v[11], d=v[15] */ - v3 = v3 + v7 + vec2(0u, u32(v3.x + v7.x < v3.x)); - v15 = (v15 ^ v3).yx; - v11 = v11 + v15 + vec2(0u, u32(v11.x + v15.x < v11.x)); - v7 = ((v7 ^ v11) >> ROTATE_24) | ((v7 ^ v11).yx << ROTATE_8); - v3 = v3 + v7 + vec2(0u, u32(v3.x + v7.x < v3.x)); - v15 = ((v15 ^ v3) >> ROTATE_16) | ((v15 ^ v3).yx << ROTATE_16); - v11 = v11 + v15 + vec2(0u, u32(v11.x + v15.x < v11.x)); - v7 = ((v7 ^ v11) << ROTATE_1) | ((v7 ^ v11).yx >> ROTATE_31); + // x = VEC4_0; + y = vec4(m2, VEC2_0); - /** - * r=2, i=4, a=v[0], b=v[5], c=v[10], d=v[15] - */ - v0 = v0 + v5 + vec2(0u, u32(v0.x + v5.x < v0.x)); - v15 = (v15 ^ v0).yx; - v10 = v10 + v15 + vec2(0u, u32(v10.x + v15.x < v10.x)); - v5 = ((v5 ^ v10) >> ROTATE_24) | ((v5 ^ v10).yx << ROTATE_8); - v0 = v0 + v5 + vec2(0u, u32(v0.x + v5.x < v0.x)); - v15 = ((v15 ^ v0) >> ROTATE_16) | ((v15 ^ v0).yx << ROTATE_16); - v10 = v10 + v15 + vec2(0u, u32(v10.x + v15.x < v10.x)); - v5 = ((v5 ^ v10) << ROTATE_1) | ((v5 ^ v10).yx >> ROTATE_31); + v23 += v67 + vec4(0u, u32(v23.x + v67.x < v23.x), 0u, u32(v23.z + v67.z < v23.z)); + // v23 += x + vec4(0u, u32(v23.x + x.x < v23.x), 0u, u32(v23.z + x.z < v23.z)); + v1415 = (v1415 ^ v23).yxwz; + v1011 += v1415 + vec4(0u, u32(v1011.x + v1415.x < v1011.x), 0u, u32(v1011.z + v1415.z < v1011.z)); + v67 = ((v67 ^ v1011) >> ROTATE_24) | ((v67 ^ v1011).yxwz << ROTATE_8); + v23 += v67 + vec4(0u, u32(v23.x + v67.x < v23.x), 0u, u32(v23.z + v67.z < v23.z)); + v23 += y + vec4(0u, u32(v23.x + y.x < v23.x), 0u, u32(v23.z + y.z < v23.z)); + v1415 = ((v1415 ^ v23) >> ROTATE_16) | ((v1415 ^ v23).yxwz << ROTATE_16); + v1011 += v1415 + vec4(0u, u32(v1011.x + v1415.x < v1011.x), 0u, u32(v1011.z + v1415.z < v1011.z)); + v67 = ((v67 ^ v1011) << ROTATE_1) | ((v67 ^ v1011).yxwz >> ROTATE_31); /** + * r=2, i=4, a=v[0], b=v[5], c=v[10], d=v[15] * r=2, i=5, a=v[1], b=v[6], c=v[11], d=v[12] */ - v1 = v1 + v6 + vec2(0u, u32(v1.x + v6.x < v1.x)); - v1 = v1 + m3 + vec2(0u, u32(v1.x + m3.x < v1.x)); - v12 = (v12 ^ v1).yx; - v11 = v11 + v12 + vec2(0u, u32(v11.x + v12.x < v11.x)); - v6 = ((v6 ^ v11) >> ROTATE_24) | ((v6 ^ v11).yx << ROTATE_8); - v1 = v1 + v6 + vec2(0u, u32(v1.x + v6.x < v1.x)); - v12 = ((v12 ^ v1) >> ROTATE_16) | ((v12 ^ v1).yx << ROTATE_16); - v11 = v11 + v12 + vec2(0u, u32(v11.x + v12.x < v11.x)); - v6 = ((v6 ^ v11) << ROTATE_1) | ((v6 ^ v11).yx >> ROTATE_31); + b = vec4(v45.zw, v67.xy); + d = vec4(v1415.zw, v1213.xy); + x = vec4(VEC2_0, m3); + // y = VEC4_0; - /** - * r=2, i=6, a=v[2], b=v[7], c=v[8], d=v[13] - */ - v2 = v2 + v7 + vec2(0u, u32(v2.x + v7.x < v2.x)); - v13 = (v13 ^ v2).yx; - v8 = v8 + v13 + vec2(0u, u32(v8.x + v13.x < v8.x)); - v7 = ((v7 ^ v8) >> ROTATE_24) | ((v7 ^ v8).yx << ROTATE_8); - v2 = v2 + v7 + vec2(0u, u32(v2.x + v7.x < v2.x)); - v2 = v2 + m1 + vec2(0u, u32(v2.x + m1.x < v2.x)); - v13 = ((v13 ^ v2) >> ROTATE_16) | ((v13 ^ v2).yx << ROTATE_16); - v8 = v8 + v13 + vec2(0u, u32(v8.x + v13.x < v8.x)); - v7 = ((v7 ^ v8) << ROTATE_1) | ((v7 ^ v8).yx >> ROTATE_31); + v01 += b + vec4(0u, u32(v01.x + b.x < v01.x), 0u, u32(v01.z + b.z < v01.z)); + v01 += x + vec4(0u, u32(v01.x + x.x < v01.x), 0u, u32(v01.z + x.z < v01.z)); + d = (d ^ v01).yxwz; + v1011 += d + vec4(0u, u32(v1011.x + d.x < v1011.x), 0u, u32(v1011.z + d.z < v1011.z)); + b = ((b ^ v1011) >> ROTATE_24) | ((b ^ v1011).yxwz << ROTATE_8); + v01 += b + vec4(0u, u32(v01.x + b.x < v01.x), 0u, u32(v01.z + b.z < v01.z)); + // v01 += y + vec4(0u, u32(v01.x + y.x < v01.x), 0u, u32(v01.z + y.z < v01.z)); + d = ((d ^ v01) >> ROTATE_16) | ((d ^ v01).yxwz << ROTATE_16); + v1011 += d + vec4(0u, u32(v1011.x + d.x < v1011.x), 0u, u32(v1011.z + d.z < v1011.z)); + b = ((b ^ v1011) << ROTATE_1) | ((b ^ v1011).yxwz >> ROTATE_31); + + v45 = vec4(v45.xy, b.xy); + v67 = vec4(b.zw, v67.zw); + v1415 = vec4(v1415.xy, d.xy); + v1213 = vec4(d.zw, v1213.zw); /** + * r=2, i=6, a=v[2], b=v[7], c=v[8], d=v[13] * r=2, i=7, a=v[3], b=v[4], c=v[9], d=v[14] */ - v3 = v3 + v4 + vec2(0u, u32(v3.x + v4.x < v3.x)); - v14 = (v14 ^ v3).yx; - v9 = v9 + v14 + vec2(0u, u32(v9.x + v14.x < v9.x)); - v4 = ((v4 ^ v9) >> ROTATE_24) | ((v4 ^ v9).yx << ROTATE_8); - v3 = v3 + v4 + vec2(0u, u32(v3.x + v4.x < v3.x)); - v3 = v3 + m4 + vec2(0u, u32(v3.x + m4.x < v3.x)); - v14 = ((v14 ^ v3) >> ROTATE_16) | ((v14 ^ v3).yx << ROTATE_16); - v9 = v9 + v14 + vec2(0u, u32(v9.x + v14.x < v9.x)); - v4 = ((v4 ^ v9) << ROTATE_1) | ((v4 ^ v9).yx >> ROTATE_31); + b = vec4(v67.zw, v45.xy); + d = vec4(v1213.zw, v1415.xy); + // x = VEC4_0; + y = vec4(m1, m4); + + v23 += b + vec4(0u, u32(v23.x + b.x < v23.x), 0u, u32(v23.z + b.z < v23.z)); + // v23 += x + vec4(0u, u32(v23.x + x.x < v23.x), 0u, u32(v23.z + x.z < v23.z)); + d = (d ^ v23).yxwz; + v89 += d + vec4(0u, u32(v89.x + d.x < v89.x), 0u, u32(v89.z + d.z < v89.z)); + b = ((b ^ v89) >> ROTATE_24) | ((b ^ v89).yxwz << ROTATE_8); + v23 += b + vec4(0u, u32(v23.x + b.x < v23.x), 0u, u32(v23.z + b.z < v23.z)); + v23 += y + vec4(0u, u32(v23.x + y.x < v23.x), 0u, u32(v23.z + y.z < v23.z)); + d = ((d ^ v23) >> ROTATE_16) | ((d ^ v23).yxwz << ROTATE_16); + v89 += d + vec4(0u, u32(v89.x + d.x < v89.x), 0u, u32(v89.z + d.z < v89.z)); + b = ((b ^ v89) << ROTATE_1) | ((b ^ v89).yxwz >> ROTATE_31); + + v67 = vec4(v67.xy, b.xy); + v45 = vec4(b.zw, v45.zw); + v1213 = vec4(v1213.xy, d.xy); + v1415 = vec4(d.zw, v1415.zw); @@ -474,104 +438,89 @@ fn main(id: vec3) { /** * r=3, i=0, a=v[0], b=v[4], c=v[8], d=v[12] - */ - v0 = v0 + v4 + vec2(0u, u32(v0.x + v4.x < v0.x)); - v12 = (v12 ^ v0).yx; - v8 = v8 + v12 + vec2(0u, u32(v8.x + v12.x < v8.x)); - v4 = ((v4 ^ v8) >> ROTATE_24) | ((v4 ^ v8).yx << ROTATE_8); - v0 = v0 + v4 + vec2(0u, u32(v0.x + v4.x < v0.x)); - v12 = ((v12 ^ v0) >> ROTATE_16) | ((v12 ^ v0).yx << ROTATE_16); - v8 = v8 + v12 + vec2(0u, u32(v8.x + v12.x < v8.x)); - v4 = ((v4 ^ v8) << ROTATE_1) | ((v4 ^ v8).yx >> ROTATE_31); - - /** * r=3, i=1, a=v[1], b=v[5], c=v[9], d=v[13] */ - v1 = v1 + v5 + vec2(0u, u32(v1.x + v5.x < v1.x)); - v1 = v1 + m3 + vec2(0u, u32(v1.x + m3.x < v1.x)); - v13 = (v13 ^ v1).yx; - v9 = v9 + v13 + vec2(0u, u32(v9.x + v13.x < v9.x)); - v5 = ((v5 ^ v9) >> ROTATE_24) | ((v5 ^ v9).yx << ROTATE_8); - v1 = v1 + v5 + vec2(0u, u32(v1.x + v5.x < v1.x)); - v1 = v1 + m1 + vec2(0u, u32(v1.x + m1.x < v1.x)); - v13 = ((v13 ^ v1) >> ROTATE_16) | ((v13 ^ v1).yx << ROTATE_16); - v9 = v9 + v13 + vec2(0u, u32(v9.x + v13.x < v9.x)); - v5 = ((v5 ^ v9) << ROTATE_1) | ((v5 ^ v9).yx >> ROTATE_31); + // x = vec4(VEC2_0, m3); // still assigned this value from last assignment at r=2 i=4/5 + y = vec4(VEC2_0, m1); - /** - * r=3, i=2, a=v[2], b=v[6], c=v[10], d=v[14] - */ - v2 = v2 + v6 + vec2(0u, u32(v2.x + v6.x < v2.x)); - v14 = (v14 ^ v2).yx; - v10 = v10 + v14 + vec2(0u, u32(v10.x + v14.x < v10.x)); - v6 = ((v6 ^ v10) >> ROTATE_24) | ((v6 ^ v10).yx << ROTATE_8); - v2 = v2 + v6 + vec2(0u, u32(v2.x + v6.x < v2.x)); - v14 = ((v14 ^ v2) >> ROTATE_16) | ((v14 ^ v2).yx << ROTATE_16); - v10 = v10 + v14 + vec2(0u, u32(v10.x + v14.x < v10.x)); - v6 = ((v6 ^ v10) << ROTATE_1) | ((v6 ^ v10).yx >> ROTATE_31); + v01 += v45 + vec4(0u, u32(v01.x + v45.x < v01.x), 0u, u32(v01.z + v45.z < v01.z)); + v01 += x + vec4(0u, u32(v01.x + x.x < v01.x), 0u, u32(v01.z + x.z < v01.z)); + v1213 = (v1213 ^ v01).yxwz; + v89 += v1213 + vec4(0u, u32(v89.x + v1213.x < v89.x), 0u, u32(v89.z + v1213.z < v89.z)); + v45 = ((v45 ^ v89) >> ROTATE_24) | ((v45 ^ v89).yxwz << ROTATE_8); + v01 += v45 + vec4(0u, u32(v01.x + v45.x < v01.x), 0u, u32(v01.z + v45.z < v01.z)); + v01 += y + vec4(0u, u32(v01.x + y.x < v01.x), 0u, u32(v01.z + y.z < v01.z)); + v1213 = ((v1213 ^ v01) >> ROTATE_16) | ((v1213 ^ v01).yxwz << ROTATE_16); + v89 += v1213 + vec4(0u, u32(v89.x + v1213.x < v89.x), 0u, u32(v89.z + v1213.z < v89.z)); + v45 = ((v45 ^ v89) << ROTATE_1) | ((v45 ^ v89).yxwz >> ROTATE_31); /** + * r=3, i=2, a=v[2], b=v[6], c=v[10], d=v[14] * r=3, i=3, a=v[3], b=v[7], c=v[11], d=v[15] */ - v3 = v3 + v7 + vec2(0u, u32(v3.x + v7.x < v3.x)); - v15 = (v15 ^ v3).yx; - v11 = v11 + v15 + vec2(0u, u32(v11.x + v15.x < v11.x)); - v7 = ((v7 ^ v11) >> ROTATE_24) | ((v7 ^ v11).yx << ROTATE_8); - v3 = v3 + v7 + vec2(0u, u32(v3.x + v7.x < v3.x)); - v15 = ((v15 ^ v3) >> ROTATE_16) | ((v15 ^ v3).yx << ROTATE_16); - v11 = v11 + v15 + vec2(0u, u32(v11.x + v15.x < v11.x)); - v7 = ((v7 ^ v11) << ROTATE_1) | ((v7 ^ v11).yx >> ROTATE_31); + // x = VEC4_0; + // y = VEC4_0; - /** - * r=3, i=4, a=v[0], b=v[5], c=v[10], d=v[15] - */ - v0 = v0 + v5 + vec2(0u, u32(v0.x + v5.x < v0.x)); - v0 = v0 + m2 + vec2(0u, u32(v0.x + m2.x < v0.x)); - v15 = (v15 ^ v0).yx; - v10 = v10 + v15 + vec2(0u, u32(v10.x + v15.x < v10.x)); - v5 = ((v5 ^ v10) >> ROTATE_24) | ((v5 ^ v10).yx << ROTATE_8); - v0 = v0 + v5 + vec2(0u, u32(v0.x + v5.x < v0.x)); - v15 = ((v15 ^ v0) >> ROTATE_16) | ((v15 ^ v0).yx << ROTATE_16); - v10 = v10 + v15 + vec2(0u, u32(v10.x + v15.x < v10.x)); - v5 = ((v5 ^ v10) << ROTATE_1) | ((v5 ^ v10).yx >> ROTATE_31); + v23 += v67 + vec4(0u, u32(v23.x + v67.x < v23.x), 0u, u32(v23.z + v67.z < v23.z)); + // v23 += x + vec4(0u, u32(v23.x + x.x < v23.x), 0u, u32(v23.z + x.z < v23.z)); + v1415 = (v1415 ^ v23).yxwz; + v1011 += v1415 + vec4(0u, u32(v1011.x + v1415.x < v1011.x), 0u, u32(v1011.z + v1415.z < v1011.z)); + v67 = ((v67 ^ v1011) >> ROTATE_24) | ((v67 ^ v1011).yxwz << ROTATE_8); + v23 += v67 + vec4(0u, u32(v23.x + v67.x < v23.x), 0u, u32(v23.z + v67.z < v23.z)); + // v23 += y + vec4(0u, u32(v23.x + y.x < v23.x), 0u, u32(v23.z + y.z < v23.z)); + v1415 = ((v1415 ^ v23) >> ROTATE_16) | ((v1415 ^ v23).yxwz << ROTATE_16); + v1011 += v1415 + vec4(0u, u32(v1011.x + v1415.x < v1011.x), 0u, u32(v1011.z + v1415.z < v1011.z)); + v67 = ((v67 ^ v1011) << ROTATE_1) | ((v67 ^ v1011).yxwz >> ROTATE_31); /** + * r=3, i=4, a=v[0], b=v[5], c=v[10], d=v[15] * r=3, i=5, a=v[1], b=v[6], c=v[11], d=v[12] */ - v1 = v1 + v6 + vec2(0u, u32(v1.x + v6.x < v1.x)); - v12 = (v12 ^ v1).yx; - v11 = v11 + v12 + vec2(0u, u32(v11.x + v12.x < v11.x)); - v6 = ((v6 ^ v11) >> ROTATE_24) | ((v6 ^ v11).yx << ROTATE_8); - v1 = v1 + v6 + vec2(0u, u32(v1.x + v6.x < v1.x)); - v12 = ((v12 ^ v1) >> ROTATE_16) | ((v12 ^ v1).yx << ROTATE_16); - v11 = v11 + v12 + vec2(0u, u32(v11.x + v12.x < v11.x)); - v6 = ((v6 ^ v11) << ROTATE_1) | ((v6 ^ v11).yx >> ROTATE_31); + b = vec4(v45.zw, v67.xy); + d = vec4(v1415.zw, v1213.xy); + x = vec4(m2, VEC2_0); + // y = VEC4_0; - /** - * r=3, i=6, a=v[2], b=v[7], c=v[8], d=v[13] - */ - v2 = v2 + v7 + vec2(0u, u32(v2.x + v7.x < v2.x)); - v2 = v2 + m4 + vec2(0u, u32(v2.x + m4.x < v2.x)); - v13 = (v13 ^ v2).yx; - v8 = v8 + v13 + vec2(0u, u32(v8.x + v13.x < v8.x)); - v7 = ((v7 ^ v8) >> ROTATE_24) | ((v7 ^ v8).yx << ROTATE_8); - v2 = v2 + v7 + vec2(0u, u32(v2.x + v7.x < v2.x)); - v2 = v2 + m0 + vec2(0u, u32(v2.x + m0.x < v2.x)); - v13 = ((v13 ^ v2) >> ROTATE_16) | ((v13 ^ v2).yx << ROTATE_16); - v8 = v8 + v13 + vec2(0u, u32(v8.x + v13.x < v8.x)); - v7 = ((v7 ^ v8) << ROTATE_1) | ((v7 ^ v8).yx >> ROTATE_31); + v01 += b + vec4(0u, u32(v01.x + b.x < v01.x), 0u, u32(v01.z + b.z < v01.z)); + v01 += x + vec4(0u, u32(v01.x + x.x < v01.x), 0u, u32(v01.z + x.z < v01.z)); + d = (d ^ v01).yxwz; + v1011 += d + vec4(0u, u32(v1011.x + d.x < v1011.x), 0u, u32(v1011.z + d.z < v1011.z)); + b = ((b ^ v1011) >> ROTATE_24) | ((b ^ v1011).yxwz << ROTATE_8); + v01 += b + vec4(0u, u32(v01.x + b.x < v01.x), 0u, u32(v01.z + b.z < v01.z)); + // v01 += y + vec4(0u, u32(v01.x + y.x < v01.x), 0u, u32(v01.z + y.z < v01.z)); + d = ((d ^ v01) >> ROTATE_16) | ((d ^ v01).yxwz << ROTATE_16); + v1011 += d + vec4(0u, u32(v1011.x + d.x < v1011.x), 0u, u32(v1011.z + d.z < v1011.z)); + b = ((b ^ v1011) << ROTATE_1) | ((b ^ v1011).yxwz >> ROTATE_31); + + v45 = vec4(v45.xy, b.xy); + v67 = vec4(b.zw, v67.zw); + v1415 = vec4(v1415.xy, d.xy); + v1213 = vec4(d.zw, v1213.zw); /** + * r=3, i=6, a=v[2], b=v[7], c=v[8], d=v[13] * r=3, i=7, a=v[3], b=v[4], c=v[9], d=v[14] */ - v3 = v3 + v4 + vec2(0u, u32(v3.x + v4.x < v3.x)); - v14 = (v14 ^ v3).yx; - v9 = v9 + v14 + vec2(0u, u32(v9.x + v14.x < v9.x)); - v4 = ((v4 ^ v9) >> ROTATE_24) | ((v4 ^ v9).yx << ROTATE_8); - v3 = v3 + v4 + vec2(0u, u32(v3.x + v4.x < v3.x)); - v14 = ((v14 ^ v3) >> ROTATE_16) | ((v14 ^ v3).yx << ROTATE_16); - v9 = v9 + v14 + vec2(0u, u32(v9.x + v14.x < v9.x)); - v4 = ((v4 ^ v9) << ROTATE_1) | ((v4 ^ v9).yx >> ROTATE_31); + b = vec4(v67.zw, v45.xy); + d = vec4(v1213.zw, v1415.xy); + x = vec4(m4, VEC2_0); + y = vec4(m0, VEC2_0); + + v23 += b + vec4(0u, u32(v23.x + b.x < v23.x), 0u, u32(v23.z + b.z < v23.z)); + v23 += x + vec4(0u, u32(v23.x + x.x < v23.x), 0u, u32(v23.z + x.z < v23.z)); + d = (d ^ v23).yxwz; + v89 += d + vec4(0u, u32(v89.x + d.x < v89.x), 0u, u32(v89.z + d.z < v89.z)); + b = ((b ^ v89) >> ROTATE_24) | ((b ^ v89).yxwz << ROTATE_8); + v23 += b + vec4(0u, u32(v23.x + b.x < v23.x), 0u, u32(v23.z + b.z < v23.z)); + v23 += y + vec4(0u, u32(v23.x + y.x < v23.x), 0u, u32(v23.z + y.z < v23.z)); + d = ((d ^ v23) >> ROTATE_16) | ((d ^ v23).yxwz << ROTATE_16); + v89 += d + vec4(0u, u32(v89.x + d.x < v89.x), 0u, u32(v89.z + d.z < v89.z)); + b = ((b ^ v89) << ROTATE_1) | ((b ^ v89).yxwz >> ROTATE_31); + + v67 = vec4(v67.xy, b.xy); + v45 = vec4(b.zw, v45.zw); + v1213 = vec4(v1213.xy, d.xy); + v1415 = vec4(d.zw, v1415.zw); @@ -583,104 +532,89 @@ fn main(id: vec3) { /** * r=4, i=0, a=v[0], b=v[4], c=v[8], d=v[12] - */ - v0 = v0 + v4 + vec2(0u, u32(v0.x + v4.x < v0.x)); - v12 = (v12 ^ v0).yx; - v8 = v8 + v12 + vec2(0u, u32(v8.x + v12.x < v8.x)); - v4 = ((v4 ^ v8) >> ROTATE_24) | ((v4 ^ v8).yx << ROTATE_8); - v0 = v0 + v4 + vec2(0u, u32(v0.x + v4.x < v0.x)); - v0 = v0 + m0 + vec2(0u, u32(v0.x + m0.x < v0.x)); - v12 = ((v12 ^ v0) >> ROTATE_16) | ((v12 ^ v0).yx << ROTATE_16); - v8 = v8 + v12 + vec2(0u, u32(v8.x + v12.x < v8.x)); - v4 = ((v4 ^ v8) << ROTATE_1) | ((v4 ^ v8).yx >> ROTATE_31); - - /** * r=4, i=1, a=v[1], b=v[5], c=v[9], d=v[13] */ - v1 = v1 + v5 + vec2(0u, u32(v1.x + v5.x < v1.x)); - v13 = (v13 ^ v1).yx; - v9 = v9 + v13 + vec2(0u, u32(v9.x + v13.x < v9.x)); - v5 = ((v5 ^ v9) >> ROTATE_24) | ((v5 ^ v9).yx << ROTATE_8); - v1 = v1 + v5 + vec2(0u, u32(v1.x + v5.x < v1.x)); - v13 = ((v13 ^ v1) >> ROTATE_16) | ((v13 ^ v1).yx << ROTATE_16); - v9 = v9 + v13 + vec2(0u, u32(v9.x + v13.x < v9.x)); - v5 = ((v5 ^ v9) << ROTATE_1) | ((v5 ^ v9).yx >> ROTATE_31); + // x = VEC4_0; + // y = vec4(m0, VEC2_0); // still assigned this value from last assignment at r=3 i=6/7 - /** - * r=4, i=2, a=v[2], b=v[6], c=v[10], d=v[14] - */ - v2 = v2 + v6 + vec2(0u, u32(v2.x + v6.x < v2.x)); - v2 = v2 + m2 + vec2(0u, u32(v2.x + m2.x < v2.x)); - v14 = (v14 ^ v2).yx; - v10 = v10 + v14 + vec2(0u, u32(v10.x + v14.x < v10.x)); - v6 = ((v6 ^ v10) >> ROTATE_24) | ((v6 ^ v10).yx << ROTATE_8); - v2 = v2 + v6 + vec2(0u, u32(v2.x + v6.x < v2.x)); - v2 = v2 + m4 + vec2(0u, u32(v2.x + m4.x < v2.x)); - v14 = ((v14 ^ v2) >> ROTATE_16) | ((v14 ^ v2).yx << ROTATE_16); - v10 = v10 + v14 + vec2(0u, u32(v10.x + v14.x < v10.x)); - v6 = ((v6 ^ v10) << ROTATE_1) | ((v6 ^ v10).yx >> ROTATE_31); + v01 += v45 + vec4(0u, u32(v01.x + v45.x < v01.x), 0u, u32(v01.z + v45.z < v01.z)); + // v01 += x + vec4(0u, u32(v01.x + x.x < v01.x), 0u, u32(v01.z + x.z < v01.z)); + v1213 = (v1213 ^ v01).yxwz; + v89 += v1213 + vec4(0u, u32(v89.x + v1213.x < v89.x), 0u, u32(v89.z + v1213.z < v89.z)); + v45 = ((v45 ^ v89) >> ROTATE_24) | ((v45 ^ v89).yxwz << ROTATE_8); + v01 += v45 + vec4(0u, u32(v01.x + v45.x < v01.x), 0u, u32(v01.z + v45.z < v01.z)); + v01 += y + vec4(0u, u32(v01.x + y.x < v01.x), 0u, u32(v01.z + y.z < v01.z)); + v1213 = ((v1213 ^ v01) >> ROTATE_16) | ((v1213 ^ v01).yxwz << ROTATE_16); + v89 += v1213 + vec4(0u, u32(v89.x + v1213.x < v89.x), 0u, u32(v89.z + v1213.z < v89.z)); + v45 = ((v45 ^ v89) << ROTATE_1) | ((v45 ^ v89).yxwz >> ROTATE_31); /** + * r=4, i=2, a=v[2], b=v[6], c=v[10], d=v[14] * r=4, i=3, a=v[3], b=v[7], c=v[11], d=v[15] */ - v3 = v3 + v7 + vec2(0u, u32(v3.x + v7.x < v3.x)); - v15 = (v15 ^ v3).yx; - v11 = v11 + v15 + vec2(0u, u32(v11.x + v15.x < v11.x)); - v7 = ((v7 ^ v11) >> ROTATE_24) | ((v7 ^ v11).yx << ROTATE_8); - v3 = v3 + v7 + vec2(0u, u32(v3.x + v7.x < v3.x)); - v15 = ((v15 ^ v3) >> ROTATE_16) | ((v15 ^ v3).yx << ROTATE_16); - v11 = v11 + v15 + vec2(0u, u32(v11.x + v15.x < v11.x)); - v7 = ((v7 ^ v11) << ROTATE_1) | ((v7 ^ v11).yx >> ROTATE_31); + x = vec4(m2, VEC2_0); + y = vec4(m4, VEC2_0); - /** - * r=4, i=4, a=v[0], b=v[5], c=v[10], d=v[15] - */ - v0 = v0 + v5 + vec2(0u, u32(v0.x + v5.x < v0.x)); - v15 = (v15 ^ v0).yx; - v10 = v10 + v15 + vec2(0u, u32(v10.x + v15.x < v10.x)); - v5 = ((v5 ^ v10) >> ROTATE_24) | ((v5 ^ v10).yx << ROTATE_8); - v0 = v0 + v5 + vec2(0u, u32(v0.x + v5.x < v0.x)); - v0 = v0 + m1 + vec2(0u, u32(v0.x + m1.x < v0.x)); - v15 = ((v15 ^ v0) >> ROTATE_16) | ((v15 ^ v0).yx << ROTATE_16); - v10 = v10 + v15 + vec2(0u, u32(v10.x + v15.x < v10.x)); - v5 = ((v5 ^ v10) << ROTATE_1) | ((v5 ^ v10).yx >> ROTATE_31); + v23 += v67 + vec4(0u, u32(v23.x + v67.x < v23.x), 0u, u32(v23.z + v67.z < v23.z)); + v23 += x + vec4(0u, u32(v23.x + x.x < v23.x), 0u, u32(v23.z + x.z < v23.z)); + v1415 = (v1415 ^ v23).yxwz; + v1011 += v1415 + vec4(0u, u32(v1011.x + v1415.x < v1011.x), 0u, u32(v1011.z + v1415.z < v1011.z)); + v67 = ((v67 ^ v1011) >> ROTATE_24) | ((v67 ^ v1011).yxwz << ROTATE_8); + v23 += v67 + vec4(0u, u32(v23.x + v67.x < v23.x), 0u, u32(v23.z + v67.z < v23.z)); + v23 += y + vec4(0u, u32(v23.x + y.x < v23.x), 0u, u32(v23.z + y.z < v23.z)); + v1415 = ((v1415 ^ v23) >> ROTATE_16) | ((v1415 ^ v23).yxwz << ROTATE_16); + v1011 += v1415 + vec4(0u, u32(v1011.x + v1415.x < v1011.x), 0u, u32(v1011.z + v1415.z < v1011.z)); + v67 = ((v67 ^ v1011) << ROTATE_1) | ((v67 ^ v1011).yxwz >> ROTATE_31); /** + * r=4, i=4, a=v[0], b=v[5], c=v[10], d=v[15] * r=4, i=5, a=v[1], b=v[6], c=v[11], d=v[12] */ - v1 = v1 + v6 + vec2(0u, u32(v1.x + v6.x < v1.x)); - v12 = (v12 ^ v1).yx; - v11 = v11 + v12 + vec2(0u, u32(v11.x + v12.x < v11.x)); - v6 = ((v6 ^ v11) >> ROTATE_24) | ((v6 ^ v11).yx << ROTATE_8); - v1 = v1 + v6 + vec2(0u, u32(v1.x + v6.x < v1.x)); - v12 = ((v12 ^ v1) >> ROTATE_16) | ((v12 ^ v1).yx << ROTATE_16); - v11 = v11 + v12 + vec2(0u, u32(v11.x + v12.x < v11.x)); - v6 = ((v6 ^ v11) << ROTATE_1) | ((v6 ^ v11).yx >> ROTATE_31); + b = vec4(v45.zw, v67.xy); + d = vec4(v1415.zw, v1213.xy); + // x = VEC4_0; + y = vec4(m1, VEC2_0); - /** - * r=4, i=6, a=v[2], b=v[7], c=v[8], d=v[13] - */ - v2 = v2 + v7 + vec2(0u, u32(v2.x + v7.x < v2.x)); - v13 = (v13 ^ v2).yx; - v8 = v8 + v13 + vec2(0u, u32(v8.x + v13.x < v8.x)); - v7 = ((v7 ^ v8) >> ROTATE_24) | ((v7 ^ v8).yx << ROTATE_8); - v2 = v2 + v7 + vec2(0u, u32(v2.x + v7.x < v2.x)); - v13 = ((v13 ^ v2) >> ROTATE_16) | ((v13 ^ v2).yx << ROTATE_16); - v8 = v8 + v13 + vec2(0u, u32(v8.x + v13.x < v8.x)); - v7 = ((v7 ^ v8) << ROTATE_1) | ((v7 ^ v8).yx >> ROTATE_31); + v01 += b + vec4(0u, u32(v01.x + b.x < v01.x), 0u, u32(v01.z + b.z < v01.z)); + // v01 += x + vec4(0u, u32(v01.x + x.x < v01.x), 0u, u32(v01.z + x.z < v01.z)); + d = (d ^ v01).yxwz; + v1011 += d + vec4(0u, u32(v1011.x + d.x < v1011.x), 0u, u32(v1011.z + d.z < v1011.z)); + b = ((b ^ v1011) >> ROTATE_24) | ((b ^ v1011).yxwz << ROTATE_8); + v01 += b + vec4(0u, u32(v01.x + b.x < v01.x), 0u, u32(v01.z + b.z < v01.z)); + v01 += y + vec4(0u, u32(v01.x + y.x < v01.x), 0u, u32(v01.z + y.z < v01.z)); + d = ((d ^ v01) >> ROTATE_16) | ((d ^ v01).yxwz << ROTATE_16); + v1011 += d + vec4(0u, u32(v1011.x + d.x < v1011.x), 0u, u32(v1011.z + d.z < v1011.z)); + b = ((b ^ v1011) << ROTATE_1) | ((b ^ v1011).yxwz >> ROTATE_31); + + v45 = vec4(v45.xy, b.xy); + v67 = vec4(b.zw, v67.zw); + v1415 = vec4(v1415.xy, d.xy); + v1213 = vec4(d.zw, v1213.zw); /** + * r=4, i=6, a=v[2], b=v[7], c=v[8], d=v[13] * r=4, i=7, a=v[3], b=v[4], c=v[9], d=v[14] */ - v3 = v3 + v4 + vec2(0u, u32(v3.x + v4.x < v3.x)); - v3 = v3 + m3 + vec2(0u, u32(v3.x + m3.x < v3.x)); - v14 = (v14 ^ v3).yx; - v9 = v9 + v14 + vec2(0u, u32(v9.x + v14.x < v9.x)); - v4 = ((v4 ^ v9) >> ROTATE_24) | ((v4 ^ v9).yx << ROTATE_8); - v3 = v3 + v4 + vec2(0u, u32(v3.x + v4.x < v3.x)); - v14 = ((v14 ^ v3) >> ROTATE_16) | ((v14 ^ v3).yx << ROTATE_16); - v9 = v9 + v14 + vec2(0u, u32(v9.x + v14.x < v9.x)); - v4 = ((v4 ^ v9) << ROTATE_1) | ((v4 ^ v9).yx >> ROTATE_31); + b = vec4(v67.zw, v45.xy); + d = vec4(v1213.zw, v1415.xy); + x = vec4(VEC2_0, m3); + // y = VEC4_0; + + v23 += b + vec4(0u, u32(v23.x + b.x < v23.x), 0u, u32(v23.z + b.z < v23.z)); + v23 += x + vec4(0u, u32(v23.x + x.x < v23.x), 0u, u32(v23.z + x.z < v23.z)); + d = (d ^ v23).yxwz; + v89 += d + vec4(0u, u32(v89.x + d.x < v89.x), 0u, u32(v89.z + d.z < v89.z)); + b = ((b ^ v89) >> ROTATE_24) | ((b ^ v89).yxwz << ROTATE_8); + v23 += b + vec4(0u, u32(v23.x + b.x < v23.x), 0u, u32(v23.z + b.z < v23.z)); + // v23 += y + vec4(0u, u32(v23.x + y.x < v23.x), 0u, u32(v23.z + y.z < v23.z)); + d = ((d ^ v23) >> ROTATE_16) | ((d ^ v23).yxwz << ROTATE_16); + v89 += d + vec4(0u, u32(v89.x + d.x < v89.x), 0u, u32(v89.z + d.z < v89.z)); + b = ((b ^ v89) << ROTATE_1) | ((b ^ v89).yxwz >> ROTATE_31); + + v67 = vec4(v67.xy, b.xy); + v45 = vec4(b.zw, v45.zw); + v1213 = vec4(v1213.xy, d.xy); + v1415 = vec4(d.zw, v1415.zw); @@ -692,104 +626,89 @@ fn main(id: vec3) { /** * r=5, i=0, a=v[0], b=v[4], c=v[8], d=v[12] - */ - v0 = v0 + v4 + vec2(0u, u32(v0.x + v4.x < v0.x)); - v0 = v0 + m2 + vec2(0u, u32(v0.x + m2.x < v0.x)); - v12 = (v12 ^ v0).yx; - v8 = v8 + v12 + vec2(0u, u32(v8.x + v12.x < v8.x)); - v4 = ((v4 ^ v8) >> ROTATE_24) | ((v4 ^ v8).yx << ROTATE_8); - v0 = v0 + v4 + vec2(0u, u32(v0.x + v4.x < v0.x)); - v12 = ((v12 ^ v0) >> ROTATE_16) | ((v12 ^ v0).yx << ROTATE_16); - v8 = v8 + v12 + vec2(0u, u32(v8.x + v12.x < v8.x)); - v4 = ((v4 ^ v8) << ROTATE_1) | ((v4 ^ v8).yx >> ROTATE_31); - - /** * r=5, i=1, a=v[1], b=v[5], c=v[9], d=v[13] */ - v1 = v1 + v5 + vec2(0u, u32(v1.x + v5.x < v1.x)); - v13 = (v13 ^ v1).yx; - v9 = v9 + v13 + vec2(0u, u32(v9.x + v13.x < v9.x)); - v5 = ((v5 ^ v9) >> ROTATE_24) | ((v5 ^ v9).yx << ROTATE_8); - v1 = v1 + v5 + vec2(0u, u32(v1.x + v5.x < v1.x)); - v13 = ((v13 ^ v1) >> ROTATE_16) | ((v13 ^ v1).yx << ROTATE_16); - v9 = v9 + v13 + vec2(0u, u32(v9.x + v13.x < v9.x)); - v5 = ((v5 ^ v9) << ROTATE_1) | ((v5 ^ v9).yx >> ROTATE_31); + x = vec4(m2, VEC2_0); + // y = VEC4_0; - /** - * r=5, i=2, a=v[2], b=v[6], c=v[10], d=v[14] - */ - v2 = v2 + v6 + vec2(0u, u32(v2.x + v6.x < v2.x)); - v2 = v2 + m0 + vec2(0u, u32(v2.x + m0.x < v2.x)); - v14 = (v14 ^ v2).yx; - v10 = v10 + v14 + vec2(0u, u32(v10.x + v14.x < v10.x)); - v6 = ((v6 ^ v10) >> ROTATE_24) | ((v6 ^ v10).yx << ROTATE_8); - v2 = v2 + v6 + vec2(0u, u32(v2.x + v6.x < v2.x)); - v14 = ((v14 ^ v2) >> ROTATE_16) | ((v14 ^ v2).yx << ROTATE_16); - v10 = v10 + v14 + vec2(0u, u32(v10.x + v14.x < v10.x)); - v6 = ((v6 ^ v10) << ROTATE_1) | ((v6 ^ v10).yx >> ROTATE_31); + v01 += v45 + vec4(0u, u32(v01.x + v45.x < v01.x), 0u, u32(v01.z + v45.z < v01.z)); + v01 += x + vec4(0u, u32(v01.x + x.x < v01.x), 0u, u32(v01.z + x.z < v01.z)); + v1213 = (v1213 ^ v01).yxwz; + v89 += v1213 + vec4(0u, u32(v89.x + v1213.x < v89.x), 0u, u32(v89.z + v1213.z < v89.z)); + v45 = ((v45 ^ v89) >> ROTATE_24) | ((v45 ^ v89).yxwz << ROTATE_8); + v01 += v45 + vec4(0u, u32(v01.x + v45.x < v01.x), 0u, u32(v01.z + v45.z < v01.z)); + // v01 += y + vec4(0u, u32(v01.x + y.x < v01.x), 0u, u32(v01.z + y.z < v01.z)); + v1213 = ((v1213 ^ v01) >> ROTATE_16) | ((v1213 ^ v01).yxwz << ROTATE_16); + v89 += v1213 + vec4(0u, u32(v89.x + v1213.x < v89.x), 0u, u32(v89.z + v1213.z < v89.z)); + v45 = ((v45 ^ v89) << ROTATE_1) | ((v45 ^ v89).yxwz >> ROTATE_31); /** + * r=5, i=2, a=v[2], b=v[6], c=v[10], d=v[14] * r=5, i=3, a=v[3], b=v[7], c=v[11], d=v[15] */ - v3 = v3 + v7 + vec2(0u, u32(v3.x + v7.x < v3.x)); - v15 = (v15 ^ v3).yx; - v11 = v11 + v15 + vec2(0u, u32(v11.x + v15.x < v11.x)); - v7 = ((v7 ^ v11) >> ROTATE_24) | ((v7 ^ v11).yx << ROTATE_8); - v3 = v3 + v7 + vec2(0u, u32(v3.x + v7.x < v3.x)); - v3 = v3 + m3 + vec2(0u, u32(v3.x + m3.x < v3.x)); - v15 = ((v15 ^ v3) >> ROTATE_16) | ((v15 ^ v3).yx << ROTATE_16); - v11 = v11 + v15 + vec2(0u, u32(v11.x + v15.x < v11.x)); - v7 = ((v7 ^ v11) << ROTATE_1) | ((v7 ^ v11).yx >> ROTATE_31); + x = vec4(m0, VEC2_0); + y = vec4(VEC2_0, m3); - /** - * r=5, i=4, a=v[0], b=v[5], c=v[10], d=v[15] - */ - v0 = v0 + v5 + vec2(0u, u32(v0.x + v5.x < v0.x)); - v0 = v0 + m4 + vec2(0u, u32(v0.x + m4.x < v0.x)); - v15 = (v15 ^ v0).yx; - v10 = v10 + v15 + vec2(0u, u32(v10.x + v15.x < v10.x)); - v5 = ((v5 ^ v10) >> ROTATE_24) | ((v5 ^ v10).yx << ROTATE_8); - v0 = v0 + v5 + vec2(0u, u32(v0.x + v5.x < v0.x)); - v15 = ((v15 ^ v0) >> ROTATE_16) | ((v15 ^ v0).yx << ROTATE_16); - v10 = v10 + v15 + vec2(0u, u32(v10.x + v15.x < v10.x)); - v5 = ((v5 ^ v10) << ROTATE_1) | ((v5 ^ v10).yx >> ROTATE_31); + v23 += v67 + vec4(0u, u32(v23.x + v67.x < v23.x), 0u, u32(v23.z + v67.z < v23.z)); + v23 += x + vec4(0u, u32(v23.x + x.x < v23.x), 0u, u32(v23.z + x.z < v23.z)); + v1415 = (v1415 ^ v23).yxwz; + v1011 += v1415 + vec4(0u, u32(v1011.x + v1415.x < v1011.x), 0u, u32(v1011.z + v1415.z < v1011.z)); + v67 = ((v67 ^ v1011) >> ROTATE_24) | ((v67 ^ v1011).yxwz << ROTATE_8); + v23 += v67 + vec4(0u, u32(v23.x + v67.x < v23.x), 0u, u32(v23.z + v67.z < v23.z)); + v23 += y + vec4(0u, u32(v23.x + y.x < v23.x), 0u, u32(v23.z + y.z < v23.z)); + v1415 = ((v1415 ^ v23) >> ROTATE_16) | ((v1415 ^ v23).yxwz << ROTATE_16); + v1011 += v1415 + vec4(0u, u32(v1011.x + v1415.x < v1011.x), 0u, u32(v1011.z + v1415.z < v1011.z)); + v67 = ((v67 ^ v1011) << ROTATE_1) | ((v67 ^ v1011).yxwz >> ROTATE_31); /** + * r=5, i=4, a=v[0], b=v[5], c=v[10], d=v[15] * r=5, i=5, a=v[1], b=v[6], c=v[11], d=v[12] */ - v1 = v1 + v6 + vec2(0u, u32(v1.x + v6.x < v1.x)); - v12 = (v12 ^ v1).yx; - v11 = v11 + v12 + vec2(0u, u32(v11.x + v12.x < v11.x)); - v6 = ((v6 ^ v11) >> ROTATE_24) | ((v6 ^ v11).yx << ROTATE_8); - v1 = v1 + v6 + vec2(0u, u32(v1.x + v6.x < v1.x)); - v12 = ((v12 ^ v1) >> ROTATE_16) | ((v12 ^ v1).yx << ROTATE_16); - v11 = v11 + v12 + vec2(0u, u32(v11.x + v12.x < v11.x)); - v6 = ((v6 ^ v11) << ROTATE_1) | ((v6 ^ v11).yx >> ROTATE_31); + b = vec4(v45.zw, v67.xy); + d = vec4(v1415.zw, v1213.xy); + x = vec4(m4, VEC2_0); + // y = VEC4_0; - /** - * r=5, i=6, a=v[2], b=v[7], c=v[8], d=v[13] - */ - v2 = v2 + v7 + vec2(0u, u32(v2.x + v7.x < v2.x)); - v13 = (v13 ^ v2).yx; - v8 = v8 + v13 + vec2(0u, u32(v8.x + v13.x < v8.x)); - v7 = ((v7 ^ v8) >> ROTATE_24) | ((v7 ^ v8).yx << ROTATE_8); - v2 = v2 + v7 + vec2(0u, u32(v2.x + v7.x < v2.x)); - v13 = ((v13 ^ v2) >> ROTATE_16) | ((v13 ^ v2).yx << ROTATE_16); - v8 = v8 + v13 + vec2(0u, u32(v8.x + v13.x < v8.x)); - v7 = ((v7 ^ v8) << ROTATE_1) | ((v7 ^ v8).yx >> ROTATE_31); + v01 += b + vec4(0u, u32(v01.x + b.x < v01.x), 0u, u32(v01.z + b.z < v01.z)); + v01 += x + vec4(0u, u32(v01.x + x.x < v01.x), 0u, u32(v01.z + x.z < v01.z)); + d = (d ^ v01).yxwz; + v1011 += d + vec4(0u, u32(v1011.x + d.x < v1011.x), 0u, u32(v1011.z + d.z < v1011.z)); + b = ((b ^ v1011) >> ROTATE_24) | ((b ^ v1011).yxwz << ROTATE_8); + v01 += b + vec4(0u, u32(v01.x + b.x < v01.x), 0u, u32(v01.z + b.z < v01.z)); + // v01 += y + vec4(0u, u32(v01.x + y.x < v01.x), 0u, u32(v01.z + y.z < v01.z)); + d = ((d ^ v01) >> ROTATE_16) | ((d ^ v01).yxwz << ROTATE_16); + v1011 += d + vec4(0u, u32(v1011.x + d.x < v1011.x), 0u, u32(v1011.z + d.z < v1011.z)); + b = ((b ^ v1011) << ROTATE_1) | ((b ^ v1011).yxwz >> ROTATE_31); + + v45 = vec4(v45.xy, b.xy); + v67 = vec4(b.zw, v67.zw); + v1415 = vec4(v1415.xy, d.xy); + v1213 = vec4(d.zw, v1213.zw); /** + * r=5, i=6, a=v[2], b=v[7], c=v[8], d=v[13] * r=5, i=7, a=v[3], b=v[4], c=v[9], d=v[14] */ - v3 = v3 + v4 + vec2(0u, u32(v3.x + v4.x < v3.x)); - v3 = v3 + m1 + vec2(0u, u32(v3.x + m1.x < v3.x)); - v14 = (v14 ^ v3).yx; - v9 = v9 + v14 + vec2(0u, u32(v9.x + v14.x < v9.x)); - v4 = ((v4 ^ v9) >> ROTATE_24) | ((v4 ^ v9).yx << ROTATE_8); - v3 = v3 + v4 + vec2(0u, u32(v3.x + v4.x < v3.x)); - v14 = ((v14 ^ v3) >> ROTATE_16) | ((v14 ^ v3).yx << ROTATE_16); - v9 = v9 + v14 + vec2(0u, u32(v9.x + v14.x < v9.x)); - v4 = ((v4 ^ v9) << ROTATE_1) | ((v4 ^ v9).yx >> ROTATE_31); + b = vec4(v67.zw, v45.xy); + d = vec4(v1213.zw, v1415.xy); + x = vec4(VEC2_0, m1); + // y = VEC4_0; + + v23 += b + vec4(0u, u32(v23.x + b.x < v23.x), 0u, u32(v23.z + b.z < v23.z)); + v23 += x + vec4(0u, u32(v23.x + x.x < v23.x), 0u, u32(v23.z + x.z < v23.z)); + d = (d ^ v23).yxwz; + v89 += d + vec4(0u, u32(v89.x + d.x < v89.x), 0u, u32(v89.z + d.z < v89.z)); + b = ((b ^ v89) >> ROTATE_24) | ((b ^ v89).yxwz << ROTATE_8); + v23 += b + vec4(0u, u32(v23.x + b.x < v23.x), 0u, u32(v23.z + b.z < v23.z)); + // v23 += y + vec4(0u, u32(v23.x + y.x < v23.x), 0u, u32(v23.z + y.z < v23.z)); + d = ((d ^ v23) >> ROTATE_16) | ((d ^ v23).yxwz << ROTATE_16); + v89 += d + vec4(0u, u32(v89.x + d.x < v89.x), 0u, u32(v89.z + d.z < v89.z)); + b = ((b ^ v89) << ROTATE_1) | ((b ^ v89).yxwz >> ROTATE_31); + + v67 = vec4(v67.xy, b.xy); + v45 = vec4(b.zw, v45.zw); + v1213 = vec4(v1213.xy, d.xy); + v1415 = vec4(d.zw, v1415.zw); @@ -801,104 +720,89 @@ fn main(id: vec3) { /** * r=6, i=0, a=v[0], b=v[4], c=v[8], d=v[12] - */ - v0 = v0 + v4 + vec2(0u, u32(v0.x + v4.x < v0.x)); - v12 = (v12 ^ v0).yx; - v8 = v8 + v12 + vec2(0u, u32(v8.x + v12.x < v8.x)); - v4 = ((v4 ^ v8) >> ROTATE_24) | ((v4 ^ v8).yx << ROTATE_8); - v0 = v0 + v4 + vec2(0u, u32(v0.x + v4.x < v0.x)); - v12 = ((v12 ^ v0) >> ROTATE_16) | ((v12 ^ v0).yx << ROTATE_16); - v8 = v8 + v12 + vec2(0u, u32(v8.x + v12.x < v8.x)); - v4 = ((v4 ^ v8) << ROTATE_1) | ((v4 ^ v8).yx >> ROTATE_31); - - /** * r=6, i=1, a=v[1], b=v[5], c=v[9], d=v[13] */ - v1 = v1 + v5 + vec2(0u, u32(v1.x + v5.x < v1.x)); - v1 = v1 + m1 + vec2(0u, u32(v1.x + m1.x < v1.x)); - v13 = (v13 ^ v1).yx; - v9 = v9 + v13 + vec2(0u, u32(v9.x + v13.x < v9.x)); - v5 = ((v5 ^ v9) >> ROTATE_24) | ((v5 ^ v9).yx << ROTATE_8); - v1 = v1 + v5 + vec2(0u, u32(v1.x + v5.x < v1.x)); - v13 = ((v13 ^ v1) >> ROTATE_16) | ((v13 ^ v1).yx << ROTATE_16); - v9 = v9 + v13 + vec2(0u, u32(v9.x + v13.x < v9.x)); - v5 = ((v5 ^ v9) << ROTATE_1) | ((v5 ^ v9).yx >> ROTATE_31); + // x = vec4(VEC2_0, m1); // still assigned this value from last assignment at r=5 i=6/7 + // y = VEC4_0; - /** - * r=6, i=2, a=v[2], b=v[6], c=v[10], d=v[14] - */ - v2 = v2 + v6 + vec2(0u, u32(v2.x + v6.x < v2.x)); - v14 = (v14 ^ v2).yx; - v10 = v10 + v14 + vec2(0u, u32(v10.x + v14.x < v10.x)); - v6 = ((v6 ^ v10) >> ROTATE_24) | ((v6 ^ v10).yx << ROTATE_8); - v2 = v2 + v6 + vec2(0u, u32(v2.x + v6.x < v2.x)); - v14 = ((v14 ^ v2) >> ROTATE_16) | ((v14 ^ v2).yx << ROTATE_16); - v10 = v10 + v14 + vec2(0u, u32(v10.x + v14.x < v10.x)); - v6 = ((v6 ^ v10) << ROTATE_1) | ((v6 ^ v10).yx >> ROTATE_31); + v01 += v45 + vec4(0u, u32(v01.x + v45.x < v01.x), 0u, u32(v01.z + v45.z < v01.z)); + v01 += x + vec4(0u, u32(v01.x + x.x < v01.x), 0u, u32(v01.z + x.z < v01.z)); + v1213 = (v1213 ^ v01).yxwz; + v89 += v1213 + vec4(0u, u32(v89.x + v1213.x < v89.x), 0u, u32(v89.z + v1213.z < v89.z)); + v45 = ((v45 ^ v89) >> ROTATE_24) | ((v45 ^ v89).yxwz << ROTATE_8); + v01 += v45 + vec4(0u, u32(v01.x + v45.x < v01.x), 0u, u32(v01.z + v45.z < v01.z)); + // v01 += y + vec4(0u, u32(v01.x + y.x < v01.x), 0u, u32(v01.z + y.z < v01.z)); + v1213 = ((v1213 ^ v01) >> ROTATE_16) | ((v1213 ^ v01).yxwz << ROTATE_16); + v89 += v1213 + vec4(0u, u32(v89.x + v1213.x < v89.x), 0u, u32(v89.z + v1213.z < v89.z)); + v45 = ((v45 ^ v89) << ROTATE_1) | ((v45 ^ v89).yxwz >> ROTATE_31); /** + * r=6, i=2, a=v[2], b=v[6], c=v[10], d=v[14] * r=6, i=3, a=v[3], b=v[7], c=v[11], d=v[15] */ - v3 = v3 + v7 + vec2(0u, u32(v3.x + v7.x < v3.x)); - v3 = v3 + m4 + vec2(0u, u32(v3.x + m4.x < v3.x)); - v15 = (v15 ^ v3).yx; - v11 = v11 + v15 + vec2(0u, u32(v11.x + v15.x < v11.x)); - v7 = ((v7 ^ v11) >> ROTATE_24) | ((v7 ^ v11).yx << ROTATE_8); - v3 = v3 + v7 + vec2(0u, u32(v3.x + v7.x < v3.x)); - v15 = ((v15 ^ v3) >> ROTATE_16) | ((v15 ^ v3).yx << ROTATE_16); - v11 = v11 + v15 + vec2(0u, u32(v11.x + v15.x < v11.x)); - v7 = ((v7 ^ v11) << ROTATE_1) | ((v7 ^ v11).yx >> ROTATE_31); + x = vec4(VEC2_0, m4); + // y = VEC4_0; - /** - * r=6, i=4, a=v[0], b=v[5], c=v[10], d=v[15] - */ - v0 = v0 + v5 + vec2(0u, u32(v0.x + v5.x < v0.x)); - v0 = v0 + m0 + vec2(0u, u32(v0.x + m0.x < v0.x)); - v15 = (v15 ^ v0).yx; - v10 = v10 + v15 + vec2(0u, u32(v10.x + v15.x < v10.x)); - v5 = ((v5 ^ v10) >> ROTATE_24) | ((v5 ^ v10).yx << ROTATE_8); - v0 = v0 + v5 + vec2(0u, u32(v0.x + v5.x < v0.x)); - v15 = ((v15 ^ v0) >> ROTATE_16) | ((v15 ^ v0).yx << ROTATE_16); - v10 = v10 + v15 + vec2(0u, u32(v10.x + v15.x < v10.x)); - v5 = ((v5 ^ v10) << ROTATE_1) | ((v5 ^ v10).yx >> ROTATE_31); + v23 += v67 + vec4(0u, u32(v23.x + v67.x < v23.x), 0u, u32(v23.z + v67.z < v23.z)); + v23 += x + vec4(0u, u32(v23.x + x.x < v23.x), 0u, u32(v23.z + x.z < v23.z)); + v1415 = (v1415 ^ v23).yxwz; + v1011 += v1415 + vec4(0u, u32(v1011.x + v1415.x < v1011.x), 0u, u32(v1011.z + v1415.z < v1011.z)); + v67 = ((v67 ^ v1011) >> ROTATE_24) | ((v67 ^ v1011).yxwz << ROTATE_8); + v23 += v67 + vec4(0u, u32(v23.x + v67.x < v23.x), 0u, u32(v23.z + v67.z < v23.z)); + // v23 += y + vec4(0u, u32(v23.x + y.x < v23.x), 0u, u32(v23.z + y.z < v23.z)); + v1415 = ((v1415 ^ v23) >> ROTATE_16) | ((v1415 ^ v23).yxwz << ROTATE_16); + v1011 += v1415 + vec4(0u, u32(v1011.x + v1415.x < v1011.x), 0u, u32(v1011.z + v1415.z < v1011.z)); + v67 = ((v67 ^ v1011) << ROTATE_1) | ((v67 ^ v1011).yxwz >> ROTATE_31); /** + * r=6, i=4, a=v[0], b=v[5], c=v[10], d=v[15] * r=6, i=5, a=v[1], b=v[6], c=v[11], d=v[12] */ - v1 = v1 + v6 + vec2(0u, u32(v1.x + v6.x < v1.x)); - v12 = (v12 ^ v1).yx; - v11 = v11 + v12 + vec2(0u, u32(v11.x + v12.x < v11.x)); - v6 = ((v6 ^ v11) >> ROTATE_24) | ((v6 ^ v11).yx << ROTATE_8); - v1 = v1 + v6 + vec2(0u, u32(v1.x + v6.x < v1.x)); - v1 = v1 + m3 + vec2(0u, u32(v1.x + m3.x < v1.x)); - v12 = ((v12 ^ v1) >> ROTATE_16) | ((v12 ^ v1).yx << ROTATE_16); - v11 = v11 + v12 + vec2(0u, u32(v11.x + v12.x < v11.x)); - v6 = ((v6 ^ v11) << ROTATE_1) | ((v6 ^ v11).yx >> ROTATE_31); + b = vec4(v45.zw, v67.xy); + d = vec4(v1415.zw, v1213.xy); + x = vec4(m0, VEC2_0); + // y = vec4(VEC2_0, m3); // still assigned this value from last assignment at r=5 i=4/5 - /** - * r=6, i=6, a=v[2], b=v[7], c=v[8], d=v[13] - */ - v2 = v2 + v7 + vec2(0u, u32(v2.x + v7.x < v2.x)); - v13 = (v13 ^ v2).yx; - v8 = v8 + v13 + vec2(0u, u32(v8.x + v13.x < v8.x)); - v7 = ((v7 ^ v8) >> ROTATE_24) | ((v7 ^ v8).yx << ROTATE_8); - v2 = v2 + v7 + vec2(0u, u32(v2.x + v7.x < v2.x)); - v2 = v2 + m2 + vec2(0u, u32(v2.x + m2.x < v2.x)); - v13 = ((v13 ^ v2) >> ROTATE_16) | ((v13 ^ v2).yx << ROTATE_16); - v8 = v8 + v13 + vec2(0u, u32(v8.x + v13.x < v8.x)); - v7 = ((v7 ^ v8) << ROTATE_1) | ((v7 ^ v8).yx >> ROTATE_31); + v01 += b + vec4(0u, u32(v01.x + b.x < v01.x), 0u, u32(v01.z + b.z < v01.z)); + v01 += x + vec4(0u, u32(v01.x + x.x < v01.x), 0u, u32(v01.z + x.z < v01.z)); + d = (d ^ v01).yxwz; + v1011 += d + vec4(0u, u32(v1011.x + d.x < v1011.x), 0u, u32(v1011.z + d.z < v1011.z)); + b = ((b ^ v1011) >> ROTATE_24) | ((b ^ v1011).yxwz << ROTATE_8); + v01 += b + vec4(0u, u32(v01.x + b.x < v01.x), 0u, u32(v01.z + b.z < v01.z)); + v01 += y + vec4(0u, u32(v01.x + y.x < v01.x), 0u, u32(v01.z + y.z < v01.z)); + d = ((d ^ v01) >> ROTATE_16) | ((d ^ v01).yxwz << ROTATE_16); + v1011 += d + vec4(0u, u32(v1011.x + d.x < v1011.x), 0u, u32(v1011.z + d.z < v1011.z)); + b = ((b ^ v1011) << ROTATE_1) | ((b ^ v1011).yxwz >> ROTATE_31); + + v45 = vec4(v45.xy, b.xy); + v67 = vec4(b.zw, v67.zw); + v1415 = vec4(v1415.xy, d.xy); + v1213 = vec4(d.zw, v1213.zw); /** + * r=6, i=6, a=v[2], b=v[7], c=v[8], d=v[13] * r=6, i=7, a=v[3], b=v[4], c=v[9], d=v[14] */ - v3 = v3 + v4 + vec2(0u, u32(v3.x + v4.x < v3.x)); - v14 = (v14 ^ v3).yx; - v9 = v9 + v14 + vec2(0u, u32(v9.x + v14.x < v9.x)); - v4 = ((v4 ^ v9) >> ROTATE_24) | ((v4 ^ v9).yx << ROTATE_8); - v3 = v3 + v4 + vec2(0u, u32(v3.x + v4.x < v3.x)); - v14 = ((v14 ^ v3) >> ROTATE_16) | ((v14 ^ v3).yx << ROTATE_16); - v9 = v9 + v14 + vec2(0u, u32(v9.x + v14.x < v9.x)); - v4 = ((v4 ^ v9) << ROTATE_1) | ((v4 ^ v9).yx >> ROTATE_31); + b = vec4(v67.zw, v45.xy); + d = vec4(v1213.zw, v1415.xy); + // x = VEC4_0; + y = vec4(m2, VEC2_0); + + v23 += b + vec4(0u, u32(v23.x + b.x < v23.x), 0u, u32(v23.z + b.z < v23.z)); + // v23 += x + vec4(0u, u32(v23.x + x.x < v23.x), 0u, u32(v23.z + x.z < v23.z)); + d = (d ^ v23).yxwz; + v89 += d + vec4(0u, u32(v89.x + d.x < v89.x), 0u, u32(v89.z + d.z < v89.z)); + b = ((b ^ v89) >> ROTATE_24) | ((b ^ v89).yxwz << ROTATE_8); + v23 += b + vec4(0u, u32(v23.x + b.x < v23.x), 0u, u32(v23.z + b.z < v23.z)); + v23 += y + vec4(0u, u32(v23.x + y.x < v23.x), 0u, u32(v23.z + y.z < v23.z)); + d = ((d ^ v23) >> ROTATE_16) | ((d ^ v23).yxwz << ROTATE_16); + v89 += d + vec4(0u, u32(v89.x + d.x < v89.x), 0u, u32(v89.z + d.z < v89.z)); + b = ((b ^ v89) << ROTATE_1) | ((b ^ v89).yxwz >> ROTATE_31); + + v67 = vec4(v67.xy, b.xy); + v45 = vec4(b.zw, v45.zw); + v1213 = vec4(v1213.xy, d.xy); + v1415 = vec4(d.zw, v1415.zw); @@ -910,104 +814,89 @@ fn main(id: vec3) { /** * r=7, i=0, a=v[0], b=v[4], c=v[8], d=v[12] - */ - v0 = v0 + v4 + vec2(0u, u32(v0.x + v4.x < v0.x)); - v12 = (v12 ^ v0).yx; - v8 = v8 + v12 + vec2(0u, u32(v8.x + v12.x < v8.x)); - v4 = ((v4 ^ v8) >> ROTATE_24) | ((v4 ^ v8).yx << ROTATE_8); - v0 = v0 + v4 + vec2(0u, u32(v0.x + v4.x < v0.x)); - v12 = ((v12 ^ v0) >> ROTATE_16) | ((v12 ^ v0).yx << ROTATE_16); - v8 = v8 + v12 + vec2(0u, u32(v8.x + v12.x < v8.x)); - v4 = ((v4 ^ v8) << ROTATE_1) | ((v4 ^ v8).yx >> ROTATE_31); - - /** * r=7, i=1, a=v[1], b=v[5], c=v[9], d=v[13] */ - v1 = v1 + v5 + vec2(0u, u32(v1.x + v5.x < v1.x)); - v13 = (v13 ^ v1).yx; - v9 = v9 + v13 + vec2(0u, u32(v9.x + v13.x < v9.x)); - v5 = ((v5 ^ v9) >> ROTATE_24) | ((v5 ^ v9).yx << ROTATE_8); - v1 = v1 + v5 + vec2(0u, u32(v1.x + v5.x < v1.x)); - v13 = ((v13 ^ v1) >> ROTATE_16) | ((v13 ^ v1).yx << ROTATE_16); - v9 = v9 + v13 + vec2(0u, u32(v9.x + v13.x < v9.x)); - v5 = ((v5 ^ v9) << ROTATE_1) | ((v5 ^ v9).yx >> ROTATE_31); + // x = VEC4_0; + // y = VEC4_0; - /** - * r=7, i=2, a=v[2], b=v[6], c=v[10], d=v[14] - */ - v2 = v2 + v6 + vec2(0u, u32(v2.x + v6.x < v2.x)); - v14 = (v14 ^ v2).yx; - v10 = v10 + v14 + vec2(0u, u32(v10.x + v14.x < v10.x)); - v6 = ((v6 ^ v10) >> ROTATE_24) | ((v6 ^ v10).yx << ROTATE_8); - v2 = v2 + v6 + vec2(0u, u32(v2.x + v6.x < v2.x)); - v2 = v2 + m1 + vec2(0u, u32(v2.x + m1.x < v2.x)); - v14 = ((v14 ^ v2) >> ROTATE_16) | ((v14 ^ v2).yx << ROTATE_16); - v10 = v10 + v14 + vec2(0u, u32(v10.x + v14.x < v10.x)); - v6 = ((v6 ^ v10) << ROTATE_1) | ((v6 ^ v10).yx >> ROTATE_31); + v01 += v45 + vec4(0u, u32(v01.x + v45.x < v01.x), 0u, u32(v01.z + v45.z < v01.z)); + // v01 += x + vec4(0u, u32(v01.x + x.x < v01.x), 0u, u32(v01.z + x.z < v01.z)); + v1213 = (v1213 ^ v01).yxwz; + v89 += v1213 + vec4(0u, u32(v89.x + v1213.x < v89.x), 0u, u32(v89.z + v1213.z < v89.z)); + v45 = ((v45 ^ v89) >> ROTATE_24) | ((v45 ^ v89).yxwz << ROTATE_8); + v01 += v45 + vec4(0u, u32(v01.x + v45.x < v01.x), 0u, u32(v01.z + v45.z < v01.z)); + // v01 += y + vec4(0u, u32(v01.x + y.x < v01.x), 0u, u32(v01.z + y.z < v01.z)); + v1213 = ((v1213 ^ v01) >> ROTATE_16) | ((v1213 ^ v01).yxwz << ROTATE_16); + v89 += v1213 + vec4(0u, u32(v89.x + v1213.x < v89.x), 0u, u32(v89.z + v1213.z < v89.z)); + v45 = ((v45 ^ v89) << ROTATE_1) | ((v45 ^ v89).yxwz >> ROTATE_31); /** + * r=7, i=2, a=v[2], b=v[6], c=v[10], d=v[14] * r=7, i=3, a=v[3], b=v[7], c=v[11], d=v[15] */ - v3 = v3 + v7 + vec2(0u, u32(v3.x + v7.x < v3.x)); - v3 = v3 + m3 + vec2(0u, u32(v3.x + m3.x < v3.x)); - v15 = (v15 ^ v3).yx; - v11 = v11 + v15 + vec2(0u, u32(v11.x + v15.x < v11.x)); - v7 = ((v7 ^ v11) >> ROTATE_24) | ((v7 ^ v11).yx << ROTATE_8); - v3 = v3 + v7 + vec2(0u, u32(v3.x + v7.x < v3.x)); - v15 = ((v15 ^ v3) >> ROTATE_16) | ((v15 ^ v3).yx << ROTATE_16); - v11 = v11 + v15 + vec2(0u, u32(v11.x + v15.x < v11.x)); - v7 = ((v7 ^ v11) << ROTATE_1) | ((v7 ^ v11).yx >> ROTATE_31); + x = vec4(VEC2_0, m3); + y = vec4(m1, VEC2_0); - /** - * r=7, i=4, a=v[0], b=v[5], c=v[10], d=v[15] - */ - v0 = v0 + v5 + vec2(0u, u32(v0.x + v5.x < v0.x)); - v15 = (v15 ^ v0).yx; - v10 = v10 + v15 + vec2(0u, u32(v10.x + v15.x < v10.x)); - v5 = ((v5 ^ v10) >> ROTATE_24) | ((v5 ^ v10).yx << ROTATE_8); - v0 = v0 + v5 + vec2(0u, u32(v0.x + v5.x < v0.x)); - v0 = v0 + m0 + vec2(0u, u32(v0.x + m0.x < v0.x)); - v15 = ((v15 ^ v0) >> ROTATE_16) | ((v15 ^ v0).yx << ROTATE_16); - v10 = v10 + v15 + vec2(0u, u32(v10.x + v15.x < v10.x)); - v5 = ((v5 ^ v10) << ROTATE_1) | ((v5 ^ v10).yx >> ROTATE_31); + v23 += v67 + vec4(0u, u32(v23.x + v67.x < v23.x), 0u, u32(v23.z + v67.z < v23.z)); + v23 += x + vec4(0u, u32(v23.x + x.x < v23.x), 0u, u32(v23.z + x.z < v23.z)); + v1415 = (v1415 ^ v23).yxwz; + v1011 += v1415 + vec4(0u, u32(v1011.x + v1415.x < v1011.x), 0u, u32(v1011.z + v1415.z < v1011.z)); + v67 = ((v67 ^ v1011) >> ROTATE_24) | ((v67 ^ v1011).yxwz << ROTATE_8); + v23 += v67 + vec4(0u, u32(v23.x + v67.x < v23.x), 0u, u32(v23.z + v67.z < v23.z)); + v23 += y + vec4(0u, u32(v23.x + y.x < v23.x), 0u, u32(v23.z + y.z < v23.z)); + v1415 = ((v1415 ^ v23) >> ROTATE_16) | ((v1415 ^ v23).yxwz << ROTATE_16); + v1011 += v1415 + vec4(0u, u32(v1011.x + v1415.x < v1011.x), 0u, u32(v1011.z + v1415.z < v1011.z)); + v67 = ((v67 ^ v1011) << ROTATE_1) | ((v67 ^ v1011).yxwz >> ROTATE_31); /** + * r=7, i=4, a=v[0], b=v[5], c=v[10], d=v[15] * r=7, i=5, a=v[1], b=v[6], c=v[11], d=v[12] */ - v1 = v1 + v6 + vec2(0u, u32(v1.x + v6.x < v1.x)); - v12 = (v12 ^ v1).yx; - v11 = v11 + v12 + vec2(0u, u32(v11.x + v12.x < v11.x)); - v6 = ((v6 ^ v11) >> ROTATE_24) | ((v6 ^ v11).yx << ROTATE_8); - v1 = v1 + v6 + vec2(0u, u32(v1.x + v6.x < v1.x)); - v1 = v1 + m4 + vec2(0u, u32(v1.x + m4.x < v1.x)); - v12 = ((v12 ^ v1) >> ROTATE_16) | ((v12 ^ v1).yx << ROTATE_16); - v11 = v11 + v12 + vec2(0u, u32(v11.x + v12.x < v11.x)); - v6 = ((v6 ^ v11) << ROTATE_1) | ((v6 ^ v11).yx >> ROTATE_31); + b = vec4(v45.zw, v67.xy); + d = vec4(v1415.zw, v1213.xy); + // x = VEC4_0; + y = vec4(m0, m4); - /** - * r=7, i=6, a=v[2], b=v[7], c=v[8], d=v[13] - */ - v2 = v2 + v7 + vec2(0u, u32(v2.x + v7.x < v2.x)); - v13 = (v13 ^ v2).yx; - v8 = v8 + v13 + vec2(0u, u32(v8.x + v13.x < v8.x)); - v7 = ((v7 ^ v8) >> ROTATE_24) | ((v7 ^ v8).yx << ROTATE_8); - v2 = v2 + v7 + vec2(0u, u32(v2.x + v7.x < v2.x)); - v13 = ((v13 ^ v2) >> ROTATE_16) | ((v13 ^ v2).yx << ROTATE_16); - v8 = v8 + v13 + vec2(0u, u32(v8.x + v13.x < v8.x)); - v7 = ((v7 ^ v8) << ROTATE_1) | ((v7 ^ v8).yx >> ROTATE_31); + v01 += b + vec4(0u, u32(v01.x + b.x < v01.x), 0u, u32(v01.z + b.z < v01.z)); + // v01 += x + vec4(0u, u32(v01.x + x.x < v01.x), 0u, u32(v01.z + x.z < v01.z)); + d = (d ^ v01).yxwz; + v1011 += d + vec4(0u, u32(v1011.x + d.x < v1011.x), 0u, u32(v1011.z + d.z < v1011.z)); + b = ((b ^ v1011) >> ROTATE_24) | ((b ^ v1011).yxwz << ROTATE_8); + v01 += b + vec4(0u, u32(v01.x + b.x < v01.x), 0u, u32(v01.z + b.z < v01.z)); + v01 += y + vec4(0u, u32(v01.x + y.x < v01.x), 0u, u32(v01.z + y.z < v01.z)); + d = ((d ^ v01) >> ROTATE_16) | ((d ^ v01).yxwz << ROTATE_16); + v1011 += d + vec4(0u, u32(v1011.x + d.x < v1011.x), 0u, u32(v1011.z + d.z < v1011.z)); + b = ((b ^ v1011) << ROTATE_1) | ((b ^ v1011).yxwz >> ROTATE_31); + + v45 = vec4(v45.xy, b.xy); + v67 = vec4(b.zw, v67.zw); + v1415 = vec4(v1415.xy, d.xy); + v1213 = vec4(d.zw, v1213.zw); /** + * r=7, i=6, a=v[2], b=v[7], c=v[8], d=v[13] * r=7, i=7, a=v[3], b=v[4], c=v[9], d=v[14] */ - v3 = v3 + v4 + vec2(0u, u32(v3.x + v4.x < v3.x)); - v3 = v3 + m2 + vec2(0u, u32(v3.x + m2.x < v3.x)); - v14 = (v14 ^ v3).yx; - v9 = v9 + v14 + vec2(0u, u32(v9.x + v14.x < v9.x)); - v4 = ((v4 ^ v9) >> ROTATE_24) | ((v4 ^ v9).yx << ROTATE_8); - v3 = v3 + v4 + vec2(0u, u32(v3.x + v4.x < v3.x)); - v14 = ((v14 ^ v3) >> ROTATE_16) | ((v14 ^ v3).yx << ROTATE_16); - v9 = v9 + v14 + vec2(0u, u32(v9.x + v14.x < v9.x)); - v4 = ((v4 ^ v9) << ROTATE_1) | ((v4 ^ v9).yx >> ROTATE_31); + b = vec4(v67.zw, v45.xy); + d = vec4(v1213.zw, v1415.xy); + x = vec4(VEC2_0, m2); + // y = VEC4_0; + + v23 += b + vec4(0u, u32(v23.x + b.x < v23.x), 0u, u32(v23.z + b.z < v23.z)); + v23 += x + vec4(0u, u32(v23.x + x.x < v23.x), 0u, u32(v23.z + x.z < v23.z)); + d = (d ^ v23).yxwz; + v89 += d + vec4(0u, u32(v89.x + d.x < v89.x), 0u, u32(v89.z + d.z < v89.z)); + b = ((b ^ v89) >> ROTATE_24) | ((b ^ v89).yxwz << ROTATE_8); + v23 += b + vec4(0u, u32(v23.x + b.x < v23.x), 0u, u32(v23.z + b.z < v23.z)); + // v23 += y + vec4(0u, u32(v23.x + y.x < v23.x), 0u, u32(v23.z + y.z < v23.z)); + d = ((d ^ v23) >> ROTATE_16) | ((d ^ v23).yxwz << ROTATE_16); + v89 += d + vec4(0u, u32(v89.x + d.x < v89.x), 0u, u32(v89.z + d.z < v89.z)); + b = ((b ^ v89) << ROTATE_1) | ((b ^ v89).yxwz >> ROTATE_31); + + v67 = vec4(v67.xy, b.xy); + v45 = vec4(b.zw, v45.zw); + v1213 = vec4(v1213.xy, d.xy); + v1415 = vec4(d.zw, v1415.zw); @@ -1019,104 +908,89 @@ fn main(id: vec3) { /** * r=8, i=0, a=v[0], b=v[4], c=v[8], d=v[12] - */ - v0 = v0 + v4 + vec2(0u, u32(v0.x + v4.x < v0.x)); - v12 = (v12 ^ v0).yx; - v8 = v8 + v12 + vec2(0u, u32(v8.x + v12.x < v8.x)); - v4 = ((v4 ^ v8) >> ROTATE_24) | ((v4 ^ v8).yx << ROTATE_8); - v0 = v0 + v4 + vec2(0u, u32(v0.x + v4.x < v0.x)); - v12 = ((v12 ^ v0) >> ROTATE_16) | ((v12 ^ v0).yx << ROTATE_16); - v8 = v8 + v12 + vec2(0u, u32(v8.x + v12.x < v8.x)); - v4 = ((v4 ^ v8) << ROTATE_1) | ((v4 ^ v8).yx >> ROTATE_31); - - /** * r=8, i=1, a=v[1], b=v[5], c=v[9], d=v[13] */ - v1 = v1 + v5 + vec2(0u, u32(v1.x + v5.x < v1.x)); - v13 = (v13 ^ v1).yx; - v9 = v9 + v13 + vec2(0u, u32(v9.x + v13.x < v9.x)); - v5 = ((v5 ^ v9) >> ROTATE_24) | ((v5 ^ v9).yx << ROTATE_8); - v1 = v1 + v5 + vec2(0u, u32(v1.x + v5.x < v1.x)); - v13 = ((v13 ^ v1) >> ROTATE_16) | ((v13 ^ v1).yx << ROTATE_16); - v9 = v9 + v13 + vec2(0u, u32(v9.x + v13.x < v9.x)); - v5 = ((v5 ^ v9) << ROTATE_1) | ((v5 ^ v9).yx >> ROTATE_31); + // x = VEC4_0; + // y = VEC4_0; - /** - * r=8, i=2, a=v[2], b=v[6], c=v[10], d=v[14] - */ - v2 = v2 + v6 + vec2(0u, u32(v2.x + v6.x < v2.x)); - v14 = (v14 ^ v2).yx; - v10 = v10 + v14 + vec2(0u, u32(v10.x + v14.x < v10.x)); - v6 = ((v6 ^ v10) >> ROTATE_24) | ((v6 ^ v10).yx << ROTATE_8); - v2 = v2 + v6 + vec2(0u, u32(v2.x + v6.x < v2.x)); - v2 = v2 + m3 + vec2(0u, u32(v2.x + m3.x < v2.x)); - v14 = ((v14 ^ v2) >> ROTATE_16) | ((v14 ^ v2).yx << ROTATE_16); - v10 = v10 + v14 + vec2(0u, u32(v10.x + v14.x < v10.x)); - v6 = ((v6 ^ v10) << ROTATE_1) | ((v6 ^ v10).yx >> ROTATE_31); + v01 += v45 + vec4(0u, u32(v01.x + v45.x < v01.x), 0u, u32(v01.z + v45.z < v01.z)); + // v01 += x + vec4(0u, u32(v01.x + x.x < v01.x), 0u, u32(v01.z + x.z < v01.z)); + v1213 = (v1213 ^ v01).yxwz; + v89 += v1213 + vec4(0u, u32(v89.x + v1213.x < v89.x), 0u, u32(v89.z + v1213.z < v89.z)); + v45 = ((v45 ^ v89) >> ROTATE_24) | ((v45 ^ v89).yxwz << ROTATE_8); + v01 += v45 + vec4(0u, u32(v01.x + v45.x < v01.x), 0u, u32(v01.z + v45.z < v01.z)); + // v01 += y + vec4(0u, u32(v01.x + y.x < v01.x), 0u, u32(v01.z + y.z < v01.z)); + v1213 = ((v1213 ^ v01) >> ROTATE_16) | ((v1213 ^ v01).yxwz << ROTATE_16); + v89 += v1213 + vec4(0u, u32(v89.x + v1213.x < v89.x), 0u, u32(v89.z + v1213.z < v89.z)); + v45 = ((v45 ^ v89) << ROTATE_1) | ((v45 ^ v89).yxwz >> ROTATE_31); /** + * r=8, i=2, a=v[2], b=v[6], c=v[10], d=v[14] * r=8, i=3, a=v[3], b=v[7], c=v[11], d=v[15] */ - v3 = v3 + v7 + vec2(0u, u32(v3.x + v7.x < v3.x)); - v3 = v3 + m0 + vec2(0u, u32(v3.x + m0.x < v3.x)); - v15 = (v15 ^ v3).yx; - v11 = v11 + v15 + vec2(0u, u32(v11.x + v15.x < v11.x)); - v7 = ((v7 ^ v11) >> ROTATE_24) | ((v7 ^ v11).yx << ROTATE_8); - v3 = v3 + v7 + vec2(0u, u32(v3.x + v7.x < v3.x)); - v15 = ((v15 ^ v3) >> ROTATE_16) | ((v15 ^ v3).yx << ROTATE_16); - v11 = v11 + v15 + vec2(0u, u32(v11.x + v15.x < v11.x)); - v7 = ((v7 ^ v11) << ROTATE_1) | ((v7 ^ v11).yx >> ROTATE_31); + x = vec4(VEC2_0, m0); + y = vec4(m3, VEC2_0); - /** - * r=8, i=4, a=v[0], b=v[5], c=v[10], d=v[15] - */ - v0 = v0 + v5 + vec2(0u, u32(v0.x + v5.x < v0.x)); - v15 = (v15 ^ v0).yx; - v10 = v10 + v15 + vec2(0u, u32(v10.x + v15.x < v10.x)); - v5 = ((v5 ^ v10) >> ROTATE_24) | ((v5 ^ v10).yx << ROTATE_8); - v0 = v0 + v5 + vec2(0u, u32(v0.x + v5.x < v0.x)); - v0 = v0 + m2 + vec2(0u, u32(v0.x + m2.x < v0.x)); - v15 = ((v15 ^ v0) >> ROTATE_16) | ((v15 ^ v0).yx << ROTATE_16); - v10 = v10 + v15 + vec2(0u, u32(v10.x + v15.x < v10.x)); - v5 = ((v5 ^ v10) << ROTATE_1) | ((v5 ^ v10).yx >> ROTATE_31); + v23 += v67 + vec4(0u, u32(v23.x + v67.x < v23.x), 0u, u32(v23.z + v67.z < v23.z)); + v23 += x + vec4(0u, u32(v23.x + x.x < v23.x), 0u, u32(v23.z + x.z < v23.z)); + v1415 = (v1415 ^ v23).yxwz; + v1011 += v1415 + vec4(0u, u32(v1011.x + v1415.x < v1011.x), 0u, u32(v1011.z + v1415.z < v1011.z)); + v67 = ((v67 ^ v1011) >> ROTATE_24) | ((v67 ^ v1011).yxwz << ROTATE_8); + v23 += v67 + vec4(0u, u32(v23.x + v67.x < v23.x), 0u, u32(v23.z + v67.z < v23.z)); + v23 += y + vec4(0u, u32(v23.x + y.x < v23.x), 0u, u32(v23.z + y.z < v23.z)); + v1415 = ((v1415 ^ v23) >> ROTATE_16) | ((v1415 ^ v23).yxwz << ROTATE_16); + v1011 += v1415 + vec4(0u, u32(v1011.x + v1415.x < v1011.x), 0u, u32(v1011.z + v1415.z < v1011.z)); + v67 = ((v67 ^ v1011) << ROTATE_1) | ((v67 ^ v1011).yxwz >> ROTATE_31); /** + * r=8, i=4, a=v[0], b=v[5], c=v[10], d=v[15] * r=8, i=5, a=v[1], b=v[6], c=v[11], d=v[12] */ - v1 = v1 + v6 + vec2(0u, u32(v1.x + v6.x < v1.x)); - v12 = (v12 ^ v1).yx; - v11 = v11 + v12 + vec2(0u, u32(v11.x + v12.x < v11.x)); - v6 = ((v6 ^ v11) >> ROTATE_24) | ((v6 ^ v11).yx << ROTATE_8); - v1 = v1 + v6 + vec2(0u, u32(v1.x + v6.x < v1.x)); - v12 = ((v12 ^ v1) >> ROTATE_16) | ((v12 ^ v1).yx << ROTATE_16); - v11 = v11 + v12 + vec2(0u, u32(v11.x + v12.x < v11.x)); - v6 = ((v6 ^ v11) << ROTATE_1) | ((v6 ^ v11).yx >> ROTATE_31); + b = vec4(v45.zw, v67.xy); + d = vec4(v1415.zw, v1213.xy); + // x = VEC4_0; + y = vec4(m2, VEC2_0); - /** - * r=8, i=6, a=v[2], b=v[7], c=v[8], d=v[13] - */ - v2 = v2 + v7 + vec2(0u, u32(v2.x + v7.x < v2.x)); - v2 = v2 + m1 + vec2(0u, u32(v2.x + m1.x < v2.x)); - v13 = (v13 ^ v2).yx; - v8 = v8 + v13 + vec2(0u, u32(v8.x + v13.x < v8.x)); - v7 = ((v7 ^ v8) >> ROTATE_24) | ((v7 ^ v8).yx << ROTATE_8); - v2 = v2 + v7 + vec2(0u, u32(v2.x + v7.x < v2.x)); - v2 = v2 + m4 + vec2(0u, u32(v2.x + m4.x < v2.x)); - v13 = ((v13 ^ v2) >> ROTATE_16) | ((v13 ^ v2).yx << ROTATE_16); - v8 = v8 + v13 + vec2(0u, u32(v8.x + v13.x < v8.x)); - v7 = ((v7 ^ v8) << ROTATE_1) | ((v7 ^ v8).yx >> ROTATE_31); + v01 += b + vec4(0u, u32(v01.x + b.x < v01.x), 0u, u32(v01.z + b.z < v01.z)); + // v01 += x + vec4(0u, u32(v01.x + x.x < v01.x), 0u, u32(v01.z + x.z < v01.z)); + d = (d ^ v01).yxwz; + v1011 += d + vec4(0u, u32(v1011.x + d.x < v1011.x), 0u, u32(v1011.z + d.z < v1011.z)); + b = ((b ^ v1011) >> ROTATE_24) | ((b ^ v1011).yxwz << ROTATE_8); + v01 += b + vec4(0u, u32(v01.x + b.x < v01.x), 0u, u32(v01.z + b.z < v01.z)); + v01 += y + vec4(0u, u32(v01.x + y.x < v01.x), 0u, u32(v01.z + y.z < v01.z)); + d = ((d ^ v01) >> ROTATE_16) | ((d ^ v01).yxwz << ROTATE_16); + v1011 += d + vec4(0u, u32(v1011.x + d.x < v1011.x), 0u, u32(v1011.z + d.z < v1011.z)); + b = ((b ^ v1011) << ROTATE_1) | ((b ^ v1011).yxwz >> ROTATE_31); + + v45 = vec4(v45.xy, b.xy); + v67 = vec4(b.zw, v67.zw); + v1415 = vec4(v1415.xy, d.xy); + v1213 = vec4(d.zw, v1213.zw); /** + * r=8, i=6, a=v[2], b=v[7], c=v[8], d=v[13] * r=8, i=7, a=v[3], b=v[4], c=v[9], d=v[14] */ - v3 = v3 + v4 + vec2(0u, u32(v3.x + v4.x < v3.x)); - v14 = (v14 ^ v3).yx; - v9 = v9 + v14 + vec2(0u, u32(v9.x + v14.x < v9.x)); - v4 = ((v4 ^ v9) >> ROTATE_24) | ((v4 ^ v9).yx << ROTATE_8); - v3 = v3 + v4 + vec2(0u, u32(v3.x + v4.x < v3.x)); - v14 = ((v14 ^ v3) >> ROTATE_16) | ((v14 ^ v3).yx << ROTATE_16); - v9 = v9 + v14 + vec2(0u, u32(v9.x + v14.x < v9.x)); - v4 = ((v4 ^ v9) << ROTATE_1) | ((v4 ^ v9).yx >> ROTATE_31); + b = vec4(v67.zw, v45.xy); + d = vec4(v1213.zw, v1415.xy); + x = vec4(m1, VEC2_0); + y = vec4(m4, VEC2_0); + + v23 += b + vec4(0u, u32(v23.x + b.x < v23.x), 0u, u32(v23.z + b.z < v23.z)); + v23 += x + vec4(0u, u32(v23.x + x.x < v23.x), 0u, u32(v23.z + x.z < v23.z)); + d = (d ^ v23).yxwz; + v89 += d + vec4(0u, u32(v89.x + d.x < v89.x), 0u, u32(v89.z + d.z < v89.z)); + b = ((b ^ v89) >> ROTATE_24) | ((b ^ v89).yxwz << ROTATE_8); + v23 += b + vec4(0u, u32(v23.x + b.x < v23.x), 0u, u32(v23.z + b.z < v23.z)); + v23 += y + vec4(0u, u32(v23.x + y.x < v23.x), 0u, u32(v23.z + y.z < v23.z)); + d = ((d ^ v23) >> ROTATE_16) | ((d ^ v23).yxwz << ROTATE_16); + v89 += d + vec4(0u, u32(v89.x + d.x < v89.x), 0u, u32(v89.z + d.z < v89.z)); + b = ((b ^ v89) << ROTATE_1) | ((b ^ v89).yxwz >> ROTATE_31); + + v67 = vec4(v67.xy, b.xy); + v45 = vec4(b.zw, v45.zw); + v1213 = vec4(v1213.xy, d.xy); + v1415 = vec4(d.zw, v1415.zw); @@ -1128,104 +1002,89 @@ fn main(id: vec3) { /** * r=9, i=0, a=v[0], b=v[4], c=v[8], d=v[12] - */ - v0 = v0 + v4 + vec2(0u, u32(v0.x + v4.x < v0.x)); - v12 = (v12 ^ v0).yx; - v8 = v8 + v12 + vec2(0u, u32(v8.x + v12.x < v8.x)); - v4 = ((v4 ^ v8) >> ROTATE_24) | ((v4 ^ v8).yx << ROTATE_8); - v0 = v0 + v4 + vec2(0u, u32(v0.x + v4.x < v0.x)); - v0 = v0 + m2 + vec2(0u, u32(v0.x + m2.x < v0.x)); - v12 = ((v12 ^ v0) >> ROTATE_16) | ((v12 ^ v0).yx << ROTATE_16); - v8 = v8 + v12 + vec2(0u, u32(v8.x + v12.x < v8.x)); - v4 = ((v4 ^ v8) << ROTATE_1) | ((v4 ^ v8).yx >> ROTATE_31); - - /** * r=9, i=1, a=v[1], b=v[5], c=v[9], d=v[13] */ - v1 = v1 + v5 + vec2(0u, u32(v1.x + v5.x < v1.x)); - v13 = (v13 ^ v1).yx; - v9 = v9 + v13 + vec2(0u, u32(v9.x + v13.x < v9.x)); - v5 = ((v5 ^ v9) >> ROTATE_24) | ((v5 ^ v9).yx << ROTATE_8); - v1 = v1 + v5 + vec2(0u, u32(v1.x + v5.x < v1.x)); - v1 = v1 + m4 + vec2(0u, u32(v1.x + m4.x < v1.x)); - v13 = ((v13 ^ v1) >> ROTATE_16) | ((v13 ^ v1).yx << ROTATE_16); - v9 = v9 + v13 + vec2(0u, u32(v9.x + v13.x < v9.x)); - v5 = ((v5 ^ v9) << ROTATE_1) | ((v5 ^ v9).yx >> ROTATE_31); + // x = VEC4_0; + y = vec4(m2, m4); - /** - * r=9, i=2, a=v[2], b=v[6], c=v[10], d=v[14] - */ - v2 = v2 + v6 + vec2(0u, u32(v2.x + v6.x < v2.x)); - v14 = (v14 ^ v2).yx; - v10 = v10 + v14 + vec2(0u, u32(v10.x + v14.x < v10.x)); - v6 = ((v6 ^ v10) >> ROTATE_24) | ((v6 ^ v10).yx << ROTATE_8); - v2 = v2 + v6 + vec2(0u, u32(v2.x + v6.x < v2.x)); - v14 = ((v14 ^ v2) >> ROTATE_16) | ((v14 ^ v2).yx << ROTATE_16); - v10 = v10 + v14 + vec2(0u, u32(v10.x + v14.x < v10.x)); - v6 = ((v6 ^ v10) << ROTATE_1) | ((v6 ^ v10).yx >> ROTATE_31); + v01 += v45 + vec4(0u, u32(v01.x + v45.x < v01.x), 0u, u32(v01.z + v45.z < v01.z)); + // v01 += x + vec4(0u, u32(v01.x + x.x < v01.x), 0u, u32(v01.z + x.z < v01.z)); + v1213 = (v1213 ^ v01).yxwz; + v89 += v1213 + vec4(0u, u32(v89.x + v1213.x < v89.x), 0u, u32(v89.z + v1213.z < v89.z)); + v45 = ((v45 ^ v89) >> ROTATE_24) | ((v45 ^ v89).yxwz << ROTATE_8); + v01 += v45 + vec4(0u, u32(v01.x + v45.x < v01.x), 0u, u32(v01.z + v45.z < v01.z)); + v01 += y + vec4(0u, u32(v01.x + y.x < v01.x), 0u, u32(v01.z + y.z < v01.z)); + v1213 = ((v1213 ^ v01) >> ROTATE_16) | ((v1213 ^ v01).yxwz << ROTATE_16); + v89 += v1213 + vec4(0u, u32(v89.x + v1213.x < v89.x), 0u, u32(v89.z + v1213.z < v89.z)); + v45 = ((v45 ^ v89) << ROTATE_1) | ((v45 ^ v89).yxwz >> ROTATE_31); /** + * r=9, i=2, a=v[2], b=v[6], c=v[10], d=v[14] * r=9, i=3, a=v[3], b=v[7], c=v[11], d=v[15] */ - v3 = v3 + v7 + vec2(0u, u32(v3.x + v7.x < v3.x)); - v3 = v3 + m1 + vec2(0u, u32(v3.x + m1.x < v3.x)); - v15 = (v15 ^ v3).yx; - v11 = v11 + v15 + vec2(0u, u32(v11.x + v15.x < v11.x)); - v7 = ((v7 ^ v11) >> ROTATE_24) | ((v7 ^ v11).yx << ROTATE_8); - v3 = v3 + v7 + vec2(0u, u32(v3.x + v7.x < v3.x)); - v15 = ((v15 ^ v3) >> ROTATE_16) | ((v15 ^ v3).yx << ROTATE_16); - v11 = v11 + v15 + vec2(0u, u32(v11.x + v15.x < v11.x)); - v7 = ((v7 ^ v11) << ROTATE_1) | ((v7 ^ v11).yx >> ROTATE_31); + x = vec4(VEC2_0, m1); + // y = VEC4_0; - /** - * r=9, i=4, a=v[0], b=v[5], c=v[10], d=v[15] - */ - v0 = v0 + v5 + vec2(0u, u32(v0.x + v5.x < v0.x)); - v15 = (v15 ^ v0).yx; - v10 = v10 + v15 + vec2(0u, u32(v10.x + v15.x < v10.x)); - v5 = ((v5 ^ v10) >> ROTATE_24) | ((v5 ^ v10).yx << ROTATE_8); - v0 = v0 + v5 + vec2(0u, u32(v0.x + v5.x < v0.x)); - v15 = ((v15 ^ v0) >> ROTATE_16) | ((v15 ^ v0).yx << ROTATE_16); - v10 = v10 + v15 + vec2(0u, u32(v10.x + v15.x < v10.x)); - v5 = ((v5 ^ v10) << ROTATE_1) | ((v5 ^ v10).yx >> ROTATE_31); + v23 += v67 + vec4(0u, u32(v23.x + v67.x < v23.x), 0u, u32(v23.z + v67.z < v23.z)); + v23 += x + vec4(0u, u32(v23.x + x.x < v23.x), 0u, u32(v23.z + x.z < v23.z)); + v1415 = (v1415 ^ v23).yxwz; + v1011 += v1415 + vec4(0u, u32(v1011.x + v1415.x < v1011.x), 0u, u32(v1011.z + v1415.z < v1011.z)); + v67 = ((v67 ^ v1011) >> ROTATE_24) | ((v67 ^ v1011).yxwz << ROTATE_8); + v23 += v67 + vec4(0u, u32(v23.x + v67.x < v23.x), 0u, u32(v23.z + v67.z < v23.z)); + // v23 += y + vec4(0u, u32(v23.x + y.x < v23.x), 0u, u32(v23.z + y.z < v23.z)); + v1415 = ((v1415 ^ v23) >> ROTATE_16) | ((v1415 ^ v23).yxwz << ROTATE_16); + v1011 += v1415 + vec4(0u, u32(v1011.x + v1415.x < v1011.x), 0u, u32(v1011.z + v1415.z < v1011.z)); + v67 = ((v67 ^ v1011) << ROTATE_1) | ((v67 ^ v1011).yxwz >> ROTATE_31); /** + * r=9, i=4, a=v[0], b=v[5], c=v[10], d=v[15] * r=9, i=5, a=v[1], b=v[6], c=v[11], d=v[12] */ - v1 = v1 + v6 + vec2(0u, u32(v1.x + v6.x < v1.x)); - v12 = (v12 ^ v1).yx; - v11 = v11 + v12 + vec2(0u, u32(v11.x + v12.x < v11.x)); - v6 = ((v6 ^ v11) >> ROTATE_24) | ((v6 ^ v11).yx << ROTATE_8); - v1 = v1 + v6 + vec2(0u, u32(v1.x + v6.x < v1.x)); - v12 = ((v12 ^ v1) >> ROTATE_16) | ((v12 ^ v1).yx << ROTATE_16); - v11 = v11 + v12 + vec2(0u, u32(v11.x + v12.x < v11.x)); - v6 = ((v6 ^ v11) << ROTATE_1) | ((v6 ^ v11).yx >> ROTATE_31); + b = vec4(v45.zw, v67.xy); + d = vec4(v1415.zw, v1213.xy); + // x = VEC4_0; + // y = VEC4_0; - /** - * r=9, i=6, a=v[2], b=v[7], c=v[8], d=v[13] - */ - v2 = v2 + v7 + vec2(0u, u32(v2.x + v7.x < v2.x)); - v2 = v2 + m3 + vec2(0u, u32(v2.x + m3.x < v2.x)); - v13 = (v13 ^ v2).yx; - v8 = v8 + v13 + vec2(0u, u32(v8.x + v13.x < v8.x)); - v7 = ((v7 ^ v8) >> ROTATE_24) | ((v7 ^ v8).yx << ROTATE_8); - v2 = v2 + v7 + vec2(0u, u32(v2.x + v7.x < v2.x)); - v13 = ((v13 ^ v2) >> ROTATE_16) | ((v13 ^ v2).yx << ROTATE_16); - v8 = v8 + v13 + vec2(0u, u32(v8.x + v13.x < v8.x)); - v7 = ((v7 ^ v8) << ROTATE_1) | ((v7 ^ v8).yx >> ROTATE_31); + v01 += b + vec4(0u, u32(v01.x + b.x < v01.x), 0u, u32(v01.z + b.z < v01.z)); + // v01 += x + vec4(0u, u32(v01.x + x.x < v01.x), 0u, u32(v01.z + x.z < v01.z)); + d = (d ^ v01).yxwz; + v1011 += d + vec4(0u, u32(v1011.x + d.x < v1011.x), 0u, u32(v1011.z + d.z < v1011.z)); + b = ((b ^ v1011) >> ROTATE_24) | ((b ^ v1011).yxwz << ROTATE_8); + v01 += b + vec4(0u, u32(v01.x + b.x < v01.x), 0u, u32(v01.z + b.z < v01.z)); + // v01 += y + vec4(0u, u32(v01.x + y.x < v01.x), 0u, u32(v01.z + y.z < v01.z)); + d = ((d ^ v01) >> ROTATE_16) | ((d ^ v01).yxwz << ROTATE_16); + v1011 += d + vec4(0u, u32(v1011.x + d.x < v1011.x), 0u, u32(v1011.z + d.z < v1011.z)); + b = ((b ^ v1011) << ROTATE_1) | ((b ^ v1011).yxwz >> ROTATE_31); + + v45 = vec4(v45.xy, b.xy); + v67 = vec4(b.zw, v67.zw); + v1415 = vec4(v1415.xy, d.xy); + v1213 = vec4(d.zw, v1213.zw); /** + * r=9, i=6, a=v[2], b=v[7], c=v[8], d=v[13] * r=9, i=7, a=v[3], b=v[4], c=v[9], d=v[14] */ - v3 = v3 + v4 + vec2(0u, u32(v3.x + v4.x < v3.x)); - v14 = (v14 ^ v3).yx; - v9 = v9 + v14 + vec2(0u, u32(v9.x + v14.x < v9.x)); - v4 = ((v4 ^ v9) >> ROTATE_24) | ((v4 ^ v9).yx << ROTATE_8); - v3 = v3 + v4 + vec2(0u, u32(v3.x + v4.x < v3.x)); - v3 = v3 + m0 + vec2(0u, u32(v3.x + m0.x < v3.x)); - v14 = ((v14 ^ v3) >> ROTATE_16) | ((v14 ^ v3).yx << ROTATE_16); - v9 = v9 + v14 + vec2(0u, u32(v9.x + v14.x < v9.x)); - v4 = ((v4 ^ v9) << ROTATE_1) | ((v4 ^ v9).yx >> ROTATE_31); + b = vec4(v67.zw, v45.xy); + d = vec4(v1213.zw, v1415.xy); + x = vec4(m3, VEC2_0); + y = vec4(VEC2_0, m0); + + v23 += b + vec4(0u, u32(v23.x + b.x < v23.x), 0u, u32(v23.z + b.z < v23.z)); + v23 += x + vec4(0u, u32(v23.x + x.x < v23.x), 0u, u32(v23.z + x.z < v23.z)); + d = (d ^ v23).yxwz; + v89 += d + vec4(0u, u32(v89.x + d.x < v89.x), 0u, u32(v89.z + d.z < v89.z)); + b = ((b ^ v89) >> ROTATE_24) | ((b ^ v89).yxwz << ROTATE_8); + v23 += b + vec4(0u, u32(v23.x + b.x < v23.x), 0u, u32(v23.z + b.z < v23.z)); + v23 += y + vec4(0u, u32(v23.x + y.x < v23.x), 0u, u32(v23.z + y.z < v23.z)); + d = ((d ^ v23) >> ROTATE_16) | ((d ^ v23).yxwz << ROTATE_16); + v89 += d + vec4(0u, u32(v89.x + d.x < v89.x), 0u, u32(v89.z + d.z < v89.z)); + b = ((b ^ v89) << ROTATE_1) | ((b ^ v89).yxwz >> ROTATE_31); + + v67 = vec4(v67.xy, b.xy); + v45 = vec4(b.zw, v45.zw); + v1213 = vec4(v1213.xy, d.xy); + v1415 = vec4(d.zw, v1415.zw); @@ -1237,104 +1096,89 @@ fn main(id: vec3) { /** * r=10, i=0, a=v[0], b=v[4], c=v[8], d=v[12] - */ - v0 = v0 + v4 + vec2(0u, u32(v0.x + v4.x < v0.x)); - v0 = v0 + m0 + vec2(0u, u32(v0.x + m0.x < v0.x)); - v12 = (v12 ^ v0).yx; - v8 = v8 + v12 + vec2(0u, u32(v8.x + v12.x < v8.x)); - v4 = ((v4 ^ v8) >> ROTATE_24) | ((v4 ^ v8).yx << ROTATE_8); - v0 = v0 + v4 + vec2(0u, u32(v0.x + v4.x < v0.x)); - v0 = v0 + m1 + vec2(0u, u32(v0.x + m1.x < v0.x)); - v12 = ((v12 ^ v0) >> ROTATE_16) | ((v12 ^ v0).yx << ROTATE_16); - v8 = v8 + v12 + vec2(0u, u32(v8.x + v12.x < v8.x)); - v4 = ((v4 ^ v8) << ROTATE_1) | ((v4 ^ v8).yx >> ROTATE_31); - - /** * r=10, i=1, a=v[1], b=v[5], c=v[9], d=v[13] */ - v1 = v1 + v5 + vec2(0u, u32(v1.x + v5.x < v1.x)); - v1 = v1 + m2 + vec2(0u, u32(v1.x + m2.x < v1.x)); - v13 = (v13 ^ v1).yx; - v9 = v9 + v13 + vec2(0u, u32(v9.x + v13.x < v9.x)); - v5 = ((v5 ^ v9) >> ROTATE_24) | ((v5 ^ v9).yx << ROTATE_8); - v1 = v1 + v5 + vec2(0u, u32(v1.x + v5.x < v1.x)); - v1 = v1 + m3 + vec2(0u, u32(v1.x + m3.x < v1.x)); - v13 = ((v13 ^ v1) >> ROTATE_16) | ((v13 ^ v1).yx << ROTATE_16); - v9 = v9 + v13 + vec2(0u, u32(v9.x + v13.x < v9.x)); - v5 = ((v5 ^ v9) << ROTATE_1) | ((v5 ^ v9).yx >> ROTATE_31); + x = vec4(m0, m2); + y = vec4(m1, m3); - /** - * r=10, i=2, a=v[2], b=v[6], c=v[10], d=v[14] - */ - v2 = v2 + v6 + vec2(0u, u32(v2.x + v6.x < v2.x)); - v2 = v2 + m4 + vec2(0u, u32(v2.x + m4.x < v2.x)); - v14 = (v14 ^ v2).yx; - v10 = v10 + v14 + vec2(0u, u32(v10.x + v14.x < v10.x)); - v6 = ((v6 ^ v10) >> ROTATE_24) | ((v6 ^ v10).yx << ROTATE_8); - v2 = v2 + v6 + vec2(0u, u32(v2.x + v6.x < v2.x)); - v14 = ((v14 ^ v2) >> ROTATE_16) | ((v14 ^ v2).yx << ROTATE_16); - v10 = v10 + v14 + vec2(0u, u32(v10.x + v14.x < v10.x)); - v6 = ((v6 ^ v10) << ROTATE_1) | ((v6 ^ v10).yx >> ROTATE_31); + v01 += v45 + vec4(0u, u32(v01.x + v45.x < v01.x), 0u, u32(v01.z + v45.z < v01.z)); + v01 += x + vec4(0u, u32(v01.x + x.x < v01.x), 0u, u32(v01.z + x.z < v01.z)); + v1213 = (v1213 ^ v01).yxwz; + v89 += v1213 + vec4(0u, u32(v89.x + v1213.x < v89.x), 0u, u32(v89.z + v1213.z < v89.z)); + v45 = ((v45 ^ v89) >> ROTATE_24) | ((v45 ^ v89).yxwz << ROTATE_8); + v01 += v45 + vec4(0u, u32(v01.x + v45.x < v01.x), 0u, u32(v01.z + v45.z < v01.z)); + v01 += y + vec4(0u, u32(v01.x + y.x < v01.x), 0u, u32(v01.z + y.z < v01.z)); + v1213 = ((v1213 ^ v01) >> ROTATE_16) | ((v1213 ^ v01).yxwz << ROTATE_16); + v89 += v1213 + vec4(0u, u32(v89.x + v1213.x < v89.x), 0u, u32(v89.z + v1213.z < v89.z)); + v45 = ((v45 ^ v89) << ROTATE_1) | ((v45 ^ v89).yxwz >> ROTATE_31); /** + * r=10, i=2, a=v[2], b=v[6], c=v[10], d=v[14] * r=10, i=3, a=v[3], b=v[7], c=v[11], d=v[15] */ - v3 = v3 + v7 + vec2(0u, u32(v3.x + v7.x < v3.x)); - v15 = (v15 ^ v3).yx; - v11 = v11 + v15 + vec2(0u, u32(v11.x + v15.x < v11.x)); - v7 = ((v7 ^ v11) >> ROTATE_24) | ((v7 ^ v11).yx << ROTATE_8); - v3 = v3 + v7 + vec2(0u, u32(v3.x + v7.x < v3.x)); - v15 = ((v15 ^ v3) >> ROTATE_16) | ((v15 ^ v3).yx << ROTATE_16); - v11 = v11 + v15 + vec2(0u, u32(v11.x + v15.x < v11.x)); - v7 = ((v7 ^ v11) << ROTATE_1) | ((v7 ^ v11).yx >> ROTATE_31); + x = vec4(m4, VEC2_0); + // y = VEC4_0; - /** - * r=10, i=4, a=v[0], b=v[5], c=v[10], d=v[15] - */ - v0 = v0 + v5 + vec2(0u, u32(v0.x + v5.x < v0.x)); - v15 = (v15 ^ v0).yx; - v10 = v10 + v15 + vec2(0u, u32(v10.x + v15.x < v10.x)); - v5 = ((v5 ^ v10) >> ROTATE_24) | ((v5 ^ v10).yx << ROTATE_8); - v0 = v0 + v5 + vec2(0u, u32(v0.x + v5.x < v0.x)); - v15 = ((v15 ^ v0) >> ROTATE_16) | ((v15 ^ v0).yx << ROTATE_16); - v10 = v10 + v15 + vec2(0u, u32(v10.x + v15.x < v10.x)); - v5 = ((v5 ^ v10) << ROTATE_1) | ((v5 ^ v10).yx >> ROTATE_31); + v23 += v67 + vec4(0u, u32(v23.x + v67.x < v23.x), 0u, u32(v23.z + v67.z < v23.z)); + v23 += x + vec4(0u, u32(v23.x + x.x < v23.x), 0u, u32(v23.z + x.z < v23.z)); + v1415 = (v1415 ^ v23).yxwz; + v1011 += v1415 + vec4(0u, u32(v1011.x + v1415.x < v1011.x), 0u, u32(v1011.z + v1415.z < v1011.z)); + v67 = ((v67 ^ v1011) >> ROTATE_24) | ((v67 ^ v1011).yxwz << ROTATE_8); + v23 += v67 + vec4(0u, u32(v23.x + v67.x < v23.x), 0u, u32(v23.z + v67.z < v23.z)); + // v23 += y + vec4(0u, u32(v23.x + y.x < v23.x), 0u, u32(v23.z + y.z < v23.z)); + v1415 = ((v1415 ^ v23) >> ROTATE_16) | ((v1415 ^ v23).yxwz << ROTATE_16); + v1011 += v1415 + vec4(0u, u32(v1011.x + v1415.x < v1011.x), 0u, u32(v1011.z + v1415.z < v1011.z)); + v67 = ((v67 ^ v1011) << ROTATE_1) | ((v67 ^ v1011).yxwz >> ROTATE_31); /** + * r=10, i=4, a=v[0], b=v[5], c=v[10], d=v[15] * r=10, i=5, a=v[1], b=v[6], c=v[11], d=v[12] */ - v1 = v1 + v6 + vec2(0u, u32(v1.x + v6.x < v1.x)); - v12 = (v12 ^ v1).yx; - v11 = v11 + v12 + vec2(0u, u32(v11.x + v12.x < v11.x)); - v6 = ((v6 ^ v11) >> ROTATE_24) | ((v6 ^ v11).yx << ROTATE_8); - v1 = v1 + v6 + vec2(0u, u32(v1.x + v6.x < v1.x)); - v12 = ((v12 ^ v1) >> ROTATE_16) | ((v12 ^ v1).yx << ROTATE_16); - v11 = v11 + v12 + vec2(0u, u32(v11.x + v12.x < v11.x)); - v6 = ((v6 ^ v11) << ROTATE_1) | ((v6 ^ v11).yx >> ROTATE_31); + b = vec4(v45.zw, v67.xy); + d = vec4(v1415.zw, v1213.xy); + // x = VEC4_0; + // y = VEC4_0; - /** - * r=10, i=6, a=v[2], b=v[7], c=v[8], d=v[13] - */ - v2 = v2 + v7 + vec2(0u, u32(v2.x + v7.x < v2.x)); - v13 = (v13 ^ v2).yx; - v8 = v8 + v13 + vec2(0u, u32(v8.x + v13.x < v8.x)); - v7 = ((v7 ^ v8) >> ROTATE_24) | ((v7 ^ v8).yx << ROTATE_8); - v2 = v2 + v7 + vec2(0u, u32(v2.x + v7.x < v2.x)); - v13 = ((v13 ^ v2) >> ROTATE_16) | ((v13 ^ v2).yx << ROTATE_16); - v8 = v8 + v13 + vec2(0u, u32(v8.x + v13.x < v8.x)); - v7 = ((v7 ^ v8) << ROTATE_1) | ((v7 ^ v8).yx >> ROTATE_31); + v01 += b + vec4(0u, u32(v01.x + b.x < v01.x), 0u, u32(v01.z + b.z < v01.z)); + // v01 += x + vec4(0u, u32(v01.x + x.x < v01.x), 0u, u32(v01.z + x.z < v01.z)); + d = (d ^ v01).yxwz; + v1011 += d + vec4(0u, u32(v1011.x + d.x < v1011.x), 0u, u32(v1011.z + d.z < v1011.z)); + b = ((b ^ v1011) >> ROTATE_24) | ((b ^ v1011).yxwz << ROTATE_8); + v01 += b + vec4(0u, u32(v01.x + b.x < v01.x), 0u, u32(v01.z + b.z < v01.z)); + // v01 += y + vec4(0u, u32(v01.x + y.x < v01.x), 0u, u32(v01.z + y.z < v01.z)); + d = ((d ^ v01) >> ROTATE_16) | ((d ^ v01).yxwz << ROTATE_16); + v1011 += d + vec4(0u, u32(v1011.x + d.x < v1011.x), 0u, u32(v1011.z + d.z < v1011.z)); + b = ((b ^ v1011) << ROTATE_1) | ((b ^ v1011).yxwz >> ROTATE_31); + + v45 = vec4(v45.xy, b.xy); + v67 = vec4(b.zw, v67.zw); + v1415 = vec4(v1415.xy, d.xy); + v1213 = vec4(d.zw, v1213.zw); /** + * r=10, i=6, a=v[2], b=v[7], c=v[8], d=v[13] * r=10, i=7, a=v[3], b=v[4], c=v[9], d=v[14] */ - v3 = v3 + v4 + vec2(0u, u32(v3.x + v4.x < v3.x)); - v14 = (v14 ^ v3).yx; - v9 = v9 + v14 + vec2(0u, u32(v9.x + v14.x < v9.x)); - v4 = ((v4 ^ v9) >> ROTATE_24) | ((v4 ^ v9).yx << ROTATE_8); - v3 = v3 + v4 + vec2(0u, u32(v3.x + v4.x < v3.x)); - v14 = ((v14 ^ v3) >> ROTATE_16) | ((v14 ^ v3).yx << ROTATE_16); - v9 = v9 + v14 + vec2(0u, u32(v9.x + v14.x < v9.x)); - v4 = ((v4 ^ v9) << ROTATE_1) | ((v4 ^ v9).yx >> ROTATE_31); + b = vec4(v67.zw, v45.xy); + d = vec4(v1213.zw, v1415.xy); + // x = VEC4_0; + // y = VEC4_0; + + v23 += b + vec4(0u, u32(v23.x + b.x < v23.x), 0u, u32(v23.z + b.z < v23.z)); + // v23 += x + vec4(0u, u32(v23.x + x.x < v23.x), 0u, u32(v23.z + x.z < v23.z)); + d = (d ^ v23).yxwz; + v89 += d + vec4(0u, u32(v89.x + d.x < v89.x), 0u, u32(v89.z + d.z < v89.z)); + b = ((b ^ v89) >> ROTATE_24) | ((b ^ v89).yxwz << ROTATE_8); + v23 += b + vec4(0u, u32(v23.x + b.x < v23.x), 0u, u32(v23.z + b.z < v23.z)); + // v23 += y + vec4(0u, u32(v23.x + y.x < v23.x), 0u, u32(v23.z + y.z < v23.z)); + d = ((d ^ v23) >> ROTATE_16) | ((d ^ v23).yxwz << ROTATE_16); + v89 += d + vec4(0u, u32(v89.x + d.x < v89.x), 0u, u32(v89.z + d.z < v89.z)); + b = ((b ^ v89) << ROTATE_1) | ((b ^ v89).yxwz >> ROTATE_31); + + v67 = vec4(v67.xy, b.xy); + v45 = vec4(b.zw, v45.zw); + v1213 = vec4(v1213.xy, d.xy); + v1415 = vec4(d.zw, v1415.zw); @@ -1346,87 +1190,89 @@ fn main(id: vec3) { /** * r=11, i=0, a=v[0], b=v[4], c=v[8], d=v[12] - */ - v0 = v0 + v4 + vec2(0u, u32(v0.x + v4.x < v0.x)); - v12 = (v12 ^ v0).yx; - v8 = v8 + v12 + vec2(0u, u32(v8.x + v12.x < v8.x)); - v4 = ((v4 ^ v8) >> ROTATE_24) | ((v4 ^ v8).yx << ROTATE_8); - v0 = v0 + v4 + vec2(0u, u32(v0.x + v4.x < v0.x)); - v12 = ((v12 ^ v0) >> ROTATE_16) | ((v12 ^ v0).yx << ROTATE_16); - v8 = v8 + v12 + vec2(0u, u32(v8.x + v12.x < v8.x)); - // skip since it does not affect the final values of `v0` and `v8` - - /** * r=11, i=1, a=v[1], b=v[5], c=v[9], d=v[13] */ - v1 = v1 + v5 + vec2(0u, u32(v1.x + v5.x < v1.x)); - v1 = v1 + m4 + vec2(0u, u32(v1.x + m4.x < v1.x)); - v13 = (v13 ^ v1).yx; - v9 = v9 + v13 + vec2(0u, u32(v9.x + v13.x < v9.x)); - v5 = ((v5 ^ v9) >> ROTATE_24) | ((v5 ^ v9).yx << ROTATE_8); - v1 = v1 + v5 + vec2(0u, u32(v1.x + v5.x < v1.x)); - v13 = ((v13 ^ v1) >> ROTATE_16) | ((v13 ^ v1).yx << ROTATE_16); - v9 = v9 + v13 + vec2(0u, u32(v9.x + v13.x < v9.x)); - v5 = ((v5 ^ v9) << ROTATE_1) | ((v5 ^ v9).yx >> ROTATE_31); + x = vec4(VEC2_0, m4); + // y = VEC4_0; - /** - * r=11, i=2, a=v[2], b=v[6], c=v[10], d=v[14] - */ - v2 = v2 + v6 + vec2(0u, u32(v2.x + v6.x < v2.x)); - v14 = (v14 ^ v2).yx; - v10 = v10 + v14 + vec2(0u, u32(v10.x + v14.x < v10.x)); - v6 = ((v6 ^ v10) >> ROTATE_24) | ((v6 ^ v10).yx << ROTATE_8); - v2 = v2 + v6 + vec2(0u, u32(v2.x + v6.x < v2.x)); - v14 = ((v14 ^ v2) >> ROTATE_16) | ((v14 ^ v2).yx << ROTATE_16); - v10 = v10 + v14 + vec2(0u, u32(v10.x + v14.x < v10.x)); - // skip since it does not affect the final values of `v0` and `v8` + v01 += v45 + vec4(0u, u32(v01.x + v45.x < v01.x), 0u, u32(v01.z + v45.z < v01.z)); + v01 += x + vec4(0u, u32(v01.x + x.x < v01.x), 0u, u32(v01.z + x.z < v01.z)); + v1213 = (v1213 ^ v01).yxwz; + v89 += v1213 + vec4(0u, u32(v89.x + v1213.x < v89.x), 0u, u32(v89.z + v1213.z < v89.z)); + v45 = ((v45 ^ v89) >> ROTATE_24) | ((v45 ^ v89).yxwz << ROTATE_8); + v01 += v45 + vec4(0u, u32(v01.x + v45.x < v01.x), 0u, u32(v01.z + v45.z < v01.z)); + // v01 += y + vec4(0u, u32(v01.x + y.x < v01.x), 0u, u32(v01.z + y.z < v01.z)); + v1213 = ((v1213 ^ v01) >> ROTATE_16) | ((v1213 ^ v01).yxwz << ROTATE_16); + v89 += v1213 + vec4(0u, u32(v89.x + v1213.x < v89.x), 0u, u32(v89.z + v1213.z < v89.z)); + v45 = ((v45 ^ v89) << ROTATE_1) | ((v45 ^ v89).yxwz >> ROTATE_31); /** + * r=11, i=2, a=v[2], b=v[6], c=v[10], d=v[14] * r=11, i=3, a=v[3], b=v[7], c=v[11], d=v[15] */ - v3 = v3 + v7 + vec2(0u, u32(v3.x + v7.x < v3.x)); - v15 = (v15 ^ v3).yx; - v11 = v11 + v15 + vec2(0u, u32(v11.x + v15.x < v11.x)); - v7 = ((v7 ^ v11) >> ROTATE_24) | ((v7 ^ v11).yx << ROTATE_8); - v3 = v3 + v7 + vec2(0u, u32(v3.x + v7.x < v3.x)); - v15 = ((v15 ^ v3) >> ROTATE_16) | ((v15 ^ v3).yx << ROTATE_16); - v11 = v11 + v15 + vec2(0u, u32(v11.x + v15.x < v11.x)); - v7 = ((v7 ^ v11) << ROTATE_1) | ((v7 ^ v11).yx >> ROTATE_31); + // x = VEC4_0; + // y = VEC4_0; - /** - * r=11, i=4, a=v[0], b=v[5], c=v[10], d=v[15] - */ - v0 = v0 + v5 + vec2(0u, u32(v0.x + v5.x < v0.x)); - v0 = v0 + m1 + vec2(0u, u32(v0.x + m1.x < v0.x)); - v15 = (v15 ^ v0).yx; - v10 = v10 + v15 + vec2(0u, u32(v10.x + v15.x < v10.x)); - v5 = ((v5 ^ v10) >> ROTATE_24) | ((v5 ^ v10).yx << ROTATE_8); - v0 = v0 + v5 + vec2(0u, u32(v0.x + v5.x < v0.x)); - // skip since it does not affect the final values of `v0` and `v8` - // skip since it does not affect the final values of `v0` and `v8` - // skip since it does not affect the final values of `v0` and `v8` + v23 += v67 + vec4(0u, u32(v23.x + v67.x < v23.x), 0u, u32(v23.z + v67.z < v23.z)); + // v23 += x + vec4(0u, u32(v23.x + x.x < v23.x), 0u, u32(v23.z + x.z < v23.z)); + v1415 = (v1415 ^ v23).yxwz; + v1011 += v1415 + vec4(0u, u32(v1011.x + v1415.x < v1011.x), 0u, u32(v1011.z + v1415.z < v1011.z)); + v67 = ((v67 ^ v1011) >> ROTATE_24) | ((v67 ^ v1011).yxwz << ROTATE_8); + v23 += v67 + vec4(0u, u32(v23.x + v67.x < v23.x), 0u, u32(v23.z + v67.z < v23.z)); + // v23 += y + vec4(0u, u32(v23.x + y.x < v23.x), 0u, u32(v23.z + y.z < v23.z)); + v1415 = ((v1415 ^ v23) >> ROTATE_16) | ((v1415 ^ v23).yxwz << ROTATE_16); + v1011 += v1415 + vec4(0u, u32(v1011.x + v1415.x < v1011.x), 0u, u32(v1011.z + v1415.z < v1011.z)); + v67 = ((v67 ^ v1011) << ROTATE_1) | ((v67 ^ v1011).yxwz >> ROTATE_31); /** + * r=11, i=4, a=v[0], b=v[5], c=v[10], d=v[15] * r=11, i=5, a=v[1], b=v[6], c=v[11], d=v[12] */ - // skip entire step since it does not affect the final values of `v0` and `v8` + b = vec4(v45.zw, v67.xy); + d = vec4(v1415.zw, v1213.xy); + x = vec4(m1, m0); + y = vec4(VEC2_0, m2); - /** - * r=11, i=6, a=v[2], b=v[7], c=v[8], d=v[13] - */ - v2 = v2 + v7 + vec2(0u, u32(v2.x + v7.x < v2.x)); - v13 = (v13 ^ v2).yx; - v8 = v8 + v13 + vec2(0u, u32(v8.x + v13.x < v8.x)); - v7 = ((v7 ^ v8) >> ROTATE_24) | ((v7 ^ v8).yx << ROTATE_8); - v2 = v2 + v7 + vec2(0u, u32(v2.x + v7.x < v2.x)); - v13 = ((v13 ^ v2) >> ROTATE_16) | ((v13 ^ v2).yx << ROTATE_16); - v8 = v8 + v13 + vec2(0u, u32(v8.x + v13.x < v8.x)); - // skip since we already have the final values of `v0` and `v8` + v01 += b + vec4(0u, u32(v01.x + b.x < v01.x), 0u, u32(v01.z + b.z < v01.z)); + v01 += x + vec4(0u, u32(v01.x + x.x < v01.x), 0u, u32(v01.z + x.z < v01.z)); + d = (d ^ v01).yxwz; + v1011 += d + vec4(0u, u32(v1011.x + d.x < v1011.x), 0u, u32(v1011.z + d.z < v1011.z)); + b = ((b ^ v1011) >> ROTATE_24) | ((b ^ v1011).yxwz << ROTATE_8); + v01 += b + vec4(0u, u32(v01.x + b.x < v01.x), 0u, u32(v01.z + b.z < v01.z)); + v01 += y + vec4(0u, u32(v01.x + y.x < v01.x), 0u, u32(v01.z + y.z < v01.z)); + d = ((d ^ v01) >> ROTATE_16) | ((d ^ v01).yxwz << ROTATE_16); + v1011 += d + vec4(0u, u32(v1011.x + d.x < v1011.x), 0u, u32(v1011.z + d.z < v1011.z)); + b = ((b ^ v1011) << ROTATE_1) | ((b ^ v1011).yxwz >> ROTATE_31); + + v45 = vec4(v45.xy, b.xy); + v67 = vec4(b.zw, v67.zw); + v1415 = vec4(v1415.xy, d.xy); + v1213 = vec4(d.zw, v1213.zw); /** + * r=11, i=6, a=v[2], b=v[7], c=v[8], d=v[13] * r=11, i=7, a=v[3], b=v[4], c=v[9], d=v[14] */ - // skip entire step since we already have the final values of `v0` and `v8` + b = vec4(v67.zw, v45.xy); + d = vec4(v1213.zw, v1415.xy); + // x = VEC4_0; + y = vec4(VEC2_0, m3); + + v23 += b + vec4(0u, u32(v23.x + b.x < v23.x), 0u, u32(v23.z + b.z < v23.z)); + // v23 += x + vec4(0u, u32(v23.x + x.x < v23.x), 0u, u32(v23.z + x.z < v23.z)); + d = (d ^ v23).yxwz; + v89 += d + vec4(0u, u32(v89.x + d.x < v89.x), 0u, u32(v89.z + d.z < v89.z)); + b = ((b ^ v89) >> ROTATE_24) | ((b ^ v89).yxwz << ROTATE_8); + v23 += b + vec4(0u, u32(v23.x + b.x < v23.x), 0u, u32(v23.z + b.z < v23.z)); + v23 += y + vec4(0u, u32(v23.x + y.x < v23.x), 0u, u32(v23.z + y.z < v23.z)); + d = ((d ^ v23) >> ROTATE_16) | ((d ^ v23).yxwz << ROTATE_16); + v89 += d + vec4(0u, u32(v89.x + d.x < v89.x), 0u, u32(v89.z + d.z < v89.z)); + // b = ((b ^ v89) << ROTATE_1) | ((b ^ v89).yxwz >> ROTATE_31); + + // v67 = vec4(v67.xy, b.xy); + // v45 = vec4(b.zw, v45.zw); + // v1213 = vec4(v1213.xy, d.xy); + // v1415 = vec4(d.zw, v1415.zw); @@ -1439,7 +1285,7 @@ fn main(id: vec3) { /** * Set nonce if it passes the threshold and no other thread has set it */ - if ((BLAKE2B_IV32_0.y ^ v0.y ^ v8.y) > threshold) { + if ((BLAKE2B_IV32_0.y ^ v01.y ^ v89.y) > ubo.threshold) { let wasFound: u32 = atomicExchange(&work.found, 1u); if (wasFound == 0u) { work.nonce = m0; diff --git a/test.html b/test.html index 35ac4eb..b09db45 100644 --- a/test.html +++ b/test.html @@ -117,21 +117,22 @@ SPDX-License-Identifier: GPL-3.0-or-later } function startValidation (event) { - console.log('validating') const threshold = document.getElementById('threshold') const work = document.getElementById('work') const hash = document.getElementById('hash') const validation = document.getElementById('validation') - validation.innerText = '' - NanoPow.validate(work.value, hash.value, { threshold: `0x${+threshold.value}` }) - .then(result => { - validation.innerText = result - ? '✔️' - : '❌' - }) - .catch(err => { - validation.innerText = '⏳' - }) + validation.innerText = '⏳' + if (work.value.length === 16 && hash.value.length === 64) { + NanoPow.validate(work.value, hash.value, { threshold: `0x${+threshold.value}` }) + .then(result => { + validation.innerText = result + ? '✔️' + : '❌' + }) + .catch(err => { + validation.innerText = '⏳' + }) + } } document.getElementById('threshold').addEventListener('input', startValidation) document.getElementById('work').addEventListener('input', startValidation) -- 2.47.3