-
Notifications
You must be signed in to change notification settings - Fork 26
/
Copy pathffx_cas.h
1445 lines (1445 loc) · 59.4 KB
/
ffx_cas.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
//_____________________________________________________________/\_______________________________________________________________
//==============================================================================================================================
//
// [CAS] FIDELITY FX - CONSTRAST ADAPTIVE SHARPENING 1.20190610
//
//==============================================================================================================================
// LICENSE
// =======
// Copyright (c) 2017-2019 Advanced Micro Devices, Inc. All rights reserved.
// -------
// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy,
// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following conditions:
// -------
// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the
// Software.
// -------
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
//------------------------------------------------------------------------------------------------------------------------------
// ABOUT
// =====
// CAS is a spatial only filter.
// CAS takes RGB color input.
// CAS enchances sharpness and local high-frequency contrast, and with or without added upsampling.
// CAS outputs RGB color.
//------------------------------------------------------------------------------------------------------------------------------
// SUGGESTIONS FOR INTEGRATION
// ===========================
// Best for performance, run CAS in sharpen-only mode, choose a video mode to have scan-out or the display scale.
// - Sharpen-only mode is faster, and provides a better quality sharpening.
// The scaling support in CAS was designed for when the application wants to do Dynamic Resolution Scaling (DRS).
// - With DRS, the render resolution can change per frame.
// - Use CAS to sharpen and upsample to the fixed output resolution, then composite the full resolution UI over CAS output.
// - This can all happen in one compute dispatch.
// It is likely better to reduce the amount of film grain which happens before CAS (as CAS will amplify grain).
// - An alternative would be to add grain after CAS.
// It is best to run CAS after tonemapping.
// - CAS needs to have input value 1.0 at the peak of the display output.
// It is ok to run CAS after compositing UI (it won't harm the UI).
//------------------------------------------------------------------------------------------------------------------------------
// EXECUTION
// =========
// CAS runs as a compute shader.
// CAS is designed to be run either in a 32-bit, CasFilter(), or packed 16-bit, CasFilterH(), form.
// The 32-bit form works on 8x8 tiles via one {64,1,1} workgroup.
// The 16-bit form works on a pair of 8x8 tiles in a 16x8 configuration via one {64,1,1} workgroup.
// CAS is designed to work best in semi-persistent form if running not async with graphics.
// For 32-bit this means looping across a collection of 4 8x8 tiles in a 2x2 tile foot-print.
// For 16-bit this means looping 2 times, once for the top 16x8 region and once for the bottom 16x8 region.
//------------------------------------------------------------------------------------------------------------------------------
// INTEGRATION SUMMARY FOR CPU
// ===========================
// // Make sure <stdint.h> has already been included.
// // Setup pre-portability-header defines.
// #define A_CPU 1
// // Include the portability header (requires version 1.20190530 or later which is backwards compatible).
// #include "ffx_a.h"
// // Include the CAS header.
// #include "ffx_cas.h"
// ...
// // Call the setup function to build out the constants for the shader, pass these to the shader.
// // The 'varAU4(const0);' expands into 'uint32_t const0[4];' on the CPU.
// varAU4(const0);
// varAU4(const1);
// CasSetup(const0,const1,
// 0.0f, // Sharpness tuning knob (0.0 to 1.0).
// 1920.0f,1080.0f, // Example input size.
// 2560.0f,1440.0f); // Example output size.
// ...
// // Later dispatch the shader based on the amount of semi-persistent loop unrolling.
// // Here is an example for running with the 16x16 (4-way unroll for 32-bit or 2-way unroll for 16-bit)
// vkCmdDispatch(cmdBuf,(widthInPixels+15)>>4,(heightInPixels+15)>>4,1);
//------------------------------------------------------------------------------------------------------------------------------
// INTEGRATION SUMMARY FOR GPU
// ===========================
// // Setup layout. Example below for VK_FORMAT_R16G16B16A16_SFLOAT.
// layout(set=0,binding=0,rgba16f)uniform image2D imgSrc;
// layout(set=0,binding=1,rgba16f)uniform image2D imgDst;
// ...
// // Setup pre-portability-header defines (sets up GLSL/HLSL path, packed math support, etc)
// #define A_GPU 1
// #define A_GLSL 1
// #define A_HALF 1
// ...
// // Include the portability header (or copy it in without an include).
// #include "ffx_a.h"
// ...
// // Define the fetch function(s).
// // CasLoad() takes a 32-bit unsigned integer 2D coordinate and loads color.
// AF3 CasLoad(ASU2 p){return imageLoad(imgSrc,p).rgb;}
// // CasLoadH() is the 16-bit version taking 16-bit unsigned integer 2D coordinate and loading 16-bit float color.
// // The ASU2() typecast back to 32-bit is a NO-OP, the compiler pattern matches and uses A16 opcode support instead.
// // The AH3() typecast to 16-bit float is a NO-OP, the compiler pattern matches and uses D16 opcode support instead.
// AH3 CasLoadH(ASW2 p){return AH3(imageLoad(imgSrc,ASU2(p)).rgb);}
// ...
// // Define the input modifiers as nop's initially.
// // See "INPUT FORMAT SPECIFIC CASES" below for specifics on what to place in these functions.
// void CasInput(inout AF1 r,inout AF1 g,inout AF1 b){}
// void CasInputH(inout AH2 r,inout AH2 g,inout AH2 b){}
// ...
// // Include this CAS header file (or copy it in without an include).
// #include "ffx_cas.h"
// ...
// // Example in shader integration for loop-unrolled 16x16 case for 32-bit.
// layout(local_size_x=64)in;
// void main(){
// // Fetch constants from CasSetup().
// AU4 const0=cb.const0;
// AU4 const1=cb.const1;
// // Do remapping of local xy in workgroup for a more PS-like swizzle pattern.
// AU2 gxy=ARmp8x8(gl_LocalInvocationID.x)+AU2(gl_WorkGroupID.x<<4u,gl_WorkGroupID.y<<4u);
// // Filter.
// AF4 c;
// CasFilter(c.r,c.g,c.b,gxy,const0,const1,false);imageStore(imgDst,ASU2(gxy),c);
// gxy.x+=8u;
// CasFilter(c.r,c.g,c.b,gxy,const0,const1,false);imageStore(imgDst,ASU2(gxy),c);
// gxy.y+=8u;
// CasFilter(c.r,c.g,c.b,gxy,const0,const1,false);imageStore(imgDst,ASU2(gxy),c);
// gxy.x-=8u;
// CasFilter(c.r,c.g,c.b,gxy,const0,const1,false);imageStore(imgDst,ASU2(gxy),c);}
// ...
// // Example for semi-persistent 16x16 but this time for packed math.
// // Use this before including 'cas.h' if not using the non-packed filter function.
// #define CAS_PACKED_ONLY 1
// ...
// layout(local_size_x=64)in;
// void main(){
// // Fetch constants from CasSetup().
// AU4 const0=cb.const0;
// AU4 const1=cb.const1;
// // Do remapping of local xy in workgroup for a more PS-like swizzle pattern.
// AU2 gxy=ARmp8x8(gl_LocalInvocationID.x)+AU2(gl_WorkGroupID.x<<4u,gl_WorkGroupID.y<<4u);
// // Filter.
// AH4 c0,c1;AH2 cR,cG,cB;
// CasFilterH(cR,cG,cB,gxy,const0,const1,false);
// // Extra work integrated after CAS would go here.
// ...
// // Suggest only running CasDepack() right before stores, to maintain packed math for any work after CasFilterH().
// CasDepack(c0,c1,cR,cG,cB);
// imageStore(imgDst,ASU2(gxy),AF4(c0));
// imageStore(imgDst,ASU2(gxy)+ASU2(8,0),AF4(c1));
// gxy.y+=8u;
// CasFilterH(cR,cG,cB,gxy,const0,const1,false);
// ...
// CasDepack(c0,c1,cR,cG,cB);
// imageStore(imgDst,ASU2(gxy),AF4(c0));
// imageStore(imgDst,ASU2(gxy)+ASU2(8,0),AF4(c1));}
//------------------------------------------------------------------------------------------------------------------------------
// CAS FILTERING LOGIC
// ===================
// CAS uses the minimal nearest 3x3 source texel window for filtering.
// The filter coefficients are radially symmetric (phase adaptive, computed per pixel based on output pixel center).
// The filter kernel adapts to local contrast (adjusting the negative lobe strength of the filter kernel).
//------------------------------------------------------------------------------------------------------------------------------
// CAS INPUT REQUIREMENTS
// ======================
// This is designed to be a linear filter.
// Running CAS on perceptual inputs will yield over-sharpening.
// Input must range between {0 to 1} for each color channel.
// CAS output will be {0 to 1} ranged as well.
// CAS does 5 loads, so any conversion applied during CasLoad() or CasInput() has a 5 load * 3 channel = 15x cost amplifier.
// - So input conversions need to be factored into the prior pass's output.
// - But if necessary use CasInput() instead of CasLoad(), as CasInput() works with packed color.
// - For CAS with scaling the amplifier is 12 load * 3 channel = 36x cost amplifier.
// Any conversion applied to output has a 3x cost amplifier (3 color channels).
// - Output conversions are substantially less expensive.
// Added VALU ops due to conversions will have visible cost as this shader is already quite VALU heavy.
// This filter does not function well on sRGB or gamma 2.2 non-linear data.
// This filter does not function on PQ non-linear data.
// - Due to the shape of PQ, the positive side of the ring created by the negative lobe tends to become over-bright.
//------------------------------------------------------------------------------------------------------------------------------
// INPUT FORMAT SPECIFIC CASES
// ===========================
// - FP16 with all non-negative values ranging {0 to 1}.
// - Use as is, filter is designed for linear input and output ranging {0 to 1}.
// ---------------------------
// - UNORM with linear conversion approximation.
// - This could be used for both sRGB or FreeSync2 native (gamma 2.2) cases.
// - Load/store with either 10:10:10:2 UNORM or 8:8:8:8 UNORM (aka VK_FORMAT_R8G8B8A8_UNORM).
// - Use gamma 2.0 conversion in CasInput(), as an approximation.
// - Modifications:
// // Change the CasInput*() function to square the inputs.
// void CasInput(inout AF1 r,inout AF1 g,inout AF1 b){r*=r;g*=g;b*=b;}
// void CasInputH(inout AH2 r,inout AH2 g,inout AH2 b){r*=r;g*=g;b*=b;}
// ...
// // Do linear to gamma 2.0 before store.
// // Since it will be common to do processing after CAS, the filter function returns linear.
// c.r=sqrt(c.r);c.g=sqrt(c.g);c.b=sqrt(c.b);
// imageStore(imgDst,ASU2(gxy),c);
// ...
// // And for packed.
// CasFilterH(cR,cG,cB,gxy,const0,const1,true);
// cR=sqrt(cR);cG=sqrt(cG);cB=sqrt(cB);
// CasDepack(c0,c1,cR,cG,cB);
// imageStore(img[0],ASU2(gxy),AF4(c0));
// imageStore(img[0],ASU2(gxy+AU2(8,0)),AF4(c1));
// ---------------------------
// - sRGB with slightly better quality and higher cost.
// - Use texelFetch() with sRGB format (VK_FORMAT_R8G8B8A8_SRGB) for loads (gets linear into shader).
// - Store to destination using UNORM (not sRGB) stores and do the linear to sRGB conversion in the shader.
// - Modifications:
// // Use texel fetch instead of image load (on GCN this will translate into an image load in the driver).
// // Hardware has sRGB to linear on loads (but in API only for read-only, aka texture instead of UAV/image).
// AF3 CasLoad(ASU2 p){return texelFetch(texSrc,p,0).rgb;}
// ...
// // Do linear to sRGB before store (GPU lacking hardware conversion support for linear to sRGB on store).
// c.r=AToSrgbF1(c.r);c.g=AToSrgbF1(c.g);c.b=AToSrgbF1(c.b);
// imageStore(imgDst,ASU2(gxy),c);
// ...
// // And for packed.
// CasFilterH(cR,cG,cB,gxy,const0,const1,true);
// cR=AToSrgbH2(cR);cG=AToSrgbH2(cG);cB=AToSrgbH2(cB);
// CasDepack(c0,c1,cR,cG,cB);
// imageStore(img[0],ASU2(gxy),AF4(c0));
// imageStore(img[0],ASU2(gxy+AU2(8,0)),AF4(c1));
// ---------------------------
// - HDR10 output via scRGB.
// - Pass before CAS needs to write out linear Rec.2020 colorspace output (all positive values).
// - Write to FP16 with {0 to 1} mapped to {0 to maxNits} nits.
// - Where 'maxNits' is typically not 10000.
// - Instead set 'maxNits' to the nits level that the HDR TV starts to clip white.
// - This can be even as low as 1000 nits on some HDR TVs.
// - After CAS do matrix multiply to take Rec.2020 back to sRGB and multiply by 'maxNits/80.0'.
// - Showing GPU code below to generate constants, likely most need to use CPU code instead.
// - Keeping the GPU code here because it is easier to read in these docs.
// - Can use 'lpm.h' source to generate the conversion matrix for Rec.2020 to sRGB:
// // Output conversion matrix from sRGB to Rec.2020.
// AF3 conR,conG,conB;
// // Working space temporaries (Rec.2020).
// AF3 rgbToXyzXW;AF3 rgbToXyzYW;AF3 rgbToXyzZW;
// LpmColRgbToXyz(rgbToXyzXW,rgbToXyzYW,rgbToXyzZW,lpmCol2020R,lpmCol2020G,lpmCol2020B,lpmColD65);
// // Output space temporaries (Rec.709, same as sRGB primaries).
// AF3 rgbToXyzXO;AF3 rgbToXyzYO;AF3 rgbToXyzZO;
// LpmColRgbToXyz(rgbToXyzXO,rgbToXyzYO,rgbToXyzZO,lpmCol709R,lpmCol709G,lpmCol709B,lpmColD65);
// AF3 xyzToRgbRO;AF3 xyzToRgbGO;AF3 xyzToRgbBO;
// LpmMatInv3x3(xyzToRgbRO,xyzToRgbGO,xyzToRgbBO,rgbToXyzXO,rgbToXyzYO,rgbToXyzZO);
// // Generate the matrix.
// LpmMatMul3x3(conR,conG,conB,xyzToRgbRO,xyzToRgbGO,xyzToRgbBO,rgbToXyzXW,rgbToXyzYW,rgbToXyzZW);
// - Adjust the conversion matrix for the multiply by 'maxNits/80.0'.
// // After this the constants can be stored into a constant buffer.
// AF1 conScale=maxNits*ARcpF1(80.0);
// conR*=conScale;conG*=conScale;conB*=conScale;
// - After CAS do the matrix multiply (passing the fetched constants into the shader).
// outputR=dot(AF3(colorR,colorG,colorB),conR);
// outputG=dot(AF3(colorR,colorG,colorB),conG);
// outputB=dot(AF3(colorR,colorG,colorB),conB);
// - Hopefully no developer is taking scRGB as input to CAS.
// - If that was the case, the conversion matrix from sRGB to Rec.2020 can be built changing the above code.
// - Swap the 'lpmCol709*' and 'lpmCol2020*' inputs to LpmColRgbToXyz().
// - Then scale by '80.0/maxNits' instead of 'maxNits/80.0'.
// ---------------------------
// - HDR10 output via native 10:10:10:2.
// - Pass before CAS needs to write out linear Rec.2020 colorspace output (all positive values).
// - Write to FP16 with {0 to 1} mapped to {0 to maxNits} nits.
// - Where 'maxNits' is typically not 10000.
// - Instead set 'maxNits' to the nits level that the HDR TV starts to clip white.
// - This can be even as low as 1000 nits on some HDR TVs.
// - Hopefully no developer needs to take PQ as input here, but if so can use A to convert PQ to linear:
// // Where 'k0' is a constant of 'maxNits/10000.0'.
// colorR=AFromPqF1(colorR*k0);
// colorG=AFromPqF1(colorG*k0);
// colorB=AFromPqF1(colorB*k0);
// - After CAS convert from linear to PQ.
// // Where 'k1' is a constant of '10000.0/maxNits'.
// colorR=AToPqF1(colorR*k1);
// colorG=AToPqF1(colorG*k1);
// colorB=AToPqF1(colorB*k1);
// ---------------------------
// - Example of a bad idea for CAS input design.
// - Have the pass before CAS store out in 10:10:10:2 UNORM with gamma 2.0.
// - Store the output of CAS with sRGB to linear conversion, or with a gamma 2.2 conversion for FreeSync2 native.
// - This will drop precision because the inputs had been quantized to 10-bit,
// and the output is using a different tonal transform,
// so inputs and outputs won't align for similar values.
// - It might be "ok" for 8-bit/channel CAS output, but definately not a good idea for 10-bit/channel output.
//------------------------------------------------------------------------------------------------------------------------------
// ALGORITHM DESCRIPTION
// =====================
// This describes the algorithm with CAS_BETTER_DIAGONALS defined.
// The default is with CAS_BETTER_DIAGONALS not defined (which is faster).
// Starting with no scaling.
// CAS fetches a 3x3 neighborhood around the pixel 'e',
// a b c
// d(e)f
// g h i
// It then computes a 'soft' minimum and maximum,
// a b c b
// d e f * 0.5 + d e f * 0.5
// g h i h
// The minimum and maximums give an idea of local contrast.
// --- 1.0 ^
// | | <-- This minimum distance to the signal limit is divided by MAX to get a base sharpening amount 'A'.
// --- MAX v
// |
// |
// --- MIN ^
// | | <-- The MIN side is more distant in this example so it is not used, but for dark colors it would be used.
// | |
// --- 0.0 v
// The base sharpening amount 'A' from above is shaped with a sqrt().
// This 'A' ranges from 0 := no sharpening, to 1 := full sharpening.
// Then 'A' is scaled by the sharpness knob while being transformed to a negative lobe (values from -1/5 to -1/8 for A=1).
// The final filter kernel looks like this,
// 0 A 0
// A 1 A <-- Center is always 1.0, followed by the negative lobe 'A' in a ring, and windowed into a circle with the 0.0s.
// 0 A 0
// The local neighborhood is then multiplied by the kernel weights, summed and divided by the sum of the kernel weights.
// The high quality path computes filter weights per channel.
// The low quality path uses the green channel's filter weights to compute the 'A' factor for all channels.
// ---------------------
// The scaling path is a little more complex.
// It starts by fetching the 4x4 neighborhood around the pixel centered between centers of pixels {f,g,j,k},
// a b c d
// e(f g)h
// i(j k)l
// m n o p
// The algorithm then computes the no-scaling result for {f,g,j,k}.
// It then interpolates between those no-scaling results.
// The interpolation is adaptive.
// To hide bilinear interpolation and restore diagonals, it weights bilinear weights by 1/(const+contrast).
// Where 'contrast' is the soft 'max-min'.
// This makes edges thin out a little.
// ---------------------
// Without CAS_BETTER_DIAGONALS defined, the algorithm is a little faster.
// Instead of using the 3x3 "box" with the 5-tap "circle" this uses just the "circle".
// Drops to 5 texture fetches for no-scaling.
// Drops to 12 texture fetches for scaling.
// Drops a bunch of math.
//------------------------------------------------------------------------------------------------------------------------------
// IDEAS FOR FUTURE
// ================
// - Avoid V_CVT's by using denormals.
// - Manually pack FP16 literals.
//------------------------------------------------------------------------------------------------------------------------------
// CHANGE LOG
// ==========
// 20190610 - Misc documentation cleanup.
// 20190609 - Removed lowQuality bool, improved scaling logic.
// 20190530 - Unified CPU/GPU setup code, using new ffx_a.h, faster, define CAS_BETTER_DIAGONALS to get older slower one.
// 20190529 - Missing a good way to re-interpret packed in HLSL, so disabling approximation optimizations for now.
// 20190528 - Fixed so GPU CasSetup() generates half data all the time.
// 20190527 - Implemented approximations for rcp() and sqrt().
// 20190524 - New algorithm, adjustable sharpness, scaling to 4x area. Fixed checker debug for no-scaling only.
// 20190521 - Updated file naming.
// 20190516 - Updated docs, fixed workaround, fixed no-scaling quality issue, removed gamma2 and generalized as CasInput*().
// 20190510 - Made the dispatch example safely round up for images that are not a multiple of 16x16.
// 20190507 - Fixed typo bug in CAS_DEBUG_CHECKER, fixed sign typo in the docs.
// 20190503 - Setup temporary workaround for compiler bug.
// 20190502 - Added argument for 'gamma2' path so input transform in that case runs packed.
// 20190426 - Improved documentation on format specific cases, etc.
// 20190425 - Updated/corrected documentation.
// 20190405 - Added CAS_PACKED_ONLY, misc bug fixes.
// 20190404 - Updated for the new a.h header.
//==============================================================================================================================
// This is the practical limit for the algorithm's scaling ability (quality is limited by 3x3 taps). Example resolutions,
// 1280x720 -> 1080p = 2.25x area
// 1536x864 -> 1080p = 1.56x area
// 1792x1008 -> 1440p = 2.04x area
// 1920x1080 -> 1440p = 1.78x area
// 1920x1080 -> 4K = 4.0x area
// 2048x1152 -> 1440p = 1.56x area
// 2560x1440 -> 4K = 2.25x area
// 3072x1728 -> 4K = 1.56x area
#define CAS_AREA_LIMIT 4.0
//------------------------------------------------------------------------------------------------------------------------------
// Pass in output and input resolution in pixels.
// This returns true if CAS supports scaling in the given configuration.
AP1 CasSupportScaling(AF1 outX,AF1 outY,AF1 inX,AF1 inY){return ((outX*outY)*ARcpF1(inX*inY))<=CAS_AREA_LIMIT;}
//==============================================================================================================================
// Call to setup required constant values (works on CPU or GPU).
A_STATIC void CasSetup(
outAU4 const0,
outAU4 const1,
AF1 sharpness, // 0 := default (lower ringing), 1 := maximum (higest ringing)
AF1 inputSizeInPixelsX,
AF1 inputSizeInPixelsY,
AF1 outputSizeInPixelsX,
AF1 outputSizeInPixelsY){
// Scaling terms.
const0[0]=AU1_AF1(inputSizeInPixelsX*ARcpF1(outputSizeInPixelsX));
const0[1]=AU1_AF1(inputSizeInPixelsY*ARcpF1(outputSizeInPixelsY));
const0[2]=AU1_AF1(AF1_(0.5)*inputSizeInPixelsX*ARcpF1(outputSizeInPixelsX)-AF1_(0.5));
const0[3]=AU1_AF1(AF1_(0.5)*inputSizeInPixelsY*ARcpF1(outputSizeInPixelsY)-AF1_(0.5));
// Sharpness value.
AF1 sharp=-ARcpF1(ALerpF1(8.0,5.0,ASatF1(sharpness)));
varAF2(hSharp)=initAF2(sharp,0.0);
const1[0]=AU1_AF1(sharp);
const1[1]=AU1_AH2_AF2(hSharp);
const1[2]=AU1_AF1(AF1_(8.0)*inputSizeInPixelsX*ARcpF1(outputSizeInPixelsX));
const1[3]=0;}
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
//_____________________________________________________________/\_______________________________________________________________
//==============================================================================================================================
// NON-PACKED VERSION
//==============================================================================================================================
#ifdef A_GPU
#ifdef CAS_PACKED_ONLY
// Avoid compiler error.
AF3 CasLoad(ASU2 p){return AF3(0.0,0.0,0.0);}
void CasInput(inout AF1 r,inout AF1 g,inout AF1 b){}
#endif
//------------------------------------------------------------------------------------------------------------------------------
void CasFilter(
out AF1 pixR, // Output values, non-vector so port between CasFilter() and CasFilterH() is easy.
out AF1 pixG,
out AF1 pixB,
AU2 ip, // Integer pixel position in output.
AU4 const0, // Constants generated by CasSetup().
AU4 const1,
AP1 noScaling){ // Must be a compile-time literal value, true = sharpen only (no resize).
//------------------------------------------------------------------------------------------------------------------------------
// Debug a checker pattern of on/off tiles for visual inspection.
#ifdef CAS_DEBUG_CHECKER
if((((ip.x^ip.y)>>8u)&1u)==0u){AF3 pix0=CasLoad(ASU2(ip));
pixR=pix0.r;pixG=pix0.g;pixB=pix0.b;CasInput(pixR,pixG,pixB);return;}
#endif
//------------------------------------------------------------------------------------------------------------------------------
// No scaling algorithm uses minimal 3x3 pixel neighborhood.
if(noScaling){
// a b c
// d e f
// g h i
ASU2 sp=ASU2(ip);
AF3 a=CasLoad(sp+ASU2(-1,-1));
AF3 b=CasLoad(sp+ASU2( 0,-1));
AF3 c=CasLoad(sp+ASU2( 1,-1));
AF3 d=CasLoad(sp+ASU2(-1, 0));
AF3 e=CasLoad(sp);
AF3 f=CasLoad(sp+ASU2( 1, 0));
AF3 g=CasLoad(sp+ASU2(-1, 1));
AF3 h=CasLoad(sp+ASU2( 0, 1));
AF3 i=CasLoad(sp+ASU2( 1, 1));
// Run optional input transform.
CasInput(a.r,a.g,a.b);
CasInput(b.r,b.g,b.b);
CasInput(c.r,c.g,c.b);
CasInput(d.r,d.g,d.b);
CasInput(e.r,e.g,e.b);
CasInput(f.r,f.g,f.b);
CasInput(g.r,g.g,g.b);
CasInput(h.r,h.g,h.b);
CasInput(i.r,i.g,i.b);
// Soft min and max.
// a b c b
// d e f * 0.5 + d e f * 0.5
// g h i h
// These are 2.0x bigger (factored out the extra multiply).
AF1 mnR=AMin3F1(AMin3F1(d.r,e.r,f.r),b.r,h.r);
AF1 mnG=AMin3F1(AMin3F1(d.g,e.g,f.g),b.g,h.g);
AF1 mnB=AMin3F1(AMin3F1(d.b,e.b,f.b),b.b,h.b);
#ifdef CAS_BETTER_DIAGONALS
AF1 mnR2=AMin3F1(AMin3F1(mnR,a.r,c.r),g.r,i.r);
AF1 mnG2=AMin3F1(AMin3F1(mnG,a.g,c.g),g.g,i.g);
AF1 mnB2=AMin3F1(AMin3F1(mnB,a.b,c.b),g.b,i.b);
mnR=mnR+mnR2;
mnG=mnG+mnG2;
mnB=mnB+mnB2;
#endif
AF1 mxR=AMax3F1(AMax3F1(d.r,e.r,f.r),b.r,h.r);
AF1 mxG=AMax3F1(AMax3F1(d.g,e.g,f.g),b.g,h.g);
AF1 mxB=AMax3F1(AMax3F1(d.b,e.b,f.b),b.b,h.b);
#ifdef CAS_BETTER_DIAGONALS
AF1 mxR2=AMax3F1(AMax3F1(mxR,a.r,c.r),g.r,i.r);
AF1 mxG2=AMax3F1(AMax3F1(mxG,a.g,c.g),g.g,i.g);
AF1 mxB2=AMax3F1(AMax3F1(mxB,a.b,c.b),g.b,i.b);
mxR=mxR+mxR2;
mxG=mxG+mxG2;
mxB=mxB+mxB2;
#endif
// Smooth minimum distance to signal limit divided by smooth max.
#ifdef CAS_GO_SLOWER
AF1 rcpMR=ARcpF1(mxR);
AF1 rcpMG=ARcpF1(mxG);
AF1 rcpMB=ARcpF1(mxB);
#else
AF1 rcpMR=APrxLoRcpF1(mxR);
AF1 rcpMG=APrxLoRcpF1(mxG);
AF1 rcpMB=APrxLoRcpF1(mxB);
#endif
#ifdef CAS_BETTER_DIAGONALS
AF1 ampR=ASatF1(min(mnR,AF1_(2.0)-mxR)*rcpMR);
AF1 ampG=ASatF1(min(mnG,AF1_(2.0)-mxG)*rcpMG);
AF1 ampB=ASatF1(min(mnB,AF1_(2.0)-mxB)*rcpMB);
#else
AF1 ampR=ASatF1(min(mnR,AF1_(1.0)-mxR)*rcpMR);
AF1 ampG=ASatF1(min(mnG,AF1_(1.0)-mxG)*rcpMG);
AF1 ampB=ASatF1(min(mnB,AF1_(1.0)-mxB)*rcpMB);
#endif
// Shaping amount of sharpening.
#ifdef CAS_GO_SLOWER
ampR=sqrt(ampR);
ampG=sqrt(ampG);
ampB=sqrt(ampB);
#else
ampR=APrxLoSqrtF1(ampR);
ampG=APrxLoSqrtF1(ampG);
ampB=APrxLoSqrtF1(ampB);
#endif
// Filter shape.
// 0 w 0
// w 1 w
// 0 w 0
AF1 peak=AF1_AU1(const1.x);
AF1 wR=ampR*peak;
AF1 wG=ampG*peak;
AF1 wB=ampB*peak;
// Filter.
#ifndef CAS_SLOW
// Using green coef only, depending on dead code removal to strip out the extra overhead.
#ifdef CAS_GO_SLOWER
AF1 rcpWeight=ARcpF1(AF1_(1.0)+AF1_(4.0)*wG);
#else
AF1 rcpWeight=APrxMedRcpF1(AF1_(1.0)+AF1_(4.0)*wG);
#endif
pixR=ASatF1((b.r*wG+d.r*wG+f.r*wG+h.r*wG+e.r)*rcpWeight);
pixG=ASatF1((b.g*wG+d.g*wG+f.g*wG+h.g*wG+e.g)*rcpWeight);
pixB=ASatF1((b.b*wG+d.b*wG+f.b*wG+h.b*wG+e.b)*rcpWeight);
#else
#ifdef CAS_GO_SLOWER
AF1 rcpWeightR=ARcpF1(AF1_(1.0)+AF1_(4.0)*wR);
AF1 rcpWeightG=ARcpF1(AF1_(1.0)+AF1_(4.0)*wG);
AF1 rcpWeightB=ARcpF1(AF1_(1.0)+AF1_(4.0)*wB);
#else
AF1 rcpWeightR=APrxMedRcpF1(AF1_(1.0)+AF1_(4.0)*wR);
AF1 rcpWeightG=APrxMedRcpF1(AF1_(1.0)+AF1_(4.0)*wG);
AF1 rcpWeightB=APrxMedRcpF1(AF1_(1.0)+AF1_(4.0)*wB);
#endif
pixR=ASatF1((b.r*wR+d.r*wR+f.r*wR+h.r*wR+e.r)*rcpWeightR);
pixG=ASatF1((b.g*wG+d.g*wG+f.g*wG+h.g*wG+e.g)*rcpWeightG);
pixB=ASatF1((b.b*wB+d.b*wB+f.b*wB+h.b*wB+e.b)*rcpWeightB);
#endif
return;}
//------------------------------------------------------------------------------------------------------------------------------
// Scaling algorithm adaptively interpolates between nearest 4 results of the non-scaling algorithm.
// a b c d
// e f g h
// i j k l
// m n o p
// Working these 4 results.
// +-----+-----+
// | | |
// | f..|..g |
// | . | . |
// +-----+-----+
// | . | . |
// | j..|..k |
// | | |
// +-----+-----+
AF2 pp=AF2(ip)*AF2_AU2(const0.xy)+AF2_AU2(const0.zw);
AF2 fp=floor(pp);
pp-=fp;
ASU2 sp=ASU2(fp);
AF3 a=CasLoad(sp+ASU2(-1,-1));
AF3 b=CasLoad(sp+ASU2( 0,-1));
AF3 e=CasLoad(sp+ASU2(-1, 0));
AF3 f=CasLoad(sp);
AF3 c=CasLoad(sp+ASU2( 1,-1));
AF3 d=CasLoad(sp+ASU2( 2,-1));
AF3 g=CasLoad(sp+ASU2( 1, 0));
AF3 h=CasLoad(sp+ASU2( 2, 0));
AF3 i=CasLoad(sp+ASU2(-1, 1));
AF3 j=CasLoad(sp+ASU2( 0, 1));
AF3 m=CasLoad(sp+ASU2(-1, 2));
AF3 n=CasLoad(sp+ASU2( 0, 2));
AF3 k=CasLoad(sp+ASU2( 1, 1));
AF3 l=CasLoad(sp+ASU2( 2, 1));
AF3 o=CasLoad(sp+ASU2( 1, 2));
AF3 p=CasLoad(sp+ASU2( 2, 2));
// Run optional input transform.
CasInput(a.r,a.g,a.b);
CasInput(b.r,b.g,b.b);
CasInput(c.r,c.g,c.b);
CasInput(d.r,d.g,d.b);
CasInput(e.r,e.g,e.b);
CasInput(f.r,f.g,f.b);
CasInput(g.r,g.g,g.b);
CasInput(h.r,h.g,h.b);
CasInput(i.r,i.g,i.b);
CasInput(j.r,j.g,j.b);
CasInput(k.r,k.g,k.b);
CasInput(l.r,l.g,l.b);
CasInput(m.r,m.g,m.b);
CasInput(n.r,n.g,n.b);
CasInput(o.r,o.g,o.b);
CasInput(p.r,p.g,p.b);
// Soft min and max.
// These are 2.0x bigger (factored out the extra multiply).
// a b c b
// e f g * 0.5 + e f g * 0.5 [F]
// i j k j
AF1 mnfR=AMin3F1(AMin3F1(b.r,e.r,f.r),g.r,j.r);
AF1 mnfG=AMin3F1(AMin3F1(b.g,e.g,f.g),g.g,j.g);
AF1 mnfB=AMin3F1(AMin3F1(b.b,e.b,f.b),g.b,j.b);
#ifdef CAS_BETTER_DIAGONALS
AF1 mnfR2=AMin3F1(AMin3F1(mnfR,a.r,c.r),i.r,k.r);
AF1 mnfG2=AMin3F1(AMin3F1(mnfG,a.g,c.g),i.g,k.g);
AF1 mnfB2=AMin3F1(AMin3F1(mnfB,a.b,c.b),i.b,k.b);
mnfR=mnfR+mnfR2;
mnfG=mnfG+mnfG2;
mnfB=mnfB+mnfB2;
#endif
AF1 mxfR=AMax3F1(AMax3F1(b.r,e.r,f.r),g.r,j.r);
AF1 mxfG=AMax3F1(AMax3F1(b.g,e.g,f.g),g.g,j.g);
AF1 mxfB=AMax3F1(AMax3F1(b.b,e.b,f.b),g.b,j.b);
#ifdef CAS_BETTER_DIAGONALS
AF1 mxfR2=AMax3F1(AMax3F1(mxfR,a.r,c.r),i.r,k.r);
AF1 mxfG2=AMax3F1(AMax3F1(mxfG,a.g,c.g),i.g,k.g);
AF1 mxfB2=AMax3F1(AMax3F1(mxfB,a.b,c.b),i.b,k.b);
mxfR=mxfR+mxfR2;
mxfG=mxfG+mxfG2;
mxfB=mxfB+mxfB2;
#endif
// b c d c
// f g h * 0.5 + f g h * 0.5 [G]
// j k l k
AF1 mngR=AMin3F1(AMin3F1(c.r,f.r,g.r),h.r,k.r);
AF1 mngG=AMin3F1(AMin3F1(c.g,f.g,g.g),h.g,k.g);
AF1 mngB=AMin3F1(AMin3F1(c.b,f.b,g.b),h.b,k.b);
#ifdef CAS_BETTER_DIAGONALS
AF1 mngR2=AMin3F1(AMin3F1(mngR,b.r,d.r),j.r,l.r);
AF1 mngG2=AMin3F1(AMin3F1(mngG,b.g,d.g),j.g,l.g);
AF1 mngB2=AMin3F1(AMin3F1(mngB,b.b,d.b),j.b,l.b);
mngR=mngR+mngR2;
mngG=mngG+mngG2;
mngB=mngB+mngB2;
#endif
AF1 mxgR=AMax3F1(AMax3F1(c.r,f.r,g.r),h.r,k.r);
AF1 mxgG=AMax3F1(AMax3F1(c.g,f.g,g.g),h.g,k.g);
AF1 mxgB=AMax3F1(AMax3F1(c.b,f.b,g.b),h.b,k.b);
#ifdef CAS_BETTER_DIAGONALS
AF1 mxgR2=AMax3F1(AMax3F1(mxgR,b.r,d.r),j.r,l.r);
AF1 mxgG2=AMax3F1(AMax3F1(mxgG,b.g,d.g),j.g,l.g);
AF1 mxgB2=AMax3F1(AMax3F1(mxgB,b.b,d.b),j.b,l.b);
mxgR=mxgR+mxgR2;
mxgG=mxgG+mxgG2;
mxgB=mxgB+mxgB2;
#endif
// e f g f
// i j k * 0.5 + i j k * 0.5 [J]
// m n o n
AF1 mnjR=AMin3F1(AMin3F1(f.r,i.r,j.r),k.r,n.r);
AF1 mnjG=AMin3F1(AMin3F1(f.g,i.g,j.g),k.g,n.g);
AF1 mnjB=AMin3F1(AMin3F1(f.b,i.b,j.b),k.b,n.b);
#ifdef CAS_BETTER_DIAGONALS
AF1 mnjR2=AMin3F1(AMin3F1(mnjR,e.r,g.r),m.r,o.r);
AF1 mnjG2=AMin3F1(AMin3F1(mnjG,e.g,g.g),m.g,o.g);
AF1 mnjB2=AMin3F1(AMin3F1(mnjB,e.b,g.b),m.b,o.b);
mnjR=mnjR+mnjR2;
mnjG=mnjG+mnjG2;
mnjB=mnjB+mnjB2;
#endif
AF1 mxjR=AMax3F1(AMax3F1(f.r,i.r,j.r),k.r,n.r);
AF1 mxjG=AMax3F1(AMax3F1(f.g,i.g,j.g),k.g,n.g);
AF1 mxjB=AMax3F1(AMax3F1(f.b,i.b,j.b),k.b,n.b);
#ifdef CAS_BETTER_DIAGONALS
AF1 mxjR2=AMax3F1(AMax3F1(mxjR,e.r,g.r),m.r,o.r);
AF1 mxjG2=AMax3F1(AMax3F1(mxjG,e.g,g.g),m.g,o.g);
AF1 mxjB2=AMax3F1(AMax3F1(mxjB,e.b,g.b),m.b,o.b);
mxjR=mxjR+mxjR2;
mxjG=mxjG+mxjG2;
mxjB=mxjB+mxjB2;
#endif
// f g h g
// j k l * 0.5 + j k l * 0.5 [K]
// n o p o
AF1 mnkR=AMin3F1(AMin3F1(g.r,j.r,k.r),l.r,o.r);
AF1 mnkG=AMin3F1(AMin3F1(g.g,j.g,k.g),l.g,o.g);
AF1 mnkB=AMin3F1(AMin3F1(g.b,j.b,k.b),l.b,o.b);
#ifdef CAS_BETTER_DIAGONALS
AF1 mnkR2=AMin3F1(AMin3F1(mnkR,f.r,h.r),n.r,p.r);
AF1 mnkG2=AMin3F1(AMin3F1(mnkG,f.g,h.g),n.g,p.g);
AF1 mnkB2=AMin3F1(AMin3F1(mnkB,f.b,h.b),n.b,p.b);
mnkR=mnkR+mnkR2;
mnkG=mnkG+mnkG2;
mnkB=mnkB+mnkB2;
#endif
AF1 mxkR=AMax3F1(AMax3F1(g.r,j.r,k.r),l.r,o.r);
AF1 mxkG=AMax3F1(AMax3F1(g.g,j.g,k.g),l.g,o.g);
AF1 mxkB=AMax3F1(AMax3F1(g.b,j.b,k.b),l.b,o.b);
#ifdef CAS_BETTER_DIAGONALS
AF1 mxkR2=AMax3F1(AMax3F1(mxkR,f.r,h.r),n.r,p.r);
AF1 mxkG2=AMax3F1(AMax3F1(mxkG,f.g,h.g),n.g,p.g);
AF1 mxkB2=AMax3F1(AMax3F1(mxkB,f.b,h.b),n.b,p.b);
mxkR=mxkR+mxkR2;
mxkG=mxkG+mxkG2;
mxkB=mxkB+mxkB2;
#endif
// Smooth minimum distance to signal limit divided by smooth max.
#ifdef CAS_GO_SLOWER
AF1 rcpMfR=ARcpF1(mxfR);
AF1 rcpMfG=ARcpF1(mxfG);
AF1 rcpMfB=ARcpF1(mxfB);
AF1 rcpMgR=ARcpF1(mxgR);
AF1 rcpMgG=ARcpF1(mxgG);
AF1 rcpMgB=ARcpF1(mxgB);
AF1 rcpMjR=ARcpF1(mxjR);
AF1 rcpMjG=ARcpF1(mxjG);
AF1 rcpMjB=ARcpF1(mxjB);
AF1 rcpMkR=ARcpF1(mxkR);
AF1 rcpMkG=ARcpF1(mxkG);
AF1 rcpMkB=ARcpF1(mxkB);
#else
AF1 rcpMfR=APrxLoRcpF1(mxfR);
AF1 rcpMfG=APrxLoRcpF1(mxfG);
AF1 rcpMfB=APrxLoRcpF1(mxfB);
AF1 rcpMgR=APrxLoRcpF1(mxgR);
AF1 rcpMgG=APrxLoRcpF1(mxgG);
AF1 rcpMgB=APrxLoRcpF1(mxgB);
AF1 rcpMjR=APrxLoRcpF1(mxjR);
AF1 rcpMjG=APrxLoRcpF1(mxjG);
AF1 rcpMjB=APrxLoRcpF1(mxjB);
AF1 rcpMkR=APrxLoRcpF1(mxkR);
AF1 rcpMkG=APrxLoRcpF1(mxkG);
AF1 rcpMkB=APrxLoRcpF1(mxkB);
#endif
#ifdef CAS_BETTER_DIAGONALS
AF1 ampfR=ASatF1(min(mnfR,AF1_(2.0)-mxfR)*rcpMfR);
AF1 ampfG=ASatF1(min(mnfG,AF1_(2.0)-mxfG)*rcpMfG);
AF1 ampfB=ASatF1(min(mnfB,AF1_(2.0)-mxfB)*rcpMfB);
AF1 ampgR=ASatF1(min(mngR,AF1_(2.0)-mxgR)*rcpMgR);
AF1 ampgG=ASatF1(min(mngG,AF1_(2.0)-mxgG)*rcpMgG);
AF1 ampgB=ASatF1(min(mngB,AF1_(2.0)-mxgB)*rcpMgB);
AF1 ampjR=ASatF1(min(mnjR,AF1_(2.0)-mxjR)*rcpMjR);
AF1 ampjG=ASatF1(min(mnjG,AF1_(2.0)-mxjG)*rcpMjG);
AF1 ampjB=ASatF1(min(mnjB,AF1_(2.0)-mxjB)*rcpMjB);
AF1 ampkR=ASatF1(min(mnkR,AF1_(2.0)-mxkR)*rcpMkR);
AF1 ampkG=ASatF1(min(mnkG,AF1_(2.0)-mxkG)*rcpMkG);
AF1 ampkB=ASatF1(min(mnkB,AF1_(2.0)-mxkB)*rcpMkB);
#else
AF1 ampfR=ASatF1(min(mnfR,AF1_(1.0)-mxfR)*rcpMfR);
AF1 ampfG=ASatF1(min(mnfG,AF1_(1.0)-mxfG)*rcpMfG);
AF1 ampfB=ASatF1(min(mnfB,AF1_(1.0)-mxfB)*rcpMfB);
AF1 ampgR=ASatF1(min(mngR,AF1_(1.0)-mxgR)*rcpMgR);
AF1 ampgG=ASatF1(min(mngG,AF1_(1.0)-mxgG)*rcpMgG);
AF1 ampgB=ASatF1(min(mngB,AF1_(1.0)-mxgB)*rcpMgB);
AF1 ampjR=ASatF1(min(mnjR,AF1_(1.0)-mxjR)*rcpMjR);
AF1 ampjG=ASatF1(min(mnjG,AF1_(1.0)-mxjG)*rcpMjG);
AF1 ampjB=ASatF1(min(mnjB,AF1_(1.0)-mxjB)*rcpMjB);
AF1 ampkR=ASatF1(min(mnkR,AF1_(1.0)-mxkR)*rcpMkR);
AF1 ampkG=ASatF1(min(mnkG,AF1_(1.0)-mxkG)*rcpMkG);
AF1 ampkB=ASatF1(min(mnkB,AF1_(1.0)-mxkB)*rcpMkB);
#endif
// Shaping amount of sharpening.
#ifdef CAS_GO_SLOWER
ampfR=sqrt(ampfR);
ampfG=sqrt(ampfG);
ampfB=sqrt(ampfB);
ampgR=sqrt(ampgR);
ampgG=sqrt(ampgG);
ampgB=sqrt(ampgB);
ampjR=sqrt(ampjR);
ampjG=sqrt(ampjG);
ampjB=sqrt(ampjB);
ampkR=sqrt(ampkR);
ampkG=sqrt(ampkG);
ampkB=sqrt(ampkB);
#else
ampfR=APrxLoSqrtF1(ampfR);
ampfG=APrxLoSqrtF1(ampfG);
ampfB=APrxLoSqrtF1(ampfB);
ampgR=APrxLoSqrtF1(ampgR);
ampgG=APrxLoSqrtF1(ampgG);
ampgB=APrxLoSqrtF1(ampgB);
ampjR=APrxLoSqrtF1(ampjR);
ampjG=APrxLoSqrtF1(ampjG);
ampjB=APrxLoSqrtF1(ampjB);
ampkR=APrxLoSqrtF1(ampkR);
ampkG=APrxLoSqrtF1(ampkG);
ampkB=APrxLoSqrtF1(ampkB);
#endif
// Filter shape.
// 0 w 0
// w 1 w
// 0 w 0
AF1 peak=AF1_AU1(const1.x);
AF1 wfR=ampfR*peak;
AF1 wfG=ampfG*peak;
AF1 wfB=ampfB*peak;
AF1 wgR=ampgR*peak;
AF1 wgG=ampgG*peak;
AF1 wgB=ampgB*peak;
AF1 wjR=ampjR*peak;
AF1 wjG=ampjG*peak;
AF1 wjB=ampjB*peak;
AF1 wkR=ampkR*peak;
AF1 wkG=ampkG*peak;
AF1 wkB=ampkB*peak;
// Blend between 4 results.
// s t
// u v
AF1 s=(AF1_(1.0)-pp.x)*(AF1_(1.0)-pp.y);
AF1 t= pp.x *(AF1_(1.0)-pp.y);
AF1 u=(AF1_(1.0)-pp.x)* pp.y ;
AF1 v= pp.x * pp.y ;
// Thin edges to hide bilinear interpolation (helps diagonals).
AF1 thinB=1.0/32.0;
#ifdef CAS_GO_SLOWER
s*=ARcpF1(thinB+(mxfG-mnfG));
t*=ARcpF1(thinB+(mxgG-mngG));
u*=ARcpF1(thinB+(mxjG-mnjG));
v*=ARcpF1(thinB+(mxkG-mnkG));
#else
s*=APrxLoRcpF1(thinB+(mxfG-mnfG));
t*=APrxLoRcpF1(thinB+(mxgG-mngG));
u*=APrxLoRcpF1(thinB+(mxjG-mnjG));
v*=APrxLoRcpF1(thinB+(mxkG-mnkG));
#endif
// Final weighting.
// b c
// e f g h
// i j k l
// n o
// _____ _____ _____ _____
// fs gt
//
// _____ _____ _____ _____
// fs s gt fs t gt
// ju kv
// _____ _____ _____ _____
// fs gt
// ju u kv ju v kv
// _____ _____ _____ _____
//
// ju kv
AF1 qbeR=wfR*s;
AF1 qbeG=wfG*s;
AF1 qbeB=wfB*s;
AF1 qchR=wgR*t;
AF1 qchG=wgG*t;
AF1 qchB=wgB*t;
AF1 qfR=wgR*t+wjR*u+s;
AF1 qfG=wgG*t+wjG*u+s;
AF1 qfB=wgB*t+wjB*u+s;
AF1 qgR=wfR*s+wkR*v+t;
AF1 qgG=wfG*s+wkG*v+t;
AF1 qgB=wfB*s+wkB*v+t;
AF1 qjR=wfR*s+wkR*v+u;
AF1 qjG=wfG*s+wkG*v+u;
AF1 qjB=wfB*s+wkB*v+u;
AF1 qkR=wgR*t+wjR*u+v;
AF1 qkG=wgG*t+wjG*u+v;
AF1 qkB=wgB*t+wjB*u+v;
AF1 qinR=wjR*u;
AF1 qinG=wjG*u;
AF1 qinB=wjB*u;
AF1 qloR=wkR*v;
AF1 qloG=wkG*v;
AF1 qloB=wkB*v;
// Filter.
#ifndef CAS_SLOW
// Using green coef only, depending on dead code removal to strip out the extra overhead.
#ifdef CAS_GO_SLOWER
AF1 rcpWG=ARcpF1(AF1_(2.0)*qbeG+AF1_(2.0)*qchG+AF1_(2.0)*qinG+AF1_(2.0)*qloG+qfG+qgG+qjG+qkG);
#else
AF1 rcpWG=APrxMedRcpF1(AF1_(2.0)*qbeG+AF1_(2.0)*qchG+AF1_(2.0)*qinG+AF1_(2.0)*qloG+qfG+qgG+qjG+qkG);
#endif
pixR=ASatF1((b.r*qbeG+e.r*qbeG+c.r*qchG+h.r*qchG+i.r*qinG+n.r*qinG+l.r*qloG+o.r*qloG+f.r*qfG+g.r*qgG+j.r*qjG+k.r*qkG)*rcpWG);
pixG=ASatF1((b.g*qbeG+e.g*qbeG+c.g*qchG+h.g*qchG+i.g*qinG+n.g*qinG+l.g*qloG+o.g*qloG+f.g*qfG+g.g*qgG+j.g*qjG+k.g*qkG)*rcpWG);
pixB=ASatF1((b.b*qbeG+e.b*qbeG+c.b*qchG+h.b*qchG+i.b*qinG+n.b*qinG+l.b*qloG+o.b*qloG+f.b*qfG+g.b*qgG+j.b*qjG+k.b*qkG)*rcpWG);
#else
#ifdef CAS_GO_SLOWER
AF1 rcpWR=ARcpF1(AF1_(2.0)*qbeR+AF1_(2.0)*qchR+AF1_(2.0)*qinR+AF1_(2.0)*qloR+qfR+qgR+qjR+qkR);
AF1 rcpWG=ARcpF1(AF1_(2.0)*qbeG+AF1_(2.0)*qchG+AF1_(2.0)*qinG+AF1_(2.0)*qloG+qfG+qgG+qjG+qkG);
AF1 rcpWB=ARcpF1(AF1_(2.0)*qbeB+AF1_(2.0)*qchB+AF1_(2.0)*qinB+AF1_(2.0)*qloB+qfB+qgB+qjB+qkB);
#else
AF1 rcpWR=APrxMedRcpF1(AF1_(2.0)*qbeR+AF1_(2.0)*qchR+AF1_(2.0)*qinR+AF1_(2.0)*qloR+qfR+qgR+qjR+qkR);
AF1 rcpWG=APrxMedRcpF1(AF1_(2.0)*qbeG+AF1_(2.0)*qchG+AF1_(2.0)*qinG+AF1_(2.0)*qloG+qfG+qgG+qjG+qkG);
AF1 rcpWB=APrxMedRcpF1(AF1_(2.0)*qbeB+AF1_(2.0)*qchB+AF1_(2.0)*qinB+AF1_(2.0)*qloB+qfB+qgB+qjB+qkB);
#endif
pixR=ASatF1((b.r*qbeR+e.r*qbeR+c.r*qchR+h.r*qchR+i.r*qinR+n.r*qinR+l.r*qloR+o.r*qloR+f.r*qfR+g.r*qgR+j.r*qjR+k.r*qkR)*rcpWR);
pixG=ASatF1((b.g*qbeG+e.g*qbeG+c.g*qchG+h.g*qchG+i.g*qinG+n.g*qinG+l.g*qloG+o.g*qloG+f.g*qfG+g.g*qgG+j.g*qjG+k.g*qkG)*rcpWG);
pixB=ASatF1((b.b*qbeB+e.b*qbeB+c.b*qchB+h.b*qchB+i.b*qinB+n.b*qinB+l.b*qloB+o.b*qloB+f.b*qfB+g.b*qgB+j.b*qjB+k.b*qkB)*rcpWB);
#endif
}
#endif
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
//_____________________________________________________________/\_______________________________________________________________
//==============================================================================================================================
// PACKED VERSION
//==============================================================================================================================
#if defined(A_GPU) && defined(A_HALF)
// Missing a way to do packed re-interpetation, so must disable approximation optimizations.
#ifdef A_HLSL
#ifndef CAS_GO_SLOWER
#define CAS_GO_SLOWER 1
#endif
#endif
//==============================================================================================================================
// Can be used to convert from packed SOA to AOS for store.
void CasDepack(out AH4 pix0,out AH4 pix1,AH2 pixR,AH2 pixG,AH2 pixB){
#ifdef A_HLSL
// Invoke a slower path for DX only, since it won't allow uninitialized values.
pix0.a=pix1.a=0.0;
#endif
pix0.rgb=AH3(pixR.x,pixG.x,pixB.x);
pix1.rgb=AH3(pixR.y,pixG.y,pixB.y);}
//==============================================================================================================================
void CasFilterH(
// Output values are for 2 8x8 tiles in a 16x8 region.
// pix<R,G,B>.x = right 8x8 tile
// pix<R,G,B>.y = left 8x8 tile
// This enables later processing to easily be packed as well.
out AH2 pixR,
out AH2 pixG,
out AH2 pixB,
AU2 ip, // Integer pixel position in output.
AU4 const0, // Constants generated by CasSetup().
AU4 const1,
AP1 noScaling){ // Must be a compile-time literal value, true = sharpen only (no resize).
//------------------------------------------------------------------------------------------------------------------------------
// Debug a checker pattern of on/off tiles for visual inspection.
#ifdef CAS_DEBUG_CHECKER
if((((ip.x^ip.y)>>8u)&1u)==0u){AH3 pix0=CasLoadH(ASW2(ip));AH3 pix1=CasLoadH(ASW2(ip)+ASW2(8,0));
pixR=AH2(pix0.r,pix1.r);pixG=AH2(pix0.g,pix1.g);pixB=AH2(pix0.b,pix1.b);CasInputH(pixR,pixG,pixB);return;}
#endif
//------------------------------------------------------------------------------------------------------------------------------
// No scaling algorithm uses minimal 3x3 pixel neighborhood.
if(noScaling){
ASW2 sp0=ASW2(ip);
AH3 a0=CasLoadH(sp0+ASW2(-1,-1));
AH3 b0=CasLoadH(sp0+ASW2( 0,-1));
AH3 c0=CasLoadH(sp0+ASW2( 1,-1));
AH3 d0=CasLoadH(sp0+ASW2(-1, 0));
AH3 e0=CasLoadH(sp0);
AH3 f0=CasLoadH(sp0+ASW2( 1, 0));
AH3 g0=CasLoadH(sp0+ASW2(-1, 1));
AH3 h0=CasLoadH(sp0+ASW2( 0, 1));
AH3 i0=CasLoadH(sp0+ASW2( 1, 1));
ASW2 sp1=sp0+ASW2(8,0);
AH3 a1=CasLoadH(sp1+ASW2(-1,-1));
AH3 b1=CasLoadH(sp1+ASW2( 0,-1));
AH3 c1=CasLoadH(sp1+ASW2( 1,-1));
AH3 d1=CasLoadH(sp1+ASW2(-1, 0));
AH3 e1=CasLoadH(sp1);
AH3 f1=CasLoadH(sp1+ASW2( 1, 0));
AH3 g1=CasLoadH(sp1+ASW2(-1, 1));
AH3 h1=CasLoadH(sp1+ASW2( 0, 1));
AH3 i1=CasLoadH(sp1+ASW2( 1, 1));
// AOS to SOA conversion.
AH2 aR=AH2(a0.r,a1.r);
AH2 aG=AH2(a0.g,a1.g);
AH2 aB=AH2(a0.b,a1.b);
AH2 bR=AH2(b0.r,b1.r);
AH2 bG=AH2(b0.g,b1.g);
AH2 bB=AH2(b0.b,b1.b);
AH2 cR=AH2(c0.r,c1.r);
AH2 cG=AH2(c0.g,c1.g);
AH2 cB=AH2(c0.b,c1.b);
AH2 dR=AH2(d0.r,d1.r);
AH2 dG=AH2(d0.g,d1.g);
AH2 dB=AH2(d0.b,d1.b);
AH2 eR=AH2(e0.r,e1.r);
AH2 eG=AH2(e0.g,e1.g);
AH2 eB=AH2(e0.b,e1.b);
AH2 fR=AH2(f0.r,f1.r);
AH2 fG=AH2(f0.g,f1.g);
AH2 fB=AH2(f0.b,f1.b);
AH2 gR=AH2(g0.r,g1.r);
AH2 gG=AH2(g0.g,g1.g);
AH2 gB=AH2(g0.b,g1.b);
AH2 hR=AH2(h0.r,h1.r);
AH2 hG=AH2(h0.g,h1.g);
AH2 hB=AH2(h0.b,h1.b);
AH2 iR=AH2(i0.r,i1.r);
AH2 iG=AH2(i0.g,i1.g);
AH2 iB=AH2(i0.b,i1.b);
// Run optional input transform.
CasInputH(aR,aG,aB);
CasInputH(bR,bG,bB);
CasInputH(cR,cG,cB);
CasInputH(dR,dG,dB);
CasInputH(eR,eG,eB);
CasInputH(fR,fG,fB);
CasInputH(gR,gG,gB);
CasInputH(hR,hG,hB);
CasInputH(iR,iG,iB);
// Soft min and max.
AH2 mnR=min(min(fR,hR),min(min(bR,dR),eR));
AH2 mnG=min(min(fG,hG),min(min(bG,dG),eG));
AH2 mnB=min(min(fB,hB),min(min(bB,dB),eB));
#ifdef CAS_BETTER_DIAGONALS
AH2 mnR2=min(min(gR,iR),min(min(aR,cR),mnR));
AH2 mnG2=min(min(gG,iG),min(min(aG,cG),mnG));
AH2 mnB2=min(min(gB,iB),min(min(aB,cB),mnB));
mnR=mnR+mnR2;
mnG=mnG+mnG2;
mnB=mnB+mnB2;
#endif
AH2 mxR=max(max(fR,hR),max(max(bR,dR),eR));
AH2 mxG=max(max(fG,hG),max(max(bG,dG),eG));
AH2 mxB=max(max(fB,hB),max(max(bB,dB),eB));
#ifdef CAS_BETTER_DIAGONALS
AH2 mxR2=max(max(gR,iR),max(max(aR,cR),mxR));
AH2 mxG2=max(max(gG,iG),max(max(aG,cG),mxG));