1 /**
2 * Implement the linear gradient fill style. dplug:canvas internals.
3 *
4 * Copyright: Copyright Chris Jones 2020.
5 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
6 */
7 module dplug.canvas.linearblit;
8 
9 import dplug.core.math;
10 
11 import dplug.canvas.rasterizer;
12 import dplug.canvas.gradient;
13 import dplug.canvas.misc;
14 
15 /*
16   linear gradient blit
17 */
18 
19 struct LinearBlit
20 {   
21 nothrow:
22 @nogc:
23 
24     void init(Gradient g, float x0, float y0, float x1, float y1)
25     {
26         assert(g !is null);
27 
28         this.gradient = g;
29         int lutsize = g.lutLength;
30 
31         xctr = x0;
32         yctr = y0;
33         float w = x1-x0;
34         float h = y1-y0;
35         float hsq = w*w + h*h;
36         if (hsq < 0.1) hsq = 0.1; // avoid div by zero
37         xstep = lutsize * w / hsq; 
38         ystep = lutsize * h / hsq;
39     }
40 
41 private:
42 
43     void linear_blit(WindingRule wr)(uint* dest, int* delta, DMWord* mask, int x0, int x1, int y)
44     {
45         assert(x0 >= 0);
46         assert(y >= 0);
47         assert((x0 & 3) == 0);
48         assert((x1 & 3) == 0);
49 
50         // main blit variables
51 
52         int bpos = x0 / 4;
53         int endbit = x1 / 4;
54         __m128i xmWinding = 0;
55         uint* lut = gradient.getLookup.ptr;
56         assert(gradient.lutLength <= short.max); // LUT can be non-power-of-2 as far as LinearBlit is concerned, but this held low interest
57         short lutMax = cast(short)(gradient.lutLength - 1);
58 
59         bool isopaque = false;//gradient.isOpaque
60 
61         // XMM constants
62 
63         immutable __m128i XMZERO = 0;
64         immutable __m128i XMFFFF = [0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF];
65         immutable __m128i XMMSK16 = [0xFFFF,0xFFFF,0xFFFF,0xFFFF];
66 
67         // paint variables
68 
69         float t0 = (bpos*4-xctr)*xstep + (y-yctr)*ystep;
70         __m128 xmT0 = _mm_mul_ps(_mm_set1_ps(xstep), _mm_setr_ps(0.0f,1.0f,2.0f,3.0f));
71         xmT0 = _mm_add_ps(xmT0, _mm_set1_ps(t0));
72         __m128 xmStep0 = _mm_set1_ps(xstep*4);
73 
74         // main loop
75 
76         while (bpos < endbit)
77         {
78             int nsb = nextSetBit(mask, bpos, endbit);
79 
80             // do we have a span of unchanging coverage?
81 
82             if (bpos < nsb)
83             {
84                 // Calc coverage of first pixel
85 
86                 static if (wr == WindingRule.NonZero)
87                 {
88                     int cover = xmWinding[3]+delta[bpos*4];
89                     cover = abs(cover)*2;
90                     if (cover > 0xFFFF) cover = 0xFFFF;
91                 }
92                 else
93                 {
94                     int cover = xmWinding[3]+delta[bpos*4];
95                     short tsc = cast(short) cover;
96                     cover = (tsc ^ (tsc >> 15)) * 2;
97                 }
98 
99                 // We can skip the span
100 
101                 if (cover == 0)
102                 {
103                     __m128 tsl = _mm_set1_ps(nsb-bpos);
104                     xmT0 = _mm_add_ps(xmT0, _mm_mul_ps(tsl,xmStep0));
105                     bpos = nsb;
106                 }
107 
108                 // Or fill span with soid color
109 
110                 else if (isopaque && (cover > 0xFF00))
111                 {
112                     uint* ptr = &dest[bpos*4];
113                     uint* end = ptr + ((nsb-bpos)*4);
114 
115                     while (ptr < end)
116                     {
117                         __m128i ipos = _mm_cvttps_epi32 (xmT0);
118                         ipos = _mm_clamp_0_to_N_epi32(ipos, lutMax);
119                         xmT0 = xmT0 + xmStep0;
120 
121                         ptr[0] = lut[ ipos.array[0] ];
122                         ptr[1] = lut[ ipos.array[1] ];
123                         ptr[2] = lut[ ipos.array[2] ];
124                         ptr[3] = lut[ ipos.array[3] ];
125 
126                         ptr+=4;                        
127                     }
128 
129                     bpos = nsb;
130                 }
131 
132                 // Or fill span with transparent color
133 
134                 else
135                 {
136                     __m128i tqcvr = _mm_set1_epi16 (cast(ushort) cover);
137 
138                     uint* ptr = &dest[bpos*4];
139                     uint* end = &dest[nsb*4];
140 
141                     while (ptr < end)
142                     {
143                         __m128i ipos = _mm_cvttps_epi32 (xmT0);
144                         ipos = _mm_clamp_0_to_N_epi32(ipos, lutMax);
145                         xmT0 = xmT0 + xmStep0;
146 
147                         __m128i d01 = _mm_loadu_si128(cast(__m128i*) ptr);
148                         __m128i d0 = _mm_unpacklo_epi8 (d01, XMZERO);
149                         __m128i d1 = _mm_unpackhi_epi8 (d01, XMZERO);
150 
151                         __m128i c0 = _mm_loadu_si32 (&lut[ ipos.array[0] ]);
152                         __m128i tnc = _mm_loadu_si32 (&lut[ ipos.array[1] ]);
153                         c0 = _mm_unpacklo_epi32 (c0, tnc);
154                         c0 = _mm_unpacklo_epi8 (c0, XMZERO);
155                         __m128i a0 = _mm_broadcast_alpha(c0);
156                         a0 = _mm_mulhi_epu16(a0, tqcvr);
157 
158                         __m128i c1 = _mm_loadu_si32 (&lut[ ipos.array[2] ]);
159                         tnc = _mm_loadu_si32 (&lut[ ipos.array[3] ]);
160                         c1 = _mm_unpacklo_epi32 (c1, tnc);
161                         c1 = _mm_unpacklo_epi8 (c1, XMZERO);
162                         __m128i a1 = _mm_broadcast_alpha(c1);
163                         a1 = _mm_mulhi_epu16(a1, tqcvr);
164 
165                        // alpha*source + dest - alpha*dest
166 
167                         c0 = _mm_mulhi_epu16 (c0,a0);
168                         c1 = _mm_mulhi_epu16 (c1,a1);
169                         c0 = _mm_adds_epi16 (c0,d0);
170                         c1 = _mm_adds_epi16 (c1,d1);
171                         d0 = _mm_mulhi_epu16 (d0,a0);
172                         d1 = _mm_mulhi_epu16 (d1,a1);
173                         c0 =  _mm_subs_epi16 (c0, d0);
174                         c1 =  _mm_subs_epi16 (c1, d1);
175 
176                         d0 = _mm_packus_epi16 (c0,c1);
177 
178                         _mm_storeu_si128 (cast(__m128i*)ptr,d0);
179                         
180                         ptr+=4;
181                     }
182 
183                     bpos = nsb;
184                 }
185             }
186 
187             // At this point we need to integrate scandelta
188 
189             uint* ptr = &dest[bpos*4];
190             uint* end = &dest[endbit*4];
191             int* dlptr = &delta[bpos*4];
192 
193             while (bpos < endbit)
194             {
195                 // Integrate delta values
196 
197                 __m128i tqw = _mm_loadu_si128(cast(__m128i*)dlptr);
198                 tqw = _mm_add_epi32(tqw, _mm_slli_si128!4(tqw)); 
199                 tqw = _mm_add_epi32(tqw, _mm_slli_si128!8(tqw)); 
200                 tqw = _mm_add_epi32(tqw, xmWinding); 
201                 xmWinding = _mm_shuffle_epi32!255(tqw);  
202                 _mm_storeu_si128(cast(__m128i*)dlptr,XMZERO);
203 
204                 // Process coverage values taking account of winding rule
205                 
206                 static if (wr == WindingRule.NonZero)
207                 {
208                     __m128i tcvr = _mm_srai_epi32(tqw,31); 
209                     tqw = _mm_add_epi32(tcvr,tqw);
210                     tqw = _mm_xor_si128(tqw,tcvr);         // abs
211                     tcvr = _mm_packs_epi32(tqw,XMZERO);    // saturate/pack to int16
212                     tcvr = _mm_slli_epi16(tcvr, 1);        // << to uint16
213                 }
214                 else
215                 {
216                     __m128i tcvr = _mm_and_si128(tqw,XMMSK16); 
217                     tqw = _mm_srai_epi16(tcvr,15);         // mask
218                     tcvr = _mm_xor_si128(tcvr,tqw);        // fold in halff
219                     tcvr = _mm_packs_epi32(tcvr,XMZERO);   // pack to int16
220                     tcvr = _mm_slli_epi16(tcvr, 1);        // << to uint16
221                 }
222 
223                 // convert grad pos to integer
224 
225                 __m128i ipos = _mm_cvttps_epi32 (xmT0);
226                 ipos = _mm_clamp_0_to_N_epi32(ipos, lutMax);
227                 xmT0 = xmT0 + xmStep0;
228 
229                 // Load destination pixels
230                 __m128i d01 = _mm_loadu_si128(cast(__m128i*) ptr);
231                 __m128i d0 = _mm_unpacklo_epi8 (d01, XMZERO);
232                 __m128i d1 = _mm_unpackhi_epi8 (d01, XMZERO);
233 
234                 // load grad colors
235 
236                 tcvr = _mm_unpacklo_epi16 (tcvr, tcvr);
237                 __m128i tcvr2 = _mm_unpackhi_epi32 (tcvr, tcvr);
238                 tcvr = _mm_unpacklo_epi32 (tcvr, tcvr);
239 
240                 __m128i c0 = _mm_loadu_si32 (&lut[ ipos.array[0] ]);
241                 __m128i tnc = _mm_loadu_si32 (&lut[ ipos.array[1] ]);
242                 c0 = _mm_unpacklo_epi32 (c0, tnc);
243                 c0 = _mm_unpacklo_epi8 (c0, XMZERO);
244                 __m128i a0 = _mm_broadcast_alpha(c0);
245                 a0 = _mm_mulhi_epu16(a0, tcvr);
246 
247 
248                 __m128i c1 = _mm_loadu_si32 (&lut[ ipos.array[2] ]);
249                 tnc = _mm_loadu_si32 (&lut[ ipos.array[3] ]);
250                 c1 = _mm_unpacklo_epi32 (c1, tnc);
251                 c1 = _mm_unpacklo_epi8 (c1, XMZERO);
252                 __m128i a1 = _mm_broadcast_alpha(c1);
253                 a1 = _mm_mulhi_epu16(a1, tcvr2);
254 
255                 // alpha*source + dest - alpha*dest
256 
257                 c0 = _mm_mulhi_epu16 (c0,a0);
258                 c1 = _mm_mulhi_epu16 (c1,a1);
259                 c0 = _mm_adds_epi16 (c0,d0);
260                 c1 = _mm_adds_epi16 (c1,d1);
261                 d0 = _mm_mulhi_epu16 (d0,a0);
262                 d1 = _mm_mulhi_epu16 (d1,a1);
263                 c0 =  _mm_subs_epi16 (c0, d0);
264                 c1 =  _mm_subs_epi16 (c1, d1);
265 
266                 d0 = _mm_packus_epi16 (c0,c1);
267 
268                 _mm_storeu_si128 (cast(__m128i*)ptr,d0);
269                 
270                 bpos++;
271                 ptr+=4;
272                 dlptr+=4;
273 
274                 if (((cast(ulong*)dlptr)[0] | (cast(ulong*)dlptr)[1]) == 0)  break;
275             }
276         }
277     }
278 
279     // Member variables
280 
281     Gradient gradient;
282     float xctr,yctr;
283     float xstep,ystep;
284 }
285 
286 nothrow:
287 @nogc:
288 
289 void doBlit_LinearBlit_NonZero(void* userData, uint* dest, int* delta, DMWord* mask, int x0, int x1, int y)
290 {
291     LinearBlit* lb = cast(LinearBlit*)userData;
292     return lb.linear_blit!(WindingRule.NonZero)(dest, delta, mask, x0, x1, y);
293 }
294 
295 void doBlit_LinearBlit_EvenOdd(void* userData, uint* dest, int* delta, DMWord* mask, int x0, int x1, int y)
296 {
297     LinearBlit* lb = cast(LinearBlit*)userData;
298     return lb.linear_blit!(WindingRule.EvenOdd)(dest, delta, mask, x0, x1, y);
299 }