dplug.canvas.ellipticalblit source code

1 /**
2 * Implement the elliptic gradient fill style. dplug:canvas internals.
3 *
4 * Copyright: Copyright Chris Jones 2020.
5 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
6 */
7 module dplug.canvas.ellipticalblit;
8 
9 import dplug.canvas.rasterizer;
10 import dplug.canvas.gradient;
11 import dplug.canvas.misc;
12 
13 struct EllipticalBlit
14 {   
15 nothrow:
16 @nogc:
17 
18     void init(Gradient g, float x0, float y0, float x1, float y1, float r2)
19     {
20         assert(g !is null);
21         this.gradient = g;
22         int lutsize = g.lutLength;
23 
24         xctr = x0;
25         yctr = y0;
26         float w = x1-x0;
27         float h = y1-y0;
28         float hyp = w*w + h*h;
29         if (hyp < 1.0) hyp = 1.0;
30         xstep0 = lutsize * w / hyp; 
31         ystep0 = lutsize * h / hyp;
32         hyp = sqrt(hyp);
33         xstep1 = lutsize * h / (r2*hyp);
34         ystep1 = lutsize * -w / (r2*hyp); 
35     }
36 
37 private:
38 
39     void color_blit(WindingRule wr)(uint* dest, int* delta, DMWord* mask, int x0, int x1, int y)
40     {
41         assert(x0 >= 0);
42         assert(y >= 0);
43         assert((x0 & 3) == 0);
44         assert((x1 & 3) == 0);
45 
46         // main blit variables
47 
48         int bpos = x0 / 4;
49         int endbit = x1 / 4;
50 
51         __m128i xmWinding = 0;
52         uint* lut = gradient.getLookup.ptr;
53         short lutMax = cast(short)(gradient.lutLength - 1);
54         bool isopaque = false;//gradient.isOpaque
55 
56         // XMM constants
57 
58         immutable __m128i XMZERO = 0;
59         immutable __m128i XMFFFF = [0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF];
60         immutable __m128i XMMSK16 = [0xFFFF,0xFFFF,0xFFFF,0xFFFF];
61 
62         // paint variables
63 
64         float t0 = (bpos*4-xctr)*xstep0 + (y-yctr)*ystep0;
65         __m128 xmT0 = _mm_mul_ps(_mm_set1_ps(xstep0), _mm_setr_ps(0.0f,1.0f,2.0f,3.0f));
66         xmT0 = _mm_add_ps(xmT0, _mm_set1_ps(t0));
67         __m128 xmStep0 = _mm_set1_ps(xstep0*4);
68 
69         float t1 = (bpos*4-xctr)*xstep1 + (y-yctr)*ystep1;
70         __m128 xmT1 = _mm_mul_ps(_mm_set1_ps(xstep1), _mm_setr_ps(0.0f,1.0f,2.0f,3.0f));
71         xmT1 = _mm_add_ps(xmT1, _mm_set1_ps(t1));
72         __m128 xmStep1 = _mm_set1_ps(xstep1*4);
73 
74         // main loop 
75 
76         while (bpos < endbit)
77         {
78             int nsb = nextSetBit(mask, bpos, endbit);
79 
80             // do we have a span of unchanging coverage?
81 
82             if (bpos < nsb)
83             {
84                 // Calc coverage of first pixel
85 
86                 static if (wr == WindingRule.NonZero)
87                 {
88                     int cover = xmWinding[3]+delta[bpos*4];
89                     cover = abs(cover)*2;
90                     if (cover > 0xFFFF) cover = 0xFFFF;
91                 }
92                 else
93                 {
94                     int cover = xmWinding[3]+delta[bpos*4];
95                     short tsc = cast(short) cover;
96                     cover = (tsc ^ (tsc >> 15)) * 2;
97                 }
98 
99                 // We can skip the span
100 
101                 if (cover == 0)
102                 {
103                     __m128 tsl = _mm_set1_ps(nsb-bpos);
104                     xmT0 = _mm_add_ps(xmT0, _mm_mul_ps(tsl,xmStep0));
105                     xmT1 = _mm_add_ps(xmT1, _mm_mul_ps(tsl,xmStep1));
106                     bpos = nsb;
107                 }
108 
109                 // Or fill span with soid color
110 
111                 else if (isopaque && (cover > 0xFF00))
112                 {
113                     uint* ptr = &dest[bpos*4];
114                     uint* end = ptr + ((nsb-bpos)*4);
115 
116                     while (ptr < end)
117                     {
118                         __m128 rad = _mm_add_ps(_mm_mul_ps(xmT0, xmT0),_mm_mul_ps(xmT1, xmT1));
119                         rad = _mm_sqrt_ps(rad);
120                         xmT0 = xmT0 + xmStep0;
121                         xmT1 = xmT1 + xmStep1;
122                         __m128i ipos = _mm_cvttps_epi32 (rad);
123                         ipos = _mm_clamp_0_to_N_epi32(ipos, lutMax);
124 
125                         ptr[0] = lut[ ipos.array[0] ];
126                         ptr[1] = lut[ ipos.array[1] ];
127                         ptr[2] = lut[ ipos.array[2] ];
128                         ptr[3] = lut[ ipos.array[3] ];
129 
130                         ptr+=4;
131                     }
132 
133                     bpos = nsb;
134                 }
135 
136                 // Or fill span with transparent color
137 
138                 else
139                 {
140                     __m128i tqcvr = _mm_set1_epi16 (cast(ushort) cover);
141 
142                     uint* ptr = &dest[bpos*4];
143                     uint* end = &dest[nsb*4];
144 
145                     while (ptr < end)
146                     {
147                         __m128 rad = _mm_add_ps(_mm_mul_ps(xmT0, xmT0),_mm_mul_ps(xmT1, xmT1));
148                         xmT0 = xmT0 + xmStep0;
149                         xmT1 = xmT1 + xmStep1;
150                         rad = _mm_sqrt_ps(rad);
151 
152                         __m128i d0 = _mm_loadu_si64 (ptr);
153                         d0 = _mm_unpacklo_epi8 (d0, XMZERO);
154                         __m128i d1 = _mm_loadu_si64 (ptr+2);
155                         d1 = _mm_unpacklo_epi8 (d1, XMZERO);
156 
157                         __m128i ipos = _mm_cvttps_epi32 (rad);
158                         ipos = _mm_clamp_0_to_N_epi32(ipos, lutMax);
159 
160                         __m128i c0 = _mm_loadu_si32 (&lut[ ipos.array[0] ]);
161                         __m128i tnc = _mm_loadu_si32 (&lut[ ipos.array[1] ]);
162                         c0 = _mm_unpacklo_epi32 (c0, tnc);
163                         c0 = _mm_unpacklo_epi8 (c0, XMZERO);
164                         __m128i a0 = _mm_broadcast_alpha(c0);
165                         a0 = _mm_mulhi_epu16(a0, tqcvr);
166 
167                         __m128i c1 = _mm_loadu_si32 (&lut[ ipos.array[2] ]);
168                         tnc = _mm_loadu_si32 (&lut[ ipos.array[3] ]);
169                         c1 = _mm_unpacklo_epi32 (c1, tnc);
170                         c1 = _mm_unpacklo_epi8 (c1, XMZERO);
171                         __m128i a1 = _mm_broadcast_alpha(c1);
172                         a1 = _mm_mulhi_epu16(a1, tqcvr);
173 
174                        // alpha*source + dest - alpha*dest
175 
176                         c0 = _mm_mulhi_epu16 (c0,a0);
177                         c1 = _mm_mulhi_epu16 (c1,a1);
178                         c0 = _mm_adds_epi16 (c0,d0);
179                         c1 = _mm_adds_epi16 (c1,d1);
180                         d0 = _mm_mulhi_epu16 (d0,a0);
181                         d1 = _mm_mulhi_epu16 (d1,a1);
182                         c0 =  _mm_subs_epi16 (c0, d0);
183                         c1 =  _mm_subs_epi16 (c1, d1);
184 
185                         d0 = _mm_packus_epi16 (c0,c1);
186 
187                         _mm_storeu_si128 (cast(__m128i*)ptr,d0);
188                         
189                         ptr+=4;
190                     }
191 
192                     bpos = nsb;
193                 }
194             }
195 
196             // At this point we need to integrate scandelta
197 
198             uint* ptr = &dest[bpos*4];
199             uint* end = &dest[endbit*4];
200             int* dlptr = &delta[bpos*4];
201 
202             while (bpos < endbit)
203             {
204                 __m128 rad = _mm_add_ps(_mm_mul_ps(xmT0, xmT0),_mm_mul_ps(xmT1, xmT1));
205                 rad = _mm_sqrt_ps(rad);
206 
207                 // Integrate delta values
208 
209                 __m128i tqw = _mm_loadu_si128(cast(__m128i*)dlptr);
210                 tqw = _mm_add_epi32(tqw, _mm_slli_si128!4(tqw)); 
211                 tqw = _mm_add_epi32(tqw, _mm_slli_si128!8(tqw)); 
212                 tqw = _mm_add_epi32(tqw, xmWinding); 
213                 xmWinding = _mm_shuffle_epi32!255(tqw);  
214                 _mm_storeu_si128(cast(__m128i*)dlptr,XMZERO);
215 
216                 // convert grad pos to integer
217 
218                 __m128i ipos = _mm_cvttps_epi32(rad);
219                 ipos = _mm_clamp_0_to_N_epi32(ipos, lutMax);
220                 xmT0 = xmT0 + xmStep0;
221                 xmT1 = xmT1 + xmStep1;
222 
223                 // Process coverage values taking account of winding rule
224                 
225                 static if (wr == WindingRule.NonZero)
226                 {
227                     __m128i tcvr = _mm_srai_epi32(tqw,31); 
228                     tqw = _mm_add_epi32(tcvr,tqw);
229                     tqw = _mm_xor_si128(tqw,tcvr);        // abs
230                     tcvr = _mm_packs_epi32(tqw,XMZERO);   // saturate/pack to int16
231                     tcvr = _mm_slli_epi16(tcvr, 1);       // << to uint16
232                 }
233                 else
234                 {
235                     __m128i tcvr = _mm_and_si128(tqw,XMMSK16); 
236                     tqw = _mm_srai_epi16(tcvr,15);       // mask
237                     tcvr = _mm_xor_si128(tcvr,tqw);      // fold in halff
238                     tcvr = _mm_packs_epi32(tcvr,XMZERO); // pack to int16
239                     tcvr = _mm_slli_epi16(tcvr, 1);      // << to uint16
240                 }
241 
242                 // Load destination pixels
243 
244                 __m128i d0 = _mm_loadu_si64 (ptr);
245                 d0 = _mm_unpacklo_epi8 (d0, XMZERO);
246                 __m128i d1 = _mm_loadu_si64 (ptr+2);
247                 d1 = _mm_unpacklo_epi8 (d1, XMZERO);
248 
249                 // load grad colors
250 
251                 tcvr = _mm_unpacklo_epi16 (tcvr, tcvr);
252                 __m128i tcvr2 = _mm_unpackhi_epi32 (tcvr, tcvr);
253                 tcvr = _mm_unpacklo_epi32 (tcvr, tcvr);
254 
255                 __m128i c0 = _mm_loadu_si32 (&lut[ ipos.array[0] ]);
256                 __m128i tnc = _mm_loadu_si32 (&lut[ ipos.array[1] ]);
257                 c0 = _mm_unpacklo_epi32 (c0, tnc);
258                 c0 = _mm_unpacklo_epi8 (c0, XMZERO);
259                 __m128i a0 = _mm_broadcast_alpha(c0);
260                 a0 = _mm_mulhi_epu16(a0, tcvr);
261 
262                 __m128i c1 = _mm_loadu_si32 (&lut[ ipos.array[2] ]);
263                 tnc = _mm_loadu_si32 (&lut[ ipos.array[3] ]);
264                 c1 = _mm_unpacklo_epi32 (c1, tnc);
265                 c1 = _mm_unpacklo_epi8 (c1, XMZERO);
266                 __m128i a1 = _mm_broadcast_alpha(c1);
267                 a1 = _mm_mulhi_epu16(a1, tcvr2);
268 
269                 // alpha*source + dest - alpha*dest
270 
271                 c0 = _mm_mulhi_epu16 (c0,a0);
272                 c1 = _mm_mulhi_epu16 (c1,a1);
273                 c0 = _mm_adds_epi16 (c0,d0);
274                 c1 = _mm_adds_epi16 (c1,d1);
275                 d0 = _mm_mulhi_epu16 (d0,a0);
276                 d1 = _mm_mulhi_epu16 (d1,a1);
277                 c0 =  _mm_subs_epi16 (c0, d0);
278                 c1 =  _mm_subs_epi16 (c1, d1);
279 
280                 d0 = _mm_packus_epi16 (c0,c1);
281 
282                 _mm_storeu_si128 (cast(__m128i*)ptr,d0);
283                 
284                 bpos++;
285                 ptr+=4;
286                 dlptr+=4;
287 
288                 if (((cast(ulong*)dlptr)[0] | (cast(ulong*)dlptr)[1]) == 0)  break;
289             }
290         }
291     }
292 
293     // Member variables
294 
295     Gradient  gradient;
296     float      xctr,yctr;
297     float      xstep0,ystep0;
298     float      xstep1,ystep1; 
299 }
300 
301 void doBlit_EllipticalBlit_NonZero(void* userData, uint* dest, int* delta, DMWord* mask, int x0, int x1, int y) nothrow @nogc
302 {
303     EllipticalBlit* cb = cast(EllipticalBlit*)userData;
304     return cb.color_blit!(WindingRule.NonZero)(dest, delta, mask, x0, x1, y);
305 }
306 
307 void doBlit_EllipticalBlit_EvenOdd(void* userData, uint* dest, int* delta, DMWord* mask, int x0, int x1, int y) nothrow @nogc
308 {
309     EllipticalBlit* cb = cast(EllipticalBlit*)userData;
310     return cb.color_blit!(WindingRule.EvenOdd)(dest, delta, mask, x0, x1, y);
311 }
312