1 /** 2 * Implement the linear gradient fill style. dplug:canvas internals. 3 * 4 * Copyright: Copyright Chris Jones 2020. 5 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 6 */ 7 module dplug.canvas.linearblit; 8 9 import dplug.core.math; 10 11 import dplug.canvas.rasterizer; 12 import dplug.canvas.gradient; 13 import dplug.canvas.misc; 14 15 /* 16 linear gradient blit 17 */ 18 19 struct LinearBlit 20 { 21 nothrow: 22 @nogc: 23 24 void init(Gradient g, float x0, float y0, float x1, float y1) 25 { 26 assert(g !is null); 27 28 this.gradient = g; 29 int lutsize = g.lutLength; 30 31 xctr = x0; 32 yctr = y0; 33 float w = x1-x0; 34 float h = y1-y0; 35 float hsq = w*w + h*h; 36 if (hsq < 0.1) hsq = 0.1; // avoid div by zero 37 xstep = lutsize * w / hsq; 38 ystep = lutsize * h / hsq; 39 } 40 41 private: 42 43 void linear_blit(WindingRule wr)(uint* dest, int* delta, DMWord* mask, int x0, int x1, int y) 44 { 45 assert(x0 >= 0); 46 assert(y >= 0); 47 assert((x0 & 3) == 0); 48 assert((x1 & 3) == 0); 49 50 // main blit variables 51 52 int bpos = x0 / 4; 53 int endbit = x1 / 4; 54 __m128i xmWinding = 0; 55 uint* lut = gradient.getLookup.ptr; 56 assert(gradient.lutLength <= short.max); // LUT can be non-power-of-2 as far as LinearBlit is concerned, but this held low interest 57 short lutMax = cast(short)(gradient.lutLength - 1); 58 59 bool isopaque = false;//gradient.isOpaque 60 61 // XMM constants 62 63 immutable __m128i XMZERO = 0; 64 immutable __m128i XMFFFF = [0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF]; 65 immutable __m128i XMMSK16 = [0xFFFF,0xFFFF,0xFFFF,0xFFFF]; 66 67 // paint variables 68 69 float t0 = (bpos*4-xctr)*xstep + (y-yctr)*ystep; 70 __m128 xmT0 = _mm_mul_ps(_mm_set1_ps(xstep), _mm_setr_ps(0.0f,1.0f,2.0f,3.0f)); 71 xmT0 = _mm_add_ps(xmT0, _mm_set1_ps(t0)); 72 __m128 xmStep0 = _mm_set1_ps(xstep*4); 73 74 // main loop 75 76 while (bpos < endbit) 77 { 78 int nsb = nextSetBit(mask, bpos, endbit); 79 80 // do we have a span of unchanging coverage? 81 82 if (bpos < nsb) 83 { 84 // Calc coverage of first pixel 85 86 static if (wr == WindingRule.NonZero) 87 { 88 int cover = xmWinding[3]+delta[bpos*4]; 89 cover = abs(cover)*2; 90 if (cover > 0xFFFF) cover = 0xFFFF; 91 } 92 else 93 { 94 int cover = xmWinding[3]+delta[bpos*4]; 95 short tsc = cast(short) cover; 96 cover = (tsc ^ (tsc >> 15)) * 2; 97 } 98 99 // We can skip the span 100 101 if (cover == 0) 102 { 103 __m128 tsl = _mm_set1_ps(nsb-bpos); 104 xmT0 = _mm_add_ps(xmT0, _mm_mul_ps(tsl,xmStep0)); 105 bpos = nsb; 106 } 107 108 // Or fill span with soid color 109 110 else if (isopaque && (cover > 0xFF00)) 111 { 112 uint* ptr = &dest[bpos*4]; 113 uint* end = ptr + ((nsb-bpos)*4); 114 115 while (ptr < end) 116 { 117 __m128i ipos = _mm_cvttps_epi32 (xmT0); 118 ipos = _mm_clamp_0_to_N_epi32(ipos, lutMax); 119 xmT0 = xmT0 + xmStep0; 120 121 ptr[0] = lut[ ipos.array[0] ]; 122 ptr[1] = lut[ ipos.array[1] ]; 123 ptr[2] = lut[ ipos.array[2] ]; 124 ptr[3] = lut[ ipos.array[3] ]; 125 126 ptr+=4; 127 } 128 129 bpos = nsb; 130 } 131 132 // Or fill span with transparent color 133 134 else 135 { 136 __m128i tqcvr = _mm_set1_epi16 (cast(ushort) cover); 137 138 uint* ptr = &dest[bpos*4]; 139 uint* end = &dest[nsb*4]; 140 141 while (ptr < end) 142 { 143 __m128i ipos = _mm_cvttps_epi32 (xmT0); 144 ipos = _mm_clamp_0_to_N_epi32(ipos, lutMax); 145 xmT0 = xmT0 + xmStep0; 146 147 __m128i d01 = _mm_loadu_si128(cast(__m128i*) ptr); 148 __m128i d0 = _mm_unpacklo_epi8 (d01, XMZERO); 149 __m128i d1 = _mm_unpackhi_epi8 (d01, XMZERO); 150 151 __m128i c0 = _mm_loadu_si32 (&lut[ ipos.array[0] ]); 152 __m128i tnc = _mm_loadu_si32 (&lut[ ipos.array[1] ]); 153 c0 = _mm_unpacklo_epi32 (c0, tnc); 154 c0 = _mm_unpacklo_epi8 (c0, XMZERO); 155 __m128i a0 = _mm_broadcast_alpha(c0); 156 a0 = _mm_mulhi_epu16(a0, tqcvr); 157 158 __m128i c1 = _mm_loadu_si32 (&lut[ ipos.array[2] ]); 159 tnc = _mm_loadu_si32 (&lut[ ipos.array[3] ]); 160 c1 = _mm_unpacklo_epi32 (c1, tnc); 161 c1 = _mm_unpacklo_epi8 (c1, XMZERO); 162 __m128i a1 = _mm_broadcast_alpha(c1); 163 a1 = _mm_mulhi_epu16(a1, tqcvr); 164 165 // alpha*source + dest - alpha*dest 166 167 c0 = _mm_mulhi_epu16 (c0,a0); 168 c1 = _mm_mulhi_epu16 (c1,a1); 169 c0 = _mm_adds_epi16 (c0,d0); 170 c1 = _mm_adds_epi16 (c1,d1); 171 d0 = _mm_mulhi_epu16 (d0,a0); 172 d1 = _mm_mulhi_epu16 (d1,a1); 173 c0 = _mm_subs_epi16 (c0, d0); 174 c1 = _mm_subs_epi16 (c1, d1); 175 176 d0 = _mm_packus_epi16 (c0,c1); 177 178 _mm_storeu_si128 (cast(__m128i*)ptr,d0); 179 180 ptr+=4; 181 } 182 183 bpos = nsb; 184 } 185 } 186 187 // At this point we need to integrate scandelta 188 189 uint* ptr = &dest[bpos*4]; 190 uint* end = &dest[endbit*4]; 191 int* dlptr = &delta[bpos*4]; 192 193 while (bpos < endbit) 194 { 195 // Integrate delta values 196 197 __m128i tqw = _mm_loadu_si128(cast(__m128i*)dlptr); 198 tqw = _mm_add_epi32(tqw, _mm_slli_si128!4(tqw)); 199 tqw = _mm_add_epi32(tqw, _mm_slli_si128!8(tqw)); 200 tqw = _mm_add_epi32(tqw, xmWinding); 201 xmWinding = _mm_shuffle_epi32!255(tqw); 202 _mm_storeu_si128(cast(__m128i*)dlptr,XMZERO); 203 204 // Process coverage values taking account of winding rule 205 206 static if (wr == WindingRule.NonZero) 207 { 208 __m128i tcvr = _mm_srai_epi32(tqw,31); 209 tqw = _mm_add_epi32(tcvr,tqw); 210 tqw = _mm_xor_si128(tqw,tcvr); // abs 211 tcvr = _mm_packs_epi32(tqw,XMZERO); // saturate/pack to int16 212 tcvr = _mm_slli_epi16(tcvr, 1); // << to uint16 213 } 214 else 215 { 216 __m128i tcvr = _mm_and_si128(tqw,XMMSK16); 217 tqw = _mm_srai_epi16(tcvr,15); // mask 218 tcvr = _mm_xor_si128(tcvr,tqw); // fold in halff 219 tcvr = _mm_packs_epi32(tcvr,XMZERO); // pack to int16 220 tcvr = _mm_slli_epi16(tcvr, 1); // << to uint16 221 } 222 223 // convert grad pos to integer 224 225 __m128i ipos = _mm_cvttps_epi32 (xmT0); 226 ipos = _mm_clamp_0_to_N_epi32(ipos, lutMax); 227 xmT0 = xmT0 + xmStep0; 228 229 // Load destination pixels 230 __m128i d01 = _mm_loadu_si128(cast(__m128i*) ptr); 231 __m128i d0 = _mm_unpacklo_epi8 (d01, XMZERO); 232 __m128i d1 = _mm_unpackhi_epi8 (d01, XMZERO); 233 234 // load grad colors 235 236 tcvr = _mm_unpacklo_epi16 (tcvr, tcvr); 237 __m128i tcvr2 = _mm_unpackhi_epi32 (tcvr, tcvr); 238 tcvr = _mm_unpacklo_epi32 (tcvr, tcvr); 239 240 __m128i c0 = _mm_loadu_si32 (&lut[ ipos.array[0] ]); 241 __m128i tnc = _mm_loadu_si32 (&lut[ ipos.array[1] ]); 242 c0 = _mm_unpacklo_epi32 (c0, tnc); 243 c0 = _mm_unpacklo_epi8 (c0, XMZERO); 244 __m128i a0 = _mm_broadcast_alpha(c0); 245 a0 = _mm_mulhi_epu16(a0, tcvr); 246 247 248 __m128i c1 = _mm_loadu_si32 (&lut[ ipos.array[2] ]); 249 tnc = _mm_loadu_si32 (&lut[ ipos.array[3] ]); 250 c1 = _mm_unpacklo_epi32 (c1, tnc); 251 c1 = _mm_unpacklo_epi8 (c1, XMZERO); 252 __m128i a1 = _mm_broadcast_alpha(c1); 253 a1 = _mm_mulhi_epu16(a1, tcvr2); 254 255 // alpha*source + dest - alpha*dest 256 257 c0 = _mm_mulhi_epu16 (c0,a0); 258 c1 = _mm_mulhi_epu16 (c1,a1); 259 c0 = _mm_adds_epi16 (c0,d0); 260 c1 = _mm_adds_epi16 (c1,d1); 261 d0 = _mm_mulhi_epu16 (d0,a0); 262 d1 = _mm_mulhi_epu16 (d1,a1); 263 c0 = _mm_subs_epi16 (c0, d0); 264 c1 = _mm_subs_epi16 (c1, d1); 265 266 d0 = _mm_packus_epi16 (c0,c1); 267 268 _mm_storeu_si128 (cast(__m128i*)ptr,d0); 269 270 bpos++; 271 ptr+=4; 272 dlptr+=4; 273 274 if (((cast(ulong*)dlptr)[0] | (cast(ulong*)dlptr)[1]) == 0) break; 275 } 276 } 277 } 278 279 // Member variables 280 281 Gradient gradient; 282 float xctr,yctr; 283 float xstep,ystep; 284 } 285 286 nothrow: 287 @nogc: 288 289 void doBlit_LinearBlit_NonZero(void* userData, uint* dest, int* delta, DMWord* mask, int x0, int x1, int y) 290 { 291 LinearBlit* lb = cast(LinearBlit*)userData; 292 return lb.linear_blit!(WindingRule.NonZero)(dest, delta, mask, x0, x1, y); 293 } 294 295 void doBlit_LinearBlit_EvenOdd(void* userData, uint* dest, int* delta, DMWord* mask, int x0, int x1, int y) 296 { 297 LinearBlit* lb = cast(LinearBlit*)userData; 298 return lb.linear_blit!(WindingRule.EvenOdd)(dest, delta, mask, x0, x1, y); 299 }