MagickCore 7.1.1
Convert, Edit, Or Compose Bitmap Images
Loading...
Searching...
No Matches
accelerate-kernels-private.h
1/*
2 Copyright @ 2010 ImageMagick Studio LLC, a non-profit organization
3 dedicated to making software imaging solutions freely available.
4
5 You may not use this file except in compliance with the License. You may
6 obtain a copy of the License at
7
8 https://imagemagick.org/script/license.php
9
10 Unless required by applicable law or agreed to in writing, software
11 distributed under the License is distributed on an "AS IS" BASIS,
12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 See the License for the specific language governing permissions and
14 limitations under the License.
15
16 MagickCore private kernels for accelerated functions.
17*/
18
19#ifndef MAGICKCORE_ACCELERATE_KERNELS_PRIVATE_H
20#define MAGICKCORE_ACCELERATE_KERNELS_PRIVATE_H
21
22#if defined(__cplusplus) || defined(c_plusplus)
23extern "C" {
24#endif
25
26#if defined(MAGICKCORE_OPENCL_SUPPORT)
27
28/*
29 Define declarations.
30*/
31#define OPENCL_DEFINE(VAR,...) "\n #""define " #VAR " " #__VA_ARGS__ " \n"
32#define OPENCL_ELIF(...) "\n #""elif " #__VA_ARGS__ " \n"
33#define OPENCL_ELSE() "\n #""else " " \n"
34#define OPENCL_ENDIF() "\n #""endif " " \n"
35#define OPENCL_IF(...) "\n #""if " #__VA_ARGS__ " \n"
36#define STRINGIFY(...) #__VA_ARGS__ "\n"
37
38const char *accelerateKernels =
39
40/*
41 Define declarations.
42*/
43 OPENCL_DEFINE(SigmaUniform, (attenuate*0.015625f))
44 OPENCL_DEFINE(SigmaGaussian, (attenuate*0.015625f))
45 OPENCL_DEFINE(SigmaImpulse, (attenuate*0.1f))
46 OPENCL_DEFINE(SigmaLaplacian, (attenuate*0.0390625f))
47 OPENCL_DEFINE(SigmaMultiplicativeGaussian, (attenuate*0.5f))
48 OPENCL_DEFINE(SigmaPoisson, (attenuate*12.5f))
49 OPENCL_DEFINE(SigmaRandom, (attenuate))
50 OPENCL_DEFINE(TauGaussian, (attenuate*0.078125f))
51 OPENCL_DEFINE(MagickMax(x,y), (((x) > (y)) ? (x) : (y)))
52 OPENCL_DEFINE(MagickMin(x,y), (((x) < (y)) ? (x) : (y)))
53 OPENCL_DEFINE(QuantumScale, (1.0/QuantumRange))
54
55/*
56 Typedef declarations.
57*/
58 STRINGIFY(
59 typedef enum
60 {
61 UndefinedColorspace,
62 CMYColorspace, /* negated linear RGB colorspace */
63 CMYKColorspace, /* CMY with Black separation */
64 GRAYColorspace, /* Single Channel greyscale (non-linear) image */
65 HCLColorspace,
66 HCLpColorspace,
67 HSBColorspace,
68 HSIColorspace,
69 HSLColorspace,
70 HSVColorspace, /* alias for HSB */
71 HWBColorspace,
72 LabColorspace,
73 LCHColorspace, /* alias for LCHuv */
74 LCHabColorspace, /* Cylindrical (Polar) Lab */
75 LCHuvColorspace, /* Cylindrical (Polar) Luv */
76 LogColorspace,
77 LMSColorspace,
78 LuvColorspace,
79 OHTAColorspace,
80 Rec601YCbCrColorspace,
81 Rec709YCbCrColorspace,
82 RGBColorspace, /* Linear RGB colorspace */
83 scRGBColorspace, /* ??? */
84 sRGBColorspace, /* Default: non-linear sRGB colorspace */
85 TransparentColorspace,
86 xyYColorspace,
87 XYZColorspace, /* IEEE Color Reference colorspace */
88 YCbCrColorspace,
89 YCCColorspace,
90 YDbDrColorspace,
91 YIQColorspace,
92 YPbPrColorspace,
93 YUVColorspace,
94 LinearGRAYColorspace /* Single Channel greyscale (linear) image */
95 } ColorspaceType;
96 )
97
98 STRINGIFY(
99 typedef enum
100 {
101 UndefinedCompositeOp,
102 AlphaCompositeOp,
103 AtopCompositeOp,
104 BlendCompositeOp,
105 BlurCompositeOp,
106 BumpmapCompositeOp,
107 ChangeMaskCompositeOp,
108 ClearCompositeOp,
109 ColorBurnCompositeOp,
110 ColorDodgeCompositeOp,
111 ColorizeCompositeOp,
112 CopyBlackCompositeOp,
113 CopyBlueCompositeOp,
114 CopyCompositeOp,
115 CopyCyanCompositeOp,
116 CopyGreenCompositeOp,
117 CopyMagentaCompositeOp,
118 CopyAlphaCompositeOp,
119 CopyRedCompositeOp,
120 CopyYellowCompositeOp,
121 DarkenCompositeOp,
122 DarkenIntensityCompositeOp,
123 DifferenceCompositeOp,
124 DisplaceCompositeOp,
125 DissolveCompositeOp,
126 DistortCompositeOp,
127 DivideDstCompositeOp,
128 DivideSrcCompositeOp,
129 DstAtopCompositeOp,
130 DstCompositeOp,
131 DstInCompositeOp,
132 DstOutCompositeOp,
133 DstOverCompositeOp,
134 ExclusionCompositeOp,
135 HardLightCompositeOp,
136 HardMixCompositeOp,
137 HueCompositeOp,
138 InCompositeOp,
139 IntensityCompositeOp,
140 LightenCompositeOp,
141 LightenIntensityCompositeOp,
142 LinearBurnCompositeOp,
143 LinearDodgeCompositeOp,
144 LinearLightCompositeOp,
145 LuminizeCompositeOp,
146 MathematicsCompositeOp,
147 MinusDstCompositeOp,
148 MinusSrcCompositeOp,
149 ModulateCompositeOp,
150 ModulusAddCompositeOp,
151 ModulusSubtractCompositeOp,
152 MultiplyCompositeOp,
153 NoCompositeOp,
154 OutCompositeOp,
155 OverCompositeOp,
156 OverlayCompositeOp,
157 PegtopLightCompositeOp,
158 PinLightCompositeOp,
159 PlusCompositeOp,
160 ReplaceCompositeOp,
161 SaturateCompositeOp,
162 ScreenCompositeOp,
163 SoftLightCompositeOp,
164 SrcAtopCompositeOp,
165 SrcCompositeOp,
166 SrcInCompositeOp,
167 SrcOutCompositeOp,
168 SrcOverCompositeOp,
169 ThresholdCompositeOp,
170 VividLightCompositeOp,
171 XorCompositeOp,
172 StereoCompositeOp
173 } CompositeOperator;
174 )
175
176 STRINGIFY(
177 typedef enum
178 {
179 UndefinedFunction,
180 ArcsinFunction,
181 ArctanFunction,
182 PolynomialFunction,
183 SinusoidFunction
184 } MagickFunction;
185 )
186
187 STRINGIFY(
188 typedef enum
189 {
190 UndefinedNoise,
191 UniformNoise,
192 GaussianNoise,
193 MultiplicativeGaussianNoise,
194 ImpulseNoise,
195 LaplacianNoise,
196 PoissonNoise,
197 RandomNoise
198 } NoiseType;
199 )
200
201 STRINGIFY(
202 typedef enum
203 {
204 UndefinedPixelIntensityMethod = 0,
205 AveragePixelIntensityMethod,
206 BrightnessPixelIntensityMethod,
207 LightnessPixelIntensityMethod,
208 MSPixelIntensityMethod,
209 Rec601LumaPixelIntensityMethod,
210 Rec601LuminancePixelIntensityMethod,
211 Rec709LumaPixelIntensityMethod,
212 Rec709LuminancePixelIntensityMethod,
213 RMSPixelIntensityMethod
214 } PixelIntensityMethod;
215 )
216
217 STRINGIFY(
218 typedef enum
219 {
220 BoxWeightingFunction = 0,
221 TriangleWeightingFunction,
222 CubicBCWeightingFunction,
223 HannWeightingFunction,
224 HammingWeightingFunction,
225 BlackmanWeightingFunction,
226 GaussianWeightingFunction,
227 QuadraticWeightingFunction,
228 JincWeightingFunction,
229 SincWeightingFunction,
230 SincFastWeightingFunction,
231 KaiserWeightingFunction,
232 WelchWeightingFunction,
233 BohmanWeightingFunction,
234 LagrangeWeightingFunction,
235 CosineWeightingFunction
236 } ResizeWeightingFunctionType;
237 )
238
239 STRINGIFY(
240 typedef enum
241 {
242 UndefinedChannel = 0x0000,
243 RedChannel = 0x0001,
244 GrayChannel = 0x0001,
245 CyanChannel = 0x0001,
246 GreenChannel = 0x0002,
247 MagentaChannel = 0x0002,
248 BlueChannel = 0x0004,
249 YellowChannel = 0x0004,
250 BlackChannel = 0x0008,
251 AlphaChannel = 0x0010,
252 OpacityChannel = 0x0010,
253 IndexChannel = 0x0020, /* Color Index Table? */
254 ReadMaskChannel = 0x0040, /* Pixel is Not Readable? */
255 WriteMaskChannel = 0x0080, /* Pixel is Write Protected? */
256 MetaChannel = 0x0100, /* ???? */
257 CompositeChannels = 0x001F,
258 AllChannels = 0x7ffffff, /* 0x7FFFFFFFFFFFFFFF for 64-bit channel masks */
259 /*
260 Special purpose channel types.
261 FUTURE: are these needed any more - they are more like hacks
262 SyncChannels for example is NOT a real channel but a 'flag'
263 It really says -- "User has not defined channels"
264 Though it does have extra meaning in the "-auto-level" operator
265 */
266 TrueAlphaChannel = 0x0100, /* extract actual alpha channel from opacity */
267 RGBChannels = 0x0200, /* set alpha from grayscale mask in RGB */
268 GrayChannels = 0x0400,
269 SyncChannels = 0x20000, /* channels modified as a single unit */
270 DefaultChannels = AllChannels
271 } ChannelType; /* must correspond to PixelChannel */
272 )
273
274/*
275 Helper functions.
276*/
277
278OPENCL_IF((MAGICKCORE_QUANTUM_DEPTH == 8))
279
280 STRINGIFY(
281 static inline CLQuantum ScaleCharToQuantum(const unsigned char value)
282 {
283 return((CLQuantum) value);
284 }
285 )
286
287OPENCL_ELIF((MAGICKCORE_QUANTUM_DEPTH == 16))
288
289 STRINGIFY(
290 static inline CLQuantum ScaleCharToQuantum(const unsigned char value)
291 {
292 return((CLQuantum) (257.0f*value));
293 }
294 )
295
296OPENCL_ELIF((MAGICKCORE_QUANTUM_DEPTH == 32))
297
298 STRINGIFY(
299 static inline CLQuantum ScaleCharToQuantum(const unsigned char value)
300 {
301 return((CLQuantum) (16843009.0*value));
302 }
303 )
304
305OPENCL_ENDIF()
306
307OPENCL_IF((MAGICKCORE_HDRI_SUPPORT == 1))
308
309 STRINGIFY(
310 static inline CLQuantum ClampToQuantum(const float value)
311 {
312 return (CLQuantum) clamp(value, 0.0f, QuantumRange);
313 }
314 )
315
316OPENCL_ELSE()
317
318 STRINGIFY(
319 static inline CLQuantum ClampToQuantum(const float value)
320 {
321 return (CLQuantum) (clamp(value, 0.0f, QuantumRange) + 0.5f);
322 }
323 )
324
325OPENCL_ENDIF()
326
327 STRINGIFY(
328 static inline int ClampToCanvas(const int offset,const int range)
329 {
330 return clamp(offset, (int)0, range-1);
331 }
332 )
333
334 STRINGIFY(
335 static inline uint ScaleQuantumToMap(CLQuantum value)
336 {
337 if (value >= (CLQuantum) MaxMap)
338 return ((uint)MaxMap);
339 else
340 return ((uint)value);
341 }
342 )
343
344 STRINGIFY(
345 static inline float PerceptibleReciprocal(const float x)
346 {
347 float sign = x < (float) 0.0 ? (float) -1.0 : (float) 1.0;
348 return((sign*x) >= MagickEpsilon ? (float) 1.0/x : sign*((float) 1.0/MagickEpsilon));
349 }
350 )
351
352 STRINGIFY(
353
354 static inline unsigned int getPixelIndex(const unsigned int number_channels,
355 const unsigned int columns, const unsigned int x, const unsigned int y)
356 {
357 return (x * number_channels) + (y * columns * number_channels);
358 }
359
360 static inline float getPixelRed(const __global CLQuantum *p) { return (float)*p; }
361 static inline float getPixelGreen(const __global CLQuantum *p) { return (float)*(p+1); }
362 static inline float getPixelBlue(const __global CLQuantum *p) { return (float)*(p+2); }
363 static inline float getPixelAlpha(const __global CLQuantum *p,const unsigned int number_channels) { return (float)*(p+number_channels-1); }
364
365 static inline void setPixelRed(__global CLQuantum *p,const CLQuantum value) { *p=value; }
366 static inline void setPixelGreen(__global CLQuantum *p,const CLQuantum value) { *(p+1)=value; }
367 static inline void setPixelBlue(__global CLQuantum *p,const CLQuantum value) { *(p+2)=value; }
368 static inline void setPixelAlpha(__global CLQuantum *p,const unsigned int number_channels,const CLQuantum value) { *(p+number_channels-1)=value; }
369
370 static inline CLQuantum getBlue(CLPixelType p) { return p.x; }
371 static inline void setBlue(CLPixelType* p, CLQuantum value) { (*p).x = value; }
372 static inline float getBlueF4(float4 p) { return p.x; }
373 static inline void setBlueF4(float4* p, float value) { (*p).x = value; }
374
375 static inline CLQuantum getGreen(CLPixelType p) { return p.y; }
376 static inline void setGreen(CLPixelType* p, CLQuantum value) { (*p).y = value; }
377 static inline float getGreenF4(float4 p) { return p.y; }
378 static inline void setGreenF4(float4* p, float value) { (*p).y = value; }
379
380 static inline CLQuantum getRed(CLPixelType p) { return p.z; }
381 static inline void setRed(CLPixelType* p, CLQuantum value) { (*p).z = value; }
382 static inline float getRedF4(float4 p) { return p.z; }
383 static inline void setRedF4(float4* p, float value) { (*p).z = value; }
384
385 static inline CLQuantum getAlpha(CLPixelType p) { return p.w; }
386 static inline void setAlpha(CLPixelType* p, CLQuantum value) { (*p).w = value; }
387 static inline float getAlphaF4(float4 p) { return p.w; }
388 static inline void setAlphaF4(float4* p, float value) { (*p).w = value; }
389
390 static inline void ReadChannels(const __global CLQuantum *p, const unsigned int number_channels,
391 const ChannelType channel, float *red, float *green, float *blue, float *alpha)
392 {
393 if ((channel & RedChannel) != 0)
394 *red=getPixelRed(p);
395
396 if (number_channels > 2)
397 {
398 if ((channel & GreenChannel) != 0)
399 *green=getPixelGreen(p);
400
401 if ((channel & BlueChannel) != 0)
402 *blue=getPixelBlue(p);
403 }
404
405 if (((number_channels == 4) || (number_channels == 2)) &&
406 ((channel & AlphaChannel) != 0))
407 *alpha=getPixelAlpha(p,number_channels);
408 }
409
410 static inline float4 ReadAllChannels(const __global CLQuantum *image, const unsigned int number_channels,
411 const unsigned int columns, const unsigned int x, const unsigned int y)
412 {
413 const __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y);
414
415 float4 pixel;
416
417 pixel.x=getPixelRed(p);
418
419 if (number_channels > 2)
420 {
421 pixel.y=getPixelGreen(p);
422 pixel.z=getPixelBlue(p);
423 }
424
425 if ((number_channels == 4) || (number_channels == 2))
426 pixel.w=getPixelAlpha(p,number_channels);
427 return(pixel);
428 }
429
430 static inline float4 ReadFloat4(const __global CLQuantum *image, const unsigned int number_channels,
431 const unsigned int columns, const unsigned int x, const unsigned int y, const ChannelType channel)
432 {
433 const __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y);
434
435 float red = 0.0f;
436 float green = 0.0f;
437 float blue = 0.0f;
438 float alpha = 0.0f;
439
440 ReadChannels(p, number_channels, channel, &red, &green, &blue, &alpha);
441 return (float4)(red, green, blue, alpha);
442 }
443
444 static inline void WriteChannels(__global CLQuantum *p, const unsigned int number_channels,
445 const ChannelType channel, float red, float green, float blue, float alpha)
446 {
447 if ((channel & RedChannel) != 0)
448 setPixelRed(p,ClampToQuantum(red));
449
450 if (number_channels > 2)
451 {
452 if ((channel & GreenChannel) != 0)
453 setPixelGreen(p,ClampToQuantum(green));
454
455 if ((channel & BlueChannel) != 0)
456 setPixelBlue(p,ClampToQuantum(blue));
457 }
458
459 if (((number_channels == 4) || (number_channels == 2)) &&
460 ((channel & AlphaChannel) != 0))
461 setPixelAlpha(p,number_channels,ClampToQuantum(alpha));
462 }
463
464 static inline void WriteAllChannels(__global CLQuantum *image, const unsigned int number_channels,
465 const unsigned int columns, const unsigned int x, const unsigned int y, float4 pixel)
466 {
467 __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y);
468
469 setPixelRed(p,ClampToQuantum(pixel.x));
470
471 if (number_channels > 2)
472 {
473 setPixelGreen(p,ClampToQuantum(pixel.y));
474 setPixelBlue(p,ClampToQuantum(pixel.z));
475 }
476
477 if ((number_channels == 4) || (number_channels == 2))
478 setPixelAlpha(p,number_channels,ClampToQuantum(pixel.w));
479 }
480
481 static inline void WriteFloat4(__global CLQuantum *image, const unsigned int number_channels,
482 const unsigned int columns, const unsigned int x, const unsigned int y, const ChannelType channel,
483 float4 pixel)
484 {
485 __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y);
486 WriteChannels(p, number_channels, channel, pixel.x, pixel.y, pixel.z, pixel.w);
487 }
488
489 static inline float GetPixelIntensity(const unsigned int colorspace,
490 const unsigned int method,float red,float green,float blue)
491 {
492 float intensity;
493
494 if ((colorspace == GRAYColorspace) || (colorspace == LinearGRAYColorspace))
495 return red;
496
497 switch (method)
498 {
499 case AveragePixelIntensityMethod:
500 {
501 intensity=(red+green+blue)/3.0;
502 break;
503 }
504 case BrightnessPixelIntensityMethod:
505 {
506 intensity=MagickMax(MagickMax(red,green),blue);
507 break;
508 }
509 case LightnessPixelIntensityMethod:
510 {
511 intensity=(MagickMin(MagickMin(red,green),blue)+
512 MagickMax(MagickMax(red,green),blue))/2.0;
513 break;
514 }
515 case MSPixelIntensityMethod:
516 {
517 intensity=(float) (((float) red*red+green*green+blue*blue)/
518 (3.0*QuantumRange));
519 break;
520 }
521 case Rec601LumaPixelIntensityMethod:
522 {
523 /*
524 if (image->colorspace == RGBColorspace)
525 {
526 red=EncodePixelGamma(red);
527 green=EncodePixelGamma(green);
528 blue=EncodePixelGamma(blue);
529 }
530 */
531 intensity=0.298839*red+0.586811*green+0.114350*blue;
532 break;
533 }
534 case Rec601LuminancePixelIntensityMethod:
535 {
536 /*
537 if (image->colorspace == sRGBColorspace)
538 {
539 red=DecodePixelGamma(red);
540 green=DecodePixelGamma(green);
541 blue=DecodePixelGamma(blue);
542 }
543 */
544 intensity=0.298839*red+0.586811*green+0.114350*blue;
545 break;
546 }
547 case Rec709LumaPixelIntensityMethod:
548 default:
549 {
550 /*
551 if (image->colorspace == RGBColorspace)
552 {
553 red=EncodePixelGamma(red);
554 green=EncodePixelGamma(green);
555 blue=EncodePixelGamma(blue);
556 }
557 */
558 intensity=0.212656*red+0.715158*green+0.072186*blue;
559 break;
560 }
561 case Rec709LuminancePixelIntensityMethod:
562 {
563 /*
564 if (image->colorspace == sRGBColorspace)
565 {
566 red=DecodePixelGamma(red);
567 green=DecodePixelGamma(green);
568 blue=DecodePixelGamma(blue);
569 }
570 */
571 intensity=0.212656*red+0.715158*green+0.072186*blue;
572 break;
573 }
574 case RMSPixelIntensityMethod:
575 {
576 intensity=(float) (sqrt((float) red*red+green*green+blue*blue)/
577 sqrt(3.0));
578 break;
579 }
580 }
581
582 return intensity;
583 }
584
585 static inline int mirrorBottom(int value)
586 {
587 return (value < 0) ? - (value) : value;
588 }
589
590 static inline int mirrorTop(int value, int width)
591 {
592 return (value >= width) ? (2 * width - value - 1) : value;
593 }
594 )
595
596/*
597%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
598% %
599% %
600% %
601% A d d N o i s e %
602% %
603% %
604% %
605%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
606*/
607
608 STRINGIFY(
609 /*
610 Part of MWC64X by David Thomas, dt10@imperial.ac.uk
611 This is provided under BSD, full license is with the main package.
612 See http://www.doc.ic.ac.uk/~dt10/research
613 */
614
615 // Pre: a<M, b<M
616 // Post: r=(a+b) mod M
617 ulong MWC_AddMod64(ulong a, ulong b, ulong M)
618 {
619 ulong v=a+b;
620 //if( (v>=M) || (v<a) )
621 if( (v>=M) || (convert_float(v) < convert_float(a)) ) // workaround for what appears to be an optimizer bug.
622 v=v-M;
623 return v;
624 }
625
626 // Pre: a<M,b<M
627 // Post: r=(a*b) mod M
628 // This could be done more efficiently, but it is portable, and should
629 // be easy to understand. It can be replaced with any of the better
630 // modular multiplication algorithms (for example if you know you have
631 // double precision available or something).
632 ulong MWC_MulMod64(ulong a, ulong b, ulong M)
633 {
634 ulong r=0;
635 while(a!=0){
636 if(a&1)
637 r=MWC_AddMod64(r,b,M);
638 b=MWC_AddMod64(b,b,M);
639 a=a>>1;
640 }
641 return r;
642 }
643
644 // Pre: a<M, e>=0
645 // Post: r=(a^b) mod M
646 // This takes at most ~64^2 modular additions, so probably about 2^15 or so instructions on
647 // most architectures
648 ulong MWC_PowMod64(ulong a, ulong e, ulong M)
649 {
650 ulong sqr=a, acc=1;
651 while(e!=0){
652 if(e&1)
653 acc=MWC_MulMod64(acc,sqr,M);
654 sqr=MWC_MulMod64(sqr,sqr,M);
655 e=e>>1;
656 }
657 return acc;
658 }
659
660 uint2 MWC_SkipImpl_Mod64(uint2 curr, ulong A, ulong M, ulong distance)
661 {
662 ulong m=MWC_PowMod64(A, distance, M);
663 ulong x=curr.x*(ulong)A+curr.y;
664 x=MWC_MulMod64(x, m, M);
665 return (uint2)((uint)(x/A), (uint)(x%A));
666 }
667
668 uint2 MWC_SeedImpl_Mod64(ulong A, ulong M, uint vecSize, uint vecOffset, ulong streamBase, ulong streamGap)
669 {
670 // This is an arbitrary constant for starting LCG jumping from. I didn't
671 // want to start from 1, as then you end up with the two or three first values
672 // being a bit poor in ones - once you've decided that, one constant is as
673 // good as any another. There is no deep mathematical reason for it, I just
674 // generated a random number.
675 enum{ MWC_BASEID = 4077358422479273989UL };
676
677 ulong dist=streamBase + (get_global_id(0)*vecSize+vecOffset)*streamGap;
678 ulong m=MWC_PowMod64(A, dist, M);
679
680 ulong x=MWC_MulMod64(MWC_BASEID, m, M);
681 return (uint2)((uint)(x/A), (uint)(x%A));
682 }
683
685 typedef struct{ uint x; uint c; uint seed0; ulong seed1; } mwc64x_state_t;
686
687 void MWC64X_Step(mwc64x_state_t *s)
688 {
689 uint X=s->x, C=s->c;
690
691 uint Xn=s->seed0*X+C;
692 uint carry=(uint)(Xn<C); // The (Xn<C) will be zero or one for scalar
693 uint Cn=mad_hi(s->seed0,X,carry);
694
695 s->x=Xn;
696 s->c=Cn;
697 }
698
699 void MWC64X_Skip(mwc64x_state_t *s, ulong distance)
700 {
701 uint2 tmp=MWC_SkipImpl_Mod64((uint2)(s->x,s->c), s->seed0, s->seed1, distance);
702 s->x=tmp.x;
703 s->c=tmp.y;
704 }
705
706 void MWC64X_SeedStreams(mwc64x_state_t *s, ulong baseOffset, ulong perStreamOffset)
707 {
708 uint2 tmp=MWC_SeedImpl_Mod64(s->seed0, s->seed1, 1, 0, baseOffset, perStreamOffset);
709 s->x=tmp.x;
710 s->c=tmp.y;
711 }
712
714 uint MWC64X_NextUint(mwc64x_state_t *s)
715 {
716 uint res=s->x ^ s->c;
717 MWC64X_Step(s);
718 return res;
719 }
720
721 //
722 // End of MWC64X excerpt
723 //
724
725 float mwcReadPseudoRandomValue(mwc64x_state_t* rng)
726 {
727 return (1.0f * MWC64X_NextUint(rng)) / (float)(0xffffffff); // normalized to 1.0
728 }
729
730 float mwcGenerateDifferentialNoise(mwc64x_state_t* r, float pixel, NoiseType noise_type, float attenuate)
731 {
732 float
733 alpha,
734 beta,
735 noise,
736 sigma;
737
738 noise = 0.0f;
739 alpha=mwcReadPseudoRandomValue(r);
740 switch(noise_type)
741 {
742 case UniformNoise:
743 default:
744 {
745 noise=(pixel+QuantumRange*SigmaUniform*(alpha-0.5f));
746 break;
747 }
748 case GaussianNoise:
749 {
750 float
751 gamma,
752 tau;
753
754 if (alpha == 0.0f)
755 alpha=1.0f;
756 beta=mwcReadPseudoRandomValue(r);
757 gamma=sqrt(-2.0f*log(alpha));
758 sigma=gamma*cospi((2.0f*beta));
759 tau=gamma*sinpi((2.0f*beta));
760 noise=pixel+sqrt(pixel)*SigmaGaussian*sigma+QuantumRange*TauGaussian*tau;
761 break;
762 }
763 case ImpulseNoise:
764 {
765 if (alpha < (SigmaImpulse/2.0f))
766 noise=0.0f;
767 else
768 if (alpha >= (1.0f-(SigmaImpulse/2.0f)))
769 noise=QuantumRange;
770 else
771 noise=pixel;
772 break;
773 }
774 case LaplacianNoise:
775 {
776 if (alpha <= 0.5f)
777 {
778 if (alpha <= MagickEpsilon)
779 noise=(pixel-QuantumRange);
780 else
781 noise=(pixel+QuantumRange*SigmaLaplacian*log(2.0f*alpha)+
782 0.5f);
783 break;
784 }
785 beta=1.0f-alpha;
786 if (beta <= (0.5f*MagickEpsilon))
787 noise=(pixel+QuantumRange);
788 else
789 noise=(pixel-QuantumRange*SigmaLaplacian*log(2.0f*beta)+0.5f);
790 break;
791 }
792 case MultiplicativeGaussianNoise:
793 {
794 sigma=1.0f;
795 if (alpha > MagickEpsilon)
796 sigma=sqrt(-2.0f*log(alpha));
797 beta=mwcReadPseudoRandomValue(r);
798 noise=(pixel+pixel*SigmaMultiplicativeGaussian*sigma*
799 cospi((2.0f*beta))/2.0f);
800 break;
801 }
802 case PoissonNoise:
803 {
804 float
805 poisson;
806 unsigned int i;
807 poisson=exp(-SigmaPoisson*QuantumScale*pixel);
808 for (i=0; alpha > poisson; i++)
809 {
810 beta=mwcReadPseudoRandomValue(r);
811 alpha*=beta;
812 }
813 noise=(QuantumRange*i*PerceptibleReciprocal(SigmaPoisson));
814 break;
815 }
816 case RandomNoise:
817 {
818 noise=(QuantumRange*SigmaRandom*alpha);
819 break;
820 }
821 }
822 return noise;
823 }
824 )
825
826/*
827%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
828% %
829% %
830% %
831% B l u r %
832% %
833% %
834% %
835%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
836*/
837
838 STRINGIFY(
839 /*
840 Reduce image noise and reduce detail levels by row
841 */
842 __kernel void BlurRow(const __global CLQuantum *image,
843 const unsigned int number_channels,const ChannelType channel,
844 __constant float *filter,const unsigned int width,
845 const unsigned int imageColumns,const unsigned int imageRows,
846 __local float4 *temp,__global float4 *tempImage)
847 {
848 const int x = get_global_id(0);
849 const int y = get_global_id(1);
850
851 const int columns = imageColumns;
852
853 const unsigned int radius = (width-1)/2;
854 const int wsize = get_local_size(0);
855 const unsigned int loadSize = wsize+width;
856
857 //group coordinate
858 const int groupX=get_local_size(0)*get_group_id(0);
859
860 //parallel load and clamp
861 for (int i=get_local_id(0); i < loadSize; i=i+get_local_size(0))
862 {
863 int cx = ClampToCanvas(i + groupX - radius, columns);
864 temp[i] = ReadFloat4(image, number_channels, columns, cx, y, channel);
865 }
866
867 // barrier
868 barrier(CLK_LOCAL_MEM_FENCE);
869
870 // only do the work if this is not a patched item
871 if (get_global_id(0) < columns)
872 {
873 // compute
874 float4 result = (float4) 0;
875
876 int i = 0;
877
878 for ( ; i+7 < width; )
879 {
880 for (int j=0; j < 8; j++)
881 result+=filter[i+j]*temp[i+j+get_local_id(0)];
882 i+=8;
883 }
884
885 for ( ; i < width; i++)
886 result+=filter[i]*temp[i+get_local_id(0)];
887
888 // write back to global
889 tempImage[y*columns+x] = result;
890 }
891 }
892 )
893
894 STRINGIFY(
895 /*
896 Reduce image noise and reduce detail levels by line
897 */
898 __kernel void BlurColumn(const __global float4 *blurRowData,
899 const unsigned int number_channels,const ChannelType channel,
900 __constant float *filter,const unsigned int width,
901 const unsigned int imageColumns,const unsigned int imageRows,
902 __local float4 *temp,__global CLQuantum *filteredImage)
903 {
904 const int x = get_global_id(0);
905 const int y = get_global_id(1);
906
907 const int columns = imageColumns;
908 const int rows = imageRows;
909
910 unsigned int radius = (width-1)/2;
911 const int wsize = get_local_size(1);
912 const unsigned int loadSize = wsize+width;
913
914 //group coordinate
915 const int groupX=get_local_size(0)*get_group_id(0);
916 const int groupY=get_local_size(1)*get_group_id(1);
917 //notice that get_local_size(0) is 1, so
918 //groupX=get_group_id(0);
919
920 //parallel load and clamp
921 for (int i = get_local_id(1); i < loadSize; i=i+get_local_size(1))
922 temp[i] = blurRowData[ClampToCanvas(i+groupY-radius, rows) * columns + groupX];
923
924 // barrier
925 barrier(CLK_LOCAL_MEM_FENCE);
926
927 // only do the work if this is not a patched item
928 if (get_global_id(1) < rows)
929 {
930 // compute
931 float4 result = (float4) 0;
932
933 int i = 0;
934
935 for ( ; i+7 < width; )
936 {
937 for (int j=0; j < 8; j++)
938 result+=filter[i+j]*temp[i+j+get_local_id(1)];
939 i+=8;
940 }
941
942 for ( ; i < width; i++)
943 result+=filter[i]*temp[i+get_local_id(1)];
944
945 // write back to global
946 WriteFloat4(filteredImage, number_channels, columns, x, y, channel, result);
947 }
948 }
949 )
950
951/*
952%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
953% %
954% %
955% %
956% C o n t r a s t %
957% %
958% %
959% %
960%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
961*/
962
963 STRINGIFY(
964
965 static inline float4 ConvertRGBToHSB(const float4 pixel)
966 {
967 float4 result=0.0f;
968 result.w=pixel.w;
969 float tmax=MagickMax(MagickMax(pixel.x,pixel.y),pixel.z);
970 if (tmax != 0.0f)
971 {
972 float tmin=MagickMin(MagickMin(pixel.x,pixel.y),pixel.z);
973 float delta=tmax-tmin;
974
975 result.y=delta/tmax;
976 result.z=QuantumScale*tmax;
977 if (delta != 0.0f)
978 {
979 result.x =((pixel.x == tmax) ? 0.0f : ((pixel.y == tmax) ? 2.0f : 4.0f));
980 result.x+=((pixel.x == tmax) ? (pixel.y-pixel.z) : ((pixel.y == tmax) ?
981 (pixel.z-pixel.x) : (pixel.x-pixel.y)))/delta;
982 result.x/=6.0f;
983 result.x+=(result.x < 0.0f) ? 0.0f : 1.0f;
984 }
985 }
986 return(result);
987 }
988
989 static inline float4 ConvertHSBToRGB(const float4 pixel)
990 {
991 float hue=pixel.x;
992 float saturation=pixel.y;
993 float brightness=pixel.z;
994
995 float4 result=pixel;
996
997 if (saturation == 0.0f)
998 {
999 result.x=result.y=result.z=ClampToQuantum(QuantumRange*brightness);
1000 }
1001 else
1002 {
1003 float h=6.0f*(hue-floor(hue));
1004 float f=h-floor(h);
1005 float p=brightness*(1.0f-saturation);
1006 float q=brightness*(1.0f-saturation*f);
1007 float t=brightness*(1.0f-(saturation*(1.0f-f)));
1008 int ih = (int)h;
1009
1010 if (ih == 1)
1011 {
1012 result.x=ClampToQuantum(QuantumRange*q);
1013 result.y=ClampToQuantum(QuantumRange*brightness);
1014 result.z=ClampToQuantum(QuantumRange*p);
1015 }
1016 else if (ih == 2)
1017 {
1018 result.x=ClampToQuantum(QuantumRange*p);
1019 result.y=ClampToQuantum(QuantumRange*brightness);
1020 result.z=ClampToQuantum(QuantumRange*t);
1021 }
1022 else if (ih == 3)
1023 {
1024 result.x=ClampToQuantum(QuantumRange*p);
1025 result.y=ClampToQuantum(QuantumRange*q);
1026 result.z=ClampToQuantum(QuantumRange*brightness);
1027 }
1028 else if (ih == 4)
1029 {
1030 result.x=ClampToQuantum(QuantumRange*t);
1031 result.y=ClampToQuantum(QuantumRange*p);
1032 result.z=ClampToQuantum(QuantumRange*brightness);
1033 }
1034 else if (ih == 5)
1035 {
1036 result.x=ClampToQuantum(QuantumRange*brightness);
1037 result.y=ClampToQuantum(QuantumRange*p);
1038 result.z=ClampToQuantum(QuantumRange*q);
1039 }
1040 else
1041 {
1042 result.x=ClampToQuantum(QuantumRange*brightness);
1043 result.y=ClampToQuantum(QuantumRange*t);
1044 result.z=ClampToQuantum(QuantumRange*p);
1045 }
1046 }
1047 return(result);
1048 }
1049
1050 __kernel void Contrast(__global CLQuantum *image,
1051 const unsigned int number_channels,const int sign)
1052 {
1053 const int x=get_global_id(0);
1054 const int y=get_global_id(1);
1055 const unsigned int columns=get_global_size(0);
1056
1057 float4 pixel=ReadAllChannels(image,number_channels,columns,x,y);
1058 if (number_channels < 3)
1059 pixel.y=pixel.z=pixel.x;
1060
1061 pixel=ConvertRGBToHSB(pixel);
1062 float brightness=pixel.z;
1063 brightness+=0.5f*sign*(0.5f*(sinpi(brightness-0.5f)+1.0f)-brightness);
1064 brightness=clamp(brightness,0.0f,1.0f);
1065 pixel.z=brightness;
1066 pixel=ConvertHSBToRGB(pixel);
1067
1068 WriteAllChannels(image,number_channels,columns,x,y,pixel);
1069 }
1070 )
1071
1072/*
1073%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1074% %
1075% %
1076% %
1077% C o n t r a s t S t r e t c h %
1078% %
1079% %
1080% %
1081%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1082*/
1083
1084 STRINGIFY(
1085 /*
1086 */
1087 __kernel void Histogram(__global CLPixelType * restrict im,
1088 const ChannelType channel,
1089 const unsigned int colorspace,
1090 const unsigned int method,
1091 __global uint4 * restrict histogram)
1092 {
1093 const int x = get_global_id(0);
1094 const int y = get_global_id(1);
1095 const int columns = get_global_size(0);
1096 const int c = x + y * columns;
1097 if ((channel & SyncChannels) != 0)
1098 {
1099 float red=(float)getRed(im[c]);
1100 float green=(float)getGreen(im[c]);
1101 float blue=(float)getBlue(im[c]);
1102
1103 float intensity = GetPixelIntensity(colorspace, method, red, green, blue);
1104 uint pos = ScaleQuantumToMap(ClampToQuantum(intensity));
1105 atomic_inc((__global uint *)(&(histogram[pos]))+2); //red position
1106 }
1107 else
1108 {
1109 // for equalizing, we always need all channels?
1110 // otherwise something more
1111 }
1112 }
1113 )
1114
1115 STRINGIFY(
1116 /*
1117 */
1118 __kernel void ContrastStretch(__global CLPixelType * restrict im,
1119 const ChannelType channel,
1120 __global CLPixelType * restrict stretch_map,
1121 const float4 white, const float4 black)
1122 {
1123 const int x = get_global_id(0);
1124 const int y = get_global_id(1);
1125 const int columns = get_global_size(0);
1126 const int c = x + y * columns;
1127
1128 uint ePos;
1129 CLPixelType oValue, eValue;
1130 CLQuantum red, green, blue, alpha;
1131
1132 //read from global
1133 oValue=im[c];
1134
1135 if ((channel & RedChannel) != 0)
1136 {
1137 if (getRedF4(white) != getRedF4(black))
1138 {
1139 ePos = ScaleQuantumToMap(getRed(oValue));
1140 eValue = stretch_map[ePos];
1141 red = getRed(eValue);
1142 }
1143 }
1144
1145 if ((channel & GreenChannel) != 0)
1146 {
1147 if (getGreenF4(white) != getGreenF4(black))
1148 {
1149 ePos = ScaleQuantumToMap(getGreen(oValue));
1150 eValue = stretch_map[ePos];
1151 green = getGreen(eValue);
1152 }
1153 }
1154
1155 if ((channel & BlueChannel) != 0)
1156 {
1157 if (getBlueF4(white) != getBlueF4(black))
1158 {
1159 ePos = ScaleQuantumToMap(getBlue(oValue));
1160 eValue = stretch_map[ePos];
1161 blue = getBlue(eValue);
1162 }
1163 }
1164
1165 if ((channel & AlphaChannel) != 0)
1166 {
1167 if (getAlphaF4(white) != getAlphaF4(black))
1168 {
1169 ePos = ScaleQuantumToMap(getAlpha(oValue));
1170 eValue = stretch_map[ePos];
1171 alpha = getAlpha(eValue);
1172 }
1173 }
1174
1175 //write back
1176 im[c]=(CLPixelType)(blue, green, red, alpha);
1177
1178 }
1179 )
1180/*
1181%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1182% %
1183% %
1184% %
1185% D e s p e c k l e %
1186% %
1187% %
1188% %
1189%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1190*/
1191
1192 STRINGIFY(
1193
1194 __kernel void HullPass1(const __global CLPixelType *inputImage, __global CLPixelType *outputImage
1195 , const unsigned int imageWidth, const unsigned int imageHeight
1196 , const int2 offset, const int polarity, const int matte) {
1197
1198 int x = get_global_id(0);
1199 int y = get_global_id(1);
1200
1201 CLPixelType v = inputImage[y*imageWidth+x];
1202
1203 int2 neighbor;
1204 neighbor.y = y + offset.y;
1205 neighbor.x = x + offset.x;
1206
1207 int2 clampedNeighbor;
1208 clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth);
1209 clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight);
1210
1211 CLPixelType r = (clampedNeighbor.x == neighbor.x
1212 && clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x]
1213 :(CLPixelType)0;
1214
1215 int sv[4];
1216 sv[0] = (int)v.x;
1217 sv[1] = (int)v.y;
1218 sv[2] = (int)v.z;
1219 sv[3] = (int)v.w;
1220
1221 int sr[4];
1222 sr[0] = (int)r.x;
1223 sr[1] = (int)r.y;
1224 sr[2] = (int)r.z;
1225 sr[3] = (int)r.w;
1226
1227 if (polarity > 0) {
1228 \n #pragma unroll 4\n
1229 for (unsigned int i = 0; i < 4; i++) {
1230 sv[i] = (sr[i] >= (sv[i]+ScaleCharToQuantum(2)))?(sv[i]+ScaleCharToQuantum(1)):sv[i];
1231 }
1232 }
1233 else {
1234 \n #pragma unroll 4\n
1235 for (unsigned int i = 0; i < 4; i++) {
1236 sv[i] = (sr[i] <= (sv[i]-ScaleCharToQuantum(2)))?(sv[i]-ScaleCharToQuantum(1)):sv[i];
1237 }
1238
1239 }
1240
1241 v.x = (CLQuantum)sv[0];
1242 v.y = (CLQuantum)sv[1];
1243 v.z = (CLQuantum)sv[2];
1244
1245 if (matte!=0)
1246 v.w = (CLQuantum)sv[3];
1247
1248 outputImage[y*imageWidth+x] = v;
1249
1250 }
1251
1252
1253 )
1254
1255 STRINGIFY(
1256
1257 __kernel void HullPass2(const __global CLPixelType *inputImage, __global CLPixelType *outputImage
1258 , const unsigned int imageWidth, const unsigned int imageHeight
1259 , const int2 offset, const int polarity, const int matte) {
1260
1261 int x = get_global_id(0);
1262 int y = get_global_id(1);
1263
1264 CLPixelType v = inputImage[y*imageWidth+x];
1265
1266 int2 neighbor, clampedNeighbor;
1267
1268 neighbor.y = y + offset.y;
1269 neighbor.x = x + offset.x;
1270 clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth);
1271 clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight);
1272
1273 CLPixelType r = (clampedNeighbor.x == neighbor.x
1274 && clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x]
1275 :(CLPixelType)0;
1276
1277
1278 neighbor.y = y - offset.y;
1279 neighbor.x = x - offset.x;
1280 clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth);
1281 clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight);
1282
1283 CLPixelType s = (clampedNeighbor.x == neighbor.x
1284 && clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x]
1285 :(CLPixelType)0;
1286
1287
1288 int sv[4];
1289 sv[0] = (int)v.x;
1290 sv[1] = (int)v.y;
1291 sv[2] = (int)v.z;
1292 sv[3] = (int)v.w;
1293
1294 int sr[4];
1295 sr[0] = (int)r.x;
1296 sr[1] = (int)r.y;
1297 sr[2] = (int)r.z;
1298 sr[3] = (int)r.w;
1299
1300 int ss[4];
1301 ss[0] = (int)s.x;
1302 ss[1] = (int)s.y;
1303 ss[2] = (int)s.z;
1304 ss[3] = (int)s.w;
1305
1306 if (polarity > 0) {
1307 \n #pragma unroll 4\n
1308 for (unsigned int i = 0; i < 4; i++) {
1309 //sv[i] = (ss[i] >= (sv[i]+ScaleCharToQuantum(2)) && sr[i] > sv[i] ) ? (sv[i]+ScaleCharToQuantum(1)):sv[i];
1310 //
1311 //sv[i] =(!( (int)(ss[i] >= (sv[i]+ScaleCharToQuantum(2))) && (int) (sr[i] > sv[i] ) )) ? sv[i]:(sv[i]+ScaleCharToQuantum(1));
1312 //sv[i] =(( (int)( ss[i] < (sv[i]+ScaleCharToQuantum(2))) || (int) ( sr[i] <= sv[i] ) )) ? sv[i]:(sv[i]+ScaleCharToQuantum(1));
1313 sv[i] =(( (int)( ss[i] < (sv[i]+ScaleCharToQuantum(2))) + (int) ( sr[i] <= sv[i] ) ) !=0) ? sv[i]:(sv[i]+ScaleCharToQuantum(1));
1314 }
1315 }
1316 else {
1317 \n #pragma unroll 4\n
1318 for (unsigned int i = 0; i < 4; i++) {
1319 //sv[i] = (ss[i] <= (sv[i]-ScaleCharToQuantum(2)) && sr[i] < sv[i] ) ? (sv[i]-ScaleCharToQuantum(1)):sv[i];
1320 //
1321 //sv[i] = ( (int)(ss[i] <= (sv[i]-ScaleCharToQuantum(2)) ) + (int)( sr[i] < sv[i] ) ==0) ? sv[i]:(sv[i]-ScaleCharToQuantum(1));
1322 sv[i] = (( (int)(ss[i] > (sv[i]-ScaleCharToQuantum(2))) + (int)( sr[i] >= sv[i] )) !=0) ? sv[i]:(sv[i]-ScaleCharToQuantum(1));
1323 }
1324 }
1325
1326 v.x = (CLQuantum)sv[0];
1327 v.y = (CLQuantum)sv[1];
1328 v.z = (CLQuantum)sv[2];
1329
1330 if (matte!=0)
1331 v.w = (CLQuantum)sv[3];
1332
1333 outputImage[y*imageWidth+x] = v;
1334
1335 }
1336 )
1337
1338/*
1339%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1340% %
1341% %
1342% %
1343% E q u a l i z e %
1344% %
1345% %
1346% %
1347%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1348*/
1349
1350 STRINGIFY(
1351 /*
1352 */
1353 __kernel void Equalize(__global CLPixelType * restrict im,
1354 const ChannelType channel,
1355 __global CLPixelType * restrict equalize_map,
1356 const float4 white, const float4 black)
1357 {
1358 const int x = get_global_id(0);
1359 const int y = get_global_id(1);
1360 const int columns = get_global_size(0);
1361 const int c = x + y * columns;
1362
1363 uint ePos;
1364 CLPixelType oValue, eValue;
1365 CLQuantum red, green, blue, alpha;
1366
1367 //read from global
1368 oValue=im[c];
1369
1370 if ((channel & SyncChannels) != 0)
1371 {
1372 if (getRedF4(white) != getRedF4(black))
1373 {
1374 ePos = ScaleQuantumToMap(getRed(oValue));
1375 eValue = equalize_map[ePos];
1376 red = getRed(eValue);
1377 ePos = ScaleQuantumToMap(getGreen(oValue));
1378 eValue = equalize_map[ePos];
1379 green = getRed(eValue);
1380 ePos = ScaleQuantumToMap(getBlue(oValue));
1381 eValue = equalize_map[ePos];
1382 blue = getRed(eValue);
1383 ePos = ScaleQuantumToMap(getAlpha(oValue));
1384 eValue = equalize_map[ePos];
1385 alpha = getRed(eValue);
1386
1387 //write back
1388 im[c]=(CLPixelType)(blue, green, red, alpha);
1389 }
1390
1391 }
1392
1393 // for equalizing, we always need all channels?
1394 // otherwise something more
1395
1396 }
1397 )
1398
1399/*
1400%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1401% %
1402% %
1403% %
1404% F u n c t i o n %
1405% %
1406% %
1407% %
1408%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1409*/
1410
1411 STRINGIFY(
1412 /*
1413 apply FunctionImageChannel(brightness-contrast)
1414 */
1415 CLQuantum ApplyFunction(float pixel,const MagickFunction function,
1416 const unsigned int number_parameters,__constant float *parameters)
1417 {
1418 float result = 0.0f;
1419
1420 switch (function)
1421 {
1422 case PolynomialFunction:
1423 {
1424 for (unsigned int i=0; i < number_parameters; i++)
1425 result = result*QuantumScale*pixel + parameters[i];
1426 result *= QuantumRange;
1427 break;
1428 }
1429 case SinusoidFunction:
1430 {
1431 float freq,phase,ampl,bias;
1432 freq = ( number_parameters >= 1 ) ? parameters[0] : 1.0f;
1433 phase = ( number_parameters >= 2 ) ? parameters[1] : 0.0f;
1434 ampl = ( number_parameters >= 3 ) ? parameters[2] : 0.5f;
1435 bias = ( number_parameters >= 4 ) ? parameters[3] : 0.5f;
1436 result = QuantumRange*(ampl*sin(2.0f*MagickPI*
1437 (freq*QuantumScale*pixel + phase/360.0f)) + bias);
1438 break;
1439 }
1440 case ArcsinFunction:
1441 {
1442 float width,range,center,bias;
1443 width = ( number_parameters >= 1 ) ? parameters[0] : 1.0f;
1444 center = ( number_parameters >= 2 ) ? parameters[1] : 0.5f;
1445 range = ( number_parameters >= 3 ) ? parameters[2] : 1.0f;
1446 bias = ( number_parameters >= 4 ) ? parameters[3] : 0.5f;
1447
1448 result = 2.0f/width*(QuantumScale*pixel - center);
1449 result = range/MagickPI*asin(result)+bias;
1450 result = ( result <= -1.0f ) ? bias - range/2.0f : result;
1451 result = ( result >= 1.0f ) ? bias + range/2.0f : result;
1452 result *= QuantumRange;
1453 break;
1454 }
1455 case ArctanFunction:
1456 {
1457 float slope,range,center,bias;
1458 slope = ( number_parameters >= 1 ) ? parameters[0] : 1.0f;
1459 center = ( number_parameters >= 2 ) ? parameters[1] : 0.5f;
1460 range = ( number_parameters >= 3 ) ? parameters[2] : 1.0f;
1461 bias = ( number_parameters >= 4 ) ? parameters[3] : 0.5f;
1462 result = MagickPI*slope*(QuantumScale*pixel-center);
1463 result = QuantumRange*(range/MagickPI*atan(result) + bias);
1464 break;
1465 }
1466 case UndefinedFunction:
1467 break;
1468 }
1469 return(ClampToQuantum(result));
1470 }
1471 )
1472
1473 STRINGIFY(
1474 /*
1475 Improve brightness / contrast of the image
1476 channel : define which channel is improved
1477 function : the function called to enhance the brightness contrast
1478 number_parameters : numbers of parameters
1479 parameters : the parameter
1480 */
1481 __kernel void ComputeFunction(__global CLQuantum *image,const unsigned int number_channels,
1482 const ChannelType channel,const MagickFunction function,const unsigned int number_parameters,
1483 __constant float *parameters)
1484 {
1485 const unsigned int x = get_global_id(0);
1486 const unsigned int y = get_global_id(1);
1487 const unsigned int columns = get_global_size(0);
1488 __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y);
1489
1490 float red;
1491 float green;
1492 float blue;
1493 float alpha;
1494
1495 ReadChannels(p, number_channels, channel, &red, &green, &blue, &alpha);
1496
1497 if ((channel & RedChannel) != 0)
1498 red=ApplyFunction(red, function, number_parameters, parameters);
1499
1500 if (number_channels > 2)
1501 {
1502 if ((channel & GreenChannel) != 0)
1503 green=ApplyFunction(green, function, number_parameters, parameters);
1504
1505 if ((channel & BlueChannel) != 0)
1506 blue=ApplyFunction(blue, function, number_parameters, parameters);
1507 }
1508
1509 if (((number_channels == 4) || (number_channels == 2)) &&
1510 ((channel & AlphaChannel) != 0))
1511 alpha=ApplyFunction(alpha, function, number_parameters, parameters);
1512
1513 WriteChannels(p, number_channels, channel, red, green, blue, alpha);
1514 }
1515 )
1516
1517/*
1518%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1519% %
1520% %
1521% %
1522% G r a y s c a l e %
1523% %
1524% %
1525% %
1526%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1527*/
1528
1529 STRINGIFY(
1530 __kernel void Grayscale(__global CLQuantum *image,const int number_channels,
1531 const unsigned int colorspace,const unsigned int method)
1532 {
1533 const unsigned int x = get_global_id(0);
1534 const unsigned int y = get_global_id(1);
1535 const unsigned int columns = get_global_size(0);
1536 __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y);
1537
1538 float
1539 blue,
1540 green,
1541 red;
1542
1543 red=getPixelRed(p);
1544 green=getPixelGreen(p);
1545 blue=getPixelBlue(p);
1546
1547 CLQuantum intensity=ClampToQuantum(GetPixelIntensity(colorspace, method, red, green, blue));
1548
1549 setPixelRed(p,intensity);
1550 setPixelGreen(p,intensity);
1551 setPixelBlue(p,intensity);
1552 }
1553 )
1554
1555/*
1556%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1557% %
1558% %
1559% %
1560% L o c a l C o n t r a s t %
1561% %
1562% %
1563% %
1564%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1565*/
1566
1567 STRINGIFY(
1568
1569 __kernel void LocalContrastBlurRow(__global CLPixelType *srcImage, __global CLPixelType *dstImage, __global float *tmpImage,
1570 const int radius,
1571 const int imageWidth,
1572 const int imageHeight)
1573 {
1574 const float4 RGB = ((float4)(0.2126f, 0.7152f, 0.0722f, 0.0f));
1575
1576 int x = get_local_id(0);
1577 int y = get_global_id(1);
1578
1579 if ((x >= imageWidth) || (y >= imageHeight))
1580 return;
1581
1582 global CLPixelType *src = srcImage + y * imageWidth;
1583
1584 for (int i = x; i < imageWidth; i += get_local_size(0)) {
1585 float sum = 0.0f;
1586 float weight = 1.0f;
1587
1588 int j = i - radius;
1589 while ((j + 7) < i) {
1590 for (int k = 0; k < 8; ++k) // Unroll 8x
1591 sum += (weight + k) * dot(RGB, convert_float4(src[mirrorBottom(j+k)]));
1592 weight += 8.0f;
1593 j+=8;
1594 }
1595 while (j < i) {
1596 sum += weight * dot(RGB, convert_float4(src[mirrorBottom(j)]));
1597 weight += 1.0f;
1598 ++j;
1599 }
1600
1601 while ((j + 7) < radius + i) {
1602 for (int k = 0; k < 8; ++k) // Unroll 8x
1603 sum += (weight - k) * dot(RGB, convert_float4(src[mirrorTop(j + k, imageWidth)]));
1604 weight -= 8.0f;
1605 j+=8;
1606 }
1607 while (j < radius + i) {
1608 sum += weight * dot(RGB, convert_float4(src[mirrorTop(j, imageWidth)]));
1609 weight -= 1.0f;
1610 ++j;
1611 }
1612
1613 tmpImage[i + y * imageWidth] = sum / ((radius + 1) * (radius + 1));
1614 }
1615 }
1616 )
1617
1618 STRINGIFY(
1619 __kernel void LocalContrastBlurApplyColumn(__global CLPixelType *srcImage, __global CLPixelType *dstImage, __global float *blurImage,
1620 const int radius,
1621 const float strength,
1622 const int imageWidth,
1623 const int imageHeight)
1624 {
1625 const float4 RGB = (float4)(0.2126f, 0.7152f, 0.0722f, 0.0f);
1626
1627 int x = get_global_id(0);
1628 int y = get_global_id(1);
1629
1630 if ((x >= imageWidth) || (y >= imageHeight))
1631 return;
1632
1633 global float *src = blurImage + x;
1634
1635 float sum = 0.0f;
1636 float weight = 1.0f;
1637
1638 int j = y - radius;
1639 while ((j + 7) < y) {
1640 for (int k = 0; k < 8; ++k) // Unroll 8x
1641 sum += (weight + k) * src[mirrorBottom(j+k) * imageWidth];
1642 weight += 8.0f;
1643 j+=8;
1644 }
1645 while (j < y) {
1646 sum += weight * src[mirrorBottom(j) * imageWidth];
1647 weight += 1.0f;
1648 ++j;
1649 }
1650
1651 while ((j + 7) < radius + y) {
1652 for (int k = 0; k < 8; ++k) // Unroll 8x
1653 sum += (weight - k) * src[mirrorTop(j + k, imageHeight) * imageWidth];
1654 weight -= 8.0f;
1655 j+=8;
1656 }
1657 while (j < radius + y) {
1658 sum += weight * src[mirrorTop(j, imageHeight) * imageWidth];
1659 weight -= 1.0f;
1660 ++j;
1661 }
1662
1663 CLPixelType pixel = srcImage[x + y * imageWidth];
1664 float srcVal = dot(RGB, convert_float4(pixel));
1665 float mult = (srcVal - (sum / ((radius + 1) * (radius + 1)))) * (strength / 100.0f);
1666 mult = (srcVal + mult) / srcVal;
1667
1668 pixel.x = ClampToQuantum(pixel.x * mult);
1669 pixel.y = ClampToQuantum(pixel.y * mult);
1670 pixel.z = ClampToQuantum(pixel.z * mult);
1671
1672 dstImage[x + y * imageWidth] = pixel;
1673 }
1674 )
1675
1676/*
1677%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1678% %
1679% %
1680% %
1681% M o d u l a t e %
1682% %
1683% %
1684% %
1685%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1686*/
1687
1688 STRINGIFY(
1689
1690 static inline void ConvertRGBToHSL(const CLQuantum red,const CLQuantum green, const CLQuantum blue,
1691 float *hue, float *saturation, float *lightness)
1692 {
1693 float
1694 c,
1695 tmax,
1696 tmin;
1697
1698 /*
1699 Convert RGB to HSL colorspace.
1700 */
1701 tmax=MagickMax(QuantumScale*red,MagickMax(QuantumScale*green, QuantumScale*blue));
1702 tmin=MagickMin(QuantumScale*red,MagickMin(QuantumScale*green, QuantumScale*blue));
1703
1704 c=tmax-tmin;
1705
1706 *lightness=(tmax+tmin)/2.0;
1707 if (c <= 0.0)
1708 {
1709 *hue=0.0;
1710 *saturation=0.0;
1711 return;
1712 }
1713
1714 if (tmax == (QuantumScale*red))
1715 {
1716 *hue=(QuantumScale*green-QuantumScale*blue)/c;
1717 if ((QuantumScale*green) < (QuantumScale*blue))
1718 *hue+=6.0;
1719 }
1720 else
1721 if (tmax == (QuantumScale*green))
1722 *hue=2.0+(QuantumScale*blue-QuantumScale*red)/c;
1723 else
1724 *hue=4.0+(QuantumScale*red-QuantumScale*green)/c;
1725
1726 *hue*=60.0/360.0;
1727 if (*lightness <= 0.5)
1728 *saturation=c/(2.0*(*lightness));
1729 else
1730 *saturation=c/(2.0-2.0*(*lightness));
1731 }
1732
1733 static inline void ConvertHSLToRGB(const float hue,const float saturation, const float lightness,
1734 CLQuantum *red,CLQuantum *green,CLQuantum *blue)
1735 {
1736 float
1737 b,
1738 c,
1739 g,
1740 h,
1741 tmin,
1742 r,
1743 x;
1744
1745 /*
1746 Convert HSL to RGB colorspace.
1747 */
1748 h=hue*360.0;
1749 if (lightness <= 0.5)
1750 c=2.0*lightness*saturation;
1751 else
1752 c=(2.0-2.0*lightness)*saturation;
1753 tmin=lightness-0.5*c;
1754 h-=360.0*floor(h/360.0);
1755 h/=60.0;
1756 x=c*(1.0-fabs(h-2.0*floor(h/2.0)-1.0));
1757 switch ((int) floor(h) % 6)
1758 {
1759 case 0:
1760 default:
1761 {
1762 r=tmin+c;
1763 g=tmin+x;
1764 b=tmin;
1765 break;
1766 }
1767 case 1:
1768 {
1769 r=tmin+x;
1770 g=tmin+c;
1771 b=tmin;
1772 break;
1773 }
1774 case 2:
1775 {
1776 r=tmin;
1777 g=tmin+c;
1778 b=tmin+x;
1779 break;
1780 }
1781 case 3:
1782 {
1783 r=tmin;
1784 g=tmin+x;
1785 b=tmin+c;
1786 break;
1787 }
1788 case 4:
1789 {
1790 r=tmin+x;
1791 g=tmin;
1792 b=tmin+c;
1793 break;
1794 }
1795 case 5:
1796 {
1797 r=tmin+c;
1798 g=tmin;
1799 b=tmin+x;
1800 break;
1801 }
1802 }
1803 *red=ClampToQuantum(QuantumRange*r);
1804 *green=ClampToQuantum(QuantumRange*g);
1805 *blue=ClampToQuantum(QuantumRange*b);
1806 }
1807
1808 static inline void ModulateHSL(const float percent_hue, const float percent_saturation,const float percent_lightness,
1809 CLQuantum *red,CLQuantum *green,CLQuantum *blue)
1810 {
1811 float
1812 hue,
1813 lightness,
1814 saturation;
1815
1816 /*
1817 Increase or decrease color lightness, saturation, or hue.
1818 */
1819 ConvertRGBToHSL(*red,*green,*blue,&hue,&saturation,&lightness);
1820 hue+=0.5*(0.01*percent_hue-1.0);
1821 while (hue < 0.0)
1822 hue+=1.0;
1823 while (hue >= 1.0)
1824 hue-=1.0;
1825 saturation*=0.01*percent_saturation;
1826 lightness*=0.01*percent_lightness;
1827 ConvertHSLToRGB(hue,saturation,lightness,red,green,blue);
1828 }
1829
1830 __kernel void Modulate(__global CLPixelType *im,
1831 const float percent_brightness,
1832 const float percent_hue,
1833 const float percent_saturation,
1834 const int colorspace)
1835 {
1836
1837 const int x = get_global_id(0);
1838 const int y = get_global_id(1);
1839 const int columns = get_global_size(0);
1840 const int c = x + y * columns;
1841
1842 CLPixelType pixel = im[c];
1843
1844 CLQuantum
1845 blue,
1846 green,
1847 red;
1848
1849 red=getRed(pixel);
1850 green=getGreen(pixel);
1851 blue=getBlue(pixel);
1852
1853 switch (colorspace)
1854 {
1855 case HSLColorspace:
1856 default:
1857 {
1858 ModulateHSL(percent_hue, percent_saturation, percent_brightness,
1859 &red, &green, &blue);
1860 }
1861
1862 }
1863
1864 CLPixelType filteredPixel;
1865
1866 setRed(&filteredPixel, red);
1867 setGreen(&filteredPixel, green);
1868 setBlue(&filteredPixel, blue);
1869 filteredPixel.w = pixel.w;
1870
1871 im[c] = filteredPixel;
1872 }
1873 )
1874
1875/*
1876%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1877% %
1878% %
1879% %
1880% M o t i o n B l u r %
1881% %
1882% %
1883% %
1884%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1885*/
1886
1887 STRINGIFY(
1888 __kernel
1889 void MotionBlur(const __global CLPixelType *input, __global CLPixelType *output,
1890 const unsigned int imageWidth, const unsigned int imageHeight,
1891 const __global float *filter, const unsigned int width, const __global int2* offset,
1892 const float4 bias,
1893 const ChannelType channel, const unsigned int matte) {
1894
1895 int2 currentPixel;
1896 currentPixel.x = get_global_id(0);
1897 currentPixel.y = get_global_id(1);
1898
1899 if (currentPixel.x >= imageWidth
1900 || currentPixel.y >= imageHeight)
1901 return;
1902
1903 float4 pixel;
1904 pixel.x = (float)bias.x;
1905 pixel.y = (float)bias.y;
1906 pixel.z = (float)bias.z;
1907 pixel.w = (float)bias.w;
1908
1909 if (((channel & AlphaChannel) == 0) || (matte == 0)) {
1910
1911 for (int i = 0; i < width; i++) {
1912 // only support EdgeVirtualPixelMethod through ClampToCanvas
1913 // TODO: implement other virtual pixel method
1914 int2 samplePixel = currentPixel + offset[i];
1915 samplePixel.x = ClampToCanvas(samplePixel.x, imageWidth);
1916 samplePixel.y = ClampToCanvas(samplePixel.y, imageHeight);
1917 CLPixelType samplePixelValue = input[ samplePixel.y * imageWidth + samplePixel.x];
1918
1919 pixel.x += (filter[i] * (float)samplePixelValue.x);
1920 pixel.y += (filter[i] * (float)samplePixelValue.y);
1921 pixel.z += (filter[i] * (float)samplePixelValue.z);
1922 pixel.w += (filter[i] * (float)samplePixelValue.w);
1923 }
1924
1925 CLPixelType outputPixel;
1926 outputPixel.x = ClampToQuantum(pixel.x);
1927 outputPixel.y = ClampToQuantum(pixel.y);
1928 outputPixel.z = ClampToQuantum(pixel.z);
1929 outputPixel.w = ClampToQuantum(pixel.w);
1930 output[currentPixel.y * imageWidth + currentPixel.x] = outputPixel;
1931 }
1932 else {
1933
1934 float gamma = 0.0f;
1935 for (int i = 0; i < width; i++) {
1936 // only support EdgeVirtualPixelMethod through ClampToCanvas
1937 // TODO: implement other virtual pixel method
1938 int2 samplePixel = currentPixel + offset[i];
1939 samplePixel.x = ClampToCanvas(samplePixel.x, imageWidth);
1940 samplePixel.y = ClampToCanvas(samplePixel.y, imageHeight);
1941
1942 CLPixelType samplePixelValue = input[ samplePixel.y * imageWidth + samplePixel.x];
1943
1944 float alpha = QuantumScale*samplePixelValue.w;
1945 float k = filter[i];
1946 pixel.x = pixel.x + k * alpha * samplePixelValue.x;
1947 pixel.y = pixel.y + k * alpha * samplePixelValue.y;
1948 pixel.z = pixel.z + k * alpha * samplePixelValue.z;
1949
1950 pixel.w += k * alpha * samplePixelValue.w;
1951
1952 gamma+=k*alpha;
1953 }
1954 gamma = PerceptibleReciprocal(gamma);
1955 pixel.xyz = gamma*pixel.xyz;
1956
1957 CLPixelType outputPixel;
1958 outputPixel.x = ClampToQuantum(pixel.x);
1959 outputPixel.y = ClampToQuantum(pixel.y);
1960 outputPixel.z = ClampToQuantum(pixel.z);
1961 outputPixel.w = ClampToQuantum(pixel.w);
1962 output[currentPixel.y * imageWidth + currentPixel.x] = outputPixel;
1963 }
1964 }
1965 )
1966
1967/*
1968%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1969% %
1970% %
1971% %
1972% R e s i z e %
1973% %
1974% %
1975% %
1976%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1977*/
1978
1979 STRINGIFY(
1980 // Based on Box from resize.c
1981 float BoxResizeFilter(const float x)
1982 {
1983 return 1.0f;
1984 }
1985 )
1986
1987 STRINGIFY(
1988 // Based on CubicBC from resize.c
1989 float CubicBC(const float x,const __global float* resizeFilterCoefficients)
1990 {
1991 /*
1992 Cubic Filters using B,C determined values:
1993 Mitchell-Netravali B = 1/3 C = 1/3 "Balanced" cubic spline filter
1994 Catmull-Rom B = 0 C = 1/2 Interpolatory and exact on linears
1995 Spline B = 1 C = 0 B-Spline Gaussian approximation
1996 Hermite B = 0 C = 0 B-Spline interpolator
1997
1998 See paper by Mitchell and Netravali, Reconstruction Filters in Computer
1999 Graphics Computer Graphics, Volume 22, Number 4, August 1988
2000 http://www.cs.utexas.edu/users/fussell/courses/cs384g/lectures/mitchell/
2001 Mitchell.pdf.
2002
2003 Coefficients are determined from B,C values:
2004 P0 = ( 6 - 2*B )/6 = coeff[0]
2005 P1 = 0
2006 P2 = (-18 +12*B + 6*C )/6 = coeff[1]
2007 P3 = ( 12 - 9*B - 6*C )/6 = coeff[2]
2008 Q0 = ( 8*B +24*C )/6 = coeff[3]
2009 Q1 = ( -12*B -48*C )/6 = coeff[4]
2010 Q2 = ( 6*B +30*C )/6 = coeff[5]
2011 Q3 = ( - 1*B - 6*C )/6 = coeff[6]
2012
2013 which are used to define the filter:
2014
2015 P0 + P1*x + P2*x^2 + P3*x^3 0 <= x < 1
2016 Q0 + Q1*x + Q2*x^2 + Q3*x^3 1 <= x < 2
2017
2018 which ensures function is continuous in value and derivative (slope).
2019 */
2020 if (x < 1.0)
2021 return(resizeFilterCoefficients[0]+x*(x*
2022 (resizeFilterCoefficients[1]+x*resizeFilterCoefficients[2])));
2023 if (x < 2.0)
2024 return(resizeFilterCoefficients[3]+x*(resizeFilterCoefficients[4]+x*
2025 (resizeFilterCoefficients[5]+x*resizeFilterCoefficients[6])));
2026 return(0.0);
2027 }
2028 )
2029
2030 STRINGIFY(
2031 float Sinc(const float x)
2032 {
2033 if (x != 0.0f)
2034 {
2035 const float alpha=(float) (MagickPI*x);
2036 return sinpi(x)/alpha;
2037 }
2038 return(1.0f);
2039 }
2040 )
2041
2042 STRINGIFY(
2043 float Triangle(const float x)
2044 {
2045 /*
2046 1st order (linear) B-Spline, bilinear interpolation, Tent 1D filter, or
2047 a Bartlett 2D Cone filter. Also used as a Bartlett Windowing function
2048 for Sinc().
2049 */
2050 return ((x<1.0f)?(1.0f-x):0.0f);
2051 }
2052 )
2053
2054
2055 STRINGIFY(
2056 float Hann(const float x)
2057 {
2058 /*
2059 Cosine window function:
2060 0.5+0.5*cos(pi*x).
2061 */
2062 const float cosine=cos((MagickPI*x));
2063 return(0.5f+0.5f*cosine);
2064 }
2065 )
2066
2067 STRINGIFY(
2068 float Hamming(const float x)
2069 {
2070 /*
2071 Offset cosine window function:
2072 .54 + .46 cos(pi x).
2073 */
2074 const float cosine=cos((MagickPI*x));
2075 return(0.54f+0.46f*cosine);
2076 }
2077 )
2078
2079 STRINGIFY(
2080 float Blackman(const float x)
2081 {
2082 /*
2083 Blackman: 2nd order cosine windowing function:
2084 0.42 + 0.5 cos(pi x) + 0.08 cos(2pi x)
2085
2086 Refactored by Chantal Racette and Nicolas Robidoux to one trig call and
2087 five flops.
2088 */
2089 const float cosine=cos((MagickPI*x));
2090 return(0.34f+cosine*(0.5f+cosine*0.16f));
2091 }
2092 )
2093
2094 STRINGIFY(
2095 static inline float applyResizeFilter(const float x, const ResizeWeightingFunctionType filterType, const __global float* filterCoefficients)
2096 {
2097 switch (filterType)
2098 {
2099 /* Call Sinc even for SincFast to get better precision on GPU
2100 and to avoid thread divergence. Sinc is pretty fast on GPU anyway...*/
2101 case SincWeightingFunction:
2102 case SincFastWeightingFunction:
2103 return Sinc(x);
2104 case CubicBCWeightingFunction:
2105 return CubicBC(x,filterCoefficients);
2106 case BoxWeightingFunction:
2107 return BoxResizeFilter(x);
2108 case TriangleWeightingFunction:
2109 return Triangle(x);
2110 case HannWeightingFunction:
2111 return Hann(x);
2112 case HammingWeightingFunction:
2113 return Hamming(x);
2114 case BlackmanWeightingFunction:
2115 return Blackman(x);
2116
2117 default:
2118 return 0.0f;
2119 }
2120 }
2121 )
2122
2123
2124 STRINGIFY(
2125 static inline float getResizeFilterWeight(const __global float* resizeFilterCubicCoefficients, const ResizeWeightingFunctionType resizeFilterType
2126 , const ResizeWeightingFunctionType resizeWindowType
2127 , const float resizeFilterScale, const float resizeWindowSupport, const float resizeFilterBlur, const float x)
2128 {
2129 float scale;
2130 float xBlur = fabs(x/resizeFilterBlur);
2131 if (resizeWindowSupport < MagickEpsilon
2132 || resizeWindowType == BoxWeightingFunction)
2133 {
2134 scale = 1.0f;
2135 }
2136 else
2137 {
2138 scale = resizeFilterScale;
2139 scale = applyResizeFilter(xBlur*scale, resizeWindowType, resizeFilterCubicCoefficients);
2140 }
2141 float weight = scale * applyResizeFilter(xBlur, resizeFilterType, resizeFilterCubicCoefficients);
2142 return weight;
2143 }
2144
2145 )
2146
2147 ;
2148 const char *accelerateKernels2 =
2149
2150 STRINGIFY(
2151
2152 static inline unsigned int getNumWorkItemsPerPixel(const unsigned int pixelPerWorkgroup, const unsigned int numWorkItems) {
2153 return (numWorkItems/pixelPerWorkgroup);
2154 }
2155
2156 // returns the index of the pixel for the current workitem to compute.
2157 // returns -1 if this workitem doesn't need to participate in any computation
2158 static inline int pixelToCompute(const unsigned itemID, const unsigned int pixelPerWorkgroup, const unsigned int numWorkItems) {
2159 const unsigned int numWorkItemsPerPixel = getNumWorkItemsPerPixel(pixelPerWorkgroup, numWorkItems);
2160 int pixelIndex = itemID/numWorkItemsPerPixel;
2161 pixelIndex = (pixelIndex<pixelPerWorkgroup)?pixelIndex:-1;
2162 return pixelIndex;
2163 }
2164
2165 )
2166
2167 STRINGIFY(
2168 __kernel __attribute__((reqd_work_group_size(256, 1, 1)))
2169 void ResizeHorizontalFilter(const __global CLQuantum *inputImage, const unsigned int number_channels,
2170 const unsigned int inputColumns, const unsigned int inputRows, __global CLQuantum *filteredImage,
2171 const unsigned int filteredColumns, const unsigned int filteredRows, const float xFactor,
2172 const int resizeFilterType, const int resizeWindowType, const __global float *resizeFilterCubicCoefficients,
2173 const float resizeFilterScale, const float resizeFilterSupport, const float resizeFilterWindowSupport,
2174 const float resizeFilterBlur, __local CLQuantum *inputImageCache, const int numCachedPixels,
2175 const unsigned int pixelPerWorkgroup, const unsigned int pixelChunkSize,
2176 __local float4 *outputPixelCache, __local float *densityCache, __local float *gammaCache)
2177 {
2178 // calculate the range of resized image pixels computed by this workgroup
2179 const unsigned int startX = get_group_id(0)*pixelPerWorkgroup;
2180 const unsigned int stopX = MagickMin(startX + pixelPerWorkgroup,filteredColumns);
2181 const unsigned int actualNumPixelToCompute = stopX - startX;
2182
2183 // calculate the range of input image pixels to cache
2184 float scale = MagickMax(1.0f/xFactor+MagickEpsilon ,1.0f);
2185 const float support = MagickMax(scale*resizeFilterSupport,0.5f);
2186 scale = PerceptibleReciprocal(scale);
2187
2188 const int cacheRangeStartX = MagickMax((int)((startX+0.5f)/xFactor+MagickEpsilon-support+0.5f),(int)(0));
2189 const int cacheRangeEndX = MagickMin((int)(cacheRangeStartX + numCachedPixels), (int)inputColumns);
2190
2191 // cache the input pixels into local memory
2192 const unsigned int y = get_global_id(1);
2193 const unsigned int pos = getPixelIndex(number_channels, inputColumns, cacheRangeStartX, y);
2194 const unsigned int num_elements = (cacheRangeEndX - cacheRangeStartX) * number_channels;
2195 event_t e = async_work_group_copy(inputImageCache, inputImage + pos, num_elements, 0);
2196 wait_group_events(1, &e);
2197
2198 unsigned int alpha_index = (number_channels == 4) || (number_channels == 2) ? number_channels - 1 : 0;
2199 unsigned int totalNumChunks = (actualNumPixelToCompute+pixelChunkSize-1)/pixelChunkSize;
2200 for (unsigned int chunk = 0; chunk < totalNumChunks; chunk++)
2201 {
2202 const unsigned int chunkStartX = startX + chunk*pixelChunkSize;
2203 const unsigned int chunkStopX = MagickMin(chunkStartX + pixelChunkSize, stopX);
2204 const unsigned int actualNumPixelInThisChunk = chunkStopX - chunkStartX;
2205
2206 // determine which resized pixel computed by this workitem
2207 const unsigned int itemID = get_local_id(0);
2208 const unsigned int numItems = getNumWorkItemsPerPixel(actualNumPixelInThisChunk, get_local_size(0));
2209
2210 const int pixelIndex = pixelToCompute(itemID, actualNumPixelInThisChunk, get_local_size(0));
2211
2212 float4 filteredPixel = (float4)0.0f;
2213 float density = 0.0f;
2214 float gamma = 0.0f;
2215 // -1 means this workitem doesn't participate in the computation
2216 if (pixelIndex != -1)
2217 {
2218 // x coordinated of the resized pixel computed by this workitem
2219 const int x = chunkStartX + pixelIndex;
2220
2221 // calculate how many steps required for this pixel
2222 const float bisect = (x+0.5)/xFactor+MagickEpsilon;
2223 const unsigned int start = (unsigned int)MagickMax(bisect-support+0.5f,0.0f);
2224 const unsigned int stop = (unsigned int)MagickMin(bisect+support+0.5f,(float)inputColumns);
2225 const unsigned int n = stop - start;
2226
2227 // calculate how many steps this workitem will contribute
2228 unsigned int numStepsPerWorkItem = n / numItems;
2229 numStepsPerWorkItem += ((numItems*numStepsPerWorkItem)==n?0:1);
2230
2231 const unsigned int startStep = (itemID%numItems)*numStepsPerWorkItem;
2232 if (startStep < n)
2233 {
2234 const unsigned int stopStep = MagickMin(startStep+numStepsPerWorkItem, n);
2235
2236 unsigned int cacheIndex = start+startStep-cacheRangeStartX;
2237 for (unsigned int i = startStep; i < stopStep; i++, cacheIndex++)
2238 {
2239 float weight = getResizeFilterWeight(resizeFilterCubicCoefficients,
2240 (ResizeWeightingFunctionType) resizeFilterType,
2241 (ResizeWeightingFunctionType) resizeWindowType,
2242 resizeFilterScale, resizeFilterWindowSupport,
2243 resizeFilterBlur, scale*(start + i - bisect + 0.5));
2244
2245 float4 cp = (float4)0.0f;
2246
2247 __local CLQuantum *p = inputImageCache + (cacheIndex*number_channels);
2248 cp.x = (float) *(p);
2249 if (number_channels > 2)
2250 {
2251 cp.y = (float) *(p + 1);
2252 cp.z = (float) *(p + 2);
2253 }
2254
2255 if (alpha_index != 0)
2256 {
2257 cp.w = (float) *(p + alpha_index);
2258
2259 float alpha = weight * QuantumScale * cp.w;
2260
2261 filteredPixel.x += alpha * cp.x;
2262 filteredPixel.y += alpha * cp.y;
2263 filteredPixel.z += alpha * cp.z;
2264 filteredPixel.w += weight * cp.w;
2265 gamma += alpha;
2266 }
2267 else
2268 filteredPixel += ((float4) weight)*cp;
2269
2270 density += weight;
2271 }
2272 }
2273 }
2274
2275 // initialize the accumulators to zero
2276 if (itemID < actualNumPixelInThisChunk) {
2277 outputPixelCache[itemID] = (float4)0.0f;
2278 densityCache[itemID] = 0.0f;
2279 if (alpha_index != 0)
2280 gammaCache[itemID] = 0.0f;
2281 }
2282 barrier(CLK_LOCAL_MEM_FENCE);
2283
2284 // accumulate the filtered pixel value and the density
2285 for (unsigned int i = 0; i < numItems; i++) {
2286 if (pixelIndex != -1) {
2287 if (itemID%numItems == i) {
2288 outputPixelCache[pixelIndex]+=filteredPixel;
2289 densityCache[pixelIndex]+=density;
2290 if (alpha_index != 0)
2291 gammaCache[pixelIndex]+=gamma;
2292 }
2293 }
2294 barrier(CLK_LOCAL_MEM_FENCE);
2295 }
2296
2297 if (itemID < actualNumPixelInThisChunk)
2298 {
2299 float4 filteredPixel = outputPixelCache[itemID];
2300
2301 float gamma = 0.0f;
2302 if (alpha_index != 0)
2303 gamma = gammaCache[itemID];
2304
2305 float density = densityCache[itemID];
2306 if ((density != 0.0f) && (density != 1.0f))
2307 {
2308 density = PerceptibleReciprocal(density);
2309 filteredPixel *= (float4) density;
2310 if (alpha_index != 0)
2311 gamma *= density;
2312 }
2313
2314 if (alpha_index != 0)
2315 {
2316 gamma = PerceptibleReciprocal(gamma);
2317 filteredPixel.x *= gamma;
2318 filteredPixel.y *= gamma;
2319 filteredPixel.z *= gamma;
2320 }
2321
2322 WriteAllChannels(filteredImage, number_channels, filteredColumns, chunkStartX + itemID, y, filteredPixel);
2323 }
2324 }
2325 }
2326 )
2327
2328
2329 STRINGIFY(
2330 __kernel __attribute__((reqd_work_group_size(1, 256, 1)))
2331 void ResizeVerticalFilter(const __global CLQuantum *inputImage, const unsigned int number_channels,
2332 const unsigned int inputColumns, const unsigned int inputRows, __global CLQuantum *filteredImage,
2333 const unsigned int filteredColumns, const unsigned int filteredRows, const float yFactor,
2334 const int resizeFilterType, const int resizeWindowType, const __global float *resizeFilterCubicCoefficients,
2335 const float resizeFilterScale, const float resizeFilterSupport, const float resizeFilterWindowSupport,
2336 const float resizeFilterBlur, __local CLQuantum *inputImageCache, const int numCachedPixels,
2337 const unsigned int pixelPerWorkgroup, const unsigned int pixelChunkSize,
2338 __local float4 *outputPixelCache, __local float *densityCache, __local float *gammaCache)
2339 {
2340 // calculate the range of resized image pixels computed by this workgroup
2341 const unsigned int startY = get_group_id(1)*pixelPerWorkgroup;
2342 const unsigned int stopY = MagickMin(startY + pixelPerWorkgroup,filteredRows);
2343 const unsigned int actualNumPixelToCompute = stopY - startY;
2344
2345 // calculate the range of input image pixels to cache
2346 float scale = MagickMax(1.0f/yFactor+MagickEpsilon ,1.0f);
2347 const float support = MagickMax(scale*resizeFilterSupport,0.5f);
2348 scale = PerceptibleReciprocal(scale);
2349
2350 const int cacheRangeStartY = MagickMax((int)((startY+0.5f)/yFactor+MagickEpsilon-support+0.5f),(int)(0));
2351 const int cacheRangeEndY = MagickMin((int)(cacheRangeStartY + numCachedPixels), (int)inputRows);
2352
2353 // cache the input pixels into local memory
2354 const unsigned int x = get_global_id(0);
2355 unsigned int pos = getPixelIndex(number_channels, inputColumns, x, cacheRangeStartY);
2356 unsigned int rangeLength = cacheRangeEndY-cacheRangeStartY;
2357 unsigned int stride = inputColumns * number_channels;
2358 for (unsigned int i = 0; i < number_channels; i++)
2359 {
2360 event_t e = async_work_group_strided_copy(inputImageCache + (rangeLength*i), inputImage+pos+i, rangeLength, stride, 0);
2361 wait_group_events(1,&e);
2362 }
2363
2364 unsigned int alpha_index = (number_channels == 4) || (number_channels == 2) ? number_channels - 1 : 0;
2365 unsigned int totalNumChunks = (actualNumPixelToCompute+pixelChunkSize-1)/pixelChunkSize;
2366 for (unsigned int chunk = 0; chunk < totalNumChunks; chunk++)
2367 {
2368 const unsigned int chunkStartY = startY + chunk*pixelChunkSize;
2369 const unsigned int chunkStopY = MagickMin(chunkStartY + pixelChunkSize, stopY);
2370 const unsigned int actualNumPixelInThisChunk = chunkStopY - chunkStartY;
2371
2372 // determine which resized pixel computed by this workitem
2373 const unsigned int itemID = get_local_id(1);
2374 const unsigned int numItems = getNumWorkItemsPerPixel(actualNumPixelInThisChunk, get_local_size(1));
2375
2376 const int pixelIndex = pixelToCompute(itemID, actualNumPixelInThisChunk, get_local_size(1));
2377
2378 float4 filteredPixel = (float4)0.0f;
2379 float density = 0.0f;
2380 float gamma = 0.0f;
2381 // -1 means this workitem doesn't participate in the computation
2382 if (pixelIndex != -1)
2383 {
2384 // x coordinated of the resized pixel computed by this workitem
2385 const int y = chunkStartY + pixelIndex;
2386
2387 // calculate how many steps required for this pixel
2388 const float bisect = (y+0.5)/yFactor+MagickEpsilon;
2389 const unsigned int start = (unsigned int)MagickMax(bisect-support+0.5f,0.0f);
2390 const unsigned int stop = (unsigned int)MagickMin(bisect+support+0.5f,(float)inputRows);
2391 const unsigned int n = stop - start;
2392
2393 // calculate how many steps this workitem will contribute
2394 unsigned int numStepsPerWorkItem = n / numItems;
2395 numStepsPerWorkItem += ((numItems*numStepsPerWorkItem)==n?0:1);
2396
2397 const unsigned int startStep = (itemID%numItems)*numStepsPerWorkItem;
2398 if (startStep < n)
2399 {
2400 const unsigned int stopStep = MagickMin(startStep+numStepsPerWorkItem, n);
2401
2402 unsigned int cacheIndex = start+startStep-cacheRangeStartY;
2403 for (unsigned int i = startStep; i < stopStep; i++, cacheIndex++)
2404 {
2405 float weight = getResizeFilterWeight(resizeFilterCubicCoefficients,
2406 (ResizeWeightingFunctionType) resizeFilterType,
2407 (ResizeWeightingFunctionType) resizeWindowType,
2408 resizeFilterScale, resizeFilterWindowSupport,
2409 resizeFilterBlur, scale*(start + i - bisect + 0.5));
2410
2411 float4 cp = (float4)0.0f;
2412
2413 __local CLQuantum *p = inputImageCache + cacheIndex;
2414 cp.x = (float) *(p);
2415 if (number_channels > 2)
2416 {
2417 cp.y = (float) *(p + rangeLength);
2418 cp.z = (float) *(p + (rangeLength * 2));
2419 }
2420
2421 if (alpha_index != 0)
2422 {
2423 cp.w = (float) *(p + (rangeLength * alpha_index));
2424
2425 float alpha = weight * QuantumScale * cp.w;
2426
2427 filteredPixel.x += alpha * cp.x;
2428 filteredPixel.y += alpha * cp.y;
2429 filteredPixel.z += alpha * cp.z;
2430 filteredPixel.w += weight * cp.w;
2431 gamma += alpha;
2432 }
2433 else
2434 filteredPixel += ((float4) weight)*cp;
2435
2436 density += weight;
2437 }
2438 }
2439 }
2440
2441 // initialize the accumulators to zero
2442 if (itemID < actualNumPixelInThisChunk) {
2443 outputPixelCache[itemID] = (float4)0.0f;
2444 densityCache[itemID] = 0.0f;
2445 if (alpha_index != 0)
2446 gammaCache[itemID] = 0.0f;
2447 }
2448 barrier(CLK_LOCAL_MEM_FENCE);
2449
2450 // accumulate the filtered pixel value and the density
2451 for (unsigned int i = 0; i < numItems; i++) {
2452 if (pixelIndex != -1) {
2453 if (itemID%numItems == i) {
2454 outputPixelCache[pixelIndex]+=filteredPixel;
2455 densityCache[pixelIndex]+=density;
2456 if (alpha_index != 0)
2457 gammaCache[pixelIndex]+=gamma;
2458 }
2459 }
2460 barrier(CLK_LOCAL_MEM_FENCE);
2461 }
2462
2463 if (itemID < actualNumPixelInThisChunk)
2464 {
2465 float4 filteredPixel = outputPixelCache[itemID];
2466
2467 float gamma = 0.0f;
2468 if (alpha_index != 0)
2469 gamma = gammaCache[itemID];
2470
2471 float density = densityCache[itemID];
2472 if ((density != 0.0f) && (density != 1.0f))
2473 {
2474 density = PerceptibleReciprocal(density);
2475 filteredPixel *= (float4) density;
2476 if (alpha_index != 0)
2477 gamma *= density;
2478 }
2479
2480 if (alpha_index != 0)
2481 {
2482 gamma = PerceptibleReciprocal(gamma);
2483 filteredPixel.x *= gamma;
2484 filteredPixel.y *= gamma;
2485 filteredPixel.z *= gamma;
2486 }
2487
2488 WriteAllChannels(filteredImage, number_channels, filteredColumns, x, chunkStartY + itemID, filteredPixel);
2489 }
2490 }
2491 }
2492 )
2493
2494/*
2495%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2496% %
2497% %
2498% %
2499% R o t a t i o n a l B l u r %
2500% %
2501% %
2502% %
2503%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2504*/
2505
2506 STRINGIFY(
2507 __kernel void RotationalBlur(const __global CLQuantum *image,
2508 const unsigned int number_channels,const unsigned int channel,
2509 const float2 blurCenter,__constant float *cos_theta,
2510 __constant float *sin_theta,const unsigned int cossin_theta_size,
2511 __global CLQuantum *filteredImage)
2512 {
2513 const int x = get_global_id(0);
2514 const int y = get_global_id(1);
2515 const int columns = get_global_size(0);
2516 const int rows = get_global_size(1);
2517 unsigned int step = 1;
2518 float center_x = (float) x - blurCenter.x;
2519 float center_y = (float) y - blurCenter.y;
2520 float radius = hypot(center_x, center_y);
2521
2522 float blur_radius = hypot(blurCenter.x, blurCenter.y);
2523
2524 if (radius > MagickEpsilon)
2525 {
2526 step = (unsigned int) (blur_radius / radius);
2527 if (step == 0)
2528 step = 1;
2529 if (step >= cossin_theta_size)
2530 step = cossin_theta_size-1;
2531 }
2532
2533 float4 result = 0.0f;
2534 float normalize = 0.0f;
2535 float gamma = 0.0f;
2536
2537 for (unsigned int i=0; i<cossin_theta_size; i+=step)
2538 {
2539 int cx = ClampToCanvas(blurCenter.x+center_x*cos_theta[i]-center_y*sin_theta[i]+0.5f,columns);
2540 int cy = ClampToCanvas(blurCenter.y+center_x*sin_theta[i]+center_y*cos_theta[i]+0.5f,rows);
2541
2542 float4 pixel = ReadAllChannels(image, number_channels, columns, cx, cy);
2543
2544 if ((number_channels == 4) || (number_channels == 2))
2545 {
2546 float alpha = (float)(QuantumScale*pixel.w);
2547
2548 gamma += alpha;
2549
2550 result.x += alpha * pixel.x;
2551 result.y += alpha * pixel.y;
2552 result.z += alpha * pixel.z;
2553 result.w += pixel.w;
2554 }
2555 else
2556 result += pixel;
2557
2558 normalize += 1.0f;
2559 }
2560
2561 normalize = PerceptibleReciprocal(normalize);
2562
2563 if ((number_channels == 4) || (number_channels == 2))
2564 {
2565 gamma = PerceptibleReciprocal(gamma);
2566 result.x *= gamma;
2567 result.y *= gamma;
2568 result.z *= gamma;
2569 result.w *= normalize;
2570 }
2571 else
2572 result *= normalize;
2573
2574 WriteFloat4(filteredImage, number_channels, columns, x, y, channel, result);
2575 }
2576 )
2577
2578/*
2579%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2580% %
2581% %
2582% %
2583% U n s h a r p M a s k %
2584% %
2585% %
2586% %
2587%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2588*/
2589
2590 STRINGIFY(
2591 __kernel void UnsharpMaskBlurColumn(const __global CLQuantum* image,
2592 const __global float4 *blurRowData,const unsigned int number_channels,
2593 const ChannelType channel,const unsigned int columns,
2594 const unsigned int rows,__local float4* cachedData,
2595 __local float* cachedFilter,const __global float *filter,
2596 const unsigned int width,const float gain, const float threshold,
2597 __global CLQuantum *filteredImage)
2598 {
2599 const unsigned int radius = (width-1)/2;
2600
2601 // cache the pixel shared by the workgroup
2602 const int groupX = get_group_id(0);
2603 const int groupStartY = get_group_id(1)*get_local_size(1) - radius;
2604 const int groupStopY = (get_group_id(1)+1)*get_local_size(1) + radius;
2605
2606 if ((groupStartY >= 0) && (groupStopY < rows))
2607 {
2608 event_t e = async_work_group_strided_copy(cachedData,
2609 blurRowData+groupStartY*columns+groupX,groupStopY-groupStartY,columns,0);
2610 wait_group_events(1,&e);
2611 }
2612 else
2613 {
2614 for (int i = get_local_id(1); i < (groupStopY - groupStartY); i+=get_local_size(1))
2615 cachedData[i] = blurRowData[ClampToCanvas(groupStartY+i,rows)*columns + groupX];
2616
2617 barrier(CLK_LOCAL_MEM_FENCE);
2618 }
2619 // cache the filter as well
2620 event_t e = async_work_group_copy(cachedFilter,filter,width,0);
2621 wait_group_events(1,&e);
2622
2623 // only do the work if this is not a patched item
2624 const int cy = get_global_id(1);
2625
2626 if (cy < rows)
2627 {
2628 float4 blurredPixel = (float4) 0.0f;
2629
2630 int i = 0;
2631
2632 for ( ; i+7 < width; )
2633 {
2634 for (int j=0; j < 8; j++, i++)
2635 blurredPixel+=cachedFilter[i+j]*cachedData[i+j+get_local_id(1)];
2636 }
2637
2638 for ( ; i < width; i++)
2639 blurredPixel+=cachedFilter[i]*cachedData[i+get_local_id(1)];
2640
2641 float4 inputImagePixel = ReadFloat4(image,number_channels,columns,groupX,cy,channel);
2642 float4 outputPixel = inputImagePixel - blurredPixel;
2643
2644 float quantumThreshold = QuantumRange*threshold;
2645
2646 int4 mask = isless(fabs(2.0f*outputPixel), (float4)quantumThreshold);
2647 outputPixel = select(inputImagePixel + outputPixel * gain, inputImagePixel, mask);
2648
2649 //write back
2650 WriteFloat4(filteredImage,number_channels,columns,groupX,cy,channel,outputPixel);
2651 }
2652 }
2653 )
2654
2655 STRINGIFY(
2656 __kernel void UnsharpMask(const __global CLQuantum *image,const unsigned int number_channels,
2657 const ChannelType channel,__constant float *filter,const unsigned int width,
2658 const unsigned int columns,const unsigned int rows,__local float4 *pixels,
2659 const float gain,const float threshold,__global CLQuantum *filteredImage)
2660 {
2661 const unsigned int x = get_global_id(0);
2662 const unsigned int y = get_global_id(1);
2663
2664 const unsigned int radius = (width - 1) / 2;
2665
2666 int row = y - radius;
2667 int baseRow = get_group_id(1) * get_local_size(1) - radius;
2668 int endRow = (get_group_id(1) + 1) * get_local_size(1) + radius;
2669
2670 while (row < endRow) {
2671 int srcy = (row < 0) ? -row : row; // mirror pad
2672 srcy = (srcy >= rows) ? (2 * rows - srcy - 1) : srcy;
2673
2674 float4 value = 0.0f;
2675
2676 int ix = x - radius;
2677 int i = 0;
2678
2679 while (i + 7 < width) {
2680 for (int j = 0; j < 8; ++j) { // unrolled
2681 int srcx = ix + j;
2682 srcx = (srcx < 0) ? -srcx : srcx;
2683 srcx = (srcx >= columns) ? (2 * columns - srcx - 1) : srcx;
2684 value += filter[i + j] * ReadFloat4(image, number_channels, columns, srcx, srcy, channel);
2685 }
2686 ix += 8;
2687 i += 8;
2688 }
2689
2690 while (i < width) {
2691 int srcx = (ix < 0) ? -ix : ix; // mirror pad
2692 srcx = (srcx >= columns) ? (2 * columns - srcx - 1) : srcx;
2693 value += filter[i] * ReadFloat4(image, number_channels, columns, srcx, srcy, channel);
2694 ++i;
2695 ++ix;
2696 }
2697 pixels[(row - baseRow) * get_local_size(0) + get_local_id(0)] = value;
2698 row += get_local_size(1);
2699 }
2700
2701 barrier(CLK_LOCAL_MEM_FENCE);
2702
2703 const int px = get_local_id(0);
2704 const int py = get_local_id(1);
2705 const int prp = get_local_size(0);
2706 float4 value = (float4)(0.0f);
2707
2708 int i = 0;
2709 while (i + 7 < width) {
2710 for (int j = 0; j < 8; ++j) // unrolled
2711 value += (float4)(filter[i]) * pixels[px + (py + i + j) * prp];
2712 i += 8;
2713 }
2714 while (i < width) {
2715 value += (float4)(filter[i]) * pixels[px + (py + i) * prp];
2716 ++i;
2717 }
2718
2719 if ((x < columns) && (y < rows)) {
2720 float4 srcPixel = ReadFloat4(image, number_channels, columns, x, y, channel);
2721 float4 diff = srcPixel - value;
2722
2723 float quantumThreshold = QuantumRange*threshold;
2724
2725 int4 mask = isless(fabs(2.0f * diff), (float4)quantumThreshold);
2726 value = select(srcPixel + diff * gain, srcPixel, mask);
2727
2728 WriteFloat4(filteredImage, number_channels, columns, x, y, channel, value);
2729 }
2730 }
2731 )
2732
2733/*
2734%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2735% %
2736% %
2737% %
2738% W a v e l e t D e n o i s e %
2739% %
2740% %
2741% %
2742%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2743*/
2744
2745 STRINGIFY(
2746 __kernel __attribute__((reqd_work_group_size(64, 4, 1)))
2747 void WaveletDenoise(__global CLQuantum *srcImage,__global CLQuantum *dstImage,
2748 const unsigned int number_channels,const unsigned int max_channels,
2749 const float threshold,const int passes,const unsigned int imageWidth,
2750 const unsigned int imageHeight)
2751 {
2752 const int pad = (1 << (passes - 1));
2753 const int tileSize = 64;
2754 const int tileRowPixels = 64;
2755 const float noise[] = { 0.8002, 0.2735, 0.1202, 0.0585, 0.0291, 0.0152, 0.0080, 0.0044 };
2756
2757 CLQuantum stage[48]; // 16 * 3 (we only need 3 channels)
2758
2759 local float buffer[64 * 64];
2760
2761 int srcx = (get_group_id(0) + get_global_offset(0) / tileSize) * (tileSize - 2 * pad) - pad + get_local_id(0);
2762 int srcy = (get_group_id(1) + get_global_offset(1) / 4) * (tileSize - 2 * pad) - pad;
2763
2764 for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) {
2765 int pos = (mirrorTop(mirrorBottom(srcx), imageWidth) * number_channels) +
2766 (mirrorTop(mirrorBottom(srcy + i), imageHeight)) * imageWidth * number_channels;
2767
2768 for (int channel = 0; channel < max_channels; ++channel)
2769 stage[(i / 4) + (16 * channel)] = srcImage[pos + channel];
2770 }
2771
2772 for (int channel = 0; channel < max_channels; ++channel) {
2773 // Load LDS
2774 for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
2775 buffer[get_local_id(0) + i * tileRowPixels] = convert_float(stage[(i / 4) + (16 * channel)]);
2776
2777 // Process
2778
2779 float tmp[16];
2780 float accum[16];
2781 float pixel;
2782
2783 for (int i = 0; i < 16; i++)
2784 accum[i]=0.0f;
2785
2786 for (int pass = 0; pass < passes; ++pass) {
2787 const int radius = 1 << pass;
2788 const int x = get_local_id(0);
2789 const float thresh = threshold * noise[pass];
2790
2791 // Apply horizontal hat
2792 for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) {
2793 const int offset = i * tileRowPixels;
2794 if (pass == 0)
2795 tmp[i / 4] = buffer[x + offset]; // snapshot input on first pass
2796 pixel = 0.5f * tmp[i / 4] + 0.25 * (buffer[mirrorBottom(x - radius) + offset] + buffer[mirrorTop(x + radius, tileSize) + offset]);
2797 barrier(CLK_LOCAL_MEM_FENCE);
2798 buffer[x + offset] = pixel;
2799 }
2800 barrier(CLK_LOCAL_MEM_FENCE);
2801
2802 // Apply vertical hat
2803 for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) {
2804 pixel = 0.5f * buffer[x + i * tileRowPixels] + 0.25 * (buffer[x + mirrorBottom(i - radius) * tileRowPixels] + buffer[x + mirrorTop(i + radius, tileRowPixels) * tileRowPixels]);
2805 float delta = tmp[i / 4] - pixel;
2806 tmp[i / 4] = pixel; // hold output in tmp until all workitems are done
2807 if (delta < -thresh)
2808 delta += thresh;
2809 else if (delta > thresh)
2810 delta -= thresh;
2811 else
2812 delta = 0;
2813 accum[i / 4] += delta;
2814 }
2815 barrier(CLK_LOCAL_MEM_FENCE);
2816
2817 if (pass < passes - 1)
2818 for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
2819 buffer[x + i * tileRowPixels] = tmp[i / 4]; // store lowpass for next pass
2820 else // last pass
2821 for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
2822 accum[i / 4] += tmp[i / 4]; // add the lowpass signal back to output
2823 barrier(CLK_LOCAL_MEM_FENCE);
2824 }
2825
2826 for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
2827 stage[(i / 4) + (16 * channel)] = ClampToQuantum(accum[i / 4]);
2828
2829 barrier(CLK_LOCAL_MEM_FENCE);
2830 }
2831
2832 // Write from stage to output
2833
2834 if ((get_local_id(0) >= pad) && (get_local_id(0) < tileSize - pad) && (srcx >= 0) && (srcx < imageWidth)) {
2835 for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) {
2836 if ((i >= pad) && (i < tileSize - pad) && (srcy + i >= 0) && (srcy + i < imageHeight)) {
2837 int pos = (srcx * number_channels) + ((srcy + i) * (imageWidth * number_channels));
2838 for (int channel = 0; channel < max_channels; ++channel) {
2839 dstImage[pos + channel] = stage[(i / 4) + (16 * channel)];
2840 }
2841 }
2842 }
2843 }
2844 }
2845 )
2846
2847 ;
2848
2849#endif // MAGICKCORE_OPENCL_SUPPORT
2850
2851#if defined(__cplusplus) || defined(c_plusplus)
2852}
2853#endif
2854
2855#endif // MAGICKCORE_ACCELERATE_KERNELS_PRIVATE_H