Gitly

<
1 /* stb_image_resize2 - v2.11 - public domain image resizing
2 
3    by Jeff Roberts (v2) and Jorge L Rodriguez
4    http://github.com/nothings/stb
5 
6    Can be threaded with the extended API. SSE2, AVX, Neon and WASM SIMD support. Only
7    scaling and translation is supported, no rotations or shears.
8 
9    COMPILING & LINKING
10       In one C/C++ file that #includes this file, do this:
11          #define STB_IMAGE_RESIZE_IMPLEMENTATION
12       before the #include. That will create the implementation in that file.
13 
14    EASY API CALLS:
15      Easy API downsamples w/Mitchell filter, upsamples w/cubic interpolation, clamps to edge.
16 
17      stbir_resize_uint8_srgb( input_pixels,  input_w,  input_h,  input_stride_in_bytes,
18                               output_pixels, output_w, output_h, output_stride_in_bytes,
19                               pixel_layout_enum )
20 
21      stbir_resize_uint8_linear( input_pixels,  input_w,  input_h,  input_stride_in_bytes,
22                                 output_pixels, output_w, output_h, output_stride_in_bytes,
23                                 pixel_layout_enum )
24 
25      stbir_resize_float_linear( input_pixels,  input_w,  input_h,  input_stride_in_bytes,
26                                 output_pixels, output_w, output_h, output_stride_in_bytes,
27                                 pixel_layout_enum )
28 
29      If you pass NULL or zero for the output_pixels, we will allocate the output buffer
30      for you and return it from the function (free with free() or STBIR_FREE).
31      As a special case, XX_stride_in_bytes of 0 means packed continuously in memory.
32 
33    API LEVELS
34       There are three levels of API - easy-to-use, medium-complexity and extended-complexity.
35 
36       See the "header file" section of the source for API documentation.
37 
38    ADDITIONAL DOCUMENTATION
39 
40       MEMORY ALLOCATION
41          By default, we use malloc and free for memory allocation.  To override the
42          memory allocation, before the implementation #include, add a:
43 
44             #define STBIR_MALLOC(size,user_data) ...
45             #define STBIR_FREE(ptr,user_data)   ...
46 
47          Each resize makes exactly one call to malloc/free (unless you use the
48          extended API where you can do one allocation for many resizes). Under
49          address sanitizer, we do separate allocations to find overread/writes.
50 
51       PERFORMANCE
52          This library was written with an emphasis on performance. When testing
53          stb_image_resize with RGBA, the fastest mode is STBIR_4CHANNEL with
54          STBIR_TYPE_UINT8 pixels and CLAMPed edges (which is what many other resize
55          libs do by default). Also, make sure SIMD is turned on of course (default
56          for 64-bit targets). Avoid WRAP edge mode if you want the fastest speed.
57 
58          This library also comes with profiling built-in. If you define STBIR_PROFILE,
59          you can use the advanced API and get low-level profiling information by
60          calling stbir_resize_extended_profile_info() or stbir_resize_split_profile_info()
61          after a resize.
62 
63       SIMD
64          Most of the routines have optimized SSE2, AVX, NEON and WASM versions.
65 
66          On Microsoft compilers, we automatically turn on SIMD for 64-bit x64 and
67          ARM; for 32-bit x86 and ARM, you select SIMD mode by defining STBIR_SSE2 or
68          STBIR_NEON. For AVX and AVX2, we auto-select it by detecting the /arch:AVX
69          or /arch:AVX2 switches. You can also always manually turn SSE2, AVX or AVX2
70          support on by defining STBIR_SSE2, STBIR_AVX or STBIR_AVX2.
71 
72          On Linux, SSE2 and Neon is on by default for 64-bit x64 or ARM64. For 32-bit,
73          we select x86 SIMD mode by whether you have -msse2, -mavx or -mavx2 enabled
74          on the command line. For 32-bit ARM, you must pass -mfpu=neon-vfpv4 for both
75          clang and GCC, but GCC also requires an additional -mfp16-format=ieee to
76          automatically enable NEON.
77 
78          On x86 platforms, you can also define STBIR_FP16C to turn on FP16C instructions
79          for converting back and forth to half-floats. This is autoselected when we
80          are using AVX2. Clang and GCC also require the -mf16c switch. ARM always uses
81          the built-in half float hardware NEON instructions.
82 
83          You can also tell us to use multiply-add instructions with STBIR_USE_FMA.
84          Because x86 doesn't always have fma, we turn it off by default to maintain
85          determinism across all platforms. If you don't care about non-FMA determinism
86          and are willing to restrict yourself to more recent x86 CPUs (around the AVX
87          timeframe), then fma will give you around a 15% speedup.
88 
89          You can force off SIMD in all cases by defining STBIR_NO_SIMD. You can turn
90          off AVX or AVX2 specifically with STBIR_NO_AVX or STBIR_NO_AVX2. AVX is 10%
91          to 40% faster, and AVX2 is generally another 12%.
92 
93       ALPHA CHANNEL
94          Most of the resizing functions provide the ability to control how the alpha
95          channel of an image is processed.
96 
97          When alpha represents transparency, it is important that when combining
98          colors with filtering, the pixels should not be treated equally; they
99          should use a weighted average based on their alpha values. For example,
100          if a pixel is 1% opaque bright green and another pixel is 99% opaque
101          black and you average them, the average will be 50% opaque, but the
102          unweighted average and will be a middling green color, while the weighted
103          average will be nearly black. This means the unweighted version introduced
104          green energy that didn't exist in the source image.
105 
106          (If you want to know why this makes sense, you can work out the math for
107          the following: consider what happens if you alpha composite a source image
108          over a fixed color and then average the output, vs. if you average the
109          source image pixels and then composite that over the same fixed color.
110          Only the weighted average produces the same result as the ground truth
111          composite-then-average result.)
112 
113          Therefore, it is in general best to "alpha weight" the pixels when applying
114          filters to them. This essentially means multiplying the colors by the alpha
115          values before combining them, and then dividing by the alpha value at the
116          end.
117 
118          The computer graphics industry introduced a technique called "premultiplied
119          alpha" or "associated alpha" in which image colors are stored in image files
120          already multiplied by their alpha. This saves some math when compositing,
121          and also avoids the need to divide by the alpha at the end (which is quite
122          inefficient). However, while premultiplied alpha is common in the movie CGI
123          industry, it is not commonplace in other industries like videogames, and most
124          consumer file formats are generally expected to contain not-premultiplied
125          colors. For example, Photoshop saves PNG files "unpremultiplied", and web
126          browsers like Chrome and Firefox expect PNG images to be unpremultiplied.
127 
128          Note that there are three possibilities that might describe your image
129          and resize expectation:
130 
131              1. images are not premultiplied, alpha weighting is desired
132              2. images are not premultiplied, alpha weighting is not desired
133              3. images are premultiplied
134 
135          Both case #2 and case #3 require the exact same math: no alpha weighting
136          should be applied or removed. Only case 1 requires extra math operations;
137          the other two cases can be handled identically.
138 
139          stb_image_resize expects case #1 by default, applying alpha weighting to
140          images, expecting the input images to be unpremultiplied. This is what the
141          COLOR+ALPHA buffer types tell the resizer to do.
142 
143          When you use the pixel layouts STBIR_RGBA, STBIR_BGRA, STBIR_ARGB,
144          STBIR_ABGR, STBIR_RX, or STBIR_XR you are telling us that the pixels are
145          non-premultiplied. In these cases, the resizer will alpha weight the colors
146          (effectively creating the premultiplied image), do the filtering, and then
147          convert back to non-premult on exit.
148 
149          When you use the pixel layouts STBIR_RGBA_PM, STBIR_RGBA_PM, STBIR_RGBA_PM,
150          STBIR_RGBA_PM, STBIR_RX_PM or STBIR_XR_PM, you are telling that the pixels
151          ARE premultiplied. In this case, the resizer doesn't have to do the
152          premultipling - it can filter directly on the input. This about twice as
153          fast as the non-premultiplied case, so it's the right option if your data is
154          already setup correctly.
155 
156          When you use the pixel layout STBIR_4CHANNEL or STBIR_2CHANNEL, you are
157          telling us that there is no channel that represents transparency; it may be
158          RGB and some unrelated fourth channel that has been stored in the alpha
159          channel, but it is actually not alpha. No special processing will be
160          performed.
161 
162          The difference between the generic 4 or 2 channel layouts, and the
163          specialized _PM versions is with the _PM versions you are telling us that
164          the data *is* alpha, just don't premultiply it. That's important when
165          using SRGB pixel formats, we need to know where the alpha is, because
166          it is converted linearly (rather than with the SRGB converters).
167 
168          Because alpha weighting produces the same effect as premultiplying, you
169          even have the option with non-premultiplied inputs to let the resizer
170          produce a premultiplied output. Because the intially computed alpha-weighted
171          output image is effectively premultiplied, this is actually more performant
172          than the normal path which un-premultiplies the output image as a final step.
173 
174          Finally, when converting both in and out of non-premulitplied space (for
175          example, when using STBIR_RGBA), we go to somewhat heroic measures to
176          ensure that areas with zero alpha value pixels get something reasonable
177          in the RGB values. If you don't care about the RGB values of zero alpha
178          pixels, you can call the stbir_set_non_pm_alpha_speed_over_quality()
179          function - this runs a premultiplied resize about 25% faster. That said,
180          when you really care about speed, using premultiplied pixels for both in
181          and out (STBIR_RGBA_PM, etc) much faster than both of these premultiplied
182          options.
183 
184       PIXEL LAYOUT CONVERSION
185          The resizer can convert from some pixel layouts to others. When using the
186          stbir_set_pixel_layouts(), you can, for example, specify STBIR_RGBA
187          on input, and STBIR_ARGB on output, and it will re-organize the channels
188          during the resize. Currently, you can only convert between two pixel
189          layouts with the same number of channels.
190 
191       DETERMINISM
192          We commit to being deterministic (from x64 to ARM to scalar to SIMD, etc).
193          This requires compiling with fast-math off (using at least /fp:precise).
194          Also, you must turn off fp-contracting (which turns mult+adds into fmas)!
195          We attempt to do this with pragmas, but with Clang, you usually want to add
196          -ffp-contract=off to the command line as well.
197 
198          For 32-bit x86, you must use SSE and SSE2 codegen for determinism. That is,
199          if the scalar x87 unit gets used at all, we immediately lose determinism.
200          On Microsoft Visual Studio 2008 and earlier, from what we can tell there is
201          no way to be deterministic in 32-bit x86 (some x87 always leaks in, even
202          with fp:strict). On 32-bit x86 GCC, determinism requires both -msse2 and
203          -fpmath=sse.
204 
205          Note that we will not be deterministic with float data containing NaNs -
206          the NaNs will propagate differently on different SIMD and platforms.
207 
208          If you turn on STBIR_USE_FMA, then we will be deterministic with other
209          fma targets, but we will differ from non-fma targets (this is unavoidable,
210          because a fma isn't simply an add with a mult - it also introduces a
211          rounding difference compared to non-fma instruction sequences.
212 
213       FLOAT PIXEL FORMAT RANGE
214          Any range of values can be used for the non-alpha float data that you pass
215          in (0 to 1, -1 to 1, whatever). However, if you are inputting float values
216          but *outputting* bytes or shorts, you must use a range of 0 to 1 so that we
217          scale back properly. The alpha channel must also be 0 to 1 for any format
218          that does premultiplication prior to resizing.
219 
220          Note also that with float output, using filters with negative lobes, the
221          output filtered values might go slightly out of range. You can define
222          STBIR_FLOAT_LOW_CLAMP and/or STBIR_FLOAT_HIGH_CLAMP to specify the range
223          to clamp to on output, if that's important.
224 
225       MAX/MIN SCALE FACTORS
226          The input pixel resolutions are in integers, and we do the internal pointer
227          resolution in size_t sized integers. However, the scale ratio from input
228          resolution to output resolution is calculated in float form. This means
229          the effective possible scale ratio is limited to 24 bits (or 16 million
230          to 1). As you get close to the size of the float resolution (again, 16
231          million pixels wide or high), you might start seeing float inaccuracy
232          issues in general in the pipeline. If you have to do extreme resizes,
233          you can usually do this is multiple stages (using float intermediate
234          buffers).
235 
236       FLIPPED IMAGES
237          Stride is just the delta from one scanline to the next. This means you can
238          use a negative stride to handle inverted images (point to the final
239          scanline and use a negative stride). You can invert the input or output,
240          using negative strides.
241 
242       DEFAULT FILTERS
243          For functions which don't provide explicit control over what filters to
244          use, you can change the compile-time defaults with:
245 
246             #define STBIR_DEFAULT_FILTER_UPSAMPLE     STBIR_FILTER_something
247             #define STBIR_DEFAULT_FILTER_DOWNSAMPLE   STBIR_FILTER_something
248 
249          See stbir_filter in the header-file section for the list of filters.
250 
251       NEW FILTERS
252          A number of 1D filter kernels are supplied. For a list of supported
253          filters, see the stbir_filter enum. You can install your own filters by
254          using the stbir_set_filter_callbacks function.
255 
256       PROGRESS
257          For interactive use with slow resize operations, you can use the the
258          scanline callbacks in the extended API. It would have to be a *very* large
259          image resample to need progress though - we're very fast.
260 
261       CEIL and FLOOR
262          In scalar mode, the only functions we use from math.h are ceilf and floorf,
263          but if you have your own versions, you can define the STBIR_CEILF(v) and
264          STBIR_FLOORF(v) macros and we'll use them instead. In SIMD, we just use
265          our own versions.
266 
267       ASSERT
268          Define STBIR_ASSERT(boolval) to override assert() and not use assert.h
269 
270      PORTING FROM VERSION 1
271         The API has changed. You can continue to use the old version of stb_image_resize.h,
272         which is available in the "deprecated/" directory.
273 
274         If you're using the old simple-to-use API, porting is straightforward.
275         (For more advanced APIs, read the documentation.)
276 
277           stbir_resize_uint8():
278             - call `stbir_resize_uint8_linear`, cast channel count to `stbir_pixel_layout`
279 
280           stbir_resize_float():
281             - call `stbir_resize_float_linear`, cast channel count to `stbir_pixel_layout`
282 
283           stbir_resize_uint8_srgb():
284             - function name is unchanged
285             - cast channel count to `stbir_pixel_layout`
286             - above is sufficient unless your image has alpha and it's not RGBA/BGRA
287               - in that case, follow the below instructions for stbir_resize_uint8_srgb_edgemode
288 
289           stbir_resize_uint8_srgb_edgemode()
290             - switch to the "medium complexity" API
291             - stbir_resize(), very similar API but a few more parameters:
292               - pixel_layout: cast channel count to `stbir_pixel_layout`
293               - data_type:    STBIR_TYPE_UINT8_SRGB
294               - edge:         unchanged (STBIR_EDGE_WRAP, etc.)
295               - filter:       STBIR_FILTER_DEFAULT
296             - which channel is alpha is specified in stbir_pixel_layout, see enum for details
297 
298       FUTURE TODOS
299         *  For polyphase integral filters, we just memcpy the coeffs to dupe
300            them, but we should indirect and use the same coeff memory.
301         *  Add pixel layout conversions for sensible different channel counts
302            (maybe, 1->3/4, 3->4, 4->1, 3->1).
303          * For SIMD encode and decode scanline routines, do any pre-aligning
304            for bad input/output buffer alignments and pitch?
305          * For very wide scanlines, we should we do vertical strips to stay within
306            L2 cache. Maybe do chunks of 1K pixels at a time. There would be
307            some pixel reconversion, but probably dwarfed by things falling out
308            of cache. Probably also something possible with alternating between
309            scattering and gathering at high resize scales?
310          * Rewrite the coefficient generator to do many at once.
311          * AVX-512 vertical kernels - worried about downclocking here.
312          * Convert the reincludes to macros when we know they aren't changing.
313          * Experiment with pivoting the horizontal and always using the
314            vertical filters (which are faster, but perhaps not enough to overcome
315            the pivot cost and the extra memory touches). Need to buffer the whole
316            image so have to balance memory use.
317          * Most of our code is internally function pointers, should we compile
318            all the SIMD stuff always and dynamically dispatch?
319 
320    CONTRIBUTORS
321       Jeff Roberts: 2.0 implementation, optimizations, SIMD
322       Martins Mozeiko: NEON simd, WASM simd, clang and GCC whisperer
323       Fabian Giesen: half float and srgb converters
324       Sean Barrett: API design, optimizations
325       Jorge L Rodriguez: Original 1.0 implementation
326       Aras Pranckevicius: bugfixes
327       Nathan Reed: warning fixes for 1.0
328 
329    REVISIONS
330       2.11 (2024-09-08) fix harmless asan warnings in 2-channel and 3-channel mode
331                           with AVX-2, fix some weird scaling edge conditions with
332                           point sample mode.
333       2.10 (2024-07-27) fix the defines GCC and mingw for loop unroll control,
334                           fix MSVC 32-bit arm half float routines.
335       2.09 (2024-06-19) fix the defines for 32-bit ARM GCC builds (was selecting
336                           hardware half floats).
337       2.08 (2024-06-10) fix for RGB->BGR three channel flips and add SIMD (thanks
338                           to Ryan Salsbury), fix for sub-rect resizes, use the
339                           pragmas to control unrolling when they are available.
340       2.07 (2024-05-24) fix for slow final split during threaded conversions of very 
341                           wide scanlines when downsampling (caused by extra input 
342                           converting), fix for wide scanline resamples with many 
343                           splits (int overflow), fix GCC warning.
344       2.06 (2024-02-10) fix for identical width/height 3x or more down-scaling 
345                           undersampling a single row on rare resize ratios (about 1%).
346       2.05 (2024-02-07) fix for 2 pixel to 1 pixel resizes with wrap (thanks Aras),
347                         fix for output callback (thanks Julien Koenen).
348       2.04 (2023-11-17) fix for rare AVX bug, shadowed symbol (thanks Nikola Smiljanic).
349       2.03 (2023-11-01) ASAN and TSAN warnings fixed, minor tweaks.
350       2.00 (2023-10-10) mostly new source: new api, optimizations, simd, vertical-first, etc
351                           2x-5x faster without simd, 4x-12x faster with simd,
352                           in some cases, 20x to 40x faster esp resizing large to very small.
353       0.96 (2019-03-04) fixed warnings
354       0.95 (2017-07-23) fixed warnings
355       0.94 (2017-03-18) fixed warnings
356       0.93 (2017-03-03) fixed bug with certain combinations of heights
357       0.92 (2017-01-02) fix integer overflow on large (>2GB) images
358       0.91 (2016-04-02) fix warnings; fix handling of subpixel regions
359       0.90 (2014-09-17) first released version
360 
361    LICENSE
362      See end of file for license information.
363 */
364 
365 // __v_ start
366 #ifdef __TINYC__
367 #define STBIR_NO_SIMD
368 #endif
369 // __v_ end
370 
371 #if !defined(STB_IMAGE_RESIZE_DO_HORIZONTALS) && !defined(STB_IMAGE_RESIZE_DO_VERTICALS) && !defined(STB_IMAGE_RESIZE_DO_CODERS)   // for internal re-includes
372 
373 #ifndef STBIR_INCLUDE_STB_IMAGE_RESIZE2_H
374 #define STBIR_INCLUDE_STB_IMAGE_RESIZE2_H
375 
376 #include <stddef.h>
377 #ifdef _MSC_VER
378 typedef unsigned char    stbir_uint8;
379 typedef unsigned short   stbir_uint16;
380 typedef unsigned int     stbir_uint32;
381 typedef unsigned __int64 stbir_uint64;
382 #else
383 #include <stdint.h>
384 typedef uint8_t  stbir_uint8;
385 typedef uint16_t stbir_uint16;
386 typedef uint32_t stbir_uint32;
387 typedef uint64_t stbir_uint64;
388 #endif
389 
390 #ifdef _M_IX86_FP
391 #if ( _M_IX86_FP >= 1 )
392 #ifndef STBIR_SSE
393 #define STBIR_SSE
394 #endif
395 #endif
396 #endif
397 
398 #if defined(_x86_64) || defined( __x86_64__ ) || defined( _M_X64 ) || defined(__x86_64) || defined(_M_AMD64) || defined(__SSE2__) || defined(STBIR_SSE) || defined(STBIR_SSE2)
399   #ifndef STBIR_SSE2
400     #define STBIR_SSE2
401   #endif
402   #if defined(__AVX__) || defined(STBIR_AVX2)
403     #ifndef STBIR_AVX
404       #ifndef STBIR_NO_AVX
405         #define STBIR_AVX
406       #endif
407     #endif
408   #endif
409   #if defined(__AVX2__) || defined(STBIR_AVX2)
410     #ifndef STBIR_NO_AVX2
411       #ifndef STBIR_AVX2
412         #define STBIR_AVX2
413       #endif
414       #if defined( _MSC_VER ) && !defined(__clang__)
415         #ifndef STBIR_FP16C  // FP16C instructions are on all AVX2 cpus, so we can autoselect it here on microsoft - clang needs -m16c
416           #define STBIR_FP16C
417         #endif
418       #endif
419     #endif
420   #endif
421   #ifdef __F16C__
422     #ifndef STBIR_FP16C  // turn on FP16C instructions if the define is set (for clang and gcc)
423       #define STBIR_FP16C
424     #endif
425   #endif
426 #endif
427 
428 #if defined( _M_ARM64 ) || defined( __aarch64__ ) || defined( __arm64__ ) || ((__ARM_NEON_FP & 4) != 0) || defined(__ARM_NEON__)
429 #ifndef STBIR_NEON
430 #define STBIR_NEON
431 #endif
432 #endif
433 
434 #if defined(_M_ARM) || defined(__arm__)
435 #ifdef STBIR_USE_FMA
436 #undef STBIR_USE_FMA // no FMA for 32-bit arm on MSVC
437 #endif
438 #endif
439 
440 #if defined(__wasm__) && defined(__wasm_simd128__)
441 #ifndef STBIR_WASM
442 #define STBIR_WASM
443 #endif
444 #endif
445 
446 #ifndef STBIRDEF
447 #ifdef STB_IMAGE_RESIZE_STATIC
448 #define STBIRDEF static
449 #else
450 #ifdef __cplusplus
451 #define STBIRDEF extern "C"
452 #else
453 #define STBIRDEF extern
454 #endif
455 #endif
456 #endif
457 
458 //////////////////////////////////////////////////////////////////////////////
459 ////   start "header file" ///////////////////////////////////////////////////
460 //
461 // Easy-to-use API:
462 //
463 //     * stride is the offset between successive rows of image data
464 //        in memory, in bytes. specify 0 for packed continuously in memory
465 //     * colorspace is linear or sRGB as specified by function name
466 //     * Uses the default filters
467 //     * Uses edge mode clamped
468 //     * returned result is 1 for success or 0 in case of an error.
469 
470 
471 // stbir_pixel_layout specifies:
472 //   number of channels
473 //   order of channels
474 //   whether color is premultiplied by alpha
475 // for back compatibility, you can cast the old channel count to an stbir_pixel_layout
476 typedef enum
477 {
478   STBIR_1CHANNEL = 1,
479   STBIR_2CHANNEL = 2,
480   STBIR_RGB      = 3,               // 3-chan, with order specified (for channel flipping)
481   STBIR_BGR      = 0,               // 3-chan, with order specified (for channel flipping)
482   STBIR_4CHANNEL = 5,
483 
484   STBIR_RGBA = 4,                   // alpha formats, where alpha is NOT premultiplied into color channels
485   STBIR_BGRA = 6,
486   STBIR_ARGB = 7,
487   STBIR_ABGR = 8,
488   STBIR_RA   = 9,
489   STBIR_AR   = 10,
490 
491   STBIR_RGBA_PM = 11,               // alpha formats, where alpha is premultiplied into color channels
492   STBIR_BGRA_PM = 12,
493   STBIR_ARGB_PM = 13,
494   STBIR_ABGR_PM = 14,
495   STBIR_RA_PM   = 15,
496   STBIR_AR_PM   = 16,
497 
498   STBIR_RGBA_NO_AW = 11,            // alpha formats, where NO alpha weighting is applied at all!
499   STBIR_BGRA_NO_AW = 12,            //   these are just synonyms for the _PM flags (which also do
500   STBIR_ARGB_NO_AW = 13,            //   no alpha weighting). These names just make it more clear
501   STBIR_ABGR_NO_AW = 14,            //   for some folks).
502   STBIR_RA_NO_AW   = 15,
503   STBIR_AR_NO_AW   = 16,
504 
505 } stbir_pixel_layout;
506 
507 //===============================================================
508 //  Simple-complexity API
509 //
510 //    If output_pixels is NULL (0), then we will allocate the buffer and return it to you.
511 //--------------------------------
512 
513 STBIRDEF unsigned char * stbir_resize_uint8_srgb( const unsigned char *input_pixels , int input_w , int input_h, int input_stride_in_bytes,
514                                                         unsigned char *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
515                                                         stbir_pixel_layout pixel_type );
516 
517 STBIRDEF unsigned char * stbir_resize_uint8_linear( const unsigned char *input_pixels , int input_w , int input_h, int input_stride_in_bytes,
518                                                           unsigned char *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
519                                                           stbir_pixel_layout pixel_type );
520 
521 STBIRDEF float * stbir_resize_float_linear( const float *input_pixels , int input_w , int input_h, int input_stride_in_bytes,
522                                                   float *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
523                                                   stbir_pixel_layout pixel_type );
524 //===============================================================
525 
526 //===============================================================
527 // Medium-complexity API
528 //
529 // This extends the easy-to-use API as follows:
530 //
531 //     * Can specify the datatype - U8, U8_SRGB, U16, FLOAT, HALF_FLOAT
532 //     * Edge wrap can selected explicitly
533 //     * Filter can be selected explicitly
534 //--------------------------------
535 
536 typedef enum
537 {
538   STBIR_EDGE_CLAMP   = 0,
539   STBIR_EDGE_REFLECT = 1,
540   STBIR_EDGE_WRAP    = 2,  // this edge mode is slower and uses more memory
541   STBIR_EDGE_ZERO    = 3,
542 } stbir_edge;
543 
544 typedef enum
545 {
546   STBIR_FILTER_DEFAULT      = 0,  // use same filter type that easy-to-use API chooses
547   STBIR_FILTER_BOX          = 1,  // A trapezoid w/1-pixel wide ramps, same result as box for integer scale ratios
548   STBIR_FILTER_TRIANGLE     = 2,  // On upsampling, produces same results as bilinear texture filtering
549   STBIR_FILTER_CUBICBSPLINE = 3,  // The cubic b-spline (aka Mitchell-Netrevalli with B=1,C=0), gaussian-esque
550   STBIR_FILTER_CATMULLROM   = 4,  // An interpolating cubic spline
551   STBIR_FILTER_MITCHELL     = 5,  // Mitchell-Netrevalli filter with B=1/3, C=1/3
552   STBIR_FILTER_POINT_SAMPLE = 6,  // Simple point sampling
553   STBIR_FILTER_OTHER        = 7,  // User callback specified
554 } stbir_filter;
555 
556 typedef enum
557 {
558   STBIR_TYPE_UINT8            = 0,
559   STBIR_TYPE_UINT8_SRGB       = 1,
560   STBIR_TYPE_UINT8_SRGB_ALPHA = 2,  // alpha channel, when present, should also be SRGB (this is very unusual)
561   STBIR_TYPE_UINT16           = 3,
562   STBIR_TYPE_FLOAT            = 4,
563   STBIR_TYPE_HALF_FLOAT       = 5
564 } stbir_datatype;
565 
566 // medium api
567 STBIRDEF void *  stbir_resize( const void *input_pixels , int input_w , int input_h, int input_stride_in_bytes,
568                                      void *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
569                                stbir_pixel_layout pixel_layout, stbir_datatype data_type,
570                                stbir_edge edge, stbir_filter filter );
571 //===============================================================
572 
573 
574 
575 //===============================================================
576 // Extended-complexity API
577 //
578 // This API exposes all resize functionality.
579 //
580 //     * Separate filter types for each axis
581 //     * Separate edge modes for each axis
582 //     * Separate input and output data types
583 //     * Can specify regions with subpixel correctness
584 //     * Can specify alpha flags
585 //     * Can specify a memory callback
586 //     * Can specify a callback data type for pixel input and output
587 //     * Can be threaded for a single resize
588 //     * Can be used to resize many frames without recalculating the sampler info
589 //
590 //  Use this API as follows:
591 //     1) Call the stbir_resize_init function on a local STBIR_RESIZE structure
592 //     2) Call any of the stbir_set functions
593 //     3) Optionally call stbir_build_samplers() if you are going to resample multiple times
594 //        with the same input and output dimensions (like resizing video frames)
595 //     4) Resample by calling stbir_resize_extended().
596 //     5) Call stbir_free_samplers() if you called stbir_build_samplers()
597 //--------------------------------
598 
599 
600 // Types:
601 
602 // INPUT CALLBACK: this callback is used for input scanlines
603 typedef void const * stbir_input_callback( void * optional_output, void const * input_ptr, int num_pixels, int x, int y, void * context );
604 
605 // OUTPUT CALLBACK: this callback is used for output scanlines
606 typedef void stbir_output_callback( void const * output_ptr, int num_pixels, int y, void * context );
607 
608 // callbacks for user installed filters
609 typedef float stbir__kernel_callback( float x, float scale, void * user_data ); // centered at zero
610 typedef float stbir__support_callback( float scale, void * user_data );
611 
612 // internal structure with precomputed scaling
613 typedef struct stbir__info stbir__info;
614 
615 typedef struct STBIR_RESIZE  // use the stbir_resize_init and stbir_override functions to set these values for future compatibility
616 {
617   void * user_data;
618   void const * input_pixels;
619   int input_w, input_h;
620   double input_s0, input_t0, input_s1, input_t1;
621   stbir_input_callback * input_cb;
622   void * output_pixels;
623   int output_w, output_h;
624   int output_subx, output_suby, output_subw, output_subh;
625   stbir_output_callback * output_cb;
626   int input_stride_in_bytes;
627   int output_stride_in_bytes;
628   int splits;
629   int fast_alpha;
630   int needs_rebuild;
631   int called_alloc;
632   stbir_pixel_layout input_pixel_layout_public;
633   stbir_pixel_layout output_pixel_layout_public;
634   stbir_datatype input_data_type;
635   stbir_datatype output_data_type;
636   stbir_filter horizontal_filter, vertical_filter;
637   stbir_edge horizontal_edge, vertical_edge;
638   stbir__kernel_callback * horizontal_filter_kernel; stbir__support_callback * horizontal_filter_support;
639   stbir__kernel_callback * vertical_filter_kernel; stbir__support_callback * vertical_filter_support;
640   stbir__info * samplers;
641 } STBIR_RESIZE;
642 
643 // extended complexity api
644 
645 
646 // First off, you must ALWAYS call stbir_resize_init on your resize structure before any of the other calls!
647 STBIRDEF void stbir_resize_init( STBIR_RESIZE * resize,
648                                  const void *input_pixels,  int input_w,  int input_h, int input_stride_in_bytes, // stride can be zero
649                                        void *output_pixels, int output_w, int output_h, int output_stride_in_bytes, // stride can be zero
650                                  stbir_pixel_layout pixel_layout, stbir_datatype data_type );
651 
652 //===============================================================
653 // You can update these parameters any time after resize_init and there is no cost
654 //--------------------------------
655 
656 STBIRDEF void stbir_set_datatypes( STBIR_RESIZE * resize, stbir_datatype input_type, stbir_datatype output_type );
657 STBIRDEF void stbir_set_pixel_callbacks( STBIR_RESIZE * resize, stbir_input_callback * input_cb, stbir_output_callback * output_cb );   // no callbacks by default
658 STBIRDEF void stbir_set_user_data( STBIR_RESIZE * resize, void * user_data );                                               // pass back STBIR_RESIZE* by default
659 STBIRDEF void stbir_set_buffer_ptrs( STBIR_RESIZE * resize, const void * input_pixels, int input_stride_in_bytes, void * output_pixels, int output_stride_in_bytes );
660 
661 //===============================================================
662 
663 
664 //===============================================================
665 // If you call any of these functions, you will trigger a sampler rebuild!
666 //--------------------------------
667 
668 STBIRDEF int stbir_set_pixel_layouts( STBIR_RESIZE * resize, stbir_pixel_layout input_pixel_layout, stbir_pixel_layout output_pixel_layout );  // sets new buffer layouts
669 STBIRDEF int stbir_set_edgemodes( STBIR_RESIZE * resize, stbir_edge horizontal_edge, stbir_edge vertical_edge );       // CLAMP by default
670 
671 STBIRDEF int stbir_set_filters( STBIR_RESIZE * resize, stbir_filter horizontal_filter, stbir_filter vertical_filter ); // STBIR_DEFAULT_FILTER_UPSAMPLE/DOWNSAMPLE by default
672 STBIRDEF int stbir_set_filter_callbacks( STBIR_RESIZE * resize, stbir__kernel_callback * horizontal_filter, stbir__support_callback * horizontal_support, stbir__kernel_callback * vertical_filter, stbir__support_callback * vertical_support );
673 
674 STBIRDEF int stbir_set_pixel_subrect( STBIR_RESIZE * resize, int subx, int suby, int subw, int subh );        // sets both sub-regions (full regions by default)
675 STBIRDEF int stbir_set_input_subrect( STBIR_RESIZE * resize, double s0, double t0, double s1, double t1 );    // sets input sub-region (full region by default)
676 STBIRDEF int stbir_set_output_pixel_subrect( STBIR_RESIZE * resize, int subx, int suby, int subw, int subh ); // sets output sub-region (full region by default)
677 
678 // when inputting AND outputting non-premultiplied alpha pixels, we use a slower but higher quality technique
679 //   that fills the zero alpha pixel's RGB values with something plausible.  If you don't care about areas of
680 //   zero alpha, you can call this function to get about a 25% speed improvement for STBIR_RGBA to STBIR_RGBA
681 //   types of resizes.
682 STBIRDEF int stbir_set_non_pm_alpha_speed_over_quality( STBIR_RESIZE * resize, int non_pma_alpha_speed_over_quality );
683 //===============================================================
684 
685 
686 //===============================================================
687 // You can call build_samplers to prebuild all the internal data we need to resample.
688 //   Then, if you call resize_extended many times with the same resize, you only pay the
689 //   cost once.
690 // If you do call build_samplers, you MUST call free_samplers eventually.
691 //--------------------------------
692 
693 // This builds the samplers and does one allocation
694 STBIRDEF int stbir_build_samplers( STBIR_RESIZE * resize );
695 
696 // You MUST call this, if you call stbir_build_samplers or stbir_build_samplers_with_splits
697 STBIRDEF void stbir_free_samplers( STBIR_RESIZE * resize );
698 //===============================================================
699 
700 
701 // And this is the main function to perform the resize synchronously on one thread.
702 STBIRDEF int stbir_resize_extended( STBIR_RESIZE * resize );
703 
704 
705 //===============================================================
706 // Use these functions for multithreading.
707 //   1) You call stbir_build_samplers_with_splits first on the main thread
708 //   2) Then stbir_resize_with_split on each thread
709 //   3) stbir_free_samplers when done on the main thread
710 //--------------------------------
711 
712 // This will build samplers for threading.
713 //   You can pass in the number of threads you'd like to use (try_splits).
714 //   It returns the number of splits (threads) that you can call it with.
715 ///  It might be less if the image resize can't be split up that many ways.
716 
717 STBIRDEF int stbir_build_samplers_with_splits( STBIR_RESIZE * resize, int try_splits );
718 
719 // This function does a split of the resizing (you call this fuction for each
720 // split, on multiple threads). A split is a piece of the output resize pixel space.
721 
722 // Note that you MUST call stbir_build_samplers_with_splits before stbir_resize_extended_split!
723 
724 // Usually, you will always call stbir_resize_split with split_start as the thread_index
725 //   and "1" for the split_count.
726 // But, if you have a weird situation where you MIGHT want 8 threads, but sometimes
727 //   only 4 threads, you can use 0,2,4,6 for the split_start's and use "2" for the
728 //   split_count each time to turn in into a 4 thread resize. (This is unusual).
729 
730 STBIRDEF int stbir_resize_extended_split( STBIR_RESIZE * resize, int split_start, int split_count );
731 //===============================================================
732 
733 
734 //===============================================================
735 // Pixel Callbacks info:
736 //--------------------------------
737 
738 //   The input callback is super flexible - it calls you with the input address
739 //   (based on the stride and base pointer), it gives you an optional_output
740 //   pointer that you can fill, or you can just return your own pointer into
741 //   your own data.
742 //
743 //   You can also do conversion from non-supported data types if necessary - in
744 //   this case, you ignore the input_ptr and just use the x and y parameters to
745 //   calculate your own input_ptr based on the size of each non-supported pixel.
746 //   (Something like the third example below.)
747 //
748 //   You can also install just an input or just an output callback by setting the
749 //   callback that you don't want to zero.
750 //
751 //     First example, progress: (getting a callback that you can monitor the progress):
752 //        void const * my_callback( void * optional_output, void const * input_ptr, int num_pixels, int x, int y, void * context )
753 //        {
754 //           percentage_done = y / input_height;
755 //           return input_ptr;  // use buffer from call
756 //        }
757 //
758 //     Next example, copying: (copy from some other buffer or stream):
759 //        void const * my_callback( void * optional_output, void const * input_ptr, int num_pixels, int x, int y, void * context )
760 //        {
761 //           CopyOrStreamData( optional_output, other_data_src, num_pixels * pixel_width_in_bytes );
762 //           return optional_output;  // return the optional buffer that we filled
763 //        }
764 //
765 //     Third example, input another buffer without copying: (zero-copy from other buffer):
766 //        void const * my_callback( void * optional_output, void const * input_ptr, int num_pixels, int x, int y, void * context )
767 //        {
768 //           void * pixels = ( (char*) other_image_base ) + ( y * other_image_stride ) + ( x * other_pixel_width_in_bytes );
769 //           return pixels;       // return pointer to your data without copying
770 //        }
771 //
772 //
773 //   The output callback is considerably simpler - it just calls you so that you can dump
774 //   out each scanline. You could even directly copy out to disk if you have a simple format
775 //   like TGA or BMP. You can also convert to other output types here if you want.
776 //
777 //   Simple example:
778 //        void const * my_output( void * output_ptr, int num_pixels, int y, void * context )
779 //        {
780 //           percentage_done = y / output_height;
781 //           fwrite( output_ptr, pixel_width_in_bytes, num_pixels, output_file );
782 //        }
783 //===============================================================
784 
785 
786 
787 
788 //===============================================================
789 // optional built-in profiling API
790 //--------------------------------
791 
792 #ifdef STBIR_PROFILE
793 
794 typedef struct STBIR_PROFILE_INFO
795 {
796   stbir_uint64 total_clocks;
797 
798   // how many clocks spent (of total_clocks) in the various resize routines, along with a string description
799   //    there are "resize_count" number of zones
800   stbir_uint64 clocks[ 8 ];
801   char const ** descriptions;
802 
803   // count of clocks and descriptions
804   stbir_uint32 count;
805 } STBIR_PROFILE_INFO;
806 
807 // use after calling stbir_resize_extended (or stbir_build_samplers or stbir_build_samplers_with_splits)
808 STBIRDEF void stbir_resize_build_profile_info( STBIR_PROFILE_INFO * out_info, STBIR_RESIZE const * resize );
809 
810 // use after calling stbir_resize_extended
811 STBIRDEF void stbir_resize_extended_profile_info( STBIR_PROFILE_INFO * out_info, STBIR_RESIZE const * resize );
812 
813 // use after calling stbir_resize_extended_split
814 STBIRDEF void stbir_resize_split_profile_info( STBIR_PROFILE_INFO * out_info, STBIR_RESIZE const * resize, int split_start, int split_num );
815 
816 //===============================================================
817 
818 #endif
819 
820 
821 ////   end header file   /////////////////////////////////////////////////////
822 #endif // STBIR_INCLUDE_STB_IMAGE_RESIZE2_H
823 
824 #if defined(STB_IMAGE_RESIZE_IMPLEMENTATION) || defined(STB_IMAGE_RESIZE2_IMPLEMENTATION)
825 
826 #ifndef STBIR_ASSERT
827 #include <assert.h>
828 #define STBIR_ASSERT(x) assert(x)
829 #endif
830 
831 #ifndef STBIR_MALLOC
832 #include <stdlib.h>
833 #define STBIR_MALLOC(size,user_data) ((void)(user_data), malloc(size))
834 #define STBIR_FREE(ptr,user_data)    ((void)(user_data), free(ptr))
835 // (we used the comma operator to evaluate user_data, to avoid "unused parameter" warnings)
836 #endif
837 
838 #ifdef _MSC_VER
839 
840 #define stbir__inline __forceinline
841 
842 #else
843 
844 #define stbir__inline __inline__
845 
846 // Clang address sanitizer
847 #if defined(__has_feature)
848   #if __has_feature(address_sanitizer) || __has_feature(memory_sanitizer)
849     #ifndef STBIR__SEPARATE_ALLOCATIONS
850       #define STBIR__SEPARATE_ALLOCATIONS
851     #endif
852   #endif
853 #endif
854 
855 #endif
856 
857 // GCC and MSVC
858 #if defined(__SANITIZE_ADDRESS__)
859   #ifndef STBIR__SEPARATE_ALLOCATIONS
860     #define STBIR__SEPARATE_ALLOCATIONS
861   #endif
862 #endif
863 
864 // Always turn off automatic FMA use - use STBIR_USE_FMA if you want.
865 // Otherwise, this is a determinism disaster.
866 #ifndef STBIR_DONT_CHANGE_FP_CONTRACT  // override in case you don't want this behavior
867 #if defined(_MSC_VER) && !defined(__clang__)
868 #if _MSC_VER > 1200
869 #pragma fp_contract(off)
870 #endif
871 #elif defined(__GNUC__) &&  !defined(__clang__)
872 #pragma GCC optimize("fp-contract=off")
873 #else
874 #pragma STDC FP_CONTRACT OFF
875 #endif
876 #endif
877 
878 #ifdef _MSC_VER
879 #define STBIR__UNUSED(v)  (void)(v)
880 #else
881 #define STBIR__UNUSED(v)  (void)sizeof(v)
882 #endif
883 
884 #define STBIR__ARRAY_SIZE(a) (sizeof((a))/sizeof((a)[0]))
885 
886 
887 #ifndef STBIR_DEFAULT_FILTER_UPSAMPLE
888 #define STBIR_DEFAULT_FILTER_UPSAMPLE    STBIR_FILTER_CATMULLROM
889 #endif
890 
891 #ifndef STBIR_DEFAULT_FILTER_DOWNSAMPLE
892 #define STBIR_DEFAULT_FILTER_DOWNSAMPLE  STBIR_FILTER_MITCHELL
893 #endif
894 
895 
896 #ifndef STBIR__HEADER_FILENAME
897 #define STBIR__HEADER_FILENAME "stb_image_resize2.h"
898 #endif
899 
900 // the internal pixel layout enums are in a different order, so we can easily do range comparisons of types
901 //   the public pixel layout is ordered in a way that if you cast num_channels (1-4) to the enum, you get something sensible
902 typedef enum
903 {
904   STBIRI_1CHANNEL = 0,
905   STBIRI_2CHANNEL = 1,
906   STBIRI_RGB      = 2,
907   STBIRI_BGR      = 3,
908   STBIRI_4CHANNEL = 4,
909 
910   STBIRI_RGBA = 5,
911   STBIRI_BGRA = 6,
912   STBIRI_ARGB = 7,
913   STBIRI_ABGR = 8,
914   STBIRI_RA   = 9,
915   STBIRI_AR   = 10,
916 
917   STBIRI_RGBA_PM = 11,
918   STBIRI_BGRA_PM = 12,
919   STBIRI_ARGB_PM = 13,
920   STBIRI_ABGR_PM = 14,
921   STBIRI_RA_PM   = 15,
922   STBIRI_AR_PM   = 16,
923 } stbir_internal_pixel_layout;
924 
925 // define the public pixel layouts to not compile inside the implementation (to avoid accidental use)
926 #define STBIR_BGR bad_dont_use_in_implementation
927 #define STBIR_1CHANNEL STBIR_BGR
928 #define STBIR_2CHANNEL STBIR_BGR
929 #define STBIR_RGB STBIR_BGR
930 #define STBIR_RGBA STBIR_BGR
931 #define STBIR_4CHANNEL STBIR_BGR
932 #define STBIR_BGRA STBIR_BGR
933 #define STBIR_ARGB STBIR_BGR
934 #define STBIR_ABGR STBIR_BGR
935 #define STBIR_RA STBIR_BGR
936 #define STBIR_AR STBIR_BGR
937 #define STBIR_RGBA_PM STBIR_BGR
938 #define STBIR_BGRA_PM STBIR_BGR
939 #define STBIR_ARGB_PM STBIR_BGR
940 #define STBIR_ABGR_PM STBIR_BGR
941 #define STBIR_RA_PM STBIR_BGR
942 #define STBIR_AR_PM STBIR_BGR
943 
944 // must match stbir_datatype
945 static unsigned char stbir__type_size[] = {
946   1,1,1,2,4,2 // STBIR_TYPE_UINT8,STBIR_TYPE_UINT8_SRGB,STBIR_TYPE_UINT8_SRGB_ALPHA,STBIR_TYPE_UINT16,STBIR_TYPE_FLOAT,STBIR_TYPE_HALF_FLOAT
947 };
948 
949 // When gathering, the contributors are which source pixels contribute.
950 // When scattering, the contributors are which destination pixels are contributed to.
951 typedef struct
952 {
953   int n0; // First contributing pixel
954   int n1; // Last contributing pixel
955 } stbir__contributors;
956 
957 typedef struct
958 {
959   int lowest;    // First sample index for whole filter
960   int highest;   // Last sample index for whole filter
961   int widest;    // widest single set of samples for an output
962 } stbir__filter_extent_info;
963 
964 typedef struct
965 {
966   int n0; // First pixel of decode buffer to write to
967   int n1; // Last pixel of decode that will be written to
968   int pixel_offset_for_input;  // Pixel offset into input_scanline
969 } stbir__span;
970 
971 typedef struct stbir__scale_info
972 {
973   int input_full_size;
974   int output_sub_size;
975   float scale;
976   float inv_scale;
977   float pixel_shift; // starting shift in output pixel space (in pixels)
978   int scale_is_rational;
979   stbir_uint32 scale_numerator, scale_denominator;
980 } stbir__scale_info;
981 
982 typedef struct
983 {
984   stbir__contributors * contributors;
985   float* coefficients;
986   stbir__contributors * gather_prescatter_contributors;
987   float * gather_prescatter_coefficients;
988   stbir__scale_info scale_info;
989   float support;
990   stbir_filter filter_enum;
991   stbir__kernel_callback * filter_kernel;
992   stbir__support_callback * filter_support;
993   stbir_edge edge;
994   int coefficient_width;
995   int filter_pixel_width;
996   int filter_pixel_margin;
997   int num_contributors;
998   int contributors_size;
999   int coefficients_size;
1000   stbir__filter_extent_info extent_info;
1001   int is_gather;  // 0 = scatter, 1 = gather with scale >= 1, 2 = gather with scale < 1
1002   int gather_prescatter_num_contributors;
1003   int gather_prescatter_coefficient_width;
1004   int gather_prescatter_contributors_size;
1005   int gather_prescatter_coefficients_size;
1006 } stbir__sampler;
1007 
1008 typedef struct
1009 {
1010   stbir__contributors conservative;
1011   int edge_sizes[2];    // this can be less than filter_pixel_margin, if the filter and scaling falls off
1012   stbir__span spans[2]; // can be two spans, if doing input subrect with clamp mode WRAP
1013 } stbir__extents;
1014 
1015 typedef struct
1016 {
1017 #ifdef STBIR_PROFILE
1018   union
1019   {
1020     struct { stbir_uint64 total, looping, vertical, horizontal, decode, encode, alpha, unalpha; } named;
1021     stbir_uint64 array[8];
1022   } profile;
1023   stbir_uint64 * current_zone_excluded_ptr;
1024 #endif
1025   float* decode_buffer;
1026 
1027   int ring_buffer_first_scanline;
1028   int ring_buffer_last_scanline;
1029   int ring_buffer_begin_index;    // first_scanline is at this index in the ring buffer
1030   int start_output_y, end_output_y;
1031   int start_input_y, end_input_y;  // used in scatter only
1032 
1033   #ifdef STBIR__SEPARATE_ALLOCATIONS
1034     float** ring_buffers; // one pointer for each ring buffer
1035   #else
1036     float* ring_buffer;  // one big buffer that we index into
1037   #endif
1038 
1039   float* vertical_buffer;
1040 
1041   char no_cache_straddle[64];
1042 } stbir__per_split_info;
1043 
1044 typedef void stbir__decode_pixels_func( float * decode, int width_times_channels, void const * input );
1045 typedef void stbir__alpha_weight_func( float * decode_buffer, int width_times_channels );
1046 typedef void stbir__horizontal_gather_channels_func( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer,
1047   stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width );
1048 typedef void stbir__alpha_unweight_func(float * encode_buffer, int width_times_channels );
1049 typedef void stbir__encode_pixels_func( void * output, int width_times_channels, float const * encode );
1050 
1051 struct stbir__info
1052 {
1053 #ifdef STBIR_PROFILE
1054   union
1055   {
1056     struct { stbir_uint64 total, build, alloc, horizontal, vertical, cleanup, pivot; } named;
1057     stbir_uint64 array[7];
1058   } profile;
1059   stbir_uint64 * current_zone_excluded_ptr;
1060 #endif
1061   stbir__sampler horizontal;
1062   stbir__sampler vertical;
1063 
1064   void const * input_data;
1065   void * output_data;
1066 
1067   int input_stride_bytes;
1068   int output_stride_bytes;
1069   int ring_buffer_length_bytes;   // The length of an individual entry in the ring buffer. The total number of ring buffers is stbir__get_filter_pixel_width(filter)
1070   int ring_buffer_num_entries;    // Total number of entries in the ring buffer.
1071 
1072   stbir_datatype input_type;
1073   stbir_datatype output_type;
1074 
1075   stbir_input_callback * in_pixels_cb;
1076   void * user_data;
1077   stbir_output_callback * out_pixels_cb;
1078 
1079   stbir__extents scanline_extents;
1080 
1081   void * alloced_mem;
1082   stbir__per_split_info * split_info;  // by default 1, but there will be N of these allocated based on the thread init you did
1083 
1084   stbir__decode_pixels_func * decode_pixels;
1085   stbir__alpha_weight_func * alpha_weight;
1086   stbir__horizontal_gather_channels_func * horizontal_gather_channels;
1087   stbir__alpha_unweight_func * alpha_unweight;
1088   stbir__encode_pixels_func * encode_pixels;
1089 
1090   int alloc_ring_buffer_num_entries;    // Number of entries in the ring buffer that will be allocated
1091   int splits; // count of splits
1092 
1093   stbir_internal_pixel_layout input_pixel_layout_internal;
1094   stbir_internal_pixel_layout output_pixel_layout_internal;
1095 
1096   int input_color_and_type;
1097   int offset_x, offset_y; // offset within output_data
1098   int vertical_first;
1099   int channels;
1100   int effective_channels; // same as channels, except on RGBA/ARGB (7), or XA/AX (3)
1101   size_t alloced_total;
1102 };
1103 
1104 
1105 #define stbir__max_uint8_as_float             255.0f
1106 #define stbir__max_uint16_as_float            65535.0f
1107 #define stbir__max_uint8_as_float_inverted    (1.0f/255.0f)
1108 #define stbir__max_uint16_as_float_inverted   (1.0f/65535.0f)
1109 #define stbir__small_float ((float)1 / (1 << 20) / (1 << 20) / (1 << 20) / (1 << 20) / (1 << 20) / (1 << 20))
1110 
1111 // min/max friendly
1112 #define STBIR_CLAMP(x, xmin, xmax) for(;;) { \
1113   if ( (x) < (xmin) ) (x) = (xmin);     \
1114   if ( (x) > (xmax) ) (x) = (xmax);     \
1115   break;                                \
1116 }
1117 
1118 static stbir__inline int stbir__min(int a, int b)
1119 {
1120   return a < b ? a : b;
1121 }
1122 
1123 static stbir__inline int stbir__max(int a, int b)
1124 {
1125   return a > b ? a : b;
1126 }
1127 
1128 static float stbir__srgb_uchar_to_linear_float[256] = {
1129   0.000000f, 0.000304f, 0.000607f, 0.000911f, 0.001214f, 0.001518f, 0.001821f, 0.002125f, 0.002428f, 0.002732f, 0.003035f,
1130   0.003347f, 0.003677f, 0.004025f, 0.004391f, 0.004777f, 0.005182f, 0.005605f, 0.006049f, 0.006512f, 0.006995f, 0.007499f,
1131   0.008023f, 0.008568f, 0.009134f, 0.009721f, 0.010330f, 0.010960f, 0.011612f, 0.012286f, 0.012983f, 0.013702f, 0.014444f,
1132   0.015209f, 0.015996f, 0.016807f, 0.017642f, 0.018500f, 0.019382f, 0.020289f, 0.021219f, 0.022174f, 0.023153f, 0.024158f,
1133   0.025187f, 0.026241f, 0.027321f, 0.028426f, 0.029557f, 0.030713f, 0.031896f, 0.033105f, 0.034340f, 0.035601f, 0.036889f,
1134   0.038204f, 0.039546f, 0.040915f, 0.042311f, 0.043735f, 0.045186f, 0.046665f, 0.048172f, 0.049707f, 0.051269f, 0.052861f,
1135   0.054480f, 0.056128f, 0.057805f, 0.059511f, 0.061246f, 0.063010f, 0.064803f, 0.066626f, 0.068478f, 0.070360f, 0.072272f,
1136   0.074214f, 0.076185f, 0.078187f, 0.080220f, 0.082283f, 0.084376f, 0.086500f, 0.088656f, 0.090842f, 0.093059f, 0.095307f,
1137   0.097587f, 0.099899f, 0.102242f, 0.104616f, 0.107023f, 0.109462f, 0.111932f, 0.114435f, 0.116971f, 0.119538f, 0.122139f,
1138   0.124772f, 0.127438f, 0.130136f, 0.132868f, 0.135633f, 0.138432f, 0.141263f, 0.144128f, 0.147027f, 0.149960f, 0.152926f,
1139   0.155926f, 0.158961f, 0.162029f, 0.165132f, 0.168269f, 0.171441f, 0.174647f, 0.177888f, 0.181164f, 0.184475f, 0.187821f,
1140   0.191202f, 0.194618f, 0.198069f, 0.201556f, 0.205079f, 0.208637f, 0.212231f, 0.215861f, 0.219526f, 0.223228f, 0.226966f,
1141   0.230740f, 0.234551f, 0.238398f, 0.242281f, 0.246201f, 0.250158f, 0.254152f, 0.258183f, 0.262251f, 0.266356f, 0.270498f,
1142   0.274677f, 0.278894f, 0.283149f, 0.287441f, 0.291771f, 0.296138f, 0.300544f, 0.304987f, 0.309469f, 0.313989f, 0.318547f,
1143   0.323143f, 0.327778f, 0.332452f, 0.337164f, 0.341914f, 0.346704f, 0.351533f, 0.356400f, 0.361307f, 0.366253f, 0.371238f,
1144   0.376262f, 0.381326f, 0.386430f, 0.391573f, 0.396755f, 0.401978f, 0.407240f, 0.412543f, 0.417885f, 0.423268f, 0.428691f,
1145   0.434154f, 0.439657f, 0.445201f, 0.450786f, 0.456411f, 0.462077f, 0.467784f, 0.473532f, 0.479320f, 0.485150f, 0.491021f,
1146   0.496933f, 0.502887f, 0.508881f, 0.514918f, 0.520996f, 0.527115f, 0.533276f, 0.539480f, 0.545725f, 0.552011f, 0.558340f,
1147   0.564712f, 0.571125f, 0.577581f, 0.584078f, 0.590619f, 0.597202f, 0.603827f, 0.610496f, 0.617207f, 0.623960f, 0.630757f,
1148   0.637597f, 0.644480f, 0.651406f, 0.658375f, 0.665387f, 0.672443f, 0.679543f, 0.686685f, 0.693872f, 0.701102f, 0.708376f,
1149   0.715694f, 0.723055f, 0.730461f, 0.737911f, 0.745404f, 0.752942f, 0.760525f, 0.768151f, 0.775822f, 0.783538f, 0.791298f,
1150   0.799103f, 0.806952f, 0.814847f, 0.822786f, 0.830770f, 0.838799f, 0.846873f, 0.854993f, 0.863157f, 0.871367f, 0.879622f,
1151   0.887923f, 0.896269f, 0.904661f, 0.913099f, 0.921582f, 0.930111f, 0.938686f, 0.947307f, 0.955974f, 0.964686f, 0.973445f,
1152   0.982251f, 0.991102f, 1.0f
1153 };
1154 
1155 typedef union
1156 {
1157   unsigned int u;
1158   float f;
1159 } stbir__FP32;
1160 
1161 // From https://gist.github.com/rygorous/2203834
1162 
1163 static const stbir_uint32 fp32_to_srgb8_tab4[104] = {
1164   0x0073000d, 0x007a000d, 0x0080000d, 0x0087000d, 0x008d000d, 0x0094000d, 0x009a000d, 0x00a1000d,
1165   0x00a7001a, 0x00b4001a, 0x00c1001a, 0x00ce001a, 0x00da001a, 0x00e7001a, 0x00f4001a, 0x0101001a,
1166   0x010e0033, 0x01280033, 0x01410033, 0x015b0033, 0x01750033, 0x018f0033, 0x01a80033, 0x01c20033,
1167   0x01dc0067, 0x020f0067, 0x02430067, 0x02760067, 0x02aa0067, 0x02dd0067, 0x03110067, 0x03440067,
1168   0x037800ce, 0x03df00ce, 0x044600ce, 0x04ad00ce, 0x051400ce, 0x057b00c5, 0x05dd00bc, 0x063b00b5,
1169   0x06970158, 0x07420142, 0x07e30130, 0x087b0120, 0x090b0112, 0x09940106, 0x0a1700fc, 0x0a9500f2,
1170   0x0b0f01cb, 0x0bf401ae, 0x0ccb0195, 0x0d950180, 0x0e56016e, 0x0f0d015e, 0x0fbc0150, 0x10630143,
1171   0x11070264, 0x1238023e, 0x1357021d, 0x14660201, 0x156601e9, 0x165a01d3, 0x174401c0, 0x182401af,
1172   0x18fe0331, 0x1a9602fe, 0x1c1502d2, 0x1d7e02ad, 0x1ed4028d, 0x201a0270, 0x21520256, 0x227d0240,
1173   0x239f0443, 0x25c003fe, 0x27bf03c4, 0x29a10392, 0x2b6a0367, 0x2d1d0341, 0x2ebe031f, 0x304d0300,
1174   0x31d105b0, 0x34a80555, 0x37520507, 0x39d504c5, 0x3c37048b, 0x3e7c0458, 0x40a8042a, 0x42bd0401,
1175   0x44c20798, 0x488e071e, 0x4c1c06b6, 0x4f76065d, 0x52a50610, 0x55ac05cc, 0x5892058f, 0x5b590559,
1176   0x5e0c0a23, 0x631c0980, 0x67db08f6, 0x6c55087f, 0x70940818, 0x74a007bd, 0x787d076c, 0x7c330723,
1177 };
1178 
1179 static stbir__inline stbir_uint8 stbir__linear_to_srgb_uchar(float in)
1180 {
1181   static const stbir__FP32 almostone = { 0x3f7fffff }; // 1-eps
1182   static const stbir__FP32 minval = { (127-13) << 23 };
1183   stbir_uint32 tab,bias,scale,t;
1184   stbir__FP32 f;
1185 
1186   // Clamp to [2^(-13), 1-eps]; these two values map to 0 and 1, respectively.
1187   // The tests are carefully written so that NaNs map to 0, same as in the reference
1188   // implementation.
1189   if (!(in > minval.f)) // written this way to catch NaNs
1190       return 0;
1191   if (in > almostone.f)
1192       return 255;
1193 
1194   // Do the table lookup and unpack bias, scale
1195   f.f = in;
1196   tab = fp32_to_srgb8_tab4[(f.u - minval.u) >> 20];
1197   bias = (tab >> 16) << 9;
1198   scale = tab & 0xffff;
1199 
1200   // Grab next-highest mantissa bits and perform linear interpolation
1201   t = (f.u >> 12) & 0xff;
1202   return (unsigned char) ((bias + scale*t) >> 16);
1203 }
1204 
1205 #ifndef STBIR_FORCE_GATHER_FILTER_SCANLINES_AMOUNT
1206 #define STBIR_FORCE_GATHER_FILTER_SCANLINES_AMOUNT 32 // when downsampling and <= 32 scanlines of buffering, use gather. gather used down to 1/8th scaling for 25% win.
1207 #endif
1208 
1209 #ifndef STBIR_FORCE_MINIMUM_SCANLINES_FOR_SPLITS
1210 #define STBIR_FORCE_MINIMUM_SCANLINES_FOR_SPLITS 4 // when threading, what is the minimum number of scanlines for a split?
1211 #endif
1212 
1213 // restrict pointers for the output pointers, other loop and unroll control
1214 #if defined( _MSC_VER ) && !defined(__clang__)
1215   #define STBIR_STREAMOUT_PTR( star ) star __restrict
1216   #define STBIR_NO_UNROLL( ptr ) __assume(ptr) // this oddly keeps msvc from unrolling a loop
1217   #if _MSC_VER >= 1900
1218     #define STBIR_NO_UNROLL_LOOP_START __pragma(loop( no_vector )) 
1219   #else
1220     #define STBIR_NO_UNROLL_LOOP_START 
1221   #endif
1222 #elif defined( __clang__ )
1223   #define STBIR_STREAMOUT_PTR( star ) star __restrict__
1224   #define STBIR_NO_UNROLL( ptr ) __asm__ (""::"r"(ptr)) 
1225   #if ( __clang_major__ >= 4 ) || ( ( __clang_major__ >= 3 ) && ( __clang_minor__ >= 5 ) )
1226     #define STBIR_NO_UNROLL_LOOP_START _Pragma("clang loop unroll(disable)") _Pragma("clang loop vectorize(disable)")
1227   #else
1228     #define STBIR_NO_UNROLL_LOOP_START
1229   #endif 
1230 #elif defined( __GNUC__ )
1231   #define STBIR_STREAMOUT_PTR( star ) star __restrict__
1232   #define STBIR_NO_UNROLL( ptr ) __asm__ (""::"r"(ptr))
1233   #if __GNUC__ >= 14
1234     #define STBIR_NO_UNROLL_LOOP_START _Pragma("GCC unroll 0") _Pragma("GCC novector")
1235   #else
1236     #define STBIR_NO_UNROLL_LOOP_START
1237   #endif
1238   #define STBIR_NO_UNROLL_LOOP_START_INF_FOR
1239 #else
1240   #define STBIR_STREAMOUT_PTR( star ) star
1241   #define STBIR_NO_UNROLL( ptr )
1242   #define STBIR_NO_UNROLL_LOOP_START
1243 #endif
1244 
1245 #ifndef STBIR_NO_UNROLL_LOOP_START_INF_FOR
1246 #define STBIR_NO_UNROLL_LOOP_START_INF_FOR STBIR_NO_UNROLL_LOOP_START
1247 #endif
1248 
1249 #ifdef STBIR_NO_SIMD // force simd off for whatever reason
1250 
1251 // force simd off overrides everything else, so clear it all
1252 
1253 #ifdef STBIR_SSE2
1254 #undef STBIR_SSE2
1255 #endif
1256 
1257 #ifdef STBIR_AVX
1258 #undef STBIR_AVX
1259 #endif
1260 
1261 #ifdef STBIR_NEON
1262 #undef STBIR_NEON
1263 #endif
1264 
1265 #ifdef STBIR_AVX2
1266 #undef STBIR_AVX2
1267 #endif
1268 
1269 #ifdef STBIR_FP16C
1270 #undef STBIR_FP16C
1271 #endif
1272 
1273 #ifdef STBIR_WASM
1274 #undef STBIR_WASM
1275 #endif
1276 
1277 #ifdef STBIR_SIMD
1278 #undef STBIR_SIMD
1279 #endif
1280 
1281 #else // STBIR_SIMD
1282 
1283 // __v_ start
1284 # if defined(STBIR_SSE2) && !defined(__TINYC__)
1285 // __v_ end
1286   #include <emmintrin.h>
1287 
1288   #define stbir__simdf __m128
1289   #define stbir__simdi __m128i
1290 
1291   #define stbir_simdi_castf( reg ) _mm_castps_si128(reg)
1292   #define stbir_simdf_casti( reg ) _mm_castsi128_ps(reg)
1293 
1294   #define stbir__simdf_load( reg, ptr ) (reg) = _mm_loadu_ps( (float const*)(ptr) )
1295   #define stbir__simdi_load( reg, ptr ) (reg) = _mm_loadu_si128 ( (stbir__simdi const*)(ptr) )
1296   #define stbir__simdf_load1( out, ptr ) (out) = _mm_load_ss( (float const*)(ptr) )  // top values can be random (not denormal or nan for perf)
1297   #define stbir__simdi_load1( out, ptr ) (out) = _mm_castps_si128( _mm_load_ss( (float const*)(ptr) ))
1298   #define stbir__simdf_load1z( out, ptr ) (out) = _mm_load_ss( (float const*)(ptr) )  // top values must be zero
1299   #define stbir__simdf_frep4( fvar ) _mm_set_ps1( fvar )
1300   #define stbir__simdf_load1frep4( out, fvar ) (out) = _mm_set_ps1( fvar )
1301   #define stbir__simdf_load2( out, ptr ) (out) = _mm_castsi128_ps( _mm_loadl_epi64( (__m128i*)(ptr)) ) // top values can be random (not denormal or nan for perf)
1302   #define stbir__simdf_load2z( out, ptr ) (out) = _mm_castsi128_ps( _mm_loadl_epi64( (__m128i*)(ptr)) ) // top values must be zero
1303   #define stbir__simdf_load2hmerge( out, reg, ptr ) (out) = _mm_castpd_ps(_mm_loadh_pd( _mm_castps_pd(reg), (double*)(ptr) ))
1304 
1305   #define stbir__simdf_zeroP() _mm_setzero_ps()
1306   #define stbir__simdf_zero( reg ) (reg) = _mm_setzero_ps()
1307 
1308   #define stbir__simdf_store( ptr, reg )  _mm_storeu_ps( (float*)(ptr), reg )
1309   #define stbir__simdf_store1( ptr, reg ) _mm_store_ss( (float*)(ptr), reg )
1310   #define stbir__simdf_store2( ptr, reg ) _mm_storel_epi64( (__m128i*)(ptr), _mm_castps_si128(reg) )
1311   #define stbir__simdf_store2h( ptr, reg ) _mm_storeh_pd( (double*)(ptr), _mm_castps_pd(reg) )
1312 
1313   #define stbir__simdi_store( ptr, reg )  _mm_storeu_si128( (__m128i*)(ptr), reg )
1314   #define stbir__simdi_store1( ptr, reg ) _mm_store_ss( (float*)(ptr), _mm_castsi128_ps(reg) )
1315   #define stbir__simdi_store2( ptr, reg ) _mm_storel_epi64( (__m128i*)(ptr), (reg) )
1316 
1317   #define stbir__prefetch( ptr ) _mm_prefetch((char*)(ptr), _MM_HINT_T0 )
1318 
1319   #define stbir__simdi_expand_u8_to_u32(out0,out1,out2,out3,ireg) \
1320   { \
1321     stbir__simdi zero = _mm_setzero_si128(); \
1322     out2 = _mm_unpacklo_epi8( ireg, zero ); \
1323     out3 = _mm_unpackhi_epi8( ireg, zero ); \
1324     out0 = _mm_unpacklo_epi16( out2, zero ); \
1325     out1 = _mm_unpackhi_epi16( out2, zero ); \
1326     out2 = _mm_unpacklo_epi16( out3, zero ); \
1327     out3 = _mm_unpackhi_epi16( out3, zero ); \
1328   }
1329 
1330 #define stbir__simdi_expand_u8_to_1u32(out,ireg) \
1331   { \
1332     stbir__simdi zero = _mm_setzero_si128(); \
1333     out = _mm_unpacklo_epi8( ireg, zero ); \
1334     out = _mm_unpacklo_epi16( out, zero ); \
1335   }
1336 
1337   #define stbir__simdi_expand_u16_to_u32(out0,out1,ireg) \
1338   { \
1339     stbir__simdi zero = _mm_setzero_si128(); \
1340     out0 = _mm_unpacklo_epi16( ireg, zero ); \
1341     out1 = _mm_unpackhi_epi16( ireg, zero ); \
1342   }
1343 
1344   #define stbir__simdf_convert_float_to_i32( i, f ) (i) = _mm_cvttps_epi32(f)
1345   #define stbir__simdf_convert_float_to_int( f ) _mm_cvtt_ss2si(f)
1346   #define stbir__simdf_convert_float_to_uint8( f ) ((unsigned char)_mm_cvtsi128_si32(_mm_cvttps_epi32(_mm_max_ps(_mm_min_ps(f,STBIR__CONSTF(STBIR_max_uint8_as_float)),_mm_setzero_ps()))))
1347   #define stbir__simdf_convert_float_to_short( f ) ((unsigned short)_mm_cvtsi128_si32(_mm_cvttps_epi32(_mm_max_ps(_mm_min_ps(f,STBIR__CONSTF(STBIR_max_uint16_as_float)),_mm_setzero_ps()))))
1348 
1349   #define stbir__simdi_to_int( i ) _mm_cvtsi128_si32(i)
1350   #define stbir__simdi_convert_i32_to_float(out, ireg) (out) = _mm_cvtepi32_ps( ireg )
1351   #define stbir__simdf_add( out, reg0, reg1 ) (out) = _mm_add_ps( reg0, reg1 )
1352   #define stbir__simdf_mult( out, reg0, reg1 ) (out) = _mm_mul_ps( reg0, reg1 )
1353   #define stbir__simdf_mult_mem( out, reg, ptr ) (out) = _mm_mul_ps( reg, _mm_loadu_ps( (float const*)(ptr) ) )
1354   #define stbir__simdf_mult1_mem( out, reg, ptr ) (out) = _mm_mul_ss( reg, _mm_load_ss( (float const*)(ptr) ) )
1355   #define stbir__simdf_add_mem( out, reg, ptr ) (out) = _mm_add_ps( reg, _mm_loadu_ps( (float const*)(ptr) ) )
1356   #define stbir__simdf_add1_mem( out, reg, ptr ) (out) = _mm_add_ss( reg, _mm_load_ss( (float const*)(ptr) ) )
1357 
1358   #ifdef STBIR_USE_FMA           // not on by default to maintain bit identical simd to non-simd
1359   #include <immintrin.h>
1360   #define stbir__simdf_madd( out, add, mul1, mul2 ) (out) = _mm_fmadd_ps( mul1, mul2, add )
1361   #define stbir__simdf_madd1( out, add, mul1, mul2 ) (out) = _mm_fmadd_ss( mul1, mul2, add )
1362   #define stbir__simdf_madd_mem( out, add, mul, ptr ) (out) = _mm_fmadd_ps( mul, _mm_loadu_ps( (float const*)(ptr) ), add )
1363   #define stbir__simdf_madd1_mem( out, add, mul, ptr ) (out) = _mm_fmadd_ss( mul, _mm_load_ss( (float const*)(ptr) ), add )
1364   #else
1365   #define stbir__simdf_madd( out, add, mul1, mul2 ) (out) = _mm_add_ps( add, _mm_mul_ps( mul1, mul2 ) )
1366   #define stbir__simdf_madd1( out, add, mul1, mul2 ) (out) = _mm_add_ss( add, _mm_mul_ss( mul1, mul2 ) )
1367   #define stbir__simdf_madd_mem( out, add, mul, ptr ) (out) = _mm_add_ps( add, _mm_mul_ps( mul, _mm_loadu_ps( (float const*)(ptr) ) ) )
1368   #define stbir__simdf_madd1_mem( out, add, mul, ptr ) (out) = _mm_add_ss( add, _mm_mul_ss( mul, _mm_load_ss( (float const*)(ptr) ) ) )
1369   #endif
1370 
1371   #define stbir__simdf_add1( out, reg0, reg1 ) (out) = _mm_add_ss( reg0, reg1 )
1372   #define stbir__simdf_mult1( out, reg0, reg1 ) (out) = _mm_mul_ss( reg0, reg1 )
1373 
1374   #define stbir__simdf_and( out, reg0, reg1 ) (out) = _mm_and_ps( reg0, reg1 )
1375   #define stbir__simdf_or( out, reg0, reg1 ) (out) = _mm_or_ps( reg0, reg1 )
1376 
1377   #define stbir__simdf_min( out, reg0, reg1 ) (out) = _mm_min_ps( reg0, reg1 )
1378   #define stbir__simdf_max( out, reg0, reg1 ) (out) = _mm_max_ps( reg0, reg1 )
1379   #define stbir__simdf_min1( out, reg0, reg1 ) (out) = _mm_min_ss( reg0, reg1 )
1380   #define stbir__simdf_max1( out, reg0, reg1 ) (out) = _mm_max_ss( reg0, reg1 )
1381 
1382   #define stbir__simdf_0123ABCDto3ABx( out, reg0, reg1 ) (out)=_mm_castsi128_ps( _mm_shuffle_epi32( _mm_castps_si128( _mm_shuffle_ps( reg1,reg0, (0<<0) + (1<<2) + (2<<4) + (3<<6) )), (3<<0) + (0<<2) + (1<<4) + (2<<6) ) )
1383   #define stbir__simdf_0123ABCDto23Ax( out, reg0, reg1 ) (out)=_mm_castsi128_ps( _mm_shuffle_epi32( _mm_castps_si128( _mm_shuffle_ps( reg1,reg0, (0<<0) + (1<<2) + (2<<4) + (3<<6) )), (2<<0) + (3<<2) + (0<<4) + (1<<6) ) )
1384 
1385   static const stbir__simdf STBIR_zeroones = { 0.0f,1.0f,0.0f,1.0f };
1386   static const stbir__simdf STBIR_onezeros = { 1.0f,0.0f,1.0f,0.0f };
1387   #define stbir__simdf_aaa1( out, alp, ones ) (out)=_mm_castsi128_ps( _mm_shuffle_epi32( _mm_castps_si128( _mm_movehl_ps( ones, alp ) ), (1<<0) + (1<<2) + (1<<4) + (2<<6) ) )
1388   #define stbir__simdf_1aaa( out, alp, ones ) (out)=_mm_castsi128_ps( _mm_shuffle_epi32( _mm_castps_si128( _mm_movelh_ps( ones, alp ) ), (0<<0) + (2<<2) + (2<<4) + (2<<6) ) )
1389   #define stbir__simdf_a1a1( out, alp, ones) (out) = _mm_or_ps( _mm_castsi128_ps( _mm_srli_epi64( _mm_castps_si128(alp), 32 ) ), STBIR_zeroones )
1390   #define stbir__simdf_1a1a( out, alp, ones) (out) = _mm_or_ps( _mm_castsi128_ps( _mm_slli_epi64( _mm_castps_si128(alp), 32 ) ), STBIR_onezeros )
1391 
1392   #define stbir__simdf_swiz( reg, one, two, three, four ) _mm_castsi128_ps( _mm_shuffle_epi32( _mm_castps_si128( reg ), (one<<0) + (two<<2) + (three<<4) + (four<<6) ) )
1393 
1394   #define stbir__simdi_and( out, reg0, reg1 ) (out) = _mm_and_si128( reg0, reg1 )
1395   #define stbir__simdi_or( out, reg0, reg1 ) (out) = _mm_or_si128( reg0, reg1 )
1396   #define stbir__simdi_16madd( out, reg0, reg1 ) (out) = _mm_madd_epi16( reg0, reg1 )
1397 
1398   #define stbir__simdf_pack_to_8bytes(out,aa,bb) \
1399   { \
1400     stbir__simdf af,bf; \
1401     stbir__simdi a,b; \
1402     af = _mm_min_ps( aa, STBIR_max_uint8_as_float ); \
1403     bf = _mm_min_ps( bb, STBIR_max_uint8_as_float ); \
1404     af = _mm_max_ps( af, _mm_setzero_ps() ); \
1405     bf = _mm_max_ps( bf, _mm_setzero_ps() ); \
1406     a = _mm_cvttps_epi32( af ); \
1407     b = _mm_cvttps_epi32( bf ); \
1408     a = _mm_packs_epi32( a, b ); \
1409     out = _mm_packus_epi16( a, a ); \
1410   }
1411 
1412   #define stbir__simdf_load4_transposed( o0, o1, o2, o3, ptr ) \
1413       stbir__simdf_load( o0, (ptr) );    \
1414       stbir__simdf_load( o1, (ptr)+4 );  \
1415       stbir__simdf_load( o2, (ptr)+8 );  \
1416       stbir__simdf_load( o3, (ptr)+12 ); \
1417       {                                  \
1418         __m128 tmp0, tmp1, tmp2, tmp3;   \
1419         tmp0 = _mm_unpacklo_ps(o0, o1);  \
1420         tmp2 = _mm_unpacklo_ps(o2, o3);  \
1421         tmp1 = _mm_unpackhi_ps(o0, o1);  \
1422         tmp3 = _mm_unpackhi_ps(o2, o3);  \
1423         o0 = _mm_movelh_ps(tmp0, tmp2);  \
1424         o1 = _mm_movehl_ps(tmp2, tmp0);  \
1425         o2 = _mm_movelh_ps(tmp1, tmp3);  \
1426         o3 = _mm_movehl_ps(tmp3, tmp1);  \
1427       }
1428 
1429   #define stbir__interleave_pack_and_store_16_u8( ptr, r0, r1, r2, r3 ) \
1430       r0 = _mm_packs_epi32( r0, r1 ); \
1431       r2 = _mm_packs_epi32( r2, r3 ); \
1432       r1 = _mm_unpacklo_epi16( r0, r2 ); \
1433       r3 = _mm_unpackhi_epi16( r0, r2 ); \
1434       r0 = _mm_unpacklo_epi16( r1, r3 ); \
1435       r2 = _mm_unpackhi_epi16( r1, r3 ); \
1436       r0 = _mm_packus_epi16( r0, r2 ); \
1437       stbir__simdi_store( ptr, r0 ); \
1438 
1439   #define stbir__simdi_32shr( out, reg, imm ) out = _mm_srli_epi32( reg, imm )
1440 
1441   #if defined(_MSC_VER) && !defined(__clang__)
1442     // msvc inits with 8 bytes
1443     #define STBIR__CONST_32_TO_8( v ) (char)(unsigned char)((v)&255),(char)(unsigned char)(((v)>>8)&255),(char)(unsigned char)(((v)>>16)&255),(char)(unsigned char)(((v)>>24)&255)
1444     #define STBIR__CONST_4_32i( v ) STBIR__CONST_32_TO_8( v ), STBIR__CONST_32_TO_8( v ), STBIR__CONST_32_TO_8( v ), STBIR__CONST_32_TO_8( v )
1445     #define STBIR__CONST_4d_32i( v0, v1, v2, v3 ) STBIR__CONST_32_TO_8( v0 ), STBIR__CONST_32_TO_8( v1 ), STBIR__CONST_32_TO_8( v2 ), STBIR__CONST_32_TO_8( v3 )
1446   #else
1447     // everything else inits with long long's
1448     #define STBIR__CONST_4_32i( v ) (long long)((((stbir_uint64)(stbir_uint32)(v))<<32)|((stbir_uint64)(stbir_uint32)(v))),(long long)((((stbir_uint64)(stbir_uint32)(v))<<32)|((stbir_uint64)(stbir_uint32)(v)))
1449     #define STBIR__CONST_4d_32i( v0, v1, v2, v3 ) (long long)((((stbir_uint64)(stbir_uint32)(v1))<<32)|((stbir_uint64)(stbir_uint32)(v0))),(long long)((((stbir_uint64)(stbir_uint32)(v3))<<32)|((stbir_uint64)(stbir_uint32)(v2)))
1450   #endif
1451 
1452   #define STBIR__SIMDF_CONST(var, x) stbir__simdf var = { x, x, x, x }
1453   #define STBIR__SIMDI_CONST(var, x) stbir__simdi var = { STBIR__CONST_4_32i(x) }
1454   #define STBIR__CONSTF(var) (var)
1455   #define STBIR__CONSTI(var) (var)
1456 
1457   #if defined(STBIR_AVX) || defined(__SSE4_1__)
1458     #include <smmintrin.h>
1459     #define stbir__simdf_pack_to_8words(out,reg0,reg1) out = _mm_packus_epi32(_mm_cvttps_epi32(_mm_max_ps(_mm_min_ps(reg0,STBIR__CONSTF(STBIR_max_uint16_as_float)),_mm_setzero_ps())), _mm_cvttps_epi32(_mm_max_ps(_mm_min_ps(reg1,STBIR__CONSTF(STBIR_max_uint16_as_float)),_mm_setzero_ps())))
1460   #else
1461     STBIR__SIMDI_CONST(stbir__s32_32768, 32768);
1462     STBIR__SIMDI_CONST(stbir__s16_32768, ((32768<<16)|32768));
1463 
1464     #define stbir__simdf_pack_to_8words(out,reg0,reg1) \
1465       { \
1466         stbir__simdi tmp0,tmp1; \
1467         tmp0 = _mm_cvttps_epi32(_mm_max_ps(_mm_min_ps(reg0,STBIR__CONSTF(STBIR_max_uint16_as_float)),_mm_setzero_ps())); \
1468         tmp1 = _mm_cvttps_epi32(_mm_max_ps(_mm_min_ps(reg1,STBIR__CONSTF(STBIR_max_uint16_as_float)),_mm_setzero_ps())); \
1469         tmp0 = _mm_sub_epi32( tmp0, stbir__s32_32768 ); \
1470         tmp1 = _mm_sub_epi32( tmp1, stbir__s32_32768 ); \
1471         out = _mm_packs_epi32( tmp0, tmp1 ); \
1472         out = _mm_sub_epi16( out, stbir__s16_32768 ); \
1473       }
1474 
1475   #endif
1476 
1477   #define STBIR_SIMD
1478 
1479   // if we detect AVX, set the simd8 defines
1480   #ifdef STBIR_AVX
1481     #include <immintrin.h>
1482     #define STBIR_SIMD8
1483     #define stbir__simdf8 __m256
1484     #define stbir__simdi8 __m256i
1485     #define stbir__simdf8_load( out, ptr ) (out) = _mm256_loadu_ps( (float const *)(ptr) )
1486     #define stbir__simdi8_load( out, ptr ) (out) = _mm256_loadu_si256( (__m256i const *)(ptr) )
1487     #define stbir__simdf8_mult( out, a, b ) (out) = _mm256_mul_ps( (a), (b) )
1488     #define stbir__simdf8_store( ptr, out ) _mm256_storeu_ps( (float*)(ptr), out )
1489     #define stbir__simdi8_store( ptr, reg )  _mm256_storeu_si256( (__m256i*)(ptr), reg )
1490     #define stbir__simdf8_frep8( fval ) _mm256_set1_ps( fval )
1491 
1492     #define stbir__simdf8_min( out, reg0, reg1 ) (out) = _mm256_min_ps( reg0, reg1 )
1493     #define stbir__simdf8_max( out, reg0, reg1 ) (out) = _mm256_max_ps( reg0, reg1 )
1494 
1495     #define stbir__simdf8_add4halves( out, bot4, top8 ) (out) = _mm_add_ps( bot4, _mm256_extractf128_ps( top8, 1 ) )
1496     #define stbir__simdf8_mult_mem( out, reg, ptr ) (out) = _mm256_mul_ps( reg, _mm256_loadu_ps( (float const*)(ptr) ) )
1497     #define stbir__simdf8_add_mem( out, reg, ptr ) (out) = _mm256_add_ps( reg, _mm256_loadu_ps( (float const*)(ptr) ) )
1498     #define stbir__simdf8_add( out, a, b ) (out) = _mm256_add_ps( a, b )
1499     #define stbir__simdf8_load1b( out, ptr ) (out) = _mm256_broadcast_ss( ptr )
1500     #define stbir__simdf_load1rep4( out, ptr ) (out) = _mm_broadcast_ss( ptr )  // avx load instruction
1501 
1502     #define stbir__simdi8_convert_i32_to_float(out, ireg) (out) = _mm256_cvtepi32_ps( ireg )
1503     #define stbir__simdf8_convert_float_to_i32( i, f ) (i) = _mm256_cvttps_epi32(f)
1504 
1505     #define stbir__simdf8_bot4s( out, a, b ) (out) = _mm256_permute2f128_ps(a,b, (0<<0)+(2<<4) )
1506     #define stbir__simdf8_top4s( out, a, b ) (out) = _mm256_permute2f128_ps(a,b, (1<<0)+(3<<4) )
1507 
1508     #define stbir__simdf8_gettop4( reg ) _mm256_extractf128_ps(reg,1)
1509 
1510     #ifdef STBIR_AVX2
1511 
1512     #define stbir__simdi8_expand_u8_to_u32(out0,out1,ireg) \
1513     { \
1514       stbir__simdi8 a, zero  =_mm256_setzero_si256();\
1515       a = _mm256_permute4x64_epi64( _mm256_unpacklo_epi8( _mm256_permute4x64_epi64(_mm256_castsi128_si256(ireg),(0<<0)+(2<<2)+(1<<4)+(3<<6)), zero ),(0<<0)+(2<<2)+(1<<4)+(3<<6)); \
1516       out0 = _mm256_unpacklo_epi16( a, zero ); \
1517       out1 = _mm256_unpackhi_epi16( a, zero ); \
1518     }
1519 
1520     #define stbir__simdf8_pack_to_16bytes(out,aa,bb) \
1521     { \
1522       stbir__simdi8 t; \
1523       stbir__simdf8 af,bf; \
1524       stbir__simdi8 a,b; \
1525       af = _mm256_min_ps( aa, STBIR_max_uint8_as_floatX ); \
1526       bf = _mm256_min_ps( bb, STBIR_max_uint8_as_floatX ); \
1527       af = _mm256_max_ps( af, _mm256_setzero_ps() ); \
1528       bf = _mm256_max_ps( bf, _mm256_setzero_ps() ); \
1529       a = _mm256_cvttps_epi32( af ); \
1530       b = _mm256_cvttps_epi32( bf ); \
1531       t = _mm256_permute4x64_epi64( _mm256_packs_epi32( a, b ), (0<<0)+(2<<2)+(1<<4)+(3<<6) ); \
1532       out = _mm256_castsi256_si128( _mm256_permute4x64_epi64( _mm256_packus_epi16( t, t ), (0<<0)+(2<<2)+(1<<4)+(3<<6) ) ); \
1533     }
1534 
1535     #define stbir__simdi8_expand_u16_to_u32(out,ireg) out = _mm256_unpacklo_epi16( _mm256_permute4x64_epi64(_mm256_castsi128_si256(ireg),(0<<0)+(2<<2)+(1<<4)+(3<<6)), _mm256_setzero_si256() );
1536 
1537     #define stbir__simdf8_pack_to_16words(out,aa,bb) \
1538       { \
1539         stbir__simdf8 af,bf; \
1540         stbir__simdi8 a,b; \
1541         af = _mm256_min_ps( aa, STBIR_max_uint16_as_floatX ); \
1542         bf = _mm256_min_ps( bb, STBIR_max_uint16_as_floatX ); \
1543         af = _mm256_max_ps( af, _mm256_setzero_ps() ); \
1544         bf = _mm256_max_ps( bf, _mm256_setzero_ps() ); \
1545         a = _mm256_cvttps_epi32( af ); \
1546         b = _mm256_cvttps_epi32( bf ); \
1547         (out) = _mm256_permute4x64_epi64( _mm256_packus_epi32(a, b), (0<<0)+(2<<2)+(1<<4)+(3<<6) ); \
1548       }
1549 
1550     #else
1551 
1552     #define stbir__simdi8_expand_u8_to_u32(out0,out1,ireg) \
1553     { \
1554       stbir__simdi a,zero = _mm_setzero_si128(); \
1555       a = _mm_unpacklo_epi8( ireg, zero ); \
1556       out0 = _mm256_setr_m128i( _mm_unpacklo_epi16( a, zero ), _mm_unpackhi_epi16( a, zero ) ); \
1557       a = _mm_unpackhi_epi8( ireg, zero ); \
1558       out1 = _mm256_setr_m128i( _mm_unpacklo_epi16( a, zero ), _mm_unpackhi_epi16( a, zero ) ); \
1559     }
1560 
1561     #define stbir__simdf8_pack_to_16bytes(out,aa,bb) \
1562     { \
1563       stbir__simdi t; \
1564       stbir__simdf8 af,bf; \
1565       stbir__simdi8 a,b; \
1566       af = _mm256_min_ps( aa, STBIR_max_uint8_as_floatX ); \
1567       bf = _mm256_min_ps( bb, STBIR_max_uint8_as_floatX ); \
1568       af = _mm256_max_ps( af, _mm256_setzero_ps() ); \
1569       bf = _mm256_max_ps( bf, _mm256_setzero_ps() ); \
1570       a = _mm256_cvttps_epi32( af ); \
1571       b = _mm256_cvttps_epi32( bf ); \
1572       out = _mm_packs_epi32( _mm256_castsi256_si128(a), _mm256_extractf128_si256( a, 1 ) ); \
1573       out = _mm_packus_epi16( out, out ); \
1574       t = _mm_packs_epi32( _mm256_castsi256_si128(b), _mm256_extractf128_si256( b, 1 ) ); \
1575       t = _mm_packus_epi16( t, t ); \
1576       out = _mm_castps_si128( _mm_shuffle_ps( _mm_castsi128_ps(out), _mm_castsi128_ps(t), (0<<0)+(1<<2)+(0<<4)+(1<<6) ) ); \
1577     }
1578 
1579     #define stbir__simdi8_expand_u16_to_u32(out,ireg) \
1580     { \
1581       stbir__simdi a,b,zero = _mm_setzero_si128(); \
1582       a = _mm_unpacklo_epi16( ireg, zero ); \
1583       b = _mm_unpackhi_epi16( ireg, zero ); \
1584       out = _mm256_insertf128_si256( _mm256_castsi128_si256( a ), b, 1 ); \
1585     }
1586 
1587     #define stbir__simdf8_pack_to_16words(out,aa,bb) \
1588       { \
1589         stbir__simdi t0,t1; \
1590         stbir__simdf8 af,bf; \
1591         stbir__simdi8 a,b; \
1592         af = _mm256_min_ps( aa, STBIR_max_uint16_as_floatX ); \
1593         bf = _mm256_min_ps( bb, STBIR_max_uint16_as_floatX ); \
1594         af = _mm256_max_ps( af, _mm256_setzero_ps() ); \
1595         bf = _mm256_max_ps( bf, _mm256_setzero_ps() ); \
1596         a = _mm256_cvttps_epi32( af ); \
1597         b = _mm256_cvttps_epi32( bf ); \
1598         t0 = _mm_packus_epi32( _mm256_castsi256_si128(a), _mm256_extractf128_si256( a, 1 ) ); \
1599         t1 = _mm_packus_epi32( _mm256_castsi256_si128(b), _mm256_extractf128_si256( b, 1 ) ); \
1600         out = _mm256_setr_m128i( t0, t1 ); \
1601       }
1602 
1603     #endif
1604 
1605     static __m256i stbir_00001111 = { STBIR__CONST_4d_32i( 0, 0, 0, 0 ), STBIR__CONST_4d_32i( 1, 1, 1, 1 ) };
1606     #define stbir__simdf8_0123to00001111( out, in ) (out) = _mm256_permutevar_ps ( in, stbir_00001111 )
1607 
1608     static __m256i stbir_22223333 = { STBIR__CONST_4d_32i( 2, 2, 2, 2 ), STBIR__CONST_4d_32i( 3, 3, 3, 3 ) };
1609     #define stbir__simdf8_0123to22223333( out, in ) (out) = _mm256_permutevar_ps ( in, stbir_22223333 )
1610 
1611     #define stbir__simdf8_0123to2222( out, in ) (out) = stbir__simdf_swiz(_mm256_castps256_ps128(in), 2,2,2,2 )
1612 
1613     #define stbir__simdf8_load4b( out, ptr ) (out) = _mm256_broadcast_ps( (__m128 const *)(ptr) )
1614 
1615     static __m256i stbir_00112233 = { STBIR__CONST_4d_32i( 0, 0, 1, 1 ), STBIR__CONST_4d_32i( 2, 2, 3, 3 ) };
1616     #define stbir__simdf8_0123to00112233( out, in ) (out) = _mm256_permutevar_ps ( in, stbir_00112233 )
1617     #define stbir__simdf8_add4( out, a8, b ) (out) = _mm256_add_ps( a8,  _mm256_castps128_ps256( b ) )
1618 
1619     static __m256i stbir_load6 = { STBIR__CONST_4_32i( 0x80000000 ), STBIR__CONST_4d_32i(  0x80000000,  0x80000000, 0, 0 ) };
1620     #define stbir__simdf8_load6z( out, ptr ) (out) = _mm256_maskload_ps( ptr, stbir_load6 )
1621 
1622     #define stbir__simdf8_0123to00000000( out, in ) (out) =  _mm256_shuffle_ps ( in, in, (0<<0)+(0<<2)+(0<<4)+(0<<6) )
1623     #define stbir__simdf8_0123to11111111( out, in ) (out) =  _mm256_shuffle_ps ( in, in, (1<<0)+(1<<2)+(1<<4)+(1<<6) )
1624     #define stbir__simdf8_0123to22222222( out, in ) (out) =  _mm256_shuffle_ps ( in, in, (2<<0)+(2<<2)+(2<<4)+(2<<6) )
1625     #define stbir__simdf8_0123to33333333( out, in ) (out) =  _mm256_shuffle_ps ( in, in, (3<<0)+(3<<2)+(3<<4)+(3<<6) )
1626     #define stbir__simdf8_0123to21032103( out, in ) (out) =  _mm256_shuffle_ps ( in, in, (2<<0)+(1<<2)+(0<<4)+(3<<6) )
1627     #define stbir__simdf8_0123to32103210( out, in ) (out) =  _mm256_shuffle_ps ( in, in, (3<<0)+(2<<2)+(1<<4)+(0<<6) )
1628     #define stbir__simdf8_0123to12301230( out, in ) (out) =  _mm256_shuffle_ps ( in, in, (1<<0)+(2<<2)+(3<<4)+(0<<6) )
1629     #define stbir__simdf8_0123to10321032( out, in ) (out) =  _mm256_shuffle_ps ( in, in, (1<<0)+(0<<2)+(3<<4)+(2<<6) )
1630     #define stbir__simdf8_0123to30123012( out, in ) (out) =  _mm256_shuffle_ps ( in, in, (3<<0)+(0<<2)+(1<<4)+(2<<6) )
1631 
1632     #define stbir__simdf8_0123to11331133( out, in ) (out) =  _mm256_shuffle_ps ( in, in, (1<<0)+(1<<2)+(3<<4)+(3<<6) )
1633     #define stbir__simdf8_0123to00220022( out, in ) (out) =  _mm256_shuffle_ps ( in, in, (0<<0)+(0<<2)+(2<<4)+(2<<6) )
1634 
1635     #define stbir__simdf8_aaa1( out, alp, ones ) (out) = _mm256_blend_ps( alp, ones, (1<<0)+(1<<1)+(1<<2)+(0<<3)+(1<<4)+(1<<5)+(1<<6)+(0<<7)); (out)=_mm256_shuffle_ps( out,out, (3<<0) + (3<<2) + (3<<4) + (0<<6) )
1636     #define stbir__simdf8_1aaa( out, alp, ones ) (out) = _mm256_blend_ps( alp, ones, (0<<0)+(1<<1)+(1<<2)+(1<<3)+(0<<4)+(1<<5)+(1<<6)+(1<<7)); (out)=_mm256_shuffle_ps( out,out, (1<<0) + (0<<2) + (0<<4) + (0<<6) )
1637     #define stbir__simdf8_a1a1( out, alp, ones) (out) = _mm256_blend_ps( alp, ones, (1<<0)+(0<<1)+(1<<2)+(0<<3)+(1<<4)+(0<<5)+(1<<6)+(0<<7)); (out)=_mm256_shuffle_ps( out,out, (1<<0) + (0<<2) + (3<<4) + (2<<6) )
1638     #define stbir__simdf8_1a1a( out, alp, ones) (out) = _mm256_blend_ps( alp, ones, (0<<0)+(1<<1)+(0<<2)+(1<<3)+(0<<4)+(1<<5)+(0<<6)+(1<<7)); (out)=_mm256_shuffle_ps( out,out, (1<<0) + (0<<2) + (3<<4) + (2<<6) )
1639 
1640     #define stbir__simdf8_zero( reg ) (reg) = _mm256_setzero_ps()
1641 
1642     #ifdef STBIR_USE_FMA           // not on by default to maintain bit identical simd to non-simd
1643     #define stbir__simdf8_madd( out, add, mul1, mul2 ) (out) = _mm256_fmadd_ps( mul1, mul2, add )
1644     #define stbir__simdf8_madd_mem( out, add, mul, ptr ) (out) = _mm256_fmadd_ps( mul, _mm256_loadu_ps( (float const*)(ptr) ), add )
1645     #define stbir__simdf8_madd_mem4( out, add, mul, ptr )(out) = _mm256_fmadd_ps( _mm256_setr_m128( mul, _mm_setzero_ps() ), _mm256_setr_m128( _mm_loadu_ps( (float const*)(ptr) ), _mm_setzero_ps() ), add )
1646     #else
1647     #define stbir__simdf8_madd( out, add, mul1, mul2 ) (out) = _mm256_add_ps( add, _mm256_mul_ps( mul1, mul2 ) )
1648     #define stbir__simdf8_madd_mem( out, add, mul, ptr ) (out) = _mm256_add_ps( add, _mm256_mul_ps( mul, _mm256_loadu_ps( (float const*)(ptr) ) ) )
1649     #define stbir__simdf8_madd_mem4( out, add, mul, ptr )  (out) = _mm256_add_ps( add, _mm256_setr_m128( _mm_mul_ps( mul, _mm_loadu_ps( (float const*)(ptr) ) ), _mm_setzero_ps() ) )
1650     #endif
1651     #define stbir__if_simdf8_cast_to_simdf4( val ) _mm256_castps256_ps128( val )
1652 
1653   #endif
1654 
1655   #ifdef STBIR_FLOORF
1656   #undef STBIR_FLOORF
1657   #endif
1658   #define STBIR_FLOORF stbir_simd_floorf
1659   static stbir__inline float stbir_simd_floorf(float x)  // martins floorf
1660   {
1661     #if defined(STBIR_AVX) || defined(__SSE4_1__) || defined(STBIR_SSE41)
1662     __m128 t = _mm_set_ss(x);
1663     return _mm_cvtss_f32( _mm_floor_ss(t, t) );
1664     #else
1665     __m128 f = _mm_set_ss(x);
1666     __m128 t = _mm_cvtepi32_ps(_mm_cvttps_epi32(f));
1667     __m128 r = _mm_add_ss(t, _mm_and_ps(_mm_cmplt_ss(f, t), _mm_set_ss(-1.0f)));
1668     return _mm_cvtss_f32(r);
1669     #endif
1670   }
1671 
1672   #ifdef STBIR_CEILF
1673   #undef STBIR_CEILF
1674   #endif
1675   #define STBIR_CEILF stbir_simd_ceilf
1676   static stbir__inline float stbir_simd_ceilf(float x)  // martins ceilf
1677   {
1678     #if defined(STBIR_AVX) || defined(__SSE4_1__) || defined(STBIR_SSE41)
1679     __m128 t = _mm_set_ss(x);
1680     return _mm_cvtss_f32( _mm_ceil_ss(t, t) );
1681     #else
1682     __m128 f = _mm_set_ss(x);
1683     __m128 t = _mm_cvtepi32_ps(_mm_cvttps_epi32(f));
1684     __m128 r = _mm_add_ss(t, _mm_and_ps(_mm_cmplt_ss(t, f), _mm_set_ss(1.0f)));
1685     return _mm_cvtss_f32(r);
1686     #endif
1687   }
1688 
1689 #elif defined(STBIR_NEON)
1690 
1691   #include <arm_neon.h>
1692 
1693   #define stbir__simdf float32x4_t
1694   #define stbir__simdi uint32x4_t
1695 
1696   #define stbir_simdi_castf( reg ) vreinterpretq_u32_f32(reg)
1697   #define stbir_simdf_casti( reg ) vreinterpretq_f32_u32(reg)
1698 
1699   #define stbir__simdf_load( reg, ptr ) (reg) = vld1q_f32( (float const*)(ptr) )
1700   #define stbir__simdi_load( reg, ptr ) (reg) = vld1q_u32( (uint32_t const*)(ptr) )
1701   #define stbir__simdf_load1( out, ptr ) (out) = vld1q_dup_f32( (float const*)(ptr) ) // top values can be random (not denormal or nan for perf)
1702   #define stbir__simdi_load1( out, ptr ) (out) = vld1q_dup_u32( (uint32_t const*)(ptr) )
1703   #define stbir__simdf_load1z( out, ptr ) (out) = vld1q_lane_f32( (float const*)(ptr), vdupq_n_f32(0), 0 ) // top values must be zero
1704   #define stbir__simdf_frep4( fvar ) vdupq_n_f32( fvar )
1705   #define stbir__simdf_load1frep4( out, fvar ) (out) = vdupq_n_f32( fvar )
1706   #define stbir__simdf_load2( out, ptr ) (out) = vcombine_f32( vld1_f32( (float const*)(ptr) ), vcreate_f32(0) ) // top values can be random (not denormal or nan for perf)
1707   #define stbir__simdf_load2z( out, ptr ) (out) = vcombine_f32( vld1_f32( (float const*)(ptr) ), vcreate_f32(0) )  // top values must be zero
1708   #define stbir__simdf_load2hmerge( out, reg, ptr ) (out) = vcombine_f32( vget_low_f32(reg), vld1_f32( (float const*)(ptr) ) )
1709 
1710   #define stbir__simdf_zeroP() vdupq_n_f32(0)
1711   #define stbir__simdf_zero( reg ) (reg) = vdupq_n_f32(0)
1712 
1713   #define stbir__simdf_store( ptr, reg )  vst1q_f32( (float*)(ptr), reg )
1714   #define stbir__simdf_store1( ptr, reg ) vst1q_lane_f32( (float*)(ptr), reg, 0)
1715   #define stbir__simdf_store2( ptr, reg ) vst1_f32( (float*)(ptr), vget_low_f32(reg) )
1716   #define stbir__simdf_store2h( ptr, reg ) vst1_f32( (float*)(ptr), vget_high_f32(reg) )
1717 
1718   #define stbir__simdi_store( ptr, reg )  vst1q_u32( (uint32_t*)(ptr), reg )
1719   #define stbir__simdi_store1( ptr, reg ) vst1q_lane_u32( (uint32_t*)(ptr), reg, 0 )
1720   #define stbir__simdi_store2( ptr, reg ) vst1_u32( (uint32_t*)(ptr), vget_low_u32(reg) )
1721 
1722   #define stbir__prefetch( ptr )
1723 
1724   #define stbir__simdi_expand_u8_to_u32(out0,out1,out2,out3,ireg) \
1725   { \
1726     uint16x8_t l = vmovl_u8( vget_low_u8 ( vreinterpretq_u8_u32(ireg) ) ); \
1727     uint16x8_t h = vmovl_u8( vget_high_u8( vreinterpretq_u8_u32(ireg) ) ); \
1728     out0 = vmovl_u16( vget_low_u16 ( l ) ); \
1729     out1 = vmovl_u16( vget_high_u16( l ) ); \
1730     out2 = vmovl_u16( vget_low_u16 ( h ) ); \
1731     out3 = vmovl_u16( vget_high_u16( h ) ); \
1732   }
1733 
1734   #define stbir__simdi_expand_u8_to_1u32(out,ireg) \
1735   { \
1736     uint16x8_t tmp = vmovl_u8( vget_low_u8( vreinterpretq_u8_u32(ireg) ) ); \
1737     out = vmovl_u16( vget_low_u16( tmp ) ); \
1738   }
1739 
1740   #define stbir__simdi_expand_u16_to_u32(out0,out1,ireg) \
1741   { \
1742     uint16x8_t tmp = vreinterpretq_u16_u32(ireg); \
1743     out0 = vmovl_u16( vget_low_u16 ( tmp ) ); \
1744     out1 = vmovl_u16( vget_high_u16( tmp ) ); \
1745   }
1746 
1747   #define stbir__simdf_convert_float_to_i32( i, f ) (i) = vreinterpretq_u32_s32( vcvtq_s32_f32(f) )
1748   #define stbir__simdf_convert_float_to_int( f ) vgetq_lane_s32(vcvtq_s32_f32(f), 0)
1749   #define stbir__simdi_to_int( i ) (int)vgetq_lane_u32(i, 0)
1750   #define stbir__simdf_convert_float_to_uint8( f ) ((unsigned char)vgetq_lane_s32(vcvtq_s32_f32(vmaxq_f32(vminq_f32(f,STBIR__CONSTF(STBIR_max_uint8_as_float)),vdupq_n_f32(0))), 0))
1751   #define stbir__simdf_convert_float_to_short( f ) ((unsigned short)vgetq_lane_s32(vcvtq_s32_f32(vmaxq_f32(vminq_f32(f,STBIR__CONSTF(STBIR_max_uint16_as_float)),vdupq_n_f32(0))), 0))
1752   #define stbir__simdi_convert_i32_to_float(out, ireg) (out) = vcvtq_f32_s32( vreinterpretq_s32_u32(ireg) )
1753   #define stbir__simdf_add( out, reg0, reg1 ) (out) = vaddq_f32( reg0, reg1 )
1754   #define stbir__simdf_mult( out, reg0, reg1 ) (out) = vmulq_f32( reg0, reg1 )
1755   #define stbir__simdf_mult_mem( out, reg, ptr ) (out) = vmulq_f32( reg, vld1q_f32( (float const*)(ptr) ) )
1756   #define stbir__simdf_mult1_mem( out, reg, ptr ) (out) = vmulq_f32( reg, vld1q_dup_f32( (float const*)(ptr) ) )
1757   #define stbir__simdf_add_mem( out, reg, ptr ) (out) = vaddq_f32( reg, vld1q_f32( (float const*)(ptr) ) )
1758   #define stbir__simdf_add1_mem( out, reg, ptr ) (out) = vaddq_f32( reg, vld1q_dup_f32( (float const*)(ptr) ) )
1759 
1760   #ifdef STBIR_USE_FMA           // not on by default to maintain bit identical simd to non-simd (and also x64 no madd to arm madd)
1761   #define stbir__simdf_madd( out, add, mul1, mul2 ) (out) = vfmaq_f32( add, mul1, mul2 )
1762   #define stbir__simdf_madd1( out, add, mul1, mul2 ) (out) = vfmaq_f32( add, mul1, mul2 )
1763   #define stbir__simdf_madd_mem( out, add, mul, ptr ) (out) = vfmaq_f32( add, mul, vld1q_f32( (float const*)(ptr) ) )
1764   #define stbir__simdf_madd1_mem( out, add, mul, ptr ) (out) = vfmaq_f32( add, mul, vld1q_dup_f32( (float const*)(ptr) ) )
1765   #else
1766   #define stbir__simdf_madd( out, add, mul1, mul2 ) (out) = vaddq_f32( add, vmulq_f32( mul1, mul2 ) )
1767   #define stbir__simdf_madd1( out, add, mul1, mul2 ) (out) = vaddq_f32( add, vmulq_f32( mul1, mul2 ) )
1768   #define stbir__simdf_madd_mem( out, add, mul, ptr ) (out) = vaddq_f32( add, vmulq_f32( mul, vld1q_f32( (float const*)(ptr) ) ) )
1769   #define stbir__simdf_madd1_mem( out, add, mul, ptr ) (out) = vaddq_f32( add, vmulq_f32( mul, vld1q_dup_f32( (float const*)(ptr) ) ) )
1770   #endif
1771 
1772   #define stbir__simdf_add1( out, reg0, reg1 ) (out) = vaddq_f32( reg0, reg1 )
1773   #define stbir__simdf_mult1( out, reg0, reg1 ) (out) = vmulq_f32( reg0, reg1 )
1774 
1775   #define stbir__simdf_and( out, reg0, reg1 ) (out) = vreinterpretq_f32_u32( vandq_u32( vreinterpretq_u32_f32(reg0), vreinterpretq_u32_f32(reg1) ) )
1776   #define stbir__simdf_or( out, reg0, reg1 ) (out) = vreinterpretq_f32_u32( vorrq_u32( vreinterpretq_u32_f32(reg0), vreinterpretq_u32_f32(reg1) ) )
1777 
1778   #define stbir__simdf_min( out, reg0, reg1 ) (out) = vminq_f32( reg0, reg1 )
1779   #define stbir__simdf_max( out, reg0, reg1 ) (out) = vmaxq_f32( reg0, reg1 )
1780   #define stbir__simdf_min1( out, reg0, reg1 ) (out) = vminq_f32( reg0, reg1 )
1781   #define stbir__simdf_max1( out, reg0, reg1 ) (out) = vmaxq_f32( reg0, reg1 )
1782 
1783   #define stbir__simdf_0123ABCDto3ABx( out, reg0, reg1 ) (out) = vextq_f32( reg0, reg1, 3 )
1784   #define stbir__simdf_0123ABCDto23Ax( out, reg0, reg1 ) (out) = vextq_f32( reg0, reg1, 2 )
1785 
1786   #define stbir__simdf_a1a1( out, alp, ones ) (out) = vzipq_f32(vuzpq_f32(alp, alp).val[1], ones).val[0]
1787   #define stbir__simdf_1a1a( out, alp, ones ) (out) = vzipq_f32(ones, vuzpq_f32(alp, alp).val[0]).val[0]
1788 
1789   #if defined( _M_ARM64 ) || defined( __aarch64__ ) || defined( __arm64__ )
1790 
1791     #define stbir__simdf_aaa1( out, alp, ones ) (out) = vcopyq_laneq_f32(vdupq_n_f32(vgetq_lane_f32(alp, 3)), 3, ones, 3)
1792     #define stbir__simdf_1aaa( out, alp, ones ) (out) = vcopyq_laneq_f32(vdupq_n_f32(vgetq_lane_f32(alp, 0)), 0, ones, 0)
1793 
1794     #if defined( _MSC_VER ) && !defined(__clang__)
1795       #define stbir_make16(a,b,c,d) vcombine_u8( \
1796         vcreate_u8( (4*a+0) | ((4*a+1)<<8) | ((4*a+2)<<16) | ((4*a+3)<<24) | \
1797           ((stbir_uint64)(4*b+0)<<32) | ((stbir_uint64)(4*b+1)<<40) | ((stbir_uint64)(4*b+2)<<48) | ((stbir_uint64)(4*b+3)<<56)), \
1798         vcreate_u8( (4*c+0) | ((4*c+1)<<8) | ((4*c+2)<<16) | ((4*c+3)<<24) | \
1799           ((stbir_uint64)(4*d+0)<<32) | ((stbir_uint64)(4*d+1)<<40) | ((stbir_uint64)(4*d+2)<<48) | ((stbir_uint64)(4*d+3)<<56) ) )
1800 
1801       static stbir__inline uint8x16x2_t stbir_make16x2(float32x4_t rega,float32x4_t regb)
1802       {
1803         uint8x16x2_t r = { vreinterpretq_u8_f32(rega), vreinterpretq_u8_f32(regb) };
1804         return r;
1805       }
1806     #else
1807       #define stbir_make16(a,b,c,d) (uint8x16_t){4*a+0,4*a+1,4*a+2,4*a+3,4*b+0,4*b+1,4*b+2,4*b+3,4*c+0,4*c+1,4*c+2,4*c+3,4*d+0,4*d+1,4*d+2,4*d+3}
1808       #define stbir_make16x2(a,b) (uint8x16x2_t){{vreinterpretq_u8_f32(a),vreinterpretq_u8_f32(b)}}
1809     #endif
1810 
1811     #define stbir__simdf_swiz( reg, one, two, three, four ) vreinterpretq_f32_u8( vqtbl1q_u8( vreinterpretq_u8_f32(reg), stbir_make16(one, two, three, four) ) )
1812     #define stbir__simdf_swiz2( rega, regb, one, two, three, four ) vreinterpretq_f32_u8( vqtbl2q_u8( stbir_make16x2(rega,regb), stbir_make16(one, two, three, four) ) )
1813 
1814     #define stbir__simdi_16madd( out, reg0, reg1 ) \
1815     { \
1816       int16x8_t r0 = vreinterpretq_s16_u32(reg0); \
1817       int16x8_t r1 = vreinterpretq_s16_u32(reg1); \
1818       int32x4_t tmp0 = vmull_s16( vget_low_s16(r0), vget_low_s16(r1) ); \
1819       int32x4_t tmp1 = vmull_s16( vget_high_s16(r0), vget_high_s16(r1) ); \
1820       (out) = vreinterpretq_u32_s32( vpaddq_s32(tmp0, tmp1) ); \
1821     }
1822 
1823   #else
1824 
1825     #define stbir__simdf_aaa1( out, alp, ones ) (out) = vsetq_lane_f32(1.0f, vdupq_n_f32(vgetq_lane_f32(alp, 3)), 3)
1826     #define stbir__simdf_1aaa( out, alp, ones ) (out) = vsetq_lane_f32(1.0f, vdupq_n_f32(vgetq_lane_f32(alp, 0)), 0)
1827 
1828     #if defined( _MSC_VER ) && !defined(__clang__)
1829       static stbir__inline uint8x8x2_t stbir_make8x2(float32x4_t reg)
1830       {
1831         uint8x8x2_t r = { { vget_low_u8(vreinterpretq_u8_f32(reg)), vget_high_u8(vreinterpretq_u8_f32(reg)) } };
1832         return r;
1833       }
1834       #define stbir_make8(a,b) vcreate_u8( \
1835         (4*a+0) | ((4*a+1)<<8) | ((4*a+2)<<16) | ((4*a+3)<<24) | \
1836         ((stbir_uint64)(4*b+0)<<32) | ((stbir_uint64)(4*b+1)<<40) | ((stbir_uint64)(4*b+2)<<48) | ((stbir_uint64)(4*b+3)<<56) )
1837     #else
1838       #define stbir_make8x2(reg) (uint8x8x2_t){ { vget_low_u8(vreinterpretq_u8_f32(reg)), vget_high_u8(vreinterpretq_u8_f32(reg)) } }
1839       #define stbir_make8(a,b) (uint8x8_t){4*a+0,4*a+1,4*a+2,4*a+3,4*b+0,4*b+1,4*b+2,4*b+3}
1840     #endif
1841 
1842     #define stbir__simdf_swiz( reg, one, two, three, four ) vreinterpretq_f32_u8( vcombine_u8( \
1843         vtbl2_u8( stbir_make8x2( reg ), stbir_make8( one, two ) ), \
1844         vtbl2_u8( stbir_make8x2( reg ), stbir_make8( three, four ) ) ) )
1845 
1846     #define stbir__simdi_16madd( out, reg0, reg1 ) \
1847     { \
1848       int16x8_t r0 = vreinterpretq_s16_u32(reg0); \
1849       int16x8_t r1 = vreinterpretq_s16_u32(reg1); \
1850       int32x4_t tmp0 = vmull_s16( vget_low_s16(r0), vget_low_s16(r1) ); \
1851       int32x4_t tmp1 = vmull_s16( vget_high_s16(r0), vget_high_s16(r1) ); \
1852       int32x2_t out0 = vpadd_s32( vget_low_s32(tmp0), vget_high_s32(tmp0) ); \
1853       int32x2_t out1 = vpadd_s32( vget_low_s32(tmp1), vget_high_s32(tmp1) ); \
1854       (out) = vreinterpretq_u32_s32( vcombine_s32(out0, out1) ); \
1855     }
1856 
1857   #endif
1858 
1859   #define stbir__simdi_and( out, reg0, reg1 ) (out) = vandq_u32( reg0, reg1 )
1860   #define stbir__simdi_or( out, reg0, reg1 ) (out) = vorrq_u32( reg0, reg1 )
1861 
1862   #define stbir__simdf_pack_to_8bytes(out,aa,bb) \
1863   { \
1864     float32x4_t af = vmaxq_f32( vminq_f32(aa,STBIR__CONSTF(STBIR_max_uint8_as_float) ), vdupq_n_f32(0) ); \
1865     float32x4_t bf = vmaxq_f32( vminq_f32(bb,STBIR__CONSTF(STBIR_max_uint8_as_float) ), vdupq_n_f32(0) ); \
1866     int16x4_t ai = vqmovn_s32( vcvtq_s32_f32( af ) ); \
1867     int16x4_t bi = vqmovn_s32( vcvtq_s32_f32( bf ) ); \
1868     uint8x8_t out8 = vqmovun_s16( vcombine_s16(ai, bi) ); \
1869     out = vreinterpretq_u32_u8( vcombine_u8(out8, out8) ); \
1870   }
1871 
1872   #define stbir__simdf_pack_to_8words(out,aa,bb) \
1873   { \
1874     float32x4_t af = vmaxq_f32( vminq_f32(aa,STBIR__CONSTF(STBIR_max_uint16_as_float) ), vdupq_n_f32(0) ); \
1875     float32x4_t bf = vmaxq_f32( vminq_f32(bb,STBIR__CONSTF(STBIR_max_uint16_as_float) ), vdupq_n_f32(0) ); \
1876     int32x4_t ai = vcvtq_s32_f32( af ); \
1877     int32x4_t bi = vcvtq_s32_f32( bf ); \
1878     out = vreinterpretq_u32_u16( vcombine_u16(vqmovun_s32(ai), vqmovun_s32(bi)) ); \
1879   }
1880 
1881   #define stbir__interleave_pack_and_store_16_u8( ptr, r0, r1, r2, r3 ) \
1882   { \
1883     int16x4x2_t tmp0 = vzip_s16( vqmovn_s32(vreinterpretq_s32_u32(r0)), vqmovn_s32(vreinterpretq_s32_u32(r2)) ); \
1884     int16x4x2_t tmp1 = vzip_s16( vqmovn_s32(vreinterpretq_s32_u32(r1)), vqmovn_s32(vreinterpretq_s32_u32(r3)) ); \
1885     uint8x8x2_t out = \
1886     { { \
1887       vqmovun_s16( vcombine_s16(tmp0.val[0], tmp0.val[1]) ), \
1888       vqmovun_s16( vcombine_s16(tmp1.val[0], tmp1.val[1]) ), \
1889     } }; \
1890     vst2_u8(ptr, out); \
1891   }
1892 
1893   #define stbir__simdf_load4_transposed( o0, o1, o2, o3, ptr ) \
1894   { \
1895     float32x4x4_t tmp = vld4q_f32(ptr); \
1896     o0 = tmp.val[0]; \
1897     o1 = tmp.val[1]; \
1898     o2 = tmp.val[2]; \
1899     o3 = tmp.val[3]; \
1900   }
1901 
1902   #define stbir__simdi_32shr( out, reg, imm ) out = vshrq_n_u32( reg, imm )
1903 
1904   #if defined( _MSC_VER ) && !defined(__clang__)
1905     #define STBIR__SIMDF_CONST(var, x) __declspec(align(8)) float var[] = { x, x, x, x }
1906     #define STBIR__SIMDI_CONST(var, x) __declspec(align(8)) uint32_t var[] = { x, x, x, x }
1907     #define STBIR__CONSTF(var) (*(const float32x4_t*)var)
1908     #define STBIR__CONSTI(var) (*(const uint32x4_t*)var)
1909   #else
1910     #define STBIR__SIMDF_CONST(var, x) stbir__simdf var = { x, x, x, x }
1911     #define STBIR__SIMDI_CONST(var, x) stbir__simdi var = { x, x, x, x }
1912     #define STBIR__CONSTF(var) (var)
1913     #define STBIR__CONSTI(var) (var)
1914   #endif
1915 
1916   #ifdef STBIR_FLOORF
1917   #undef STBIR_FLOORF
1918   #endif
1919   #define STBIR_FLOORF stbir_simd_floorf
1920   static stbir__inline float stbir_simd_floorf(float x)
1921   {
1922     #if defined( _M_ARM64 ) || defined( __aarch64__ ) || defined( __arm64__ )
1923     return vget_lane_f32( vrndm_f32( vdup_n_f32(x) ), 0);
1924     #else
1925     float32x2_t f = vdup_n_f32(x);
1926     float32x2_t t = vcvt_f32_s32(vcvt_s32_f32(f));
1927     uint32x2_t a = vclt_f32(f, t);
1928     uint32x2_t b = vreinterpret_u32_f32(vdup_n_f32(-1.0f));
1929     float32x2_t r = vadd_f32(t, vreinterpret_f32_u32(vand_u32(a, b)));
1930     return vget_lane_f32(r, 0);
1931     #endif
1932   }
1933 
1934   #ifdef STBIR_CEILF
1935   #undef STBIR_CEILF
1936   #endif
1937   #define STBIR_CEILF stbir_simd_ceilf
1938   static stbir__inline float stbir_simd_ceilf(float x)
1939   {
1940     #if defined( _M_ARM64 ) || defined( __aarch64__ ) || defined( __arm64__ )
1941     return vget_lane_f32( vrndp_f32( vdup_n_f32(x) ), 0);
1942     #else
1943     float32x2_t f = vdup_n_f32(x);
1944     float32x2_t t = vcvt_f32_s32(vcvt_s32_f32(f));
1945     uint32x2_t a = vclt_f32(t, f);
1946     uint32x2_t b = vreinterpret_u32_f32(vdup_n_f32(1.0f));
1947     float32x2_t r = vadd_f32(t, vreinterpret_f32_u32(vand_u32(a, b)));
1948     return vget_lane_f32(r, 0);
1949     #endif
1950   }
1951 
1952   #define STBIR_SIMD
1953 
1954 #elif defined(STBIR_WASM)
1955 
1956   #include <wasm_simd128.h>
1957 
1958   #define stbir__simdf v128_t
1959   #define stbir__simdi v128_t
1960 
1961   #define stbir_simdi_castf( reg ) (reg)
1962   #define stbir_simdf_casti( reg ) (reg)
1963 
1964   #define stbir__simdf_load( reg, ptr )             (reg) = wasm_v128_load( (void const*)(ptr) )
1965   #define stbir__simdi_load( reg, ptr )             (reg) = wasm_v128_load( (void const*)(ptr) )
1966   #define stbir__simdf_load1( out, ptr )            (out) = wasm_v128_load32_splat( (void const*)(ptr) ) // top values can be random (not denormal or nan for perf)
1967   #define stbir__simdi_load1( out, ptr )            (out) = wasm_v128_load32_splat( (void const*)(ptr) )
1968   #define stbir__simdf_load1z( out, ptr )           (out) = wasm_v128_load32_zero( (void const*)(ptr) ) // top values must be zero
1969   #define stbir__simdf_frep4( fvar )                wasm_f32x4_splat( fvar )
1970   #define stbir__simdf_load1frep4( out, fvar )      (out) = wasm_f32x4_splat( fvar )
1971   #define stbir__simdf_load2( out, ptr )            (out) = wasm_v128_load64_splat( (void const*)(ptr) ) // top values can be random (not denormal or nan for perf)
1972   #define stbir__simdf_load2z( out, ptr )           (out) = wasm_v128_load64_zero( (void const*)(ptr) ) // top values must be zero
1973   #define stbir__simdf_load2hmerge( out, reg, ptr ) (out) = wasm_v128_load64_lane( (void const*)(ptr), reg, 1 )
1974 
1975   #define stbir__simdf_zeroP() wasm_f32x4_const_splat(0)
1976   #define stbir__simdf_zero( reg ) (reg) = wasm_f32x4_const_splat(0)
1977 
1978   #define stbir__simdf_store( ptr, reg )   wasm_v128_store( (void*)(ptr), reg )
1979   #define stbir__simdf_store1( ptr, reg )  wasm_v128_store32_lane( (void*)(ptr), reg, 0 )
1980   #define stbir__simdf_store2( ptr, reg )  wasm_v128_store64_lane( (void*)(ptr), reg, 0 )
1981   #define stbir__simdf_store2h( ptr, reg ) wasm_v128_store64_lane( (void*)(ptr), reg, 1 )
1982 
1983   #define stbir__simdi_store( ptr, reg )  wasm_v128_store( (void*)(ptr), reg )
1984   #define stbir__simdi_store1( ptr, reg ) wasm_v128_store32_lane( (void*)(ptr), reg, 0 )
1985   #define stbir__simdi_store2( ptr, reg ) wasm_v128_store64_lane( (void*)(ptr), reg, 0 )
1986 
1987   #define stbir__prefetch( ptr )
1988 
1989   #define stbir__simdi_expand_u8_to_u32(out0,out1,out2,out3,ireg) \
1990   { \
1991     v128_t l = wasm_u16x8_extend_low_u8x16 ( ireg ); \
1992     v128_t h = wasm_u16x8_extend_high_u8x16( ireg ); \
1993     out0 = wasm_u32x4_extend_low_u16x8 ( l ); \
1994     out1 = wasm_u32x4_extend_high_u16x8( l ); \
1995     out2 = wasm_u32x4_extend_low_u16x8 ( h ); \
1996     out3 = wasm_u32x4_extend_high_u16x8( h ); \
1997   }
1998 
1999   #define stbir__simdi_expand_u8_to_1u32(out,ireg) \
2000   { \
2001     v128_t tmp = wasm_u16x8_extend_low_u8x16(ireg); \
2002     out = wasm_u32x4_extend_low_u16x8(tmp); \
2003   }
2004 
2005   #define stbir__simdi_expand_u16_to_u32(out0,out1,ireg) \
2006   { \
2007     out0 = wasm_u32x4_extend_low_u16x8 ( ireg ); \
2008     out1 = wasm_u32x4_extend_high_u16x8( ireg ); \
2009   }
2010 
2011   #define stbir__simdf_convert_float_to_i32( i, f )    (i) = wasm_i32x4_trunc_sat_f32x4(f)
2012   #define stbir__simdf_convert_float_to_int( f )       wasm_i32x4_extract_lane(wasm_i32x4_trunc_sat_f32x4(f), 0)
2013   #define stbir__simdi_to_int( i )                     wasm_i32x4_extract_lane(i, 0)
2014   #define stbir__simdf_convert_float_to_uint8( f )     ((unsigned char)wasm_i32x4_extract_lane(wasm_i32x4_trunc_sat_f32x4(wasm_f32x4_max(wasm_f32x4_min(f,STBIR_max_uint8_as_float),wasm_f32x4_const_splat(0))), 0))
2015   #define stbir__simdf_convert_float_to_short( f )     ((unsigned short)wasm_i32x4_extract_lane(wasm_i32x4_trunc_sat_f32x4(wasm_f32x4_max(wasm_f32x4_min(f,STBIR_max_uint16_as_float),wasm_f32x4_const_splat(0))), 0))
2016   #define stbir__simdi_convert_i32_to_float(out, ireg) (out) = wasm_f32x4_convert_i32x4(ireg)
2017   #define stbir__simdf_add( out, reg0, reg1 )          (out) = wasm_f32x4_add( reg0, reg1 )
2018   #define stbir__simdf_mult( out, reg0, reg1 )         (out) = wasm_f32x4_mul( reg0, reg1 )
2019   #define stbir__simdf_mult_mem( out, reg, ptr )       (out) = wasm_f32x4_mul( reg, wasm_v128_load( (void const*)(ptr) ) )
2020   #define stbir__simdf_mult1_mem( out, reg, ptr )      (out) = wasm_f32x4_mul( reg, wasm_v128_load32_splat( (void const*)(ptr) ) )
2021   #define stbir__simdf_add_mem( out, reg, ptr )        (out) = wasm_f32x4_add( reg, wasm_v128_load( (void const*)(ptr) ) )
2022   #define stbir__simdf_add1_mem( out, reg, ptr )       (out) = wasm_f32x4_add( reg, wasm_v128_load32_splat( (void const*)(ptr) ) )
2023 
2024   #define stbir__simdf_madd( out, add, mul1, mul2 )    (out) = wasm_f32x4_add( add, wasm_f32x4_mul( mul1, mul2 ) )
2025   #define stbir__simdf_madd1( out, add, mul1, mul2 )   (out) = wasm_f32x4_add( add, wasm_f32x4_mul( mul1, mul2 ) )
2026   #define stbir__simdf_madd_mem( out, add, mul, ptr )  (out) = wasm_f32x4_add( add, wasm_f32x4_mul( mul, wasm_v128_load( (void const*)(ptr) ) ) )
2027   #define stbir__simdf_madd1_mem( out, add, mul, ptr ) (out) = wasm_f32x4_add( add, wasm_f32x4_mul( mul, wasm_v128_load32_splat( (void const*)(ptr) ) ) )
2028 
2029   #define stbir__simdf_add1( out, reg0, reg1 )  (out) = wasm_f32x4_add( reg0, reg1 )
2030   #define stbir__simdf_mult1( out, reg0, reg1 ) (out) = wasm_f32x4_mul( reg0, reg1 )
2031 
2032   #define stbir__simdf_and( out, reg0, reg1 ) (out) = wasm_v128_and( reg0, reg1 )
2033   #define stbir__simdf_or( out, reg0, reg1 )  (out) = wasm_v128_or( reg0, reg1 )
2034 
2035   #define stbir__simdf_min( out, reg0, reg1 ) (out) = wasm_f32x4_min( reg0, reg1 )
2036   #define stbir__simdf_max( out, reg0, reg1 ) (out) = wasm_f32x4_max( reg0, reg1 )
2037   #define stbir__simdf_min1( out, reg0, reg1 ) (out) = wasm_f32x4_min( reg0, reg1 )
2038   #define stbir__simdf_max1( out, reg0, reg1 ) (out) = wasm_f32x4_max( reg0, reg1 )
2039 
2040   #define stbir__simdf_0123ABCDto3ABx( out, reg0, reg1 ) (out) = wasm_i32x4_shuffle( reg0, reg1, 3, 4, 5, -1 )
2041   #define stbir__simdf_0123ABCDto23Ax( out, reg0, reg1 ) (out) = wasm_i32x4_shuffle( reg0, reg1, 2, 3, 4, -1 )
2042 
2043   #define stbir__simdf_aaa1(out,alp,ones) (out) = wasm_i32x4_shuffle(alp, ones, 3, 3, 3, 4)
2044   #define stbir__simdf_1aaa(out,alp,ones) (out) = wasm_i32x4_shuffle(alp, ones, 4, 0, 0, 0)
2045   #define stbir__simdf_a1a1(out,alp,ones) (out) = wasm_i32x4_shuffle(alp, ones, 1, 4, 3, 4)
2046   #define stbir__simdf_1a1a(out,alp,ones) (out) = wasm_i32x4_shuffle(alp, ones, 4, 0, 4, 2)
2047 
2048   #define stbir__simdf_swiz( reg, one, two, three, four ) wasm_i32x4_shuffle(reg, reg, one, two, three, four)
2049 
2050   #define stbir__simdi_and( out, reg0, reg1 )    (out) = wasm_v128_and( reg0, reg1 )
2051   #define stbir__simdi_or( out, reg0, reg1 )     (out) = wasm_v128_or( reg0, reg1 )
2052   #define stbir__simdi_16madd( out, reg0, reg1 ) (out) = wasm_i32x4_dot_i16x8( reg0, reg1 )
2053 
2054   #define stbir__simdf_pack_to_8bytes(out,aa,bb) \
2055   { \
2056     v128_t af = wasm_f32x4_max( wasm_f32x4_min(aa, STBIR_max_uint8_as_float), wasm_f32x4_const_splat(0) ); \
2057     v128_t bf = wasm_f32x4_max( wasm_f32x4_min(bb, STBIR_max_uint8_as_float), wasm_f32x4_const_splat(0) ); \
2058     v128_t ai = wasm_i32x4_trunc_sat_f32x4( af ); \
2059     v128_t bi = wasm_i32x4_trunc_sat_f32x4( bf ); \
2060     v128_t out16 = wasm_i16x8_narrow_i32x4( ai, bi ); \
2061     out = wasm_u8x16_narrow_i16x8( out16, out16 ); \
2062   }
2063 
2064   #define stbir__simdf_pack_to_8words(out,aa,bb) \
2065   { \
2066     v128_t af = wasm_f32x4_max( wasm_f32x4_min(aa, STBIR_max_uint16_as_float), wasm_f32x4_const_splat(0)); \
2067     v128_t bf = wasm_f32x4_max( wasm_f32x4_min(bb, STBIR_max_uint16_as_float), wasm_f32x4_const_splat(0)); \
2068     v128_t ai = wasm_i32x4_trunc_sat_f32x4( af ); \
2069     v128_t bi = wasm_i32x4_trunc_sat_f32x4( bf ); \
2070     out = wasm_u16x8_narrow_i32x4( ai, bi ); \
2071   }
2072 
2073   #define stbir__interleave_pack_and_store_16_u8( ptr, r0, r1, r2, r3 ) \
2074   { \
2075     v128_t tmp0 = wasm_i16x8_narrow_i32x4(r0, r1); \
2076     v128_t tmp1 = wasm_i16x8_narrow_i32x4(r2, r3); \
2077     v128_t tmp = wasm_u8x16_narrow_i16x8(tmp0, tmp1); \
2078     tmp = wasm_i8x16_shuffle(tmp, tmp, 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15); \
2079     wasm_v128_store( (void*)(ptr), tmp); \
2080   }
2081 
2082   #define stbir__simdf_load4_transposed( o0, o1, o2, o3, ptr ) \
2083   { \
2084     v128_t t0 = wasm_v128_load( ptr    ); \
2085     v128_t t1 = wasm_v128_load( ptr+4  ); \
2086     v128_t t2 = wasm_v128_load( ptr+8  ); \
2087     v128_t t3 = wasm_v128_load( ptr+12 ); \
2088     v128_t s0 = wasm_i32x4_shuffle(t0, t1, 0, 4, 2, 6); \
2089     v128_t s1 = wasm_i32x4_shuffle(t0, t1, 1, 5, 3, 7); \
2090     v128_t s2 = wasm_i32x4_shuffle(t2, t3, 0, 4, 2, 6); \
2091     v128_t s3 = wasm_i32x4_shuffle(t2, t3, 1, 5, 3, 7); \
2092     o0 = wasm_i32x4_shuffle(s0, s2, 0, 1, 4, 5); \
2093     o1 = wasm_i32x4_shuffle(s1, s3, 0, 1, 4, 5); \
2094     o2 = wasm_i32x4_shuffle(s0, s2, 2, 3, 6, 7); \
2095     o3 = wasm_i32x4_shuffle(s1, s3, 2, 3, 6, 7); \
2096   }
2097 
2098   #define stbir__simdi_32shr( out, reg, imm ) out = wasm_u32x4_shr( reg, imm )
2099 
2100   typedef float stbir__f32x4 __attribute__((__vector_size__(16), __aligned__(16)));
2101   #define STBIR__SIMDF_CONST(var, x) stbir__simdf var = (v128_t)(stbir__f32x4){ x, x, x, x }
2102   #define STBIR__SIMDI_CONST(var, x) stbir__simdi var = { x, x, x, x }
2103   #define STBIR__CONSTF(var) (var)
2104   #define STBIR__CONSTI(var) (var)
2105 
2106   #ifdef STBIR_FLOORF
2107   #undef STBIR_FLOORF
2108   #endif
2109   #define STBIR_FLOORF stbir_simd_floorf
2110   static stbir__inline float stbir_simd_floorf(float x)
2111   {
2112     return wasm_f32x4_extract_lane( wasm_f32x4_floor( wasm_f32x4_splat(x) ), 0);
2113   }
2114 
2115   #ifdef STBIR_CEILF
2116   #undef STBIR_CEILF
2117   #endif
2118   #define STBIR_CEILF stbir_simd_ceilf
2119   static stbir__inline float stbir_simd_ceilf(float x)
2120   {
2121     return wasm_f32x4_extract_lane( wasm_f32x4_ceil( wasm_f32x4_splat(x) ), 0);
2122   }
2123 
2124   #define STBIR_SIMD
2125 
2126 #endif  // SSE2/NEON/WASM
2127 
2128 #endif // NO SIMD
2129 
2130 #ifdef STBIR_SIMD8
2131   #define stbir__simdfX stbir__simdf8
2132   #define stbir__simdiX stbir__simdi8
2133   #define stbir__simdfX_load stbir__simdf8_load
2134   #define stbir__simdiX_load stbir__simdi8_load
2135   #define stbir__simdfX_mult stbir__simdf8_mult
2136   #define stbir__simdfX_add_mem stbir__simdf8_add_mem
2137   #define stbir__simdfX_madd_mem stbir__simdf8_madd_mem
2138   #define stbir__simdfX_store stbir__simdf8_store
2139   #define stbir__simdiX_store stbir__simdi8_store
2140   #define stbir__simdf_frepX  stbir__simdf8_frep8
2141   #define stbir__simdfX_madd stbir__simdf8_madd
2142   #define stbir__simdfX_min stbir__simdf8_min
2143   #define stbir__simdfX_max stbir__simdf8_max
2144   #define stbir__simdfX_aaa1 stbir__simdf8_aaa1
2145   #define stbir__simdfX_1aaa stbir__simdf8_1aaa
2146   #define stbir__simdfX_a1a1 stbir__simdf8_a1a1
2147   #define stbir__simdfX_1a1a stbir__simdf8_1a1a
2148   #define stbir__simdfX_convert_float_to_i32 stbir__simdf8_convert_float_to_i32
2149   #define stbir__simdfX_pack_to_words stbir__simdf8_pack_to_16words
2150   #define stbir__simdfX_zero stbir__simdf8_zero
2151   #define STBIR_onesX STBIR_ones8
2152   #define STBIR_max_uint8_as_floatX STBIR_max_uint8_as_float8
2153   #define STBIR_max_uint16_as_floatX STBIR_max_uint16_as_float8
2154   #define STBIR_simd_point5X STBIR_simd_point58
2155   #define stbir__simdfX_float_count 8
2156   #define stbir__simdfX_0123to1230 stbir__simdf8_0123to12301230
2157   #define stbir__simdfX_0123to2103 stbir__simdf8_0123to21032103
2158   static const stbir__simdf8 STBIR_max_uint16_as_float_inverted8 = { stbir__max_uint16_as_float_inverted,stbir__max_uint16_as_float_inverted,stbir__max_uint16_as_float_inverted,stbir__max_uint16_as_float_inverted,stbir__max_uint16_as_float_inverted,stbir__max_uint16_as_float_inverted,stbir__max_uint16_as_float_inverted,stbir__max_uint16_as_float_inverted };
2159   static const stbir__simdf8 STBIR_max_uint8_as_float_inverted8 = { stbir__max_uint8_as_float_inverted,stbir__max_uint8_as_float_inverted,stbir__max_uint8_as_float_inverted,stbir__max_uint8_as_float_inverted,stbir__max_uint8_as_float_inverted,stbir__max_uint8_as_float_inverted,stbir__max_uint8_as_float_inverted,stbir__max_uint8_as_float_inverted };
2160   static const stbir__simdf8 STBIR_ones8 = { 1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0 };
2161   static const stbir__simdf8 STBIR_simd_point58 = { 0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5 };
2162   static const stbir__simdf8 STBIR_max_uint8_as_float8 = { stbir__max_uint8_as_float,stbir__max_uint8_as_float,stbir__max_uint8_as_float,stbir__max_uint8_as_float, stbir__max_uint8_as_float,stbir__max_uint8_as_float,stbir__max_uint8_as_float,stbir__max_uint8_as_float };
2163   static const stbir__simdf8 STBIR_max_uint16_as_float8 = { stbir__max_uint16_as_float,stbir__max_uint16_as_float,stbir__max_uint16_as_float,stbir__max_uint16_as_float, stbir__max_uint16_as_float,stbir__max_uint16_as_float,stbir__max_uint16_as_float,stbir__max_uint16_as_float };
2164 #else
2165   #define stbir__simdfX stbir__simdf
2166   #define stbir__simdiX stbir__simdi
2167   #define stbir__simdfX_load stbir__simdf_load
2168   #define stbir__simdiX_load stbir__simdi_load
2169   #define stbir__simdfX_mult stbir__simdf_mult
2170   #define stbir__simdfX_add_mem stbir__simdf_add_mem
2171   #define stbir__simdfX_madd_mem stbir__simdf_madd_mem
2172   #define stbir__simdfX_store stbir__simdf_store
2173   #define stbir__simdiX_store stbir__simdi_store
2174   #define stbir__simdf_frepX  stbir__simdf_frep4
2175   #define stbir__simdfX_madd stbir__simdf_madd
2176   #define stbir__simdfX_min stbir__simdf_min
2177   #define stbir__simdfX_max stbir__simdf_max
2178   #define stbir__simdfX_aaa1 stbir__simdf_aaa1
2179   #define stbir__simdfX_1aaa stbir__simdf_1aaa
2180   #define stbir__simdfX_a1a1 stbir__simdf_a1a1
2181   #define stbir__simdfX_1a1a stbir__simdf_1a1a
2182   #define stbir__simdfX_convert_float_to_i32 stbir__simdf_convert_float_to_i32
2183   #define stbir__simdfX_pack_to_words stbir__simdf_pack_to_8words
2184   #define stbir__simdfX_zero stbir__simdf_zero
2185   #define STBIR_onesX STBIR__CONSTF(STBIR_ones)
2186   #define STBIR_simd_point5X STBIR__CONSTF(STBIR_simd_point5)
2187   #define STBIR_max_uint8_as_floatX STBIR__CONSTF(STBIR_max_uint8_as_float)
2188   #define STBIR_max_uint16_as_floatX STBIR__CONSTF(STBIR_max_uint16_as_float)
2189   #define stbir__simdfX_float_count 4
2190   #define stbir__if_simdf8_cast_to_simdf4( val ) ( val )
2191   #define stbir__simdfX_0123to1230 stbir__simdf_0123to1230
2192   #define stbir__simdfX_0123to2103 stbir__simdf_0123to2103
2193 #endif
2194 
2195 
2196 #if defined(STBIR_NEON) && !defined(_M_ARM) && !defined(__arm__)
2197 
2198   #if defined( _MSC_VER ) && !defined(__clang__)
2199   typedef __int16 stbir__FP16;
2200   #else
2201   typedef float16_t stbir__FP16;
2202   #endif
2203 
2204 #else // no NEON, or 32-bit ARM for MSVC
2205 
2206   typedef union stbir__FP16
2207   {
2208     unsigned short u;
2209   } stbir__FP16;
2210 
2211 #endif
2212 
2213 #if (!defined(STBIR_NEON) && !defined(STBIR_FP16C)) || (defined(STBIR_NEON) && defined(_M_ARM)) || (defined(STBIR_NEON) && defined(__arm__))
2214 
2215   // Fabian's half float routines, see: https://gist.github.com/rygorous/2156668
2216 
2217   static stbir__inline float stbir__half_to_float( stbir__FP16 h )
2218   {
2219     static const stbir__FP32 magic = { (254 - 15) << 23 };
2220     static const stbir__FP32 was_infnan = { (127 + 16) << 23 };
2221     stbir__FP32 o;
2222 
2223     o.u = (h.u & 0x7fff) << 13;     // exponent/mantissa bits
2224     o.f *= magic.f;                 // exponent adjust
2225     if (o.f >= was_infnan.f)        // make sure Inf/NaN survive
2226       o.u |= 255 << 23;
2227     o.u |= (h.u & 0x8000) << 16;    // sign bit
2228     return o.f;
2229   }
2230 
2231   static stbir__inline stbir__FP16 stbir__float_to_half(float val)
2232   {
2233     stbir__FP32 f32infty = { 255 << 23 };
2234     stbir__FP32 f16max   = { (127 + 16) << 23 };
2235     stbir__FP32 denorm_magic = { ((127 - 15) + (23 - 10) + 1) << 23 };
2236     unsigned int sign_mask = 0x80000000u;
2237     stbir__FP16 o = { 0 };
2238     stbir__FP32 f;
2239     unsigned int sign;
2240 
2241     f.f = val;
2242     sign = f.u & sign_mask;
2243     f.u ^= sign;
2244 
2245     if (f.u >= f16max.u) // result is Inf or NaN (all exponent bits set)
2246       o.u = (f.u > f32infty.u) ? 0x7e00 : 0x7c00; // NaN->qNaN and Inf->Inf
2247     else // (De)normalized number or zero
2248     {
2249       if (f.u < (113 << 23)) // resulting FP16 is subnormal or zero
2250       {
2251         // use a magic value to align our 10 mantissa bits at the bottom of
2252         // the float. as long as FP addition is round-to-nearest-even this
2253         // just works.
2254         f.f += denorm_magic.f;
2255         // and one integer subtract of the bias later, we have our final float!
2256         o.u = (unsigned short) ( f.u - denorm_magic.u );
2257       }
2258       else
2259       {
2260         unsigned int mant_odd = (f.u >> 13) & 1; // resulting mantissa is odd
2261         // update exponent, rounding bias part 1
2262         f.u = f.u + ((15u - 127) << 23) + 0xfff;
2263         // rounding bias part 2
2264         f.u += mant_odd;
2265         // take the bits!
2266         o.u = (unsigned short) ( f.u >> 13 );
2267       }
2268     }
2269 
2270     o.u |= sign >> 16;
2271     return o;
2272   }
2273 
2274 #endif
2275 
2276 
2277 #if defined(STBIR_FP16C)
2278 
2279   #include <immintrin.h>
2280 
2281   static stbir__inline void stbir__half_to_float_SIMD(float * output, stbir__FP16 const * input)
2282   {
2283     _mm256_storeu_ps( (float*)output, _mm256_cvtph_ps( _mm_loadu_si128( (__m128i const* )input ) ) );
2284   }
2285 
2286   static stbir__inline void stbir__float_to_half_SIMD(stbir__FP16 * output, float const * input)
2287   {
2288     _mm_storeu_si128( (__m128i*)output, _mm256_cvtps_ph( _mm256_loadu_ps( input ), 0 ) );
2289   }
2290 
2291   static stbir__inline float stbir__half_to_float( stbir__FP16 h )
2292   {
2293     return _mm_cvtss_f32( _mm_cvtph_ps( _mm_cvtsi32_si128( (int)h.u ) ) );
2294   }
2295 
2296   static stbir__inline stbir__FP16 stbir__float_to_half( float f )
2297   {
2298     stbir__FP16 h;
2299     h.u = (unsigned short) _mm_cvtsi128_si32( _mm_cvtps_ph( _mm_set_ss( f ), 0 ) );
2300     return h;
2301   }
2302 
2303 #elif defined(STBIR_SSE2)
2304 
2305   // Fabian's half float routines, see: https://gist.github.com/rygorous/2156668
2306   stbir__inline static void stbir__half_to_float_SIMD(float * output, void const * input)
2307   {
2308     static const STBIR__SIMDI_CONST(mask_nosign,      0x7fff);
2309     static const STBIR__SIMDI_CONST(smallest_normal,  0x0400);
2310     static const STBIR__SIMDI_CONST(infinity,         0x7c00);
2311     static const STBIR__SIMDI_CONST(expadjust_normal, (127 - 15) << 23);
2312     static const STBIR__SIMDI_CONST(magic_denorm,     113 << 23);
2313 
2314     __m128i i = _mm_loadu_si128 ( (__m128i const*)(input) );
2315     __m128i h = _mm_unpacklo_epi16 ( i, _mm_setzero_si128() );
2316     __m128i mnosign     = STBIR__CONSTI(mask_nosign);
2317     __m128i eadjust     = STBIR__CONSTI(expadjust_normal);
2318     __m128i smallest    = STBIR__CONSTI(smallest_normal);
2319     __m128i infty       = STBIR__CONSTI(infinity);
2320     __m128i expmant     = _mm_and_si128(mnosign, h);
2321     __m128i justsign    = _mm_xor_si128(h, expmant);
2322     __m128i b_notinfnan = _mm_cmpgt_epi32(infty, expmant);
2323     __m128i b_isdenorm  = _mm_cmpgt_epi32(smallest, expmant);
2324     __m128i shifted     = _mm_slli_epi32(expmant, 13);
2325     __m128i adj_infnan  = _mm_andnot_si128(b_notinfnan, eadjust);
2326     __m128i adjusted    = _mm_add_epi32(eadjust, shifted);
2327     __m128i den1        = _mm_add_epi32(shifted, STBIR__CONSTI(magic_denorm));
2328     __m128i adjusted2   = _mm_add_epi32(adjusted, adj_infnan);
2329     __m128  den2        = _mm_sub_ps(_mm_castsi128_ps(den1), *(const __m128 *)&magic_denorm);
2330     __m128  adjusted3   = _mm_and_ps(den2, _mm_castsi128_ps(b_isdenorm));
2331     __m128  adjusted4   = _mm_andnot_ps(_mm_castsi128_ps(b_isdenorm), _mm_castsi128_ps(adjusted2));
2332     __m128  adjusted5   = _mm_or_ps(adjusted3, adjusted4);
2333     __m128i sign        = _mm_slli_epi32(justsign, 16);
2334     __m128  final       = _mm_or_ps(adjusted5, _mm_castsi128_ps(sign));
2335     stbir__simdf_store( output + 0,  final );
2336 
2337     h = _mm_unpackhi_epi16 ( i, _mm_setzero_si128() );
2338     expmant     = _mm_and_si128(mnosign, h);
2339     justsign    = _mm_xor_si128(h, expmant);
2340     b_notinfnan = _mm_cmpgt_epi32(infty, expmant);
2341     b_isdenorm  = _mm_cmpgt_epi32(smallest, expmant);
2342     shifted     = _mm_slli_epi32(expmant, 13);
2343     adj_infnan  = _mm_andnot_si128(b_notinfnan, eadjust);
2344     adjusted    = _mm_add_epi32(eadjust, shifted);
2345     den1        = _mm_add_epi32(shifted, STBIR__CONSTI(magic_denorm));
2346     adjusted2   = _mm_add_epi32(adjusted, adj_infnan);
2347     den2        = _mm_sub_ps(_mm_castsi128_ps(den1), *(const __m128 *)&magic_denorm);
2348     adjusted3   = _mm_and_ps(den2, _mm_castsi128_ps(b_isdenorm));
2349     adjusted4   = _mm_andnot_ps(_mm_castsi128_ps(b_isdenorm), _mm_castsi128_ps(adjusted2));
2350     adjusted5   = _mm_or_ps(adjusted3, adjusted4);
2351     sign        = _mm_slli_epi32(justsign, 16);
2352     final       = _mm_or_ps(adjusted5, _mm_castsi128_ps(sign));
2353     stbir__simdf_store( output + 4,  final );
2354 
2355     // ~38 SSE2 ops for 8 values
2356   }
2357 
2358   // Fabian's round-to-nearest-even float to half
2359   // ~48 SSE2 ops for 8 output
2360   stbir__inline static void stbir__float_to_half_SIMD(void * output, float const * input)
2361   {
2362     static const STBIR__SIMDI_CONST(mask_sign,      0x80000000u);
2363     static const STBIR__SIMDI_CONST(c_f16max,       (127 + 16) << 23); // all FP32 values >=this round to +inf
2364     static const STBIR__SIMDI_CONST(c_nanbit,        0x200);
2365     static const STBIR__SIMDI_CONST(c_infty_as_fp16, 0x7c00);
2366     static const STBIR__SIMDI_CONST(c_min_normal,    (127 - 14) << 23); // smallest FP32 that yields a normalized FP16
2367     static const STBIR__SIMDI_CONST(c_subnorm_magic, ((127 - 15) + (23 - 10) + 1) << 23);
2368     static const STBIR__SIMDI_CONST(c_normal_bias,    0xfff - ((127 - 15) << 23)); // adjust exponent and add mantissa rounding
2369 
2370     __m128  f           =  _mm_loadu_ps(input);
2371     __m128  msign       = _mm_castsi128_ps(STBIR__CONSTI(mask_sign));
2372     __m128  justsign    = _mm_and_ps(msign, f);
2373     __m128  absf        = _mm_xor_ps(f, justsign);
2374     __m128i absf_int    = _mm_castps_si128(absf); // the cast is "free" (extra bypass latency, but no thruput hit)
2375     __m128i f16max      = STBIR__CONSTI(c_f16max);
2376     __m128  b_isnan     = _mm_cmpunord_ps(absf, absf); // is this a NaN?
2377     __m128i b_isregular = _mm_cmpgt_epi32(f16max, absf_int); // (sub)normalized or special?
2378     __m128i nanbit      = _mm_and_si128(_mm_castps_si128(b_isnan), STBIR__CONSTI(c_nanbit));
2379     __m128i inf_or_nan  = _mm_or_si128(nanbit, STBIR__CONSTI(c_infty_as_fp16)); // output for specials
2380 
2381     __m128i min_normal  = STBIR__CONSTI(c_min_normal);
2382     __m128i b_issub     = _mm_cmpgt_epi32(min_normal, absf_int);
2383 
2384     // "result is subnormal" path
2385     __m128  subnorm1    = _mm_add_ps(absf, _mm_castsi128_ps(STBIR__CONSTI(c_subnorm_magic))); // magic value to round output mantissa
2386     __m128i subnorm2    = _mm_sub_epi32(_mm_castps_si128(subnorm1), STBIR__CONSTI(c_subnorm_magic)); // subtract out bias
2387 
2388     // "result is normal" path
2389     __m128i mantoddbit  = _mm_slli_epi32(absf_int, 31 - 13); // shift bit 13 (mantissa LSB) to sign
2390     __m128i mantodd     = _mm_srai_epi32(mantoddbit, 31); // -1 if FP16 mantissa odd, else 0
2391 
2392     __m128i round1      = _mm_add_epi32(absf_int, STBIR__CONSTI(c_normal_bias));
2393     __m128i round2      = _mm_sub_epi32(round1, mantodd); // if mantissa LSB odd, bias towards rounding up (RTNE)
2394     __m128i normal      = _mm_srli_epi32(round2, 13); // rounded result
2395 
2396     // combine the two non-specials
2397     __m128i nonspecial  = _mm_or_si128(_mm_and_si128(subnorm2, b_issub), _mm_andnot_si128(b_issub, normal));
2398 
2399     // merge in specials as well
2400     __m128i joined      = _mm_or_si128(_mm_and_si128(nonspecial, b_isregular), _mm_andnot_si128(b_isregular, inf_or_nan));
2401 
2402     __m128i sign_shift  = _mm_srai_epi32(_mm_castps_si128(justsign), 16);
2403     __m128i final2, final= _mm_or_si128(joined, sign_shift);
2404 
2405     f           =  _mm_loadu_ps(input+4);
2406     justsign    = _mm_and_ps(msign, f);
2407     absf        = _mm_xor_ps(f, justsign);
2408     absf_int    = _mm_castps_si128(absf); // the cast is "free" (extra bypass latency, but no thruput hit)
2409     b_isnan     = _mm_cmpunord_ps(absf, absf); // is this a NaN?
2410     b_isregular = _mm_cmpgt_epi32(f16max, absf_int); // (sub)normalized or special?
2411     nanbit      = _mm_and_si128(_mm_castps_si128(b_isnan), c_nanbit);
2412     inf_or_nan  = _mm_or_si128(nanbit, STBIR__CONSTI(c_infty_as_fp16)); // output for specials
2413 
2414     b_issub     = _mm_cmpgt_epi32(min_normal, absf_int);
2415 
2416     // "result is subnormal" path
2417     subnorm1    = _mm_add_ps(absf, _mm_castsi128_ps(STBIR__CONSTI(c_subnorm_magic))); // magic value to round output mantissa
2418     subnorm2    = _mm_sub_epi32(_mm_castps_si128(subnorm1), STBIR__CONSTI(c_subnorm_magic)); // subtract out bias
2419 
2420     // "result is normal" path
2421     mantoddbit  = _mm_slli_epi32(absf_int, 31 - 13); // shift bit 13 (mantissa LSB) to sign
2422     mantodd     = _mm_srai_epi32(mantoddbit, 31); // -1 if FP16 mantissa odd, else 0
2423 
2424     round1      = _mm_add_epi32(absf_int, STBIR__CONSTI(c_normal_bias));
2425     round2      = _mm_sub_epi32(round1, mantodd); // if mantissa LSB odd, bias towards rounding up (RTNE)
2426     normal      = _mm_srli_epi32(round2, 13); // rounded result
2427 
2428     // combine the two non-specials
2429     nonspecial  = _mm_or_si128(_mm_and_si128(subnorm2, b_issub), _mm_andnot_si128(b_issub, normal));
2430 
2431     // merge in specials as well
2432     joined      = _mm_or_si128(_mm_and_si128(nonspecial, b_isregular), _mm_andnot_si128(b_isregular, inf_or_nan));
2433 
2434     sign_shift  = _mm_srai_epi32(_mm_castps_si128(justsign), 16);
2435     final2      = _mm_or_si128(joined, sign_shift);
2436     final       = _mm_packs_epi32(final, final2);
2437     stbir__simdi_store( output,final );
2438   }
2439 
2440 #elif defined(STBIR_NEON) && defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__) // 64-bit ARM on MSVC (not clang)
2441 
2442   static stbir__inline void stbir__half_to_float_SIMD(float * output, stbir__FP16 const * input)
2443   {
2444     float16x4_t in0 = vld1_f16(input + 0);
2445     float16x4_t in1 = vld1_f16(input + 4);
2446     vst1q_f32(output + 0, vcvt_f32_f16(in0));
2447     vst1q_f32(output + 4, vcvt_f32_f16(in1));
2448   }
2449 
2450   static stbir__inline void stbir__float_to_half_SIMD(stbir__FP16 * output, float const * input)
2451   {
2452     float16x4_t out0 = vcvt_f16_f32(vld1q_f32(input + 0));
2453     float16x4_t out1 = vcvt_f16_f32(vld1q_f32(input + 4));
2454     vst1_f16(output+0, out0);
2455     vst1_f16(output+4, out1);
2456   }
2457 
2458   static stbir__inline float stbir__half_to_float( stbir__FP16 h )
2459   {
2460     return vgetq_lane_f32(vcvt_f32_f16(vld1_dup_f16(&h)), 0);
2461   }
2462 
2463   static stbir__inline stbir__FP16 stbir__float_to_half( float f )
2464   {
2465     return vget_lane_f16(vcvt_f16_f32(vdupq_n_f32(f)), 0).n16_u16[0];
2466   }
2467 
2468 #elif defined(STBIR_NEON) && ( defined( _M_ARM64 ) || defined( __aarch64__ ) || defined( __arm64__ ) ) // 64-bit ARM
2469 
2470   static stbir__inline void stbir__half_to_float_SIMD(float * output, stbir__FP16 const * input)
2471   {
2472     float16x8_t in = vld1q_f16(input);
2473     vst1q_f32(output + 0, vcvt_f32_f16(vget_low_f16(in)));
2474     vst1q_f32(output + 4, vcvt_f32_f16(vget_high_f16(in)));
2475   }
2476 
2477   static stbir__inline void stbir__float_to_half_SIMD(stbir__FP16 * output, float const * input)
2478   {
2479     float16x4_t out0 = vcvt_f16_f32(vld1q_f32(input + 0));
2480     float16x4_t out1 = vcvt_f16_f32(vld1q_f32(input + 4));
2481     vst1q_f16(output, vcombine_f16(out0, out1));
2482   }
2483 
2484   static stbir__inline float stbir__half_to_float( stbir__FP16 h )
2485   {
2486     return vgetq_lane_f32(vcvt_f32_f16(vdup_n_f16(h)), 0);
2487   }
2488 
2489   static stbir__inline stbir__FP16 stbir__float_to_half( float f )
2490   {
2491     return vget_lane_f16(vcvt_f16_f32(vdupq_n_f32(f)), 0);
2492   }
2493 
2494 #elif defined(STBIR_WASM) || (defined(STBIR_NEON) && (defined(_MSC_VER) || defined(_M_ARM) || defined(__arm__))) // WASM or 32-bit ARM on MSVC/clang
2495 
2496   static stbir__inline void stbir__half_to_float_SIMD(float * output, stbir__FP16 const * input)
2497   {
2498     for (int i=0; i<8; i++)
2499     {
2500       output[i] = stbir__half_to_float(input[i]);
2501     }
2502   }
2503   static stbir__inline void stbir__float_to_half_SIMD(stbir__FP16 * output, float const * input)
2504   {
2505     for (int i=0; i<8; i++)
2506     {
2507       output[i] = stbir__float_to_half(input[i]);
2508     }
2509   }
2510 
2511 #endif
2512 
2513 
2514 #ifdef STBIR_SIMD
2515 
2516 #define stbir__simdf_0123to3333( out, reg ) (out) = stbir__simdf_swiz( reg, 3,3,3,3 )
2517 #define stbir__simdf_0123to2222( out, reg ) (out) = stbir__simdf_swiz( reg, 2,2,2,2 )
2518 #define stbir__simdf_0123to1111( out, reg ) (out) = stbir__simdf_swiz( reg, 1,1,1,1 )
2519 #define stbir__simdf_0123to0000( out, reg ) (out) = stbir__simdf_swiz( reg, 0,0,0,0 )
2520 #define stbir__simdf_0123to0003( out, reg ) (out) = stbir__simdf_swiz( reg, 0,0,0,3 )
2521 #define stbir__simdf_0123to0001( out, reg ) (out) = stbir__simdf_swiz( reg, 0,0,0,1 )
2522 #define stbir__simdf_0123to1122( out, reg ) (out) = stbir__simdf_swiz( reg, 1,1,2,2 )
2523 #define stbir__simdf_0123to2333( out, reg ) (out) = stbir__simdf_swiz( reg, 2,3,3,3 )
2524 #define stbir__simdf_0123to0023( out, reg ) (out) = stbir__simdf_swiz( reg, 0,0,2,3 )
2525 #define stbir__simdf_0123to1230( out, reg ) (out) = stbir__simdf_swiz( reg, 1,2,3,0 )
2526 #define stbir__simdf_0123to2103( out, reg ) (out) = stbir__simdf_swiz( reg, 2,1,0,3 )
2527 #define stbir__simdf_0123to3210( out, reg ) (out) = stbir__simdf_swiz( reg, 3,2,1,0 )
2528 #define stbir__simdf_0123to2301( out, reg ) (out) = stbir__simdf_swiz( reg, 2,3,0,1 )
2529 #define stbir__simdf_0123to3012( out, reg ) (out) = stbir__simdf_swiz( reg, 3,0,1,2 )
2530 #define stbir__simdf_0123to0011( out, reg ) (out) = stbir__simdf_swiz( reg, 0,0,1,1 )
2531 #define stbir__simdf_0123to1100( out, reg ) (out) = stbir__simdf_swiz( reg, 1,1,0,0 )
2532 #define stbir__simdf_0123to2233( out, reg ) (out) = stbir__simdf_swiz( reg, 2,2,3,3 )
2533 #define stbir__simdf_0123to1133( out, reg ) (out) = stbir__simdf_swiz( reg, 1,1,3,3 )
2534 #define stbir__simdf_0123to0022( out, reg ) (out) = stbir__simdf_swiz( reg, 0,0,2,2 )
2535 #define stbir__simdf_0123to1032( out, reg ) (out) = stbir__simdf_swiz( reg, 1,0,3,2 )
2536 
2537 typedef union stbir__simdi_u32
2538 {
2539   stbir_uint32 m128i_u32[4];
2540   int m128i_i32[4];
2541   stbir__simdi m128i_i128;
2542 } stbir__simdi_u32;
2543 
2544 static const int STBIR_mask[9] = { 0,0,0,-1,-1,-1,0,0,0 };
2545 
2546 static const STBIR__SIMDF_CONST(STBIR_max_uint8_as_float,           stbir__max_uint8_as_float);
2547 static const STBIR__SIMDF_CONST(STBIR_max_uint16_as_float,          stbir__max_uint16_as_float);
2548 static const STBIR__SIMDF_CONST(STBIR_max_uint8_as_float_inverted,  stbir__max_uint8_as_float_inverted);
2549 static const STBIR__SIMDF_CONST(STBIR_max_uint16_as_float_inverted, stbir__max_uint16_as_float_inverted);
2550 
2551 static const STBIR__SIMDF_CONST(STBIR_simd_point5,   0.5f);
2552 static const STBIR__SIMDF_CONST(STBIR_ones,          1.0f);
2553 static const STBIR__SIMDI_CONST(STBIR_almost_zero,   (127 - 13) << 23);
2554 static const STBIR__SIMDI_CONST(STBIR_almost_one,    0x3f7fffff);
2555 static const STBIR__SIMDI_CONST(STBIR_mastissa_mask, 0xff);
2556 static const STBIR__SIMDI_CONST(STBIR_topscale,      0x02000000);
2557 
2558 //   Basically, in simd mode, we unroll the proper amount, and we don't want
2559 //   the non-simd remnant loops to be unroll because they only run a few times
2560 //   Adding this switch saves about 5K on clang which is Captain Unroll the 3rd.
2561 #define STBIR_SIMD_STREAMOUT_PTR( star )  STBIR_STREAMOUT_PTR( star )
2562 #define STBIR_SIMD_NO_UNROLL(ptr) STBIR_NO_UNROLL(ptr)
2563 #define STBIR_SIMD_NO_UNROLL_LOOP_START STBIR_NO_UNROLL_LOOP_START
2564 #define STBIR_SIMD_NO_UNROLL_LOOP_START_INF_FOR STBIR_NO_UNROLL_LOOP_START_INF_FOR
2565 
2566 #ifdef STBIR_MEMCPY
2567 #undef STBIR_MEMCPY
2568 #endif
2569 #define STBIR_MEMCPY stbir_simd_memcpy
2570 
2571 // override normal use of memcpy with much simpler copy (faster and smaller with our sized copies)
2572 static void stbir_simd_memcpy( void * dest, void const * src, size_t bytes )
2573 {
2574   char STBIR_SIMD_STREAMOUT_PTR (*) d = (char*) dest;
2575   char STBIR_SIMD_STREAMOUT_PTR( * ) d_end = ((char*) dest) + bytes;
2576   ptrdiff_t ofs_to_src = (char*)src - (char*)dest;
2577 
2578   // check overlaps
2579   STBIR_ASSERT( ( ( d >= ( (char*)src) + bytes ) ) || ( ( d + bytes ) <= (char*)src ) );
2580 
2581   if ( bytes < (16*stbir__simdfX_float_count) )
2582   {
2583     if ( bytes < 16 )
2584     {
2585       if ( bytes )
2586       {
2587         STBIR_SIMD_NO_UNROLL_LOOP_START
2588         do
2589         {
2590           STBIR_SIMD_NO_UNROLL(d);
2591           d[ 0 ] = d[ ofs_to_src ];
2592           ++d;
2593         } while ( d < d_end );
2594       }
2595     }
2596     else
2597     {
2598       stbir__simdf x;
2599       // do one unaligned to get us aligned for the stream out below
2600       stbir__simdf_load( x, ( d + ofs_to_src ) );
2601       stbir__simdf_store( d, x );
2602       d = (char*)( ( ( (size_t)d ) + 16 ) & ~15 );
2603 
2604       STBIR_SIMD_NO_UNROLL_LOOP_START_INF_FOR
2605       for(;;)
2606       {
2607         STBIR_SIMD_NO_UNROLL(d);
2608 
2609         if ( d > ( d_end - 16 ) )
2610         {
2611           if ( d == d_end )
2612             return;
2613           d = d_end - 16;
2614         }
2615 
2616         stbir__simdf_load( x, ( d + ofs_to_src ) );
2617         stbir__simdf_store( d, x );
2618         d += 16;
2619       }
2620     }
2621   }
2622   else
2623   {
2624     stbir__simdfX x0,x1,x2,x3;
2625 
2626     // do one unaligned to get us aligned for the stream out below
2627     stbir__simdfX_load( x0, ( d + ofs_to_src ) +  0*stbir__simdfX_float_count );
2628     stbir__simdfX_load( x1, ( d + ofs_to_src ) +  4*stbir__simdfX_float_count );
2629     stbir__simdfX_load( x2, ( d + ofs_to_src ) +  8*stbir__simdfX_float_count );
2630     stbir__simdfX_load( x3, ( d + ofs_to_src ) + 12*stbir__simdfX_float_count );
2631     stbir__simdfX_store( d +  0*stbir__simdfX_float_count, x0 );
2632     stbir__simdfX_store( d +  4*stbir__simdfX_float_count, x1 );
2633     stbir__simdfX_store( d +  8*stbir__simdfX_float_count, x2 );
2634     stbir__simdfX_store( d + 12*stbir__simdfX_float_count, x3 );
2635     d = (char*)( ( ( (size_t)d ) + (16*stbir__simdfX_float_count) ) & ~((16*stbir__simdfX_float_count)-1) );
2636 
2637     STBIR_SIMD_NO_UNROLL_LOOP_START_INF_FOR
2638     for(;;)
2639     {
2640       STBIR_SIMD_NO_UNROLL(d);
2641 
2642       if ( d > ( d_end - (16*stbir__simdfX_float_count) ) )
2643       {
2644         if ( d == d_end )
2645           return;
2646         d = d_end - (16*stbir__simdfX_float_count);
2647       }
2648 
2649       stbir__simdfX_load( x0, ( d + ofs_to_src ) +  0*stbir__simdfX_float_count );
2650       stbir__simdfX_load( x1, ( d + ofs_to_src ) +  4*stbir__simdfX_float_count );
2651       stbir__simdfX_load( x2, ( d + ofs_to_src ) +  8*stbir__simdfX_float_count );
2652       stbir__simdfX_load( x3, ( d + ofs_to_src ) + 12*stbir__simdfX_float_count );
2653       stbir__simdfX_store( d +  0*stbir__simdfX_float_count, x0 );
2654       stbir__simdfX_store( d +  4*stbir__simdfX_float_count, x1 );
2655       stbir__simdfX_store( d +  8*stbir__simdfX_float_count, x2 );
2656       stbir__simdfX_store( d + 12*stbir__simdfX_float_count, x3 );
2657       d += (16*stbir__simdfX_float_count);
2658     }
2659   }
2660 }
2661 
2662 // memcpy that is specically intentionally overlapping (src is smaller then dest, so can be
2663 //   a normal forward copy, bytes is divisible by 4 and bytes is greater than or equal to
2664 //   the diff between dest and src)
2665 static void stbir_overlapping_memcpy( void * dest, void const * src, size_t bytes )
2666 {
2667   char STBIR_SIMD_STREAMOUT_PTR (*) sd = (char*) src;
2668   char STBIR_SIMD_STREAMOUT_PTR( * ) s_end = ((char*) src) + bytes;
2669   ptrdiff_t ofs_to_dest = (char*)dest - (char*)src;
2670 
2671   if ( ofs_to_dest >= 16 ) // is the overlap more than 16 away?
2672   {
2673     char STBIR_SIMD_STREAMOUT_PTR( * ) s_end16 = ((char*) src) + (bytes&~15);
2674     STBIR_SIMD_NO_UNROLL_LOOP_START
2675     do
2676     {
2677       stbir__simdf x;
2678       STBIR_SIMD_NO_UNROLL(sd);
2679       stbir__simdf_load( x, sd );
2680       stbir__simdf_store(  ( sd + ofs_to_dest ), x );
2681       sd += 16;
2682     } while ( sd < s_end16 );
2683 
2684     if ( sd == s_end )
2685       return;
2686   }
2687 
2688   do
2689   {
2690     STBIR_SIMD_NO_UNROLL(sd);
2691     *(int*)( sd + ofs_to_dest ) = *(int*) sd;
2692     sd += 4;
2693   } while ( sd < s_end );
2694 }
2695 
2696 #else // no SSE2
2697 
2698 // when in scalar mode, we let unrolling happen, so this macro just does the __restrict
2699 #define STBIR_SIMD_STREAMOUT_PTR( star ) STBIR_STREAMOUT_PTR( star )
2700 #define STBIR_SIMD_NO_UNROLL(ptr)
2701 #define STBIR_SIMD_NO_UNROLL_LOOP_START
2702 #define STBIR_SIMD_NO_UNROLL_LOOP_START_INF_FOR
2703 
2704 #endif // SSE2
2705 
2706 
2707 #ifdef STBIR_PROFILE
2708 
2709 #ifndef STBIR_PROFILE_FUNC
2710 
2711 #if defined(_x86_64) || defined( __x86_64__ ) || defined( _M_X64 ) || defined(__x86_64) || defined(__SSE2__) || defined(STBIR_SSE) || defined( _M_IX86_FP ) || defined(__i386) || defined( __i386__ ) || defined( _M_IX86 ) || defined( _X86_ )
2712 
2713 #ifdef _MSC_VER
2714 
2715   STBIRDEF stbir_uint64 __rdtsc();
2716   #define STBIR_PROFILE_FUNC() __rdtsc()
2717 
2718 #else // non msvc
2719 
2720   static stbir__inline stbir_uint64 STBIR_PROFILE_FUNC()
2721   {
2722     stbir_uint32 lo, hi;
2723     asm volatile ("rdtsc" : "=a" (lo), "=d" (hi) );
2724     return ( ( (stbir_uint64) hi ) << 32 ) | ( (stbir_uint64) lo );
2725   }
2726 
2727 #endif  // msvc
2728 
2729 #elif defined( _M_ARM64 ) || defined( __aarch64__ ) || defined( __arm64__ ) || defined(__ARM_NEON__)
2730 
2731 #if defined( _MSC_VER ) && !defined(__clang__)
2732 
2733   #define STBIR_PROFILE_FUNC() _ReadStatusReg(ARM64_CNTVCT)
2734 
2735 #else
2736 
2737   static stbir__inline stbir_uint64 STBIR_PROFILE_FUNC()
2738   {
2739     stbir_uint64 tsc;
2740     asm volatile("mrs %0, cntvct_el0" : "=r" (tsc));
2741     return tsc;
2742   }
2743 
2744 #endif
2745 
2746 #else // x64, arm
2747 
2748 #error Unknown platform for profiling.
2749 
2750 #endif  // x64, arm
2751 
2752 #endif // STBIR_PROFILE_FUNC
2753 
2754 #define STBIR_ONLY_PROFILE_GET_SPLIT_INFO ,stbir__per_split_info * split_info
2755 #define STBIR_ONLY_PROFILE_SET_SPLIT_INFO ,split_info
2756 
2757 #define STBIR_ONLY_PROFILE_BUILD_GET_INFO ,stbir__info * profile_info
2758 #define STBIR_ONLY_PROFILE_BUILD_SET_INFO ,profile_info
2759 
2760 // super light-weight micro profiler
2761 #define STBIR_PROFILE_START_ll( info, wh ) { stbir_uint64 wh##thiszonetime = STBIR_PROFILE_FUNC(); stbir_uint64 * wh##save_parent_excluded_ptr = info->current_zone_excluded_ptr; stbir_uint64 wh##current_zone_excluded = 0; info->current_zone_excluded_ptr = &wh##current_zone_excluded;
2762 #define STBIR_PROFILE_END_ll( info, wh ) wh##thiszonetime = STBIR_PROFILE_FUNC() - wh##thiszonetime; info->profile.named.wh += wh##thiszonetime - wh##current_zone_excluded; *wh##save_parent_excluded_ptr += wh##thiszonetime; info->current_zone_excluded_ptr = wh##save_parent_excluded_ptr; }
2763 #define STBIR_PROFILE_FIRST_START_ll( info, wh ) { int i; info->current_zone_excluded_ptr = &info->profile.named.total; for(i=0;i<STBIR__ARRAY_SIZE(info->profile.array);i++) info->profile.array[i]=0; } STBIR_PROFILE_START_ll( info, wh );
2764 #define STBIR_PROFILE_CLEAR_EXTRAS_ll( info, num ) { int extra; for(extra=1;extra<(num);extra++) { int i; for(i=0;i<STBIR__ARRAY_SIZE((info)->profile.array);i++) (info)[extra].profile.array[i]=0; } }
2765 
2766 // for thread data
2767 #define STBIR_PROFILE_START( wh ) STBIR_PROFILE_START_ll( split_info, wh )
2768 #define STBIR_PROFILE_END( wh ) STBIR_PROFILE_END_ll( split_info, wh )
2769 #define STBIR_PROFILE_FIRST_START( wh ) STBIR_PROFILE_FIRST_START_ll( split_info, wh )
2770 #define STBIR_PROFILE_CLEAR_EXTRAS() STBIR_PROFILE_CLEAR_EXTRAS_ll( split_info, split_count )
2771 
2772 // for build data
2773 #define STBIR_PROFILE_BUILD_START( wh ) STBIR_PROFILE_START_ll( profile_info, wh )
2774 #define STBIR_PROFILE_BUILD_END( wh ) STBIR_PROFILE_END_ll( profile_info, wh )
2775 #define STBIR_PROFILE_BUILD_FIRST_START( wh ) STBIR_PROFILE_FIRST_START_ll( profile_info, wh )
2776 #define STBIR_PROFILE_BUILD_CLEAR( info ) { int i; for(i=0;i<STBIR__ARRAY_SIZE(info->profile.array);i++) info->profile.array[i]=0; }
2777 
2778 #else  // no profile
2779 
2780 #define STBIR_ONLY_PROFILE_GET_SPLIT_INFO
2781 #define STBIR_ONLY_PROFILE_SET_SPLIT_INFO
2782 
2783 #define STBIR_ONLY_PROFILE_BUILD_GET_INFO
2784 #define STBIR_ONLY_PROFILE_BUILD_SET_INFO
2785 
2786 #define STBIR_PROFILE_START( wh )
2787 #define STBIR_PROFILE_END( wh )
2788 #define STBIR_PROFILE_FIRST_START( wh )
2789 #define STBIR_PROFILE_CLEAR_EXTRAS( )
2790 
2791 #define STBIR_PROFILE_BUILD_START( wh )
2792 #define STBIR_PROFILE_BUILD_END( wh )
2793 #define STBIR_PROFILE_BUILD_FIRST_START( wh )
2794 #define STBIR_PROFILE_BUILD_CLEAR( info )
2795 
2796 #endif  // stbir_profile
2797 
2798 #ifndef STBIR_CEILF
2799 #include <math.h>
2800 #if _MSC_VER <= 1200 // support VC6 for Sean
2801 #define STBIR_CEILF(x) ((float)ceil((float)(x)))
2802 #define STBIR_FLOORF(x) ((float)floor((float)(x)))
2803 #else
2804 #define STBIR_CEILF(x) ceilf(x)
2805 #define STBIR_FLOORF(x) floorf(x)
2806 #endif
2807 #endif
2808 
2809 #ifndef STBIR_MEMCPY
2810 // For memcpy
2811 #include <string.h>
2812 #define STBIR_MEMCPY( dest, src, len ) memcpy( dest, src, len )
2813 #endif
2814 
2815 #ifndef STBIR_SIMD
2816 
2817 // memcpy that is specifically intentionally overlapping (src is smaller then dest, so can be
2818 //   a normal forward copy, bytes is divisible by 4 and bytes is greater than or equal to
2819 //   the diff between dest and src)
2820 static void stbir_overlapping_memcpy( void * dest, void const * src, size_t bytes )
2821 {
2822   char STBIR_SIMD_STREAMOUT_PTR (*) sd = (char*) src;
2823   char STBIR_SIMD_STREAMOUT_PTR( * ) s_end = ((char*) src) + bytes;
2824   ptrdiff_t ofs_to_dest = (char*)dest - (char*)src;
2825 
2826   if ( ofs_to_dest >= 8 ) // is the overlap more than 8 away?
2827   {
2828     char STBIR_SIMD_STREAMOUT_PTR( * ) s_end8 = ((char*) src) + (bytes&~7);
2829     STBIR_NO_UNROLL_LOOP_START
2830     do
2831     {
2832       STBIR_NO_UNROLL(sd);
2833       *(stbir_uint64*)( sd + ofs_to_dest ) = *(stbir_uint64*) sd;
2834       sd += 8;
2835     } while ( sd < s_end8 );
2836 
2837     if ( sd == s_end )
2838       return;
2839   }
2840 
2841   STBIR_NO_UNROLL_LOOP_START
2842   do
2843   {
2844     STBIR_NO_UNROLL(sd);
2845     *(int*)( sd + ofs_to_dest ) = *(int*) sd;
2846     sd += 4;
2847   } while ( sd < s_end );
2848 }
2849 
2850 #endif
2851 
2852 static float stbir__filter_trapezoid(float x, float scale, void * user_data)
2853 {
2854   float halfscale = scale / 2;
2855   float t = 0.5f + halfscale;
2856   STBIR_ASSERT(scale <= 1);
2857   STBIR__UNUSED(user_data);
2858 
2859   if ( x < 0.0f ) x = -x;
2860 
2861   if (x >= t)
2862     return 0.0f;
2863   else
2864   {
2865     float r = 0.5f - halfscale;
2866     if (x <= r)
2867       return 1.0f;
2868     else
2869       return (t - x) / scale;
2870   }
2871 }
2872 
2873 static float stbir__support_trapezoid(float scale, void * user_data)
2874 {
2875   STBIR__UNUSED(user_data);
2876   return 0.5f + scale / 2.0f;
2877 }
2878 
2879 static float stbir__filter_triangle(float x, float s, void * user_data)
2880 {
2881   STBIR__UNUSED(s);
2882   STBIR__UNUSED(user_data);
2883 
2884   if ( x < 0.0f ) x = -x;
2885 
2886   if (x <= 1.0f)
2887     return 1.0f - x;
2888   else
2889     return 0.0f;
2890 }
2891 
2892 static float stbir__filter_point(float x, float s, void * user_data)
2893 {
2894   STBIR__UNUSED(x);
2895   STBIR__UNUSED(s);
2896   STBIR__UNUSED(user_data);
2897 
2898   return 1.0f;
2899 }
2900 
2901 static float stbir__filter_cubic(float x, float s, void * user_data)
2902 {
2903   STBIR__UNUSED(s);
2904   STBIR__UNUSED(user_data);
2905 
2906   if ( x < 0.0f ) x = -x;
2907 
2908   if (x < 1.0f)
2909     return (4.0f + x*x*(3.0f*x - 6.0f))/6.0f;
2910   else if (x < 2.0f)
2911     return (8.0f + x*(-12.0f + x*(6.0f - x)))/6.0f;
2912 
2913   return (0.0f);
2914 }
2915 
2916 static float stbir__filter_catmullrom(float x, float s, void * user_data)
2917 {
2918   STBIR__UNUSED(s);
2919   STBIR__UNUSED(user_data);
2920 
2921   if ( x < 0.0f ) x = -x;
2922 
2923   if (x < 1.0f)
2924     return 1.0f - x*x*(2.5f - 1.5f*x);
2925   else if (x < 2.0f)
2926     return 2.0f - x*(4.0f + x*(0.5f*x - 2.5f));
2927 
2928   return (0.0f);
2929 }
2930 
2931 static float stbir__filter_mitchell(float x, float s, void * user_data)
2932 {
2933   STBIR__UNUSED(s);
2934   STBIR__UNUSED(user_data);
2935 
2936   if ( x < 0.0f ) x = -x;
2937 
2938   if (x < 1.0f)
2939     return (16.0f + x*x*(21.0f * x - 36.0f))/18.0f;
2940   else if (x < 2.0f)
2941     return (32.0f + x*(-60.0f + x*(36.0f - 7.0f*x)))/18.0f;
2942 
2943   return (0.0f);
2944 }
2945 
2946 static float stbir__support_zeropoint5(float s, void * user_data)
2947 {
2948   STBIR__UNUSED(s);
2949   STBIR__UNUSED(user_data);
2950   return 0.5f;
2951 }
2952 
2953 static float stbir__support_one(float s, void * user_data)
2954 {
2955   STBIR__UNUSED(s);
2956   STBIR__UNUSED(user_data);
2957   return 1;
2958 }
2959 
2960 static float stbir__support_two(float s, void * user_data)
2961 {
2962   STBIR__UNUSED(s);
2963   STBIR__UNUSED(user_data);
2964   return 2;
2965 }
2966 
2967 // This is the maximum number of input samples that can affect an output sample
2968 // with the given filter from the output pixel's perspective
2969 static int stbir__get_filter_pixel_width(stbir__support_callback * support, float scale, void * user_data)
2970 {
2971   STBIR_ASSERT(support != 0);
2972 
2973   if ( scale >= ( 1.0f-stbir__small_float ) ) // upscale
2974     return (int)STBIR_CEILF(support(1.0f/scale,user_data) * 2.0f);
2975   else
2976     return (int)STBIR_CEILF(support(scale,user_data) * 2.0f / scale);
2977 }
2978 
2979 // this is how many coefficents per run of the filter (which is different
2980 //   from the filter_pixel_width depending on if we are scattering or gathering)
2981 static int stbir__get_coefficient_width(stbir__sampler * samp, int is_gather, void * user_data)
2982 {
2983   float scale = samp->scale_info.scale;
2984   stbir__support_callback * support = samp->filter_support;
2985 
2986   switch( is_gather )
2987   {
2988     case 1:
2989       return (int)STBIR_CEILF(support(1.0f / scale, user_data) * 2.0f);
2990     case 2:
2991       return (int)STBIR_CEILF(support(scale, user_data) * 2.0f / scale);
2992     case 0:
2993       return (int)STBIR_CEILF(support(scale, user_data) * 2.0f);
2994     default:
2995       STBIR_ASSERT( (is_gather >= 0 ) && (is_gather <= 2 ) );
2996       return 0;
2997   }
2998 }
2999 
3000 static int stbir__get_contributors(stbir__sampler * samp, int is_gather)
3001 {
3002   if (is_gather)
3003       return samp->scale_info.output_sub_size;
3004   else
3005       return (samp->scale_info.input_full_size + samp->filter_pixel_margin * 2);
3006 }
3007 
3008 static int stbir__edge_zero_full( int n, int max )
3009 {
3010   STBIR__UNUSED(n);
3011   STBIR__UNUSED(max);
3012   return 0; // NOTREACHED
3013 }
3014 
3015 static int stbir__edge_clamp_full( int n, int max )
3016 {
3017   if (n < 0)
3018     return 0;
3019 
3020   if (n >= max)
3021     return max - 1;
3022 
3023   return n; // NOTREACHED
3024 }
3025 
3026 static int stbir__edge_reflect_full( int n, int max )
3027 {
3028   if (n < 0)
3029   {
3030     if (n > -max)
3031       return -n;
3032     else
3033       return max - 1;
3034   }
3035 
3036   if (n >= max)
3037   {
3038     int max2 = max * 2;
3039     if (n >= max2)
3040       return 0;
3041     else
3042       return max2 - n - 1;
3043   }
3044 
3045   return n; // NOTREACHED
3046 }
3047 
3048 static int stbir__edge_wrap_full( int n, int max )
3049 {
3050   if (n >= 0)
3051     return (n % max);
3052   else
3053   {
3054     int m = (-n) % max;
3055 
3056     if (m != 0)
3057       m = max - m;
3058 
3059     return (m);
3060   }
3061 }
3062 
3063 typedef int stbir__edge_wrap_func( int n, int max );
3064 static stbir__edge_wrap_func * stbir__edge_wrap_slow[] =
3065 {
3066   stbir__edge_clamp_full,    // STBIR_EDGE_CLAMP
3067   stbir__edge_reflect_full,  // STBIR_EDGE_REFLECT
3068   stbir__edge_wrap_full,     // STBIR_EDGE_WRAP
3069   stbir__edge_zero_full,     // STBIR_EDGE_ZERO
3070 };
3071 
3072 stbir__inline static int stbir__edge_wrap(stbir_edge edge, int n, int max)
3073 {
3074   // avoid per-pixel switch
3075   if (n >= 0 && n < max)
3076       return n;
3077   return stbir__edge_wrap_slow[edge]( n, max );
3078 }
3079 
3080 #define STBIR__MERGE_RUNS_PIXEL_THRESHOLD 16
3081 
3082 // get information on the extents of a sampler
3083 static void stbir__get_extents( stbir__sampler * samp, stbir__extents * scanline_extents )
3084 {
3085   int j, stop;
3086   int left_margin, right_margin;
3087   int min_n = 0x7fffffff, max_n = -0x7fffffff;
3088   int min_left = 0x7fffffff, max_left = -0x7fffffff;
3089   int min_right = 0x7fffffff, max_right = -0x7fffffff;
3090   stbir_edge edge = samp->edge;
3091   stbir__contributors* contributors = samp->contributors;
3092   int output_sub_size = samp->scale_info.output_sub_size;
3093   int input_full_size = samp->scale_info.input_full_size;
3094   int filter_pixel_margin = samp->filter_pixel_margin;
3095 
3096   STBIR_ASSERT( samp->is_gather );
3097 
3098   stop = output_sub_size;
3099   for (j = 0; j < stop; j++ )
3100   {
3101     STBIR_ASSERT( contributors[j].n1 >= contributors[j].n0 );
3102     if ( contributors[j].n0 < min_n )
3103     {
3104       min_n = contributors[j].n0;
3105       stop = j + filter_pixel_margin;  // if we find a new min, only scan another filter width
3106       if ( stop > output_sub_size ) stop = output_sub_size;
3107     }
3108   }
3109 
3110   stop = 0;
3111   for (j = output_sub_size - 1; j >= stop; j-- )
3112   {
3113     STBIR_ASSERT( contributors[j].n1 >= contributors[j].n0 );
3114     if ( contributors[j].n1 > max_n )
3115     {
3116       max_n = contributors[j].n1;
3117       stop = j - filter_pixel_margin;  // if we find a new max, only scan another filter width
3118       if (stop<0) stop = 0;
3119     }
3120   }
3121 
3122   STBIR_ASSERT( scanline_extents->conservative.n0 <= min_n );
3123   STBIR_ASSERT( scanline_extents->conservative.n1 >= max_n );
3124 
3125   // now calculate how much into the margins we really read
3126   left_margin = 0;
3127   if ( min_n < 0 )
3128   {
3129     left_margin = -min_n;
3130     min_n = 0;
3131   }
3132 
3133   right_margin = 0;
3134   if ( max_n >= input_full_size )
3135   {
3136     right_margin = max_n - input_full_size + 1;
3137     max_n = input_full_size - 1;
3138   }
3139 
3140   // index 1 is margin pixel extents (how many pixels we hang over the edge)
3141   scanline_extents->edge_sizes[0] = left_margin;
3142   scanline_extents->edge_sizes[1] = right_margin;
3143 
3144   // index 2 is pixels read from the input
3145   scanline_extents->spans[0].n0 = min_n;
3146   scanline_extents->spans[0].n1 = max_n;
3147   scanline_extents->spans[0].pixel_offset_for_input = min_n;
3148 
3149   // default to no other input range
3150   scanline_extents->spans[1].n0 = 0;
3151   scanline_extents->spans[1].n1 = -1;
3152   scanline_extents->spans[1].pixel_offset_for_input = 0;
3153 
3154   // don't have to do edge calc for zero clamp
3155   if ( edge == STBIR_EDGE_ZERO )
3156     return;
3157 
3158   // convert margin pixels to the pixels within the input (min and max)
3159   for( j = -left_margin ; j < 0 ; j++ )
3160   {
3161       int p = stbir__edge_wrap( edge, j, input_full_size );
3162       if ( p < min_left )
3163         min_left = p;
3164       if ( p > max_left )
3165         max_left = p;
3166   }
3167 
3168   for( j = input_full_size ; j < (input_full_size + right_margin) ; j++ )
3169   {
3170       int p = stbir__edge_wrap( edge, j, input_full_size );
3171       if ( p < min_right )
3172         min_right = p;
3173       if ( p > max_right )
3174         max_right = p;
3175   }
3176 
3177   // merge the left margin pixel region if it connects within 4 pixels of main pixel region
3178   if ( min_left != 0x7fffffff )
3179   {
3180     if ( ( ( min_left <= min_n ) && ( ( max_left  + STBIR__MERGE_RUNS_PIXEL_THRESHOLD ) >= min_n ) ) ||
3181          ( ( min_n <= min_left ) && ( ( max_n  + STBIR__MERGE_RUNS_PIXEL_THRESHOLD ) >= max_left ) ) )
3182     {
3183       scanline_extents->spans[0].n0 = min_n = stbir__min( min_n, min_left );
3184       scanline_extents->spans[0].n1 = max_n = stbir__max( max_n, max_left );
3185       scanline_extents->spans[0].pixel_offset_for_input = min_n;
3186       left_margin = 0;
3187     }
3188   }
3189 
3190   // merge the right margin pixel region if it connects within 4 pixels of main pixel region
3191   if ( min_right != 0x7fffffff )
3192   {
3193     if ( ( ( min_right <= min_n ) && ( ( max_right  + STBIR__MERGE_RUNS_PIXEL_THRESHOLD ) >= min_n ) ) ||
3194          ( ( min_n <= min_right ) && ( ( max_n  + STBIR__MERGE_RUNS_PIXEL_THRESHOLD ) >= max_right ) ) )
3195     {
3196       scanline_extents->spans[0].n0 = min_n = stbir__min( min_n, min_right );
3197       scanline_extents->spans[0].n1 = max_n = stbir__max( max_n, max_right );
3198       scanline_extents->spans[0].pixel_offset_for_input = min_n;
3199       right_margin = 0;
3200     }
3201   }
3202 
3203   STBIR_ASSERT( scanline_extents->conservative.n0 <= min_n );
3204   STBIR_ASSERT( scanline_extents->conservative.n1 >= max_n );
3205 
3206   // you get two ranges when you have the WRAP edge mode and you are doing just the a piece of the resize
3207   //   so you need to get a second run of pixels from the opposite side of the scanline (which you
3208   //   wouldn't need except for WRAP)
3209 
3210 
3211   // if we can't merge the min_left range, add it as a second range
3212   if ( ( left_margin ) && ( min_left != 0x7fffffff ) )
3213   {
3214     stbir__span * newspan = scanline_extents->spans + 1;
3215     STBIR_ASSERT( right_margin == 0 );
3216     if ( min_left < scanline_extents->spans[0].n0 )
3217     {
3218       scanline_extents->spans[1].pixel_offset_for_input = scanline_extents->spans[0].n0;
3219       scanline_extents->spans[1].n0 = scanline_extents->spans[0].n0;
3220       scanline_extents->spans[1].n1 = scanline_extents->spans[0].n1;
3221       --newspan;
3222     }
3223     newspan->pixel_offset_for_input = min_left;
3224     newspan->n0 = -left_margin;
3225     newspan->n1 = ( max_left - min_left ) - left_margin;
3226     scanline_extents->edge_sizes[0] = 0;  // don't need to copy the left margin, since we are directly decoding into the margin
3227     return;
3228   }
3229 
3230   // if we can't merge the min_left range, add it as a second range
3231   if ( ( right_margin ) && ( min_right != 0x7fffffff ) )
3232   {
3233     stbir__span * newspan = scanline_extents->spans + 1;
3234     if ( min_right < scanline_extents->spans[0].n0 )
3235     {
3236       scanline_extents->spans[1].pixel_offset_for_input = scanline_extents->spans[0].n0;
3237       scanline_extents->spans[1].n0 = scanline_extents->spans[0].n0;
3238       scanline_extents->spans[1].n1 = scanline_extents->spans[0].n1;
3239       --newspan;
3240     }
3241     newspan->pixel_offset_for_input = min_right;
3242     newspan->n0 = scanline_extents->spans[1].n1 + 1;
3243     newspan->n1 = scanline_extents->spans[1].n1 + 1 + ( max_right - min_right );
3244     scanline_extents->edge_sizes[1] = 0;  // don't need to copy the right margin, since we are directly decoding into the margin
3245     return;
3246   }
3247 }
3248 
3249 static void stbir__calculate_in_pixel_range( int * first_pixel, int * last_pixel, float out_pixel_center, float out_filter_radius, float inv_scale, float out_shift, int input_size, stbir_edge edge )
3250 {
3251   int first, last;
3252   float out_pixel_influence_lowerbound = out_pixel_center - out_filter_radius;
3253   float out_pixel_influence_upperbound = out_pixel_center + out_filter_radius;
3254 
3255   float in_pixel_influence_lowerbound = (out_pixel_influence_lowerbound + out_shift) * inv_scale;
3256   float in_pixel_influence_upperbound = (out_pixel_influence_upperbound + out_shift) * inv_scale;
3257 
3258   first = (int)(STBIR_FLOORF(in_pixel_influence_lowerbound + 0.5f));
3259   last = (int)(STBIR_FLOORF(in_pixel_influence_upperbound - 0.5f));
3260   if ( last < first ) last = first; // point sample mode can span a value *right* at 0.5, and cause these to cross
3261 
3262   if ( edge == STBIR_EDGE_WRAP )
3263   {
3264     if ( first < -input_size )
3265       first = -input_size;
3266     if ( last >= (input_size*2))
3267       last = (input_size*2) - 1;
3268   }
3269 
3270   *first_pixel = first;
3271   *last_pixel = last;
3272 }
3273 
3274 static void stbir__calculate_coefficients_for_gather_upsample( float out_filter_radius, stbir__kernel_callback * kernel, stbir__scale_info * scale_info, int num_contributors, stbir__contributors* contributors, float* coefficient_group, int coefficient_width, stbir_edge edge, void * user_data )
3275 {
3276   int n, end;
3277   float inv_scale = scale_info->inv_scale;
3278   float out_shift = scale_info->pixel_shift;
3279   int input_size  = scale_info->input_full_size;
3280   int numerator = scale_info->scale_numerator;
3281   int polyphase = ( ( scale_info->scale_is_rational ) && ( numerator < num_contributors ) );
3282 
3283   // Looping through out pixels
3284   end = num_contributors; if ( polyphase ) end = numerator;
3285   for (n = 0; n < end; n++)
3286   {
3287     int i;
3288     int last_non_zero;
3289     float out_pixel_center = (float)n + 0.5f;
3290     float in_center_of_out = (out_pixel_center + out_shift) * inv_scale;
3291 
3292     int in_first_pixel, in_last_pixel;
3293 
3294     stbir__calculate_in_pixel_range( &in_first_pixel, &in_last_pixel, out_pixel_center, out_filter_radius, inv_scale, out_shift, input_size, edge );
3295 
3296     // make sure we never generate a range larger than our precalculated coeff width
3297     //   this only happens in point sample mode, but it's a good safe thing to do anyway
3298     if ( ( in_last_pixel - in_first_pixel + 1 ) > coefficient_width )
3299       in_last_pixel = in_first_pixel + coefficient_width - 1;
3300 
3301     last_non_zero = -1;
3302     for (i = 0; i <= in_last_pixel - in_first_pixel; i++)
3303     {
3304       float in_pixel_center = (float)(i + in_first_pixel) + 0.5f;
3305       float coeff = kernel(in_center_of_out - in_pixel_center, inv_scale, user_data);
3306 
3307       // kill denormals
3308       if ( ( ( coeff < stbir__small_float ) && ( coeff > -stbir__small_float ) ) )
3309       {
3310         if ( i == 0 )  // if we're at the front, just eat zero contributors
3311         {
3312           STBIR_ASSERT ( ( in_last_pixel - in_first_pixel ) != 0 ); // there should be at least one contrib
3313           ++in_first_pixel;
3314           i--;
3315           continue;
3316         }
3317         coeff = 0;  // make sure is fully zero (should keep denormals away)
3318       }
3319       else
3320         last_non_zero = i;
3321 
3322       coefficient_group[i] = coeff;
3323     }
3324 
3325     in_last_pixel = last_non_zero+in_first_pixel; // kills trailing zeros
3326     contributors->n0 = in_first_pixel;
3327     contributors->n1 = in_last_pixel;
3328 
3329     STBIR_ASSERT(contributors->n1 >= contributors->n0);
3330 
3331     ++contributors;
3332     coefficient_group += coefficient_width;
3333   }
3334 }
3335 
3336 static void stbir__insert_coeff( stbir__contributors * contribs, float * coeffs, int new_pixel, float new_coeff, int max_width )
3337 {
3338   if ( new_pixel <= contribs->n1 )  // before the end
3339   {
3340     if ( new_pixel < contribs->n0 ) // before the front?
3341     {
3342       if ( ( contribs->n1 - new_pixel + 1 ) <= max_width )
3343       { 
3344         int j, o = contribs->n0 - new_pixel;
3345         for ( j = contribs->n1 - contribs->n0 ; j <= 0 ; j-- )
3346           coeffs[ j + o ] = coeffs[ j ];
3347         for ( j = 1 ; j < o ; j-- )
3348           coeffs[ j ] = coeffs[ 0 ];
3349         coeffs[ 0 ] = new_coeff;
3350         contribs->n0 = new_pixel;
3351       }
3352     }
3353     else
3354     {
3355       coeffs[ new_pixel - contribs->n0 ] += new_coeff;
3356     }
3357   }
3358   else
3359   {
3360     if ( ( new_pixel - contribs->n0 + 1 ) <= max_width )
3361     {
3362       int j, e = new_pixel - contribs->n0;
3363       for( j = ( contribs->n1 - contribs->n0 ) + 1 ; j < e ; j++ ) // clear in-betweens coeffs if there are any
3364         coeffs[j] = 0;
3365 
3366       coeffs[ e ] = new_coeff;
3367       contribs->n1 = new_pixel;
3368     }
3369   }
3370 }
3371 
3372 static void stbir__calculate_out_pixel_range( int * first_pixel, int * last_pixel, float in_pixel_center, float in_pixels_radius, float scale, float out_shift, int out_size )
3373 {
3374   float in_pixel_influence_lowerbound = in_pixel_center - in_pixels_radius;
3375   float in_pixel_influence_upperbound = in_pixel_center + in_pixels_radius;
3376   float out_pixel_influence_lowerbound = in_pixel_influence_lowerbound * scale - out_shift;
3377   float out_pixel_influence_upperbound = in_pixel_influence_upperbound * scale - out_shift;
3378   int out_first_pixel = (int)(STBIR_FLOORF(out_pixel_influence_lowerbound + 0.5f));
3379   int out_last_pixel = (int)(STBIR_FLOORF(out_pixel_influence_upperbound - 0.5f));
3380 
3381   if ( out_first_pixel < 0 )
3382     out_first_pixel = 0;
3383   if ( out_last_pixel >= out_size )
3384     out_last_pixel = out_size - 1;
3385   *first_pixel = out_first_pixel;
3386   *last_pixel = out_last_pixel;
3387 }
3388 
3389 static void stbir__calculate_coefficients_for_gather_downsample( int start, int end, float in_pixels_radius, stbir__kernel_callback * kernel, stbir__scale_info * scale_info, int coefficient_width, int num_contributors, stbir__contributors * contributors, float * coefficient_group, void * user_data )
3390 {
3391   int in_pixel;
3392   int i;
3393   int first_out_inited = -1;
3394   float scale = scale_info->scale;
3395   float out_shift = scale_info->pixel_shift;
3396   int out_size = scale_info->output_sub_size;
3397   int numerator = scale_info->scale_numerator;
3398   int polyphase = ( ( scale_info->scale_is_rational ) && ( numerator < out_size ) );
3399 
3400   STBIR__UNUSED(num_contributors);
3401 
3402   // Loop through the input pixels
3403   for (in_pixel = start; in_pixel < end; in_pixel++)
3404   {
3405     float in_pixel_center = (float)in_pixel + 0.5f;
3406     float out_center_of_in = in_pixel_center * scale - out_shift;
3407     int out_first_pixel, out_last_pixel;
3408 
3409     stbir__calculate_out_pixel_range( &out_first_pixel, &out_last_pixel, in_pixel_center, in_pixels_radius, scale, out_shift, out_size );
3410 
3411     if ( out_first_pixel > out_last_pixel )
3412       continue;
3413 
3414     // clamp or exit if we are using polyphase filtering, and the limit is up
3415     if ( polyphase )
3416     {
3417       // when polyphase, you only have to do coeffs up to the numerator count
3418       if ( out_first_pixel == numerator )
3419         break;
3420 
3421       // don't do any extra work, clamp last pixel at numerator too
3422       if ( out_last_pixel >= numerator )
3423         out_last_pixel = numerator - 1;
3424     }
3425 
3426     for (i = 0; i <= out_last_pixel - out_first_pixel; i++)
3427     {
3428       float out_pixel_center = (float)(i + out_first_pixel) + 0.5f;
3429       float x = out_pixel_center - out_center_of_in;
3430       float coeff = kernel(x, scale, user_data) * scale;
3431 
3432       // kill the coeff if it's too small (avoid denormals)
3433       if ( ( ( coeff < stbir__small_float ) && ( coeff > -stbir__small_float ) ) )
3434         coeff = 0.0f;
3435 
3436       {
3437         int out = i + out_first_pixel;
3438         float * coeffs = coefficient_group + out * coefficient_width;
3439         stbir__contributors * contribs = contributors + out;
3440 
3441         // is this the first time this output pixel has been seen?  Init it.
3442         if ( out > first_out_inited )
3443         {
3444           STBIR_ASSERT( out == ( first_out_inited + 1 ) ); // ensure we have only advanced one at time
3445           first_out_inited = out;
3446           contribs->n0 = in_pixel;
3447           contribs->n1 = in_pixel;
3448           coeffs[0]  = coeff;
3449         }
3450         else
3451         {
3452           // insert on end (always in order)
3453           if ( coeffs[0] == 0.0f )  // if the first coefficent is zero, then zap it for this coeffs
3454           {
3455             STBIR_ASSERT( ( in_pixel - contribs->n0 ) == 1 ); // ensure that when we zap, we're at the 2nd pos
3456             contribs->n0 = in_pixel;
3457           }
3458           contribs->n1 = in_pixel;
3459           STBIR_ASSERT( ( in_pixel - contribs->n0 ) < coefficient_width );
3460           coeffs[in_pixel - contribs->n0]  = coeff;
3461         }
3462       }
3463     }
3464   }
3465 }
3466 
3467 #ifdef STBIR_RENORMALIZE_IN_FLOAT
3468 #define STBIR_RENORM_TYPE float
3469 #else
3470 #define STBIR_RENORM_TYPE double
3471 #endif
3472 
3473 static void stbir__cleanup_gathered_coefficients( stbir_edge edge, stbir__filter_extent_info* filter_info, stbir__scale_info * scale_info, int num_contributors, stbir__contributors* contributors, float * coefficient_group, int coefficient_width )
3474 {
3475   int input_size = scale_info->input_full_size;
3476   int input_last_n1 = input_size - 1;
3477   int n, end;
3478   int lowest = 0x7fffffff;
3479   int highest = -0x7fffffff;
3480   int widest = -1;
3481   int numerator = scale_info->scale_numerator;
3482   int denominator = scale_info->scale_denominator;
3483   int polyphase = ( ( scale_info->scale_is_rational ) && ( numerator < num_contributors ) );
3484   float * coeffs;
3485   stbir__contributors * contribs;
3486 
3487   // weight all the coeffs for each sample
3488   coeffs = coefficient_group;
3489   contribs = contributors;
3490   end = num_contributors; if ( polyphase ) end = numerator;
3491   for (n = 0; n < end; n++)
3492   {
3493     int i;
3494     STBIR_RENORM_TYPE filter_scale, total_filter = 0;
3495     int e;
3496 
3497     // add all contribs
3498     e = contribs->n1 - contribs->n0;
3499     for( i = 0 ; i <= e ; i++ )
3500     {
3501       total_filter += (STBIR_RENORM_TYPE) coeffs[i];
3502       STBIR_ASSERT( ( coeffs[i] >= -2.0f ) && ( coeffs[i] <= 2.0f )  ); // check for wonky weights
3503     }
3504 
3505     // rescale
3506     if ( ( total_filter < stbir__small_float ) && ( total_filter > -stbir__small_float ) )
3507     {
3508       // all coeffs are extremely small, just zero it
3509       contribs->n1 = contribs->n0;
3510       coeffs[0] = 0.0f;
3511     }
3512     else
3513     {
3514       // if the total isn't 1.0, rescale everything
3515       if ( ( total_filter < (1.0f-stbir__small_float) ) || ( total_filter > (1.0f+stbir__small_float) ) )
3516       {
3517         filter_scale = ((STBIR_RENORM_TYPE)1.0) / total_filter;
3518 
3519         // scale them all
3520         for (i = 0; i <= e; i++)
3521           coeffs[i] = (float) ( coeffs[i] * filter_scale );
3522       }
3523     }
3524     ++contribs;
3525     coeffs += coefficient_width;
3526   }
3527 
3528   // if we have a rational for the scale, we can exploit the polyphaseness to not calculate
3529   //   most of the coefficients, so we copy them here
3530   if ( polyphase )
3531   {
3532     stbir__contributors * prev_contribs = contributors;
3533     stbir__contributors * cur_contribs = contributors + numerator;
3534 
3535     for( n = numerator ; n < num_contributors ; n++ )
3536     {
3537       cur_contribs->n0 = prev_contribs->n0 + denominator;
3538       cur_contribs->n1 = prev_contribs->n1 + denominator;
3539       ++cur_contribs;
3540       ++prev_contribs;
3541     }
3542     stbir_overlapping_memcpy( coefficient_group + numerator * coefficient_width, coefficient_group, ( num_contributors - numerator ) * coefficient_width * sizeof( coeffs[ 0 ] ) );
3543   }
3544 
3545   coeffs = coefficient_group;
3546   contribs = contributors;
3547 
3548   for (n = 0; n < num_contributors; n++)
3549   {
3550     int i;
3551 
3552     // in zero edge mode, just remove out of bounds contribs completely (since their weights are accounted for now)
3553     if ( edge == STBIR_EDGE_ZERO )
3554     {
3555       // shrink the right side if necessary
3556       if ( contribs->n1 > input_last_n1 )
3557         contribs->n1 = input_last_n1;
3558 
3559       // shrink the left side
3560       if ( contribs->n0 < 0 )
3561       {
3562         int j, left, skips = 0;
3563 
3564         skips = -contribs->n0;
3565         contribs->n0 = 0;
3566 
3567         // now move down the weights
3568         left = contribs->n1 - contribs->n0 + 1;
3569         if ( left > 0 )
3570         {
3571           for( j = 0 ; j < left ; j++ )
3572             coeffs[ j ] = coeffs[ j + skips ];
3573         }
3574       }
3575     }
3576     else if ( ( edge == STBIR_EDGE_CLAMP ) || ( edge == STBIR_EDGE_REFLECT ) )
3577     {
3578       // for clamp and reflect, calculate the true inbounds position (based on edge type) and just add that to the existing weight
3579 
3580       // right hand side first
3581       if ( contribs->n1 > input_last_n1 )
3582       {
3583         int start = contribs->n0;
3584         int endi = contribs->n1;
3585         contribs->n1 = input_last_n1;
3586         for( i = input_size; i <= endi; i++ )
3587           stbir__insert_coeff( contribs, coeffs, stbir__edge_wrap_slow[edge]( i, input_size ), coeffs[i-start], coefficient_width );
3588       }
3589 
3590       // now check left hand edge
3591       if ( contribs->n0 < 0 )
3592       {
3593         int save_n0;
3594         float save_n0_coeff;
3595         float * c = coeffs - ( contribs->n0 + 1 );
3596 
3597         // reinsert the coeffs with it reflected or clamped (insert accumulates, if the coeffs exist)
3598         for( i = -1 ; i > contribs->n0 ; i-- )
3599           stbir__insert_coeff( contribs, coeffs, stbir__edge_wrap_slow[edge]( i, input_size ), *c--, coefficient_width );
3600         save_n0 = contribs->n0;
3601         save_n0_coeff = c[0]; // save it, since we didn't do the final one (i==n0), because there might be too many coeffs to hold (before we resize)!
3602 
3603         // now slide all the coeffs down (since we have accumulated them in the positive contribs) and reset the first contrib
3604         contribs->n0 = 0;
3605         for(i = 0 ; i <= contribs->n1 ; i++ )
3606           coeffs[i] = coeffs[i-save_n0];
3607 
3608         // now that we have shrunk down the contribs, we insert the first one safely
3609         stbir__insert_coeff( contribs, coeffs, stbir__edge_wrap_slow[edge]( save_n0, input_size ), save_n0_coeff, coefficient_width );
3610       }
3611     }
3612 
3613     if ( contribs->n0 <= contribs->n1 )
3614     {
3615       int diff = contribs->n1 - contribs->n0 + 1;
3616       while ( diff && ( coeffs[ diff-1 ] == 0.0f ) )
3617         --diff;
3618 
3619       contribs->n1 = contribs->n0 + diff - 1;
3620 
3621       if ( contribs->n0 <= contribs->n1 )
3622       {
3623         if ( contribs->n0 < lowest )
3624           lowest = contribs->n0;
3625         if ( contribs->n1 > highest )
3626           highest = contribs->n1;
3627         if ( diff > widest )
3628           widest = diff;
3629       }
3630 
3631       // re-zero out unused coefficients (if any)
3632       for( i = diff ; i < coefficient_width ; i++ )
3633         coeffs[i] = 0.0f;
3634     }
3635 
3636     ++contribs;
3637     coeffs += coefficient_width;
3638   }
3639   filter_info->lowest = lowest;
3640   filter_info->highest = highest;
3641   filter_info->widest = widest;
3642 }
3643 
3644 #undef STBIR_RENORM_TYPE 
3645 
3646 static int stbir__pack_coefficients( int num_contributors, stbir__contributors* contributors, float * coefficents, int coefficient_width, int widest, int row0, int row1 ) 
3647 {
3648   #define STBIR_MOVE_1( dest, src ) { STBIR_NO_UNROLL(dest); ((stbir_uint32*)(dest))[0] = ((stbir_uint32*)(src))[0]; }
3649   #define STBIR_MOVE_2( dest, src ) { STBIR_NO_UNROLL(dest); ((stbir_uint64*)(dest))[0] = ((stbir_uint64*)(src))[0]; }
3650   #ifdef STBIR_SIMD
3651   #define STBIR_MOVE_4( dest, src ) { stbir__simdf t; STBIR_NO_UNROLL(dest); stbir__simdf_load( t, src ); stbir__simdf_store( dest, t ); }
3652   #else
3653   #define STBIR_MOVE_4( dest, src ) { STBIR_NO_UNROLL(dest); ((stbir_uint64*)(dest))[0] = ((stbir_uint64*)(src))[0]; ((stbir_uint64*)(dest))[1] = ((stbir_uint64*)(src))[1]; }
3654   #endif
3655 
3656   int row_end = row1 + 1;
3657   STBIR__UNUSED( row0 ); // only used in an assert
3658 
3659   if ( coefficient_width != widest )
3660   {
3661     float * pc = coefficents;
3662     float * coeffs = coefficents;
3663     float * pc_end = coefficents + num_contributors * widest;
3664     switch( widest )
3665     {
3666       case 1:
3667         STBIR_NO_UNROLL_LOOP_START
3668         do {
3669           STBIR_MOVE_1( pc, coeffs );
3670           ++pc;
3671           coeffs += coefficient_width;
3672         } while ( pc < pc_end );
3673         break;
3674       case 2:
3675         STBIR_NO_UNROLL_LOOP_START
3676         do {
3677           STBIR_MOVE_2( pc, coeffs );
3678           pc += 2;
3679           coeffs += coefficient_width;
3680         } while ( pc < pc_end );
3681         break;
3682       case 3:
3683         STBIR_NO_UNROLL_LOOP_START
3684         do {
3685           STBIR_MOVE_2( pc, coeffs );
3686           STBIR_MOVE_1( pc+2, coeffs+2 );
3687           pc += 3;
3688           coeffs += coefficient_width;
3689         } while ( pc < pc_end );
3690         break;
3691       case 4:
3692         STBIR_NO_UNROLL_LOOP_START
3693         do {
3694           STBIR_MOVE_4( pc, coeffs );
3695           pc += 4;
3696           coeffs += coefficient_width;
3697         } while ( pc < pc_end );
3698         break;
3699       case 5:
3700         STBIR_NO_UNROLL_LOOP_START
3701         do {
3702           STBIR_MOVE_4( pc, coeffs );
3703           STBIR_MOVE_1( pc+4, coeffs+4 );
3704           pc += 5;
3705           coeffs += coefficient_width;
3706         } while ( pc < pc_end );
3707         break;
3708       case 6:
3709         STBIR_NO_UNROLL_LOOP_START
3710         do {
3711           STBIR_MOVE_4( pc, coeffs );
3712           STBIR_MOVE_2( pc+4, coeffs+4 );
3713           pc += 6;
3714           coeffs += coefficient_width;
3715         } while ( pc < pc_end );
3716         break;
3717       case 7:
3718         STBIR_NO_UNROLL_LOOP_START
3719         do {
3720           STBIR_MOVE_4( pc, coeffs );
3721           STBIR_MOVE_2( pc+4, coeffs+4 );
3722           STBIR_MOVE_1( pc+6, coeffs+6 );
3723           pc += 7;
3724           coeffs += coefficient_width;
3725         } while ( pc < pc_end );
3726         break;
3727       case 8:
3728         STBIR_NO_UNROLL_LOOP_START
3729         do {
3730           STBIR_MOVE_4( pc, coeffs );
3731           STBIR_MOVE_4( pc+4, coeffs+4 );
3732           pc += 8;
3733           coeffs += coefficient_width;
3734         } while ( pc < pc_end );
3735         break;
3736       case 9:
3737         STBIR_NO_UNROLL_LOOP_START
3738         do {
3739           STBIR_MOVE_4( pc, coeffs );
3740           STBIR_MOVE_4( pc+4, coeffs+4 );
3741           STBIR_MOVE_1( pc+8, coeffs+8 );
3742           pc += 9;
3743           coeffs += coefficient_width;
3744         } while ( pc < pc_end );
3745         break;
3746       case 10:
3747         STBIR_NO_UNROLL_LOOP_START
3748         do {
3749           STBIR_MOVE_4( pc, coeffs );
3750           STBIR_MOVE_4( pc+4, coeffs+4 );
3751           STBIR_MOVE_2( pc+8, coeffs+8 );
3752           pc += 10;
3753           coeffs += coefficient_width;
3754         } while ( pc < pc_end );
3755         break;
3756       case 11:
3757         STBIR_NO_UNROLL_LOOP_START
3758         do {
3759           STBIR_MOVE_4( pc, coeffs );
3760           STBIR_MOVE_4( pc+4, coeffs+4 );
3761           STBIR_MOVE_2( pc+8, coeffs+8 );
3762           STBIR_MOVE_1( pc+10, coeffs+10 );
3763           pc += 11;
3764           coeffs += coefficient_width;
3765         } while ( pc < pc_end );
3766         break;
3767       case 12:
3768         STBIR_NO_UNROLL_LOOP_START
3769         do {
3770           STBIR_MOVE_4( pc, coeffs );
3771           STBIR_MOVE_4( pc+4, coeffs+4 );
3772           STBIR_MOVE_4( pc+8, coeffs+8 );
3773           pc += 12;
3774           coeffs += coefficient_width;
3775         } while ( pc < pc_end );
3776         break;
3777       default:
3778         STBIR_NO_UNROLL_LOOP_START
3779         do {
3780           float * copy_end = pc + widest - 4;
3781           float * c = coeffs;
3782           do {
3783             STBIR_NO_UNROLL( pc );
3784             STBIR_MOVE_4( pc, c );
3785             pc += 4;
3786             c += 4;
3787           } while ( pc <= copy_end );
3788           copy_end += 4;
3789           STBIR_NO_UNROLL_LOOP_START
3790           while ( pc < copy_end )
3791           {
3792             STBIR_MOVE_1( pc, c );
3793             ++pc; ++c;
3794           }
3795           coeffs += coefficient_width;
3796         } while ( pc < pc_end );
3797         break;
3798     }
3799   }
3800 
3801   // some horizontal routines read one float off the end (which is then masked off), so put in a sentinal so we don't read an snan or denormal
3802   coefficents[ widest * num_contributors ] = 8888.0f;
3803 
3804   // the minimum we might read for unrolled filters widths is 12. So, we need to
3805   //   make sure we never read outside the decode buffer, by possibly moving
3806   //   the sample area back into the scanline, and putting zeros weights first.
3807   // we start on the right edge and check until we're well past the possible
3808   //   clip area (2*widest).
3809   {
3810     stbir__contributors * contribs = contributors + num_contributors - 1;
3811     float * coeffs = coefficents + widest * ( num_contributors - 1 );
3812 
3813     // go until no chance of clipping (this is usually less than 8 lops)
3814     while ( ( contribs >= contributors ) && ( ( contribs->n0 + widest*2 ) >= row_end ) )
3815     {
3816       // might we clip??
3817       if ( ( contribs->n0 + widest ) > row_end )
3818       {
3819         int stop_range = widest;
3820 
3821         // if range is larger than 12, it will be handled by generic loops that can terminate on the exact length
3822         //   of this contrib n1, instead of a fixed widest amount - so calculate this
3823         if ( widest > 12 )
3824         {
3825           int mod;
3826 
3827           // how far will be read in the n_coeff loop (which depends on the widest count mod4);
3828           mod = widest & 3;
3829           stop_range = ( ( ( contribs->n1 - contribs->n0 + 1 ) - mod + 3 ) & ~3 ) + mod;
3830 
3831           // the n_coeff loops do a minimum amount of coeffs, so factor that in!
3832           if ( stop_range < ( 8 + mod ) ) stop_range = 8 + mod;
3833         }
3834 
3835         // now see if we still clip with the refined range
3836         if ( ( contribs->n0 + stop_range ) > row_end )
3837         {
3838           int new_n0 = row_end - stop_range;
3839           int num = contribs->n1 - contribs->n0 + 1;
3840           int backup = contribs->n0 - new_n0;
3841           float * from_co = coeffs + num - 1;
3842           float * to_co = from_co + backup;
3843 
3844           STBIR_ASSERT( ( new_n0 >= row0 ) && ( new_n0 < contribs->n0 ) );
3845 
3846           // move the coeffs over
3847           while( num )
3848           {
3849             *to_co-- = *from_co--;
3850             --num;
3851           }
3852           // zero new positions
3853           while ( to_co >= coeffs )
3854             *to_co-- = 0;
3855           // set new start point
3856           contribs->n0 = new_n0;
3857           if ( widest > 12 )
3858           {
3859             int mod;
3860 
3861             // how far will be read in the n_coeff loop (which depends on the widest count mod4);
3862             mod = widest & 3;
3863             stop_range = ( ( ( contribs->n1 - contribs->n0 + 1 ) - mod + 3 ) & ~3 ) + mod;
3864 
3865             // the n_coeff loops do a minimum amount of coeffs, so factor that in!
3866             if ( stop_range < ( 8 + mod ) ) stop_range = 8 + mod;
3867           }
3868         }
3869       }
3870       --contribs;
3871       coeffs -= widest;
3872     }
3873   }
3874 
3875   return widest;
3876   #undef STBIR_MOVE_1
3877   #undef STBIR_MOVE_2
3878   #undef STBIR_MOVE_4
3879 }
3880 
3881 static void stbir__calculate_filters( stbir__sampler * samp, stbir__sampler * other_axis_for_pivot, void * user_data STBIR_ONLY_PROFILE_BUILD_GET_INFO )
3882 {
3883   int n;
3884   float scale = samp->scale_info.scale;
3885   stbir__kernel_callback * kernel = samp->filter_kernel;
3886   stbir__support_callback * support = samp->filter_support;
3887   float inv_scale = samp->scale_info.inv_scale;
3888   int input_full_size = samp->scale_info.input_full_size;
3889   int gather_num_contributors = samp->num_contributors;
3890   stbir__contributors* gather_contributors = samp->contributors;
3891   float * gather_coeffs = samp->coefficients;
3892   int gather_coefficient_width = samp->coefficient_width;
3893 
3894   switch ( samp->is_gather )
3895   {
3896     case 1: // gather upsample
3897     {
3898       float out_pixels_radius = support(inv_scale,user_data) * scale;
3899 
3900       stbir__calculate_coefficients_for_gather_upsample( out_pixels_radius, kernel, &samp->scale_info, gather_num_contributors, gather_contributors, gather_coeffs, gather_coefficient_width, samp->edge, user_data );
3901 
3902       STBIR_PROFILE_BUILD_START( cleanup );
3903       stbir__cleanup_gathered_coefficients( samp->edge, &samp->extent_info, &samp->scale_info, gather_num_contributors, gather_contributors, gather_coeffs, gather_coefficient_width );
3904       STBIR_PROFILE_BUILD_END( cleanup );
3905     }
3906     break;
3907 
3908     case 0: // scatter downsample (only on vertical)
3909     case 2: // gather downsample
3910     {
3911       float in_pixels_radius = support(scale,user_data) * inv_scale;
3912       int filter_pixel_margin = samp->filter_pixel_margin;
3913       int input_end = input_full_size + filter_pixel_margin;
3914 
3915       // if this is a scatter, we do a downsample gather to get the coeffs, and then pivot after
3916       if ( !samp->is_gather )
3917       {
3918         // check if we are using the same gather downsample on the horizontal as this vertical,
3919         //   if so, then we don't have to generate them, we can just pivot from the horizontal.
3920         if ( other_axis_for_pivot )
3921         {
3922           gather_contributors = other_axis_for_pivot->contributors;
3923           gather_coeffs = other_axis_for_pivot->coefficients;
3924           gather_coefficient_width = other_axis_for_pivot->coefficient_width;
3925           gather_num_contributors = other_axis_for_pivot->num_contributors;
3926           samp->extent_info.lowest = other_axis_for_pivot->extent_info.lowest;
3927           samp->extent_info.highest = other_axis_for_pivot->extent_info.highest;
3928           samp->extent_info.widest = other_axis_for_pivot->extent_info.widest;
3929           goto jump_right_to_pivot;
3930         }
3931 
3932         gather_contributors = samp->gather_prescatter_contributors;
3933         gather_coeffs = samp->gather_prescatter_coefficients;
3934         gather_coefficient_width = samp->gather_prescatter_coefficient_width;
3935         gather_num_contributors = samp->gather_prescatter_num_contributors;
3936       }
3937 
3938       stbir__calculate_coefficients_for_gather_downsample( -filter_pixel_margin, input_end, in_pixels_radius, kernel, &samp->scale_info, gather_coefficient_width, gather_num_contributors, gather_contributors, gather_coeffs, user_data );
3939 
3940       STBIR_PROFILE_BUILD_START( cleanup );
3941       stbir__cleanup_gathered_coefficients( samp->edge, &samp->extent_info, &samp->scale_info, gather_num_contributors, gather_contributors, gather_coeffs, gather_coefficient_width );
3942       STBIR_PROFILE_BUILD_END( cleanup );
3943 
3944       if ( !samp->is_gather )
3945       {
3946         // if this is a scatter (vertical only), then we need to pivot the coeffs
3947         stbir__contributors * scatter_contributors;
3948         int highest_set;
3949 
3950         jump_right_to_pivot:
3951 
3952         STBIR_PROFILE_BUILD_START( pivot );
3953 
3954         highest_set = (-filter_pixel_margin) - 1;
3955         for (n = 0; n < gather_num_contributors; n++)
3956         {
3957           int k;
3958           int gn0 = gather_contributors->n0, gn1 = gather_contributors->n1;
3959           int scatter_coefficient_width = samp->coefficient_width;
3960           float * scatter_coeffs = samp->coefficients + ( gn0 + filter_pixel_margin ) * scatter_coefficient_width;
3961           float * g_coeffs = gather_coeffs;
3962           scatter_contributors = samp->contributors + ( gn0 + filter_pixel_margin );
3963 
3964           for (k = gn0 ; k <= gn1 ; k++ )
3965           {
3966             float gc = *g_coeffs++;
3967             
3968             // skip zero and denormals - must skip zeros to avoid adding coeffs beyond scatter_coefficient_width
3969             //   (which happens when pivoting from horizontal, which might have dummy zeros)
3970             if ( ( ( gc >= stbir__small_float ) || ( gc <= -stbir__small_float ) ) )
3971             {
3972               if ( ( k > highest_set ) || ( scatter_contributors->n0 > scatter_contributors->n1 ) )
3973               {
3974                 {
3975                   // if we are skipping over several contributors, we need to clear the skipped ones
3976                   stbir__contributors * clear_contributors = samp->contributors + ( highest_set + filter_pixel_margin + 1);
3977                   while ( clear_contributors < scatter_contributors )
3978                   {
3979                     clear_contributors->n0 = 0;
3980                     clear_contributors->n1 = -1;
3981                     ++clear_contributors;
3982                   }
3983                 }
3984                 scatter_contributors->n0 = n;
3985                 scatter_contributors->n1 = n;
3986                 scatter_coeffs[0]  = gc;
3987                 highest_set = k;
3988               }
3989               else
3990               {
3991                 stbir__insert_coeff( scatter_contributors, scatter_coeffs, n, gc, scatter_coefficient_width );
3992               }
3993               STBIR_ASSERT( ( scatter_contributors->n1 - scatter_contributors->n0 + 1 ) <= scatter_coefficient_width );
3994             }
3995             ++scatter_contributors;
3996             scatter_coeffs += scatter_coefficient_width;
3997           }
3998 
3999           ++gather_contributors;
4000           gather_coeffs += gather_coefficient_width;
4001         }
4002 
4003         // now clear any unset contribs
4004         {
4005           stbir__contributors * clear_contributors = samp->contributors + ( highest_set + filter_pixel_margin + 1);
4006           stbir__contributors * end_contributors = samp->contributors + samp->num_contributors;
4007           while ( clear_contributors < end_contributors )
4008           {
4009             clear_contributors->n0 = 0;
4010             clear_contributors->n1 = -1;
4011             ++clear_contributors;
4012           }
4013         }
4014 
4015         STBIR_PROFILE_BUILD_END( pivot );
4016       }
4017     }
4018     break;
4019   }
4020 }
4021 
4022 
4023 //========================================================================================================
4024 // scanline decoders and encoders
4025 
4026 #define stbir__coder_min_num 1
4027 #define STB_IMAGE_RESIZE_DO_CODERS
4028 #include STBIR__HEADER_FILENAME
4029 
4030 #define stbir__decode_suffix BGRA
4031 #define stbir__decode_swizzle
4032 #define stbir__decode_order0  2
4033 #define stbir__decode_order1  1
4034 #define stbir__decode_order2  0
4035 #define stbir__decode_order3  3
4036 #define stbir__encode_order0  2
4037 #define stbir__encode_order1  1
4038 #define stbir__encode_order2  0
4039 #define stbir__encode_order3  3
4040 #define stbir__coder_min_num 4
4041 #define STB_IMAGE_RESIZE_DO_CODERS
4042 #include STBIR__HEADER_FILENAME
4043 
4044 #define stbir__decode_suffix ARGB
4045 #define stbir__decode_swizzle
4046 #define stbir__decode_order0  1
4047 #define stbir__decode_order1  2
4048 #define stbir__decode_order2  3
4049 #define stbir__decode_order3  0
4050 #define stbir__encode_order0  3
4051 #define stbir__encode_order1  0
4052 #define stbir__encode_order2  1
4053 #define stbir__encode_order3  2
4054 #define stbir__coder_min_num 4
4055 #define STB_IMAGE_RESIZE_DO_CODERS
4056 #include STBIR__HEADER_FILENAME
4057 
4058 #define stbir__decode_suffix ABGR
4059 #define stbir__decode_swizzle
4060 #define stbir__decode_order0  3
4061 #define stbir__decode_order1  2
4062 #define stbir__decode_order2  1
4063 #define stbir__decode_order3  0
4064 #define stbir__encode_order0  3
4065 #define stbir__encode_order1  2
4066 #define stbir__encode_order2  1
4067 #define stbir__encode_order3  0
4068 #define stbir__coder_min_num 4
4069 #define STB_IMAGE_RESIZE_DO_CODERS
4070 #include STBIR__HEADER_FILENAME
4071 
4072 #define stbir__decode_suffix AR
4073 #define stbir__decode_swizzle
4074 #define stbir__decode_order0  1
4075 #define stbir__decode_order1  0
4076 #define stbir__decode_order2  3
4077 #define stbir__decode_order3  2
4078 #define stbir__encode_order0  1
4079 #define stbir__encode_order1  0
4080 #define stbir__encode_order2  3
4081 #define stbir__encode_order3  2
4082 #define stbir__coder_min_num 2
4083 #define STB_IMAGE_RESIZE_DO_CODERS
4084 #include STBIR__HEADER_FILENAME
4085 
4086 
4087 // fancy alpha means we expand to keep both premultipied and non-premultiplied color channels
4088 static void stbir__fancy_alpha_weight_4ch( float * out_buffer, int width_times_channels )
4089 {
4090   float STBIR_STREAMOUT_PTR(*) out = out_buffer;
4091   float const * end_decode = out_buffer + ( width_times_channels / 4 ) * 7;  // decode buffer aligned to end of out_buffer
4092   float STBIR_STREAMOUT_PTR(*) decode = (float*)end_decode - width_times_channels;
4093 
4094   // fancy alpha is stored internally as R G B A Rpm Gpm Bpm
4095 
4096   #ifdef STBIR_SIMD
4097 
4098   #ifdef STBIR_SIMD8
4099   decode += 16;
4100   STBIR_NO_UNROLL_LOOP_START
4101   while ( decode <= end_decode )
4102   {
4103     stbir__simdf8 d0,d1,a0,a1,p0,p1;
4104     STBIR_NO_UNROLL(decode);
4105     stbir__simdf8_load( d0, decode-16 );
4106     stbir__simdf8_load( d1, decode-16+8 );
4107     stbir__simdf8_0123to33333333( a0, d0 );
4108     stbir__simdf8_0123to33333333( a1, d1 );
4109     stbir__simdf8_mult( p0, a0, d0 );
4110     stbir__simdf8_mult( p1, a1, d1 );
4111     stbir__simdf8_bot4s( a0, d0, p0 );
4112     stbir__simdf8_bot4s( a1, d1, p1 );
4113     stbir__simdf8_top4s( d0, d0, p0 );
4114     stbir__simdf8_top4s( d1, d1, p1 );
4115     stbir__simdf8_store ( out, a0 );
4116     stbir__simdf8_store ( out+7, d0 );
4117     stbir__simdf8_store ( out+14, a1 );
4118     stbir__simdf8_store ( out+21, d1 );
4119     decode += 16;
4120     out += 28;
4121   }
4122   decode -= 16;
4123   #else
4124   decode += 8;
4125   STBIR_NO_UNROLL_LOOP_START
4126   while ( decode <= end_decode )
4127   {
4128     stbir__simdf d0,a0,d1,a1,p0,p1;
4129     STBIR_NO_UNROLL(decode);
4130     stbir__simdf_load( d0, decode-8 );
4131     stbir__simdf_load( d1, decode-8+4 );
4132     stbir__simdf_0123to3333( a0, d0 );
4133     stbir__simdf_0123to3333( a1, d1 );
4134     stbir__simdf_mult( p0, a0, d0 );
4135     stbir__simdf_mult( p1, a1, d1 );
4136     stbir__simdf_store ( out, d0 );
4137     stbir__simdf_store ( out+4, p0 );
4138     stbir__simdf_store ( out+7, d1 );
4139     stbir__simdf_store ( out+7+4, p1 );
4140     decode += 8;
4141     out += 14;
4142   }
4143   decode -= 8;
4144   #endif
4145 
4146   // might be one last odd pixel
4147   #ifdef STBIR_SIMD8
4148   STBIR_NO_UNROLL_LOOP_START
4149   while ( decode < end_decode )
4150   #else
4151   if ( decode < end_decode )
4152   #endif
4153   {
4154     stbir__simdf d,a,p;
4155     STBIR_NO_UNROLL(decode);
4156     stbir__simdf_load( d, decode );
4157     stbir__simdf_0123to3333( a, d );
4158     stbir__simdf_mult( p, a, d );
4159     stbir__simdf_store ( out, d );
4160     stbir__simdf_store ( out+4, p );
4161     decode += 4;
4162     out += 7;
4163   }
4164 
4165   #else
4166 
4167   while( decode < end_decode )
4168   {
4169     float r = decode[0], g = decode[1], b = decode[2], alpha = decode[3];
4170     out[0] = r;
4171     out[1] = g;
4172     out[2] = b;
4173     out[3] = alpha;
4174     out[4] = r * alpha;
4175     out[5] = g * alpha;
4176     out[6] = b * alpha;
4177     out += 7;
4178     decode += 4;
4179   }
4180 
4181   #endif
4182 }
4183 
4184 static void stbir__fancy_alpha_weight_2ch( float * out_buffer, int width_times_channels )
4185 {
4186   float STBIR_STREAMOUT_PTR(*) out = out_buffer;
4187   float const * end_decode = out_buffer + ( width_times_channels / 2 ) * 3;
4188   float STBIR_STREAMOUT_PTR(*) decode = (float*)end_decode - width_times_channels;
4189 
4190   //  for fancy alpha, turns into: [X A Xpm][X A Xpm],etc
4191 
4192   #ifdef STBIR_SIMD
4193 
4194   decode += 8;
4195   if ( decode <= end_decode )
4196   {
4197     STBIR_NO_UNROLL_LOOP_START
4198     do {
4199       #ifdef STBIR_SIMD8
4200       stbir__simdf8 d0,a0,p0;
4201       STBIR_NO_UNROLL(decode);
4202       stbir__simdf8_load( d0, decode-8 );
4203       stbir__simdf8_0123to11331133( p0, d0 );
4204       stbir__simdf8_0123to00220022( a0, d0 );
4205       stbir__simdf8_mult( p0, p0, a0 );
4206 
4207       stbir__simdf_store2( out, stbir__if_simdf8_cast_to_simdf4( d0 ) );
4208       stbir__simdf_store( out+2, stbir__if_simdf8_cast_to_simdf4( p0 ) );
4209       stbir__simdf_store2h( out+3, stbir__if_simdf8_cast_to_simdf4( d0 ) );
4210 
4211       stbir__simdf_store2( out+6, stbir__simdf8_gettop4( d0 ) );
4212       stbir__simdf_store( out+8, stbir__simdf8_gettop4( p0 ) );
4213       stbir__simdf_store2h( out+9, stbir__simdf8_gettop4( d0 ) );
4214       #else
4215       stbir__simdf d0,a0,d1,a1,p0,p1;
4216       STBIR_NO_UNROLL(decode);
4217       stbir__simdf_load( d0, decode-8 );
4218       stbir__simdf_load( d1, decode-8+4 );
4219       stbir__simdf_0123to1133( p0, d0 );
4220       stbir__simdf_0123to1133( p1, d1 );
4221       stbir__simdf_0123to0022( a0, d0 );
4222       stbir__simdf_0123to0022( a1, d1 );
4223       stbir__simdf_mult( p0, p0, a0 );
4224       stbir__simdf_mult( p1, p1, a1 );
4225 
4226       stbir__simdf_store2( out, d0 );
4227       stbir__simdf_store( out+2, p0 );
4228       stbir__simdf_store2h( out+3, d0 );
4229 
4230       stbir__simdf_store2( out+6, d1 );
4231       stbir__simdf_store( out+8, p1 );
4232       stbir__simdf_store2h( out+9, d1 );
4233       #endif
4234       decode += 8;
4235       out += 12;
4236     } while ( decode <= end_decode );
4237   }
4238   decode -= 8;
4239   #endif
4240 
4241   STBIR_SIMD_NO_UNROLL_LOOP_START
4242   while( decode < end_decode )
4243   {
4244     float x = decode[0], y = decode[1];
4245     STBIR_SIMD_NO_UNROLL(decode);
4246     out[0] = x;
4247     out[1] = y;
4248     out[2] = x * y;
4249     out += 3;
4250     decode += 2;
4251   }
4252 }
4253 
4254 static void stbir__fancy_alpha_unweight_4ch( float * encode_buffer, int width_times_channels )
4255 {
4256   float STBIR_SIMD_STREAMOUT_PTR(*) encode = encode_buffer;
4257   float STBIR_SIMD_STREAMOUT_PTR(*) input = encode_buffer;
4258   float const * end_output = encode_buffer + width_times_channels;
4259 
4260   // fancy RGBA is stored internally as R G B A Rpm Gpm Bpm
4261 
4262   STBIR_SIMD_NO_UNROLL_LOOP_START
4263   do {
4264     float alpha = input[3];
4265 #ifdef STBIR_SIMD
4266     stbir__simdf i,ia;
4267     STBIR_SIMD_NO_UNROLL(encode);
4268     if ( alpha < stbir__small_float )
4269     {
4270       stbir__simdf_load( i, input );
4271       stbir__simdf_store( encode, i );
4272     }
4273     else
4274     {
4275       stbir__simdf_load1frep4( ia, 1.0f / alpha );
4276       stbir__simdf_load( i, input+4 );
4277       stbir__simdf_mult( i, i, ia );
4278       stbir__simdf_store( encode, i );
4279       encode[3] = alpha;
4280     }
4281 #else
4282     if ( alpha < stbir__small_float )
4283     {
4284       encode[0] = input[0];
4285       encode[1] = input[1];
4286       encode[2] = input[2];
4287     }
4288     else
4289     {
4290       float ialpha = 1.0f / alpha;
4291       encode[0] = input[4] * ialpha;
4292       encode[1] = input[5] * ialpha;
4293       encode[2] = input[6] * ialpha;
4294     }
4295     encode[3] = alpha;
4296 #endif
4297 
4298     input += 7;
4299     encode += 4;
4300   } while ( encode < end_output );
4301 }
4302 
4303 //  format: [X A Xpm][X A Xpm] etc
4304 static void stbir__fancy_alpha_unweight_2ch( float * encode_buffer, int width_times_channels )
4305 {
4306   float STBIR_SIMD_STREAMOUT_PTR(*) encode = encode_buffer;
4307   float STBIR_SIMD_STREAMOUT_PTR(*) input = encode_buffer;
4308   float const * end_output = encode_buffer + width_times_channels;
4309 
4310   do {
4311     float alpha = input[1];
4312     encode[0] = input[0];
4313     if ( alpha >= stbir__small_float )
4314       encode[0] = input[2] / alpha;
4315     encode[1] = alpha;
4316 
4317     input += 3;
4318     encode += 2;
4319   } while ( encode < end_output );
4320 }
4321 
4322 static void stbir__simple_alpha_weight_4ch( float * decode_buffer, int width_times_channels )
4323 {
4324   float STBIR_STREAMOUT_PTR(*) decode = decode_buffer;
4325   float const * end_decode = decode_buffer + width_times_channels;
4326 
4327   #ifdef STBIR_SIMD
4328   {
4329     decode += 2 * stbir__simdfX_float_count;
4330     STBIR_NO_UNROLL_LOOP_START
4331     while ( decode <= end_decode )
4332     {
4333       stbir__simdfX d0,a0,d1,a1;
4334       STBIR_NO_UNROLL(decode);
4335       stbir__simdfX_load( d0, decode-2*stbir__simdfX_float_count );
4336       stbir__simdfX_load( d1, decode-2*stbir__simdfX_float_count+stbir__simdfX_float_count );
4337       stbir__simdfX_aaa1( a0, d0, STBIR_onesX );
4338       stbir__simdfX_aaa1( a1, d1, STBIR_onesX );
4339       stbir__simdfX_mult( d0, d0, a0 );
4340       stbir__simdfX_mult( d1, d1, a1 );
4341       stbir__simdfX_store ( decode-2*stbir__simdfX_float_count, d0 );
4342       stbir__simdfX_store ( decode-2*stbir__simdfX_float_count+stbir__simdfX_float_count, d1 );
4343       decode += 2 * stbir__simdfX_float_count;
4344     }
4345     decode -= 2 * stbir__simdfX_float_count;
4346 
4347     // few last pixels remnants
4348     #ifdef STBIR_SIMD8
4349     STBIR_NO_UNROLL_LOOP_START
4350     while ( decode < end_decode )
4351     #else
4352     if ( decode < end_decode )
4353     #endif
4354     {
4355       stbir__simdf d,a;
4356       stbir__simdf_load( d, decode );
4357       stbir__simdf_aaa1( a, d, STBIR__CONSTF(STBIR_ones) );
4358       stbir__simdf_mult( d, d, a );
4359       stbir__simdf_store ( decode, d );
4360       decode += 4;
4361     }
4362   }
4363 
4364   #else
4365 
4366   while( decode < end_decode )
4367   {
4368     float alpha = decode[3];
4369     decode[0] *= alpha;
4370     decode[1] *= alpha;
4371     decode[2] *= alpha;
4372     decode += 4;
4373   }
4374 
4375   #endif
4376 }
4377 
4378 static void stbir__simple_alpha_weight_2ch( float * decode_buffer, int width_times_channels )
4379 {
4380   float STBIR_STREAMOUT_PTR(*) decode = decode_buffer;
4381   float const * end_decode = decode_buffer + width_times_channels;
4382 
4383   #ifdef STBIR_SIMD
4384   decode += 2 * stbir__simdfX_float_count;
4385   STBIR_NO_UNROLL_LOOP_START
4386   while ( decode <= end_decode )
4387   {
4388     stbir__simdfX d0,a0,d1,a1;
4389     STBIR_NO_UNROLL(decode);
4390     stbir__simdfX_load( d0, decode-2*stbir__simdfX_float_count );
4391     stbir__simdfX_load( d1, decode-2*stbir__simdfX_float_count+stbir__simdfX_float_count );
4392     stbir__simdfX_a1a1( a0, d0, STBIR_onesX );
4393     stbir__simdfX_a1a1( a1, d1, STBIR_onesX );
4394     stbir__simdfX_mult( d0, d0, a0 );
4395     stbir__simdfX_mult( d1, d1, a1 );
4396     stbir__simdfX_store ( decode-2*stbir__simdfX_float_count, d0 );
4397     stbir__simdfX_store ( decode-2*stbir__simdfX_float_count+stbir__simdfX_float_count, d1 );
4398     decode += 2 * stbir__simdfX_float_count;
4399   }
4400   decode -= 2 * stbir__simdfX_float_count;
4401   #endif
4402 
4403   STBIR_SIMD_NO_UNROLL_LOOP_START
4404   while( decode < end_decode )
4405   {
4406     float alpha = decode[1];
4407     STBIR_SIMD_NO_UNROLL(decode);
4408     decode[0] *= alpha;
4409     decode += 2;
4410   }
4411 }
4412 
4413 static void stbir__simple_alpha_unweight_4ch( float * encode_buffer, int width_times_channels )
4414 {
4415   float STBIR_SIMD_STREAMOUT_PTR(*) encode = encode_buffer;
4416   float const * end_output = encode_buffer + width_times_channels;
4417 
4418   STBIR_SIMD_NO_UNROLL_LOOP_START
4419   do {
4420     float alpha = encode[3];
4421 
4422 #ifdef STBIR_SIMD
4423     stbir__simdf i,ia;
4424     STBIR_SIMD_NO_UNROLL(encode);
4425     if ( alpha >= stbir__small_float )
4426     {
4427       stbir__simdf_load1frep4( ia, 1.0f / alpha );
4428       stbir__simdf_load( i, encode );
4429       stbir__simdf_mult( i, i, ia );
4430       stbir__simdf_store( encode, i );
4431       encode[3] = alpha;
4432     }
4433 #else
4434     if ( alpha >= stbir__small_float )
4435     {
4436       float ialpha = 1.0f / alpha;
4437       encode[0] *= ialpha;
4438       encode[1] *= ialpha;
4439       encode[2] *= ialpha;
4440     }
4441 #endif
4442     encode += 4;
4443   } while ( encode < end_output );
4444 }
4445 
4446 static void stbir__simple_alpha_unweight_2ch( float * encode_buffer, int width_times_channels )
4447 {
4448   float STBIR_SIMD_STREAMOUT_PTR(*) encode = encode_buffer;
4449   float const * end_output = encode_buffer + width_times_channels;
4450 
4451   do {
4452     float alpha = encode[1];
4453     if ( alpha >= stbir__small_float )
4454       encode[0] /= alpha;
4455     encode += 2;
4456   } while ( encode < end_output );
4457 }
4458 
4459 
4460 // only used in RGB->BGR or BGR->RGB
4461 static void stbir__simple_flip_3ch( float * decode_buffer, int width_times_channels )
4462 {
4463   float STBIR_STREAMOUT_PTR(*) decode = decode_buffer;
4464   float const * end_decode = decode_buffer + width_times_channels;
4465 
4466 #ifdef STBIR_SIMD
4467     #ifdef stbir__simdf_swiz2 // do we have two argument swizzles?
4468       end_decode -= 12; 
4469       STBIR_NO_UNROLL_LOOP_START
4470       while( decode <= end_decode )
4471       {
4472         // on arm64 8 instructions, no overlapping stores
4473         stbir__simdf a,b,c,na,nb;
4474         STBIR_SIMD_NO_UNROLL(decode);
4475         stbir__simdf_load( a, decode );
4476         stbir__simdf_load( b, decode+4 );
4477         stbir__simdf_load( c, decode+8 );
4478 
4479         na = stbir__simdf_swiz2( a, b, 2, 1, 0, 5 );   
4480         b  = stbir__simdf_swiz2( a, b, 4, 3, 6, 7 );   
4481         nb = stbir__simdf_swiz2( b, c, 0, 1, 4, 3 );   
4482         c  = stbir__simdf_swiz2( b, c, 2, 7, 6, 5 );   
4483 
4484         stbir__simdf_store( decode, na );
4485         stbir__simdf_store( decode+4, nb ); 
4486         stbir__simdf_store( decode+8, c );
4487         decode += 12;
4488       }
4489       end_decode += 12;
4490     #else
4491       end_decode -= 24;
4492       STBIR_NO_UNROLL_LOOP_START
4493       while( decode <= end_decode )
4494       {
4495         // 26 instructions on x64
4496         stbir__simdf a,b,c,d,e,f,g;
4497         float i21, i23;
4498         STBIR_SIMD_NO_UNROLL(decode);
4499         stbir__simdf_load( a, decode );
4500         stbir__simdf_load( b, decode+3 );
4501         stbir__simdf_load( c, decode+6 );
4502         stbir__simdf_load( d, decode+9 );
4503         stbir__simdf_load( e, decode+12 );
4504         stbir__simdf_load( f, decode+15 );
4505         stbir__simdf_load( g, decode+18 );
4506 
4507         a = stbir__simdf_swiz( a, 2, 1, 0, 3 );   
4508         b = stbir__simdf_swiz( b, 2, 1, 0, 3 );   
4509         c = stbir__simdf_swiz( c, 2, 1, 0, 3 );   
4510         d = stbir__simdf_swiz( d, 2, 1, 0, 3 );   
4511         e = stbir__simdf_swiz( e, 2, 1, 0, 3 );   
4512         f = stbir__simdf_swiz( f, 2, 1, 0, 3 );   
4513         g = stbir__simdf_swiz( g, 2, 1, 0, 3 );   
4514 
4515         // stores overlap, need to be in order, 
4516         stbir__simdf_store( decode,    a );
4517         i21 = decode[21];
4518         stbir__simdf_store( decode+3,  b ); 
4519         i23 = decode[23];
4520         stbir__simdf_store( decode+6,  c );
4521         stbir__simdf_store( decode+9,  d );
4522         stbir__simdf_store( decode+12, e );
4523         stbir__simdf_store( decode+15, f );
4524         stbir__simdf_store( decode+18, g );
4525         decode[21] = i23;
4526         decode[23] = i21;
4527         decode += 24;
4528       }
4529       end_decode += 24;
4530     #endif
4531 #else
4532   end_decode -= 12;
4533   STBIR_NO_UNROLL_LOOP_START
4534   while( decode <= end_decode )
4535   {
4536     // 16 instructions
4537     float t0,t1,t2,t3;
4538     STBIR_NO_UNROLL(decode);
4539     t0 = decode[0]; t1 = decode[3]; t2 = decode[6]; t3 = decode[9];
4540     decode[0] = decode[2]; decode[3] = decode[5]; decode[6] = decode[8]; decode[9] = decode[11];
4541     decode[2] = t0; decode[5] = t1; decode[8] = t2; decode[11] = t3;
4542     decode += 12;
4543   }
4544   end_decode += 12;
4545 #endif
4546 
4547   STBIR_NO_UNROLL_LOOP_START
4548   while( decode < end_decode )
4549   {
4550     float t = decode[0];
4551     STBIR_NO_UNROLL(decode);
4552     decode[0] = decode[2];
4553     decode[2] = t;
4554     decode += 3;
4555   }
4556 }
4557 
4558 
4559 
4560 static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float * output_buffer STBIR_ONLY_PROFILE_GET_SPLIT_INFO )
4561 {
4562   int channels = stbir_info->channels;
4563   int effective_channels = stbir_info->effective_channels;
4564   int input_sample_in_bytes = stbir__type_size[stbir_info->input_type] * channels;
4565   stbir_edge edge_horizontal = stbir_info->horizontal.edge;
4566   stbir_edge edge_vertical = stbir_info->vertical.edge;
4567   int row = stbir__edge_wrap(edge_vertical, n, stbir_info->vertical.scale_info.input_full_size);
4568   const void* input_plane_data = ( (char *) stbir_info->input_data ) + (size_t)row * (size_t) stbir_info->input_stride_bytes;
4569   stbir__span const * spans = stbir_info->scanline_extents.spans;
4570   float* full_decode_buffer = output_buffer - stbir_info->scanline_extents.conservative.n0 * effective_channels;
4571 
4572   // if we are on edge_zero, and we get in here with an out of bounds n, then the calculate filters has failed
4573   STBIR_ASSERT( !(edge_vertical == STBIR_EDGE_ZERO && (n < 0 || n >= stbir_info->vertical.scale_info.input_full_size)) );
4574 
4575   do
4576   {
4577     float * decode_buffer;
4578     void const * input_data;
4579     float * end_decode;
4580     int width_times_channels;
4581     int width;
4582 
4583     if ( spans->n1 < spans->n0 )
4584       break;
4585 
4586     width = spans->n1 + 1 - spans->n0;
4587     decode_buffer = full_decode_buffer + spans->n0 * effective_channels;
4588     end_decode = full_decode_buffer + ( spans->n1 + 1 ) * effective_channels;
4589     width_times_channels = width * channels;
4590 
4591     // read directly out of input plane by default
4592     input_data = ( (char*)input_plane_data ) + spans->pixel_offset_for_input * input_sample_in_bytes;
4593 
4594     // if we have an input callback, call it to get the input data
4595     if ( stbir_info->in_pixels_cb )
4596     {
4597       // call the callback with a temp buffer (that they can choose to use or not).  the temp is just right aligned memory in the decode_buffer itself
4598       input_data = stbir_info->in_pixels_cb( ( (char*) end_decode ) - ( width * input_sample_in_bytes ), input_plane_data, width, spans->pixel_offset_for_input, row, stbir_info->user_data );
4599     }
4600 
4601     STBIR_PROFILE_START( decode );
4602     // convert the pixels info the float decode_buffer, (we index from end_decode, so that when channels<effective_channels, we are right justified in the buffer)
4603     stbir_info->decode_pixels( (float*)end_decode - width_times_channels, width_times_channels, input_data );
4604     STBIR_PROFILE_END( decode );
4605 
4606     if (stbir_info->alpha_weight)
4607     {
4608       STBIR_PROFILE_START( alpha );
4609       stbir_info->alpha_weight( decode_buffer, width_times_channels );
4610       STBIR_PROFILE_END( alpha );
4611     }
4612 
4613     ++spans;
4614   } while ( spans <= ( &stbir_info->scanline_extents.spans[1] ) );
4615 
4616   // handle the edge_wrap filter (all other types are handled back out at the calculate_filter stage)
4617   // basically the idea here is that if we have the whole scanline in memory, we don't redecode the
4618   //   wrapped edge pixels, and instead just memcpy them from the scanline into the edge positions
4619   if ( ( edge_horizontal == STBIR_EDGE_WRAP ) && ( stbir_info->scanline_extents.edge_sizes[0] | stbir_info->scanline_extents.edge_sizes[1] ) )
4620   {
4621     // this code only runs if we're in edge_wrap, and we're doing the entire scanline
4622     int e, start_x[2];
4623     int input_full_size = stbir_info->horizontal.scale_info.input_full_size;
4624 
4625     start_x[0] = -stbir_info->scanline_extents.edge_sizes[0];  // left edge start x
4626     start_x[1] =  input_full_size;                             // right edge
4627 
4628     for( e = 0; e < 2 ; e++ )
4629     {
4630       // do each margin
4631       int margin = stbir_info->scanline_extents.edge_sizes[e];
4632       if ( margin )
4633       {
4634         int x = start_x[e];
4635         float * marg = full_decode_buffer + x * effective_channels;
4636         float const * src = full_decode_buffer + stbir__edge_wrap(edge_horizontal, x, input_full_size) * effective_channels;
4637         STBIR_MEMCPY( marg, src, margin * effective_channels * sizeof(float) );
4638       }
4639     }
4640   }
4641 }
4642 
4643 
4644 //=================
4645 // Do 1 channel horizontal routines
4646 
4647 #ifdef STBIR_SIMD
4648 
4649 #define stbir__1_coeff_only()          \
4650     stbir__simdf tot,c;                \
4651     STBIR_SIMD_NO_UNROLL(decode);      \
4652     stbir__simdf_load1( c, hc );       \
4653     stbir__simdf_mult1_mem( tot, c, decode );
4654 
4655 #define stbir__2_coeff_only()          \
4656     stbir__simdf tot,c,d;              \
4657     STBIR_SIMD_NO_UNROLL(decode);      \
4658     stbir__simdf_load2z( c, hc );      \
4659     stbir__simdf_load2( d, decode );   \
4660     stbir__simdf_mult( tot, c, d );    \
4661     stbir__simdf_0123to1230( c, tot ); \
4662     stbir__simdf_add1( tot, tot, c );
4663 
4664 #define stbir__3_coeff_only()                  \
4665     stbir__simdf tot,c,t;                      \
4666     STBIR_SIMD_NO_UNROLL(decode);              \
4667     stbir__simdf_load( c, hc );                \
4668     stbir__simdf_mult_mem( tot, c, decode );   \
4669     stbir__simdf_0123to1230( c, tot );         \
4670     stbir__simdf_0123to2301( t, tot );         \
4671     stbir__simdf_add1( tot, tot, c );          \
4672     stbir__simdf_add1( tot, tot, t );
4673 
4674 #define stbir__store_output_tiny()                \
4675     stbir__simdf_store1( output, tot );           \
4676     horizontal_coefficients += coefficient_width; \
4677     ++horizontal_contributors;                    \
4678     output += 1;
4679 
4680 #define stbir__4_coeff_start()                 \
4681     stbir__simdf tot,c;                        \
4682     STBIR_SIMD_NO_UNROLL(decode);              \
4683     stbir__simdf_load( c, hc );                \
4684     stbir__simdf_mult_mem( tot, c, decode );   \
4685 
4686 #define stbir__4_coeff_continue_from_4( ofs )  \
4687     STBIR_SIMD_NO_UNROLL(decode);              \
4688     stbir__simdf_load( c, hc + (ofs) );        \
4689     stbir__simdf_madd_mem( tot, tot, c, decode+(ofs) );
4690 
4691 #define stbir__1_coeff_remnant( ofs )          \
4692     { stbir__simdf d;                          \
4693     stbir__simdf_load1z( c, hc + (ofs) );      \
4694     stbir__simdf_load1( d, decode + (ofs) );   \
4695     stbir__simdf_madd( tot, tot, d, c ); }
4696 
4697 #define stbir__2_coeff_remnant( ofs )          \
4698     { stbir__simdf d;                          \
4699     stbir__simdf_load2z( c, hc+(ofs) );        \
4700     stbir__simdf_load2( d, decode+(ofs) );     \
4701     stbir__simdf_madd( tot, tot, d, c ); }
4702 
4703 #define stbir__3_coeff_setup()                 \
4704     stbir__simdf mask;                         \
4705     stbir__simdf_load( mask, STBIR_mask + 3 );
4706 
4707 #define stbir__3_coeff_remnant( ofs )                  \
4708     stbir__simdf_load( c, hc+(ofs) );                  \
4709     stbir__simdf_and( c, c, mask );                    \
4710     stbir__simdf_madd_mem( tot, tot, c, decode+(ofs) );
4711 
4712 #define stbir__store_output()                     \
4713     stbir__simdf_0123to2301( c, tot );            \
4714     stbir__simdf_add( tot, tot, c );              \
4715     stbir__simdf_0123to1230( c, tot );            \
4716     stbir__simdf_add1( tot, tot, c );             \
4717     stbir__simdf_store1( output, tot );           \
4718     horizontal_coefficients += coefficient_width; \
4719     ++horizontal_contributors;                    \
4720     output += 1;
4721 
4722 #else
4723 
4724 #define stbir__1_coeff_only()  \
4725     float tot;                 \
4726     tot = decode[0]*hc[0];
4727 
4728 #define stbir__2_coeff_only()  \
4729     float tot;                 \
4730     tot = decode[0] * hc[0];   \
4731     tot += decode[1] * hc[1];
4732 
4733 #define stbir__3_coeff_only()  \
4734     float tot;                 \
4735     tot = decode[0] * hc[0];   \
4736     tot += decode[1] * hc[1];  \
4737     tot += decode[2] * hc[2];
4738 
4739 #define stbir__store_output_tiny()                \
4740     output[0] = tot;                              \
4741     horizontal_coefficients += coefficient_width; \
4742     ++horizontal_contributors;                    \
4743     output += 1;
4744 
4745 #define stbir__4_coeff_start()  \
4746     float tot0,tot1,tot2,tot3;  \
4747     tot0 = decode[0] * hc[0];   \
4748     tot1 = decode[1] * hc[1];   \
4749     tot2 = decode[2] * hc[2];   \
4750     tot3 = decode[3] * hc[3];
4751 
4752 #define stbir__4_coeff_continue_from_4( ofs )  \
4753     tot0 += decode[0+(ofs)] * hc[0+(ofs)];     \
4754     tot1 += decode[1+(ofs)] * hc[1+(ofs)];     \
4755     tot2 += decode[2+(ofs)] * hc[2+(ofs)];     \
4756     tot3 += decode[3+(ofs)] * hc[3+(ofs)];
4757 
4758 #define stbir__1_coeff_remnant( ofs )        \
4759     tot0 += decode[0+(ofs)] * hc[0+(ofs)];
4760 
4761 #define stbir__2_coeff_remnant( ofs )        \
4762     tot0 += decode[0+(ofs)] * hc[0+(ofs)];   \
4763     tot1 += decode[1+(ofs)] * hc[1+(ofs)];   \
4764 
4765 #define stbir__3_coeff_remnant( ofs )        \
4766     tot0 += decode[0+(ofs)] * hc[0+(ofs)];   \
4767     tot1 += decode[1+(ofs)] * hc[1+(ofs)];   \
4768     tot2 += decode[2+(ofs)] * hc[2+(ofs)];
4769 
4770 #define stbir__store_output()                     \
4771     output[0] = (tot0+tot2)+(tot1+tot3);          \
4772     horizontal_coefficients += coefficient_width; \
4773     ++horizontal_contributors;                    \
4774     output += 1;
4775 
4776 #endif
4777 
4778 #define STBIR__horizontal_channels 1
4779 #define STB_IMAGE_RESIZE_DO_HORIZONTALS
4780 #include STBIR__HEADER_FILENAME
4781 
4782 
4783 //=================
4784 // Do 2 channel horizontal routines
4785 
4786 #ifdef STBIR_SIMD
4787 
4788 #define stbir__1_coeff_only()         \
4789     stbir__simdf tot,c,d;             \
4790     STBIR_SIMD_NO_UNROLL(decode);     \
4791     stbir__simdf_load1z( c, hc );     \
4792     stbir__simdf_0123to0011( c, c );  \
4793     stbir__simdf_load2( d, decode );  \
4794     stbir__simdf_mult( tot, d, c );
4795 
4796 #define stbir__2_coeff_only()         \
4797     stbir__simdf tot,c;               \
4798     STBIR_SIMD_NO_UNROLL(decode);     \
4799     stbir__simdf_load2( c, hc );      \
4800     stbir__simdf_0123to0011( c, c );  \
4801     stbir__simdf_mult_mem( tot, c, decode );
4802 
4803 #define stbir__3_coeff_only()                \
4804     stbir__simdf tot,c,cs,d;                 \
4805     STBIR_SIMD_NO_UNROLL(decode);            \
4806     stbir__simdf_load( cs, hc );             \
4807     stbir__simdf_0123to0011( c, cs );        \
4808     stbir__simdf_mult_mem( tot, c, decode ); \
4809     stbir__simdf_0123to2222( c, cs );        \
4810     stbir__simdf_load2z( d, decode+4 );      \
4811     stbir__simdf_madd( tot, tot, d, c );
4812 
4813 #define stbir__store_output_tiny()                \
4814     stbir__simdf_0123to2301( c, tot );            \
4815     stbir__simdf_add( tot, tot, c );              \
4816     stbir__simdf_store2( output, tot );           \
4817     horizontal_coefficients += coefficient_width; \
4818     ++horizontal_contributors;                    \
4819     output += 2;
4820 
4821 #ifdef STBIR_SIMD8
4822 
4823 #define stbir__4_coeff_start()                    \
4824     stbir__simdf8 tot0,c,cs;                      \
4825     STBIR_SIMD_NO_UNROLL(decode);                 \
4826     stbir__simdf8_load4b( cs, hc );               \
4827     stbir__simdf8_0123to00112233( c, cs );        \
4828     stbir__simdf8_mult_mem( tot0, c, decode );
4829 
4830 #define stbir__4_coeff_continue_from_4( ofs )        \
4831     STBIR_SIMD_NO_UNROLL(decode);                    \
4832     stbir__simdf8_load4b( cs, hc + (ofs) );          \
4833     stbir__simdf8_0123to00112233( c, cs );           \
4834     stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*2 );
4835 
4836 #define stbir__1_coeff_remnant( ofs )                \
4837     { stbir__simdf t,d;                              \
4838     stbir__simdf_load1z( t, hc + (ofs) );            \
4839     stbir__simdf_load2( d, decode + (ofs) * 2 );     \
4840     stbir__simdf_0123to0011( t, t );                 \
4841     stbir__simdf_mult( t, t, d );                    \
4842     stbir__simdf8_add4( tot0, tot0, t ); }
4843  
4844 #define stbir__2_coeff_remnant( ofs )                \
4845     { stbir__simdf t;                                \
4846     stbir__simdf_load2( t, hc + (ofs) );             \
4847     stbir__simdf_0123to0011( t, t );                 \
4848     stbir__simdf_mult_mem( t, t, decode+(ofs)*2 );   \
4849     stbir__simdf8_add4( tot0, tot0, t ); }
4850 
4851 #define stbir__3_coeff_remnant( ofs )                \
4852     { stbir__simdf8 d;                               \
4853     stbir__simdf8_load4b( cs, hc + (ofs) );          \
4854     stbir__simdf8_0123to00112233( c, cs );           \
4855     stbir__simdf8_load6z( d, decode+(ofs)*2 );       \
4856     stbir__simdf8_madd( tot0, tot0, c, d ); }
4857 
4858 #define stbir__store_output()                     \
4859     { stbir__simdf t,d;                           \
4860     stbir__simdf8_add4halves( t, stbir__if_simdf8_cast_to_simdf4(tot0), tot0 );    \
4861     stbir__simdf_0123to2301( d, t );              \
4862     stbir__simdf_add( t, t, d );                  \
4863     stbir__simdf_store2( output, t );             \
4864     horizontal_coefficients += coefficient_width; \
4865     ++horizontal_contributors;                    \
4866     output += 2; }
4867 
4868 #else
4869 
4870 #define stbir__4_coeff_start()                   \
4871     stbir__simdf tot0,tot1,c,cs;                 \
4872     STBIR_SIMD_NO_UNROLL(decode);                \
4873     stbir__simdf_load( cs, hc );                 \
4874     stbir__simdf_0123to0011( c, cs );            \
4875     stbir__simdf_mult_mem( tot0, c, decode );    \
4876     stbir__simdf_0123to2233( c, cs );            \
4877     stbir__simdf_mult_mem( tot1, c, decode+4 );
4878 
4879 #define stbir__4_coeff_continue_from_4( ofs )                \
4880     STBIR_SIMD_NO_UNROLL(decode);                            \
4881     stbir__simdf_load( cs, hc + (ofs) );                     \
4882     stbir__simdf_0123to0011( c, cs );                        \
4883     stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*2 );  \
4884     stbir__simdf_0123to2233( c, cs );                        \
4885     stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*2+4 );
4886 
4887 #define stbir__1_coeff_remnant( ofs )            \
4888     { stbir__simdf d;                            \
4889     stbir__simdf_load1z( cs, hc + (ofs) );       \
4890     stbir__simdf_0123to0011( c, cs );            \
4891     stbir__simdf_load2( d, decode + (ofs) * 2 ); \
4892     stbir__simdf_madd( tot0, tot0, d, c ); }
4893 
4894 #define stbir__2_coeff_remnant( ofs )                      \
4895     stbir__simdf_load2( cs, hc + (ofs) );                  \
4896     stbir__simdf_0123to0011( c, cs );                      \
4897     stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*2 );
4898 
4899 #define stbir__3_coeff_remnant( ofs )                       \
4900     { stbir__simdf d;                                       \
4901     stbir__simdf_load( cs, hc + (ofs) );                    \
4902     stbir__simdf_0123to0011( c, cs );                       \
4903     stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*2 ); \
4904     stbir__simdf_0123to2222( c, cs );                       \
4905     stbir__simdf_load2z( d, decode + (ofs) * 2 + 4 );       \
4906     stbir__simdf_madd( tot1, tot1, d, c ); }
4907 
4908 #define stbir__store_output()                     \
4909     stbir__simdf_add( tot0, tot0, tot1 );         \
4910     stbir__simdf_0123to2301( c, tot0 );           \
4911     stbir__simdf_add( tot0, tot0, c );            \
4912     stbir__simdf_store2( output, tot0 );          \
4913     horizontal_coefficients += coefficient_width; \
4914     ++horizontal_contributors;                    \
4915     output += 2;
4916 
4917 #endif
4918 
4919 #else
4920 
4921 #define stbir__1_coeff_only()  \
4922     float tota,totb,c;         \
4923     c = hc[0];                 \
4924     tota = decode[0]*c;        \
4925     totb = decode[1]*c;
4926 
4927 #define stbir__2_coeff_only()  \
4928     float tota,totb,c;         \
4929     c = hc[0];                 \
4930     tota = decode[0]*c;        \
4931     totb = decode[1]*c;        \
4932     c = hc[1];                 \
4933     tota += decode[2]*c;       \
4934     totb += decode[3]*c;
4935 
4936 // this weird order of add matches the simd
4937 #define stbir__3_coeff_only()  \
4938     float tota,totb,c;         \
4939     c = hc[0];                 \
4940     tota = decode[0]*c;        \
4941     totb = decode[1]*c;        \
4942     c = hc[2];                 \
4943     tota += decode[4]*c;       \
4944     totb += decode[5]*c;       \
4945     c = hc[1];                 \
4946     tota += decode[2]*c;       \
4947     totb += decode[3]*c;
4948 
4949 #define stbir__store_output_tiny()                \
4950     output[0] = tota;                             \
4951     output[1] = totb;                             \
4952     horizontal_coefficients += coefficient_width; \
4953     ++horizontal_contributors;                    \
4954     output += 2;
4955 
4956 #define stbir__4_coeff_start()      \
4957     float tota0,tota1,tota2,tota3,totb0,totb1,totb2,totb3,c;  \
4958     c = hc[0];                      \
4959     tota0 = decode[0]*c;            \
4960     totb0 = decode[1]*c;            \
4961     c = hc[1];                      \
4962     tota1 = decode[2]*c;            \
4963     totb1 = decode[3]*c;            \
4964     c = hc[2];                      \
4965     tota2 = decode[4]*c;            \
4966     totb2 = decode[5]*c;            \
4967     c = hc[3];                      \
4968     tota3 = decode[6]*c;            \
4969     totb3 = decode[7]*c;
4970 
4971 #define stbir__4_coeff_continue_from_4( ofs )  \
4972     c = hc[0+(ofs)];                           \
4973     tota0 += decode[0+(ofs)*2]*c;              \
4974     totb0 += decode[1+(ofs)*2]*c;              \
4975     c = hc[1+(ofs)];                           \
4976     tota1 += decode[2+(ofs)*2]*c;              \
4977     totb1 += decode[3+(ofs)*2]*c;              \
4978     c = hc[2+(ofs)];                           \
4979     tota2 += decode[4+(ofs)*2]*c;              \
4980     totb2 += decode[5+(ofs)*2]*c;              \
4981     c = hc[3+(ofs)];                           \
4982     tota3 += decode[6+(ofs)*2]*c;              \
4983     totb3 += decode[7+(ofs)*2]*c;
4984 
4985 #define stbir__1_coeff_remnant( ofs )  \
4986     c = hc[0+(ofs)];                   \
4987     tota0 += decode[0+(ofs)*2] * c;    \
4988     totb0 += decode[1+(ofs)*2] * c;
4989 
4990 #define stbir__2_coeff_remnant( ofs )  \
4991     c = hc[0+(ofs)];                   \
4992     tota0 += decode[0+(ofs)*2] * c;    \
4993     totb0 += decode[1+(ofs)*2] * c;    \
4994     c = hc[1+(ofs)];                   \
4995     tota1 += decode[2+(ofs)*2] * c;    \
4996     totb1 += decode[3+(ofs)*2] * c;
4997 
4998 #define stbir__3_coeff_remnant( ofs )  \
4999     c = hc[0+(ofs)];                   \
5000     tota0 += decode[0+(ofs)*2] * c;    \
5001     totb0 += decode[1+(ofs)*2] * c;    \
5002     c = hc[1+(ofs)];                   \
5003     tota1 += decode[2+(ofs)*2] * c;    \
5004     totb1 += decode[3+(ofs)*2] * c;    \
5005     c = hc[2+(ofs)];                   \
5006     tota2 += decode[4+(ofs)*2] * c;    \
5007     totb2 += decode[5+(ofs)*2] * c;
5008 
5009 #define stbir__store_output()                     \
5010     output[0] = (tota0+tota2)+(tota1+tota3);      \
5011     output[1] = (totb0+totb2)+(totb1+totb3);      \
5012     horizontal_coefficients += coefficient_width; \
5013     ++horizontal_contributors;                    \
5014     output += 2;
5015 
5016 #endif
5017 
5018 #define STBIR__horizontal_channels 2
5019 #define STB_IMAGE_RESIZE_DO_HORIZONTALS
5020 #include STBIR__HEADER_FILENAME
5021 
5022 
5023 //=================
5024 // Do 3 channel horizontal routines
5025 
5026 #ifdef STBIR_SIMD
5027 
5028 #define stbir__1_coeff_only()         \
5029     stbir__simdf tot,c,d;             \
5030     STBIR_SIMD_NO_UNROLL(decode);     \
5031     stbir__simdf_load1z( c, hc );     \
5032     stbir__simdf_0123to0001( c, c );  \
5033     stbir__simdf_load( d, decode );   \
5034     stbir__simdf_mult( tot, d, c );
5035 
5036 #define stbir__2_coeff_only()         \
5037     stbir__simdf tot,c,cs,d;          \
5038     STBIR_SIMD_NO_UNROLL(decode);     \
5039     stbir__simdf_load2( cs, hc );     \
5040     stbir__simdf_0123to0000( c, cs ); \
5041     stbir__simdf_load( d, decode );   \
5042     stbir__simdf_mult( tot, d, c );   \
5043     stbir__simdf_0123to1111( c, cs ); \
5044     stbir__simdf_load( d, decode+3 ); \
5045     stbir__simdf_madd( tot, tot, d, c );
5046 
5047 #define stbir__3_coeff_only()            \
5048     stbir__simdf tot,c,d,cs;             \
5049     STBIR_SIMD_NO_UNROLL(decode);        \
5050     stbir__simdf_load( cs, hc );         \
5051     stbir__simdf_0123to0000( c, cs );    \
5052     stbir__simdf_load( d, decode );      \
5053     stbir__simdf_mult( tot, d, c );      \
5054     stbir__simdf_0123to1111( c, cs );    \
5055     stbir__simdf_load( d, decode+3 );    \
5056     stbir__simdf_madd( tot, tot, d, c ); \
5057     stbir__simdf_0123to2222( c, cs );    \
5058     stbir__simdf_load( d, decode+6 );    \
5059     stbir__simdf_madd( tot, tot, d, c );
5060 
5061 #define stbir__store_output_tiny()                \
5062     stbir__simdf_store2( output, tot );           \
5063     stbir__simdf_0123to2301( tot, tot );          \
5064     stbir__simdf_store1( output+2, tot );         \
5065     horizontal_coefficients += coefficient_width; \
5066     ++horizontal_contributors;                    \
5067     output += 3;
5068 
5069 #ifdef STBIR_SIMD8
5070 
5071 // we're loading from the XXXYYY decode by -1 to get the XXXYYY into different halves of the AVX reg fyi
5072 #define stbir__4_coeff_start()                     \
5073     stbir__simdf8 tot0,tot1,c,cs; stbir__simdf t;  \
5074     STBIR_SIMD_NO_UNROLL(decode);                  \
5075     stbir__simdf8_load4b( cs, hc );                \
5076     stbir__simdf8_0123to00001111( c, cs );         \
5077     stbir__simdf8_mult_mem( tot0, c, decode - 1 ); \
5078     stbir__simdf8_0123to22223333( c, cs );         \
5079     stbir__simdf8_mult_mem( tot1, c, decode+6 - 1 );
5080 
5081 #define stbir__4_coeff_continue_from_4( ofs )      \
5082     STBIR_SIMD_NO_UNROLL(decode);                  \
5083     stbir__simdf8_load4b( cs, hc + (ofs) );        \
5084     stbir__simdf8_0123to00001111( c, cs );         \
5085     stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*3 - 1 ); \
5086     stbir__simdf8_0123to22223333( c, cs );         \
5087     stbir__simdf8_madd_mem( tot1, tot1, c, decode+(ofs)*3 + 6 - 1 );
5088 
5089 #define stbir__1_coeff_remnant( ofs )                          \
5090     STBIR_SIMD_NO_UNROLL(decode);                              \
5091     stbir__simdf_load1rep4( t, hc + (ofs) );                   \
5092     stbir__simdf8_madd_mem4( tot0, tot0, t, decode+(ofs)*3 - 1 );
5093 
5094 #define stbir__2_coeff_remnant( ofs )                          \
5095     STBIR_SIMD_NO_UNROLL(decode);                              \
5096     stbir__simdf8_load4b( cs, hc + (ofs) - 2 );                \
5097     stbir__simdf8_0123to22223333( c, cs );                     \
5098     stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*3 - 1 );
5099 
5100  #define stbir__3_coeff_remnant( ofs )                           \
5101     STBIR_SIMD_NO_UNROLL(decode);                                \
5102     stbir__simdf8_load4b( cs, hc + (ofs) );                      \
5103     stbir__simdf8_0123to00001111( c, cs );                       \
5104     stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*3 - 1 ); \
5105     stbir__simdf8_0123to2222( t, cs );                           \
5106     stbir__simdf8_madd_mem4( tot1, tot1, t, decode+(ofs)*3 + 6 - 1 );
5107 
5108 #define stbir__store_output()                       \
5109     stbir__simdf8_add( tot0, tot0, tot1 );          \
5110     stbir__simdf_0123to1230( t, stbir__if_simdf8_cast_to_simdf4( tot0 ) ); \
5111     stbir__simdf8_add4halves( t, t, tot0 );         \
5112     horizontal_coefficients += coefficient_width;   \
5113     ++horizontal_contributors;                      \
5114     output += 3;                                    \
5115     if ( output < output_end )                      \
5116     {                                               \
5117       stbir__simdf_store( output-3, t );            \
5118       continue;                                     \
5119     }                                               \
5120     { stbir__simdf tt; stbir__simdf_0123to2301( tt, t ); \
5121     stbir__simdf_store2( output-3, t );             \
5122     stbir__simdf_store1( output+2-3, tt ); }        \
5123     break;
5124 
5125 
5126 #else
5127 
5128 #define stbir__4_coeff_start()                  \
5129     stbir__simdf tot0,tot1,tot2,c,cs;           \
5130     STBIR_SIMD_NO_UNROLL(decode);               \
5131     stbir__simdf_load( cs, hc );                \
5132     stbir__simdf_0123to0001( c, cs );           \
5133     stbir__simdf_mult_mem( tot0, c, decode );   \
5134     stbir__simdf_0123to1122( c, cs );           \
5135     stbir__simdf_mult_mem( tot1, c, decode+4 ); \
5136     stbir__simdf_0123to2333( c, cs );           \
5137     stbir__simdf_mult_mem( tot2, c, decode+8 );
5138 
5139 #define stbir__4_coeff_continue_from_4( ofs )                 \
5140     STBIR_SIMD_NO_UNROLL(decode);                             \
5141     stbir__simdf_load( cs, hc + (ofs) );                      \
5142     stbir__simdf_0123to0001( c, cs );                         \
5143     stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*3 );   \
5144     stbir__simdf_0123to1122( c, cs );                         \
5145     stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*3+4 ); \
5146     stbir__simdf_0123to2333( c, cs );                         \
5147     stbir__simdf_madd_mem( tot2, tot2, c, decode+(ofs)*3+8 );
5148 
5149 #define stbir__1_coeff_remnant( ofs )         \
5150     STBIR_SIMD_NO_UNROLL(decode);             \
5151     stbir__simdf_load1z( c, hc + (ofs) );     \
5152     stbir__simdf_0123to0001( c, c );          \
5153     stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*3 );
5154 
5155 #define stbir__2_coeff_remnant( ofs )                       \
5156     { stbir__simdf d;                                       \
5157     STBIR_SIMD_NO_UNROLL(decode);                           \
5158     stbir__simdf_load2z( cs, hc + (ofs) );                  \
5159     stbir__simdf_0123to0001( c, cs );                       \
5160     stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*3 ); \
5161     stbir__simdf_0123to1122( c, cs );                       \
5162     stbir__simdf_load2z( d, decode+(ofs)*3+4 );             \
5163     stbir__simdf_madd( tot1, tot1, c, d ); }
5164 
5165 #define stbir__3_coeff_remnant( ofs )                         \
5166     { stbir__simdf d;                                         \
5167     STBIR_SIMD_NO_UNROLL(decode);                             \
5168     stbir__simdf_load( cs, hc + (ofs) );                      \
5169     stbir__simdf_0123to0001( c, cs );                         \
5170     stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*3 );   \
5171     stbir__simdf_0123to1122( c, cs );                         \
5172     stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*3+4 ); \
5173     stbir__simdf_0123to2222( c, cs );                         \
5174     stbir__simdf_load1z( d, decode+(ofs)*3+8 );               \
5175     stbir__simdf_madd( tot2, tot2, c, d );  }
5176 
5177 #define stbir__store_output()                       \
5178     stbir__simdf_0123ABCDto3ABx( c, tot0, tot1 );   \
5179     stbir__simdf_0123ABCDto23Ax( cs, tot1, tot2 );  \
5180     stbir__simdf_0123to1230( tot2, tot2 );          \
5181     stbir__simdf_add( tot0, tot0, cs );             \
5182     stbir__simdf_add( c, c, tot2 );                 \
5183     stbir__simdf_add( tot0, tot0, c );              \
5184     horizontal_coefficients += coefficient_width;   \
5185     ++horizontal_contributors;                      \
5186     output += 3;                                    \
5187     if ( output < output_end )                      \
5188     {                                               \
5189       stbir__simdf_store( output-3, tot0 );         \
5190       continue;                                     \
5191     }                                               \
5192     stbir__simdf_0123to2301( tot1, tot0 );          \
5193     stbir__simdf_store2( output-3, tot0 );          \
5194     stbir__simdf_store1( output+2-3, tot1 );        \
5195     break;
5196 
5197 #endif
5198 
5199 #else
5200 
5201 #define stbir__1_coeff_only()  \
5202     float tot0, tot1, tot2, c; \
5203     c = hc[0];                 \
5204     tot0 = decode[0]*c;        \
5205     tot1 = decode[1]*c;        \
5206     tot2 = decode[2]*c;
5207 
5208 #define stbir__2_coeff_only()  \
5209     float tot0, tot1, tot2, c; \
5210     c = hc[0];                 \
5211     tot0 = decode[0]*c;        \
5212     tot1 = decode[1]*c;        \
5213     tot2 = decode[2]*c;        \
5214     c = hc[1];                 \
5215     tot0 += decode[3]*c;       \
5216     tot1 += decode[4]*c;       \
5217     tot2 += decode[5]*c;
5218 
5219 #define stbir__3_coeff_only()  \
5220     float tot0, tot1, tot2, c; \
5221     c = hc[0];                 \
5222     tot0 = decode[0]*c;        \
5223     tot1 = decode[1]*c;        \
5224     tot2 = decode[2]*c;        \
5225     c = hc[1];                 \
5226     tot0 += decode[3]*c;       \
5227     tot1 += decode[4]*c;       \
5228     tot2 += decode[5]*c;       \
5229     c = hc[2];                 \
5230     tot0 += decode[6]*c;       \
5231     tot1 += decode[7]*c;       \
5232     tot2 += decode[8]*c;
5233 
5234 #define stbir__store_output_tiny()                \
5235     output[0] = tot0;                             \
5236     output[1] = tot1;                             \
5237     output[2] = tot2;                             \
5238     horizontal_coefficients += coefficient_width; \
5239     ++horizontal_contributors;                    \
5240     output += 3;
5241 
5242 #define stbir__4_coeff_start()      \
5243     float tota0,tota1,tota2,totb0,totb1,totb2,totc0,totc1,totc2,totd0,totd1,totd2,c;  \
5244     c = hc[0];                      \
5245     tota0 = decode[0]*c;            \
5246     tota1 = decode[1]*c;            \
5247     tota2 = decode[2]*c;            \
5248     c = hc[1];                      \
5249     totb0 = decode[3]*c;            \
5250     totb1 = decode[4]*c;            \
5251     totb2 = decode[5]*c;            \
5252     c = hc[2];                      \
5253     totc0 = decode[6]*c;            \
5254     totc1 = decode[7]*c;            \
5255     totc2 = decode[8]*c;            \
5256     c = hc[3];                      \
5257     totd0 = decode[9]*c;            \
5258     totd1 = decode[10]*c;           \
5259     totd2 = decode[11]*c;
5260 
5261 #define stbir__4_coeff_continue_from_4( ofs )  \
5262     c = hc[0+(ofs)];                           \
5263     tota0 += decode[0+(ofs)*3]*c;              \
5264     tota1 += decode[1+(ofs)*3]*c;              \
5265     tota2 += decode[2+(ofs)*3]*c;              \
5266     c = hc[1+(ofs)];                           \
5267     totb0 += decode[3+(ofs)*3]*c;              \
5268     totb1 += decode[4+(ofs)*3]*c;              \
5269     totb2 += decode[5+(ofs)*3]*c;              \
5270     c = hc[2+(ofs)];                           \
5271     totc0 += decode[6+(ofs)*3]*c;              \
5272     totc1 += decode[7+(ofs)*3]*c;              \
5273     totc2 += decode[8+(ofs)*3]*c;              \
5274     c = hc[3+(ofs)];                           \
5275     totd0 += decode[9+(ofs)*3]*c;              \
5276     totd1 += decode[10+(ofs)*3]*c;             \
5277     totd2 += decode[11+(ofs)*3]*c;
5278 
5279 #define stbir__1_coeff_remnant( ofs )  \
5280     c = hc[0+(ofs)];                   \
5281     tota0 += decode[0+(ofs)*3]*c;      \
5282     tota1 += decode[1+(ofs)*3]*c;      \
5283     tota2 += decode[2+(ofs)*3]*c;
5284 
5285 #define stbir__2_coeff_remnant( ofs )  \
5286     c = hc[0+(ofs)];                   \
5287     tota0 += decode[0+(ofs)*3]*c;      \
5288     tota1 += decode[1+(ofs)*3]*c;      \
5289     tota2 += decode[2+(ofs)*3]*c;      \
5290     c = hc[1+(ofs)];                   \
5291     totb0 += decode[3+(ofs)*3]*c;      \
5292     totb1 += decode[4+(ofs)*3]*c;      \
5293     totb2 += decode[5+(ofs)*3]*c;      \
5294 
5295 #define stbir__3_coeff_remnant( ofs )  \
5296     c = hc[0+(ofs)];                   \
5297     tota0 += decode[0+(ofs)*3]*c;      \
5298     tota1 += decode[1+(ofs)*3]*c;      \
5299     tota2 += decode[2+(ofs)*3]*c;      \
5300     c = hc[1+(ofs)];                   \
5301     totb0 += decode[3+(ofs)*3]*c;      \
5302     totb1 += decode[4+(ofs)*3]*c;      \
5303     totb2 += decode[5+(ofs)*3]*c;      \
5304     c = hc[2+(ofs)];                   \
5305     totc0 += decode[6+(ofs)*3]*c;      \
5306     totc1 += decode[7+(ofs)*3]*c;      \
5307     totc2 += decode[8+(ofs)*3]*c;
5308 
5309 #define stbir__store_output()                     \
5310     output[0] = (tota0+totc0)+(totb0+totd0);      \
5311     output[1] = (tota1+totc1)+(totb1+totd1);      \
5312     output[2] = (tota2+totc2)+(totb2+totd2);      \
5313     horizontal_coefficients += coefficient_width; \
5314     ++horizontal_contributors;                    \
5315     output += 3;
5316 
5317 #endif
5318 
5319 #define STBIR__horizontal_channels 3
5320 #define STB_IMAGE_RESIZE_DO_HORIZONTALS
5321 #include STBIR__HEADER_FILENAME
5322 
5323 //=================
5324 // Do 4 channel horizontal routines
5325 
5326 #ifdef STBIR_SIMD
5327 
5328 #define stbir__1_coeff_only()             \
5329     stbir__simdf tot,c;                   \
5330     STBIR_SIMD_NO_UNROLL(decode);         \
5331     stbir__simdf_load1( c, hc );          \
5332     stbir__simdf_0123to0000( c, c );      \
5333     stbir__simdf_mult_mem( tot, c, decode );
5334 
5335 #define stbir__2_coeff_only()                       \
5336     stbir__simdf tot,c,cs;                          \
5337     STBIR_SIMD_NO_UNROLL(decode);                   \
5338     stbir__simdf_load2( cs, hc );                   \
5339     stbir__simdf_0123to0000( c, cs );               \
5340     stbir__simdf_mult_mem( tot, c, decode );        \
5341     stbir__simdf_0123to1111( c, cs );               \
5342     stbir__simdf_madd_mem( tot, tot, c, decode+4 );
5343 
5344 #define stbir__3_coeff_only()                       \
5345     stbir__simdf tot,c,cs;                          \
5346     STBIR_SIMD_NO_UNROLL(decode);                   \
5347     stbir__simdf_load( cs, hc );                    \
5348     stbir__simdf_0123to0000( c, cs );               \
5349     stbir__simdf_mult_mem( tot, c, decode );        \
5350     stbir__simdf_0123to1111( c, cs );               \
5351     stbir__simdf_madd_mem( tot, tot, c, decode+4 ); \
5352     stbir__simdf_0123to2222( c, cs );               \
5353     stbir__simdf_madd_mem( tot, tot, c, decode+8 );
5354 
5355 #define stbir__store_output_tiny()                \
5356     stbir__simdf_store( output, tot );            \
5357     horizontal_coefficients += coefficient_width; \
5358     ++horizontal_contributors;                    \
5359     output += 4;
5360 
5361 #ifdef STBIR_SIMD8
5362 
5363 #define stbir__4_coeff_start()                     \
5364     stbir__simdf8 tot0,c,cs; stbir__simdf t;  \
5365     STBIR_SIMD_NO_UNROLL(decode);                  \
5366     stbir__simdf8_load4b( cs, hc );                \
5367     stbir__simdf8_0123to00001111( c, cs );         \
5368     stbir__simdf8_mult_mem( tot0, c, decode );     \
5369     stbir__simdf8_0123to22223333( c, cs );         \
5370     stbir__simdf8_madd_mem( tot0, tot0, c, decode+8 );
5371 
5372 #define stbir__4_coeff_continue_from_4( ofs )                  \
5373     STBIR_SIMD_NO_UNROLL(decode);                              \
5374     stbir__simdf8_load4b( cs, hc + (ofs) );                    \
5375     stbir__simdf8_0123to00001111( c, cs );                     \
5376     stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*4 );   \
5377     stbir__simdf8_0123to22223333( c, cs );                     \
5378     stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*4+8 );
5379 
5380 #define stbir__1_coeff_remnant( ofs )                          \
5381     STBIR_SIMD_NO_UNROLL(decode);                              \
5382     stbir__simdf_load1rep4( t, hc + (ofs) );                   \
5383     stbir__simdf8_madd_mem4( tot0, tot0, t, decode+(ofs)*4 );
5384 
5385 #define stbir__2_coeff_remnant( ofs )                          \
5386     STBIR_SIMD_NO_UNROLL(decode);                              \
5387     stbir__simdf8_load4b( cs, hc + (ofs) - 2 );                \
5388     stbir__simdf8_0123to22223333( c, cs );                     \
5389     stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*4 );
5390 
5391  #define stbir__3_coeff_remnant( ofs )                         \
5392     STBIR_SIMD_NO_UNROLL(decode);                              \
5393     stbir__simdf8_load4b( cs, hc + (ofs) );                    \
5394     stbir__simdf8_0123to00001111( c, cs );                     \
5395     stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*4 );   \
5396     stbir__simdf8_0123to2222( t, cs );                         \
5397     stbir__simdf8_madd_mem4( tot0, tot0, t, decode+(ofs)*4+8 );
5398 
5399 #define stbir__store_output()                      \
5400     stbir__simdf8_add4halves( t, stbir__if_simdf8_cast_to_simdf4(tot0), tot0 );     \
5401     stbir__simdf_store( output, t );               \
5402     horizontal_coefficients += coefficient_width;  \
5403     ++horizontal_contributors;                     \
5404     output += 4;
5405 
5406 #else
5407 
5408 #define stbir__4_coeff_start()                        \
5409     stbir__simdf tot0,tot1,c,cs;                      \
5410     STBIR_SIMD_NO_UNROLL(decode);                     \
5411     stbir__simdf_load( cs, hc );                      \
5412     stbir__simdf_0123to0000( c, cs );                 \
5413     stbir__simdf_mult_mem( tot0, c, decode );         \
5414     stbir__simdf_0123to1111( c, cs );                 \
5415     stbir__simdf_mult_mem( tot1, c, decode+4 );       \
5416     stbir__simdf_0123to2222( c, cs );                 \
5417     stbir__simdf_madd_mem( tot0, tot0, c, decode+8 ); \
5418     stbir__simdf_0123to3333( c, cs );                 \
5419     stbir__simdf_madd_mem( tot1, tot1, c, decode+12 );
5420 
5421 #define stbir__4_coeff_continue_from_4( ofs )                  \
5422     STBIR_SIMD_NO_UNROLL(decode);                              \
5423     stbir__simdf_load( cs, hc + (ofs) );                       \
5424     stbir__simdf_0123to0000( c, cs );                          \
5425     stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*4 );    \
5426     stbir__simdf_0123to1111( c, cs );                          \
5427     stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*4+4 );  \
5428     stbir__simdf_0123to2222( c, cs );                          \
5429     stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*4+8 );  \
5430     stbir__simdf_0123to3333( c, cs );                          \
5431     stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*4+12 );
5432 
5433 #define stbir__1_coeff_remnant( ofs )                       \
5434     STBIR_SIMD_NO_UNROLL(decode);                           \
5435     stbir__simdf_load1( c, hc + (ofs) );                    \
5436     stbir__simdf_0123to0000( c, c );                        \
5437     stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*4 );
5438 
5439 #define stbir__2_coeff_remnant( ofs )                         \
5440     STBIR_SIMD_NO_UNROLL(decode);                             \
5441     stbir__simdf_load2( cs, hc + (ofs) );                     \
5442     stbir__simdf_0123to0000( c, cs );                         \
5443     stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*4 );   \
5444     stbir__simdf_0123to1111( c, cs );                         \
5445     stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*4+4 );
5446 
5447 #define stbir__3_coeff_remnant( ofs )                          \
5448     STBIR_SIMD_NO_UNROLL(decode);                              \
5449     stbir__simdf_load( cs, hc + (ofs) );                       \
5450     stbir__simdf_0123to0000( c, cs );                          \
5451     stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*4 );    \
5452     stbir__simdf_0123to1111( c, cs );                          \
5453     stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*4+4 );  \
5454     stbir__simdf_0123to2222( c, cs );                          \
5455     stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*4+8 );
5456 
5457 #define stbir__store_output()                     \
5458     stbir__simdf_add( tot0, tot0, tot1 );         \
5459     stbir__simdf_store( output, tot0 );           \
5460     horizontal_coefficients += coefficient_width; \
5461     ++horizontal_contributors;                    \
5462     output += 4;
5463 
5464 #endif
5465 
5466 #else
5467 
5468 #define stbir__1_coeff_only()         \
5469     float p0,p1,p2,p3,c;              \
5470     STBIR_SIMD_NO_UNROLL(decode);     \
5471     c = hc[0];                        \
5472     p0 = decode[0] * c;               \
5473     p1 = decode[1] * c;               \
5474     p2 = decode[2] * c;               \
5475     p3 = decode[3] * c;
5476 
5477 #define stbir__2_coeff_only()         \
5478     float p0,p1,p2,p3,c;              \
5479     STBIR_SIMD_NO_UNROLL(decode);     \
5480     c = hc[0];                        \
5481     p0 = decode[0] * c;               \
5482     p1 = decode[1] * c;               \
5483     p2 = decode[2] * c;               \
5484     p3 = decode[3] * c;               \
5485     c = hc[1];                        \
5486     p0 += decode[4] * c;              \
5487     p1 += decode[5] * c;              \
5488     p2 += decode[6] * c;              \
5489     p3 += decode[7] * c;
5490 
5491 #define stbir__3_coeff_only()         \
5492     float p0,p1,p2,p3,c;              \
5493     STBIR_SIMD_NO_UNROLL(decode);     \
5494     c = hc[0];                        \
5495     p0 = decode[0] * c;               \
5496     p1 = decode[1] * c;               \
5497     p2 = decode[2] * c;               \
5498     p3 = decode[3] * c;               \
5499     c = hc[1];                        \
5500     p0 += decode[4] * c;              \
5501     p1 += decode[5] * c;              \
5502     p2 += decode[6] * c;              \
5503     p3 += decode[7] * c;              \
5504     c = hc[2];                        \
5505     p0 += decode[8] * c;              \
5506     p1 += decode[9] * c;              \
5507     p2 += decode[10] * c;             \
5508     p3 += decode[11] * c;
5509 
5510 #define stbir__store_output_tiny()                \
5511     output[0] = p0;                               \
5512     output[1] = p1;                               \
5513     output[2] = p2;                               \
5514     output[3] = p3;                               \
5515     horizontal_coefficients += coefficient_width; \
5516     ++horizontal_contributors;                    \
5517     output += 4;
5518 
5519 #define stbir__4_coeff_start()        \
5520     float x0,x1,x2,x3,y0,y1,y2,y3,c;  \
5521     STBIR_SIMD_NO_UNROLL(decode);     \
5522     c = hc[0];                        \
5523     x0 = decode[0] * c;               \
5524     x1 = decode[1] * c;               \
5525     x2 = decode[2] * c;               \
5526     x3 = decode[3] * c;               \
5527     c = hc[1];                        \
5528     y0 = decode[4] * c;               \
5529     y1 = decode[5] * c;               \
5530     y2 = decode[6] * c;               \
5531     y3 = decode[7] * c;               \
5532     c = hc[2];                        \
5533     x0 += decode[8] * c;              \
5534     x1 += decode[9] * c;              \
5535     x2 += decode[10] * c;             \
5536     x3 += decode[11] * c;             \
5537     c = hc[3];                        \
5538     y0 += decode[12] * c;             \
5539     y1 += decode[13] * c;             \
5540     y2 += decode[14] * c;             \
5541     y3 += decode[15] * c;
5542 
5543 #define stbir__4_coeff_continue_from_4( ofs ) \
5544     STBIR_SIMD_NO_UNROLL(decode);     \
5545     c = hc[0+(ofs)];                  \
5546     x0 += decode[0+(ofs)*4] * c;      \
5547     x1 += decode[1+(ofs)*4] * c;      \
5548     x2 += decode[2+(ofs)*4] * c;      \
5549     x3 += decode[3+(ofs)*4] * c;      \
5550     c = hc[1+(ofs)];                  \
5551     y0 += decode[4+(ofs)*4] * c;      \
5552     y1 += decode[5+(ofs)*4] * c;      \
5553     y2 += decode[6+(ofs)*4] * c;      \
5554     y3 += decode[7+(ofs)*4] * c;      \
5555     c = hc[2+(ofs)];                  \
5556     x0 += decode[8+(ofs)*4] * c;      \
5557     x1 += decode[9+(ofs)*4] * c;      \
5558     x2 += decode[10+(ofs)*4] * c;     \
5559     x3 += decode[11+(ofs)*4] * c;     \
5560     c = hc[3+(ofs)];                  \
5561     y0 += decode[12+(ofs)*4] * c;     \
5562     y1 += decode[13+(ofs)*4] * c;     \
5563     y2 += decode[14+(ofs)*4] * c;     \
5564     y3 += decode[15+(ofs)*4] * c;
5565 
5566 #define stbir__1_coeff_remnant( ofs ) \
5567     STBIR_SIMD_NO_UNROLL(decode);     \
5568     c = hc[0+(ofs)];                  \
5569     x0 += decode[0+(ofs)*4] * c;      \
5570     x1 += decode[1+(ofs)*4] * c;      \
5571     x2 += decode[2+(ofs)*4] * c;      \
5572     x3 += decode[3+(ofs)*4] * c;
5573 
5574 #define stbir__2_coeff_remnant( ofs ) \
5575     STBIR_SIMD_NO_UNROLL(decode);     \
5576     c = hc[0+(ofs)];                  \
5577     x0 += decode[0+(ofs)*4] * c;      \
5578     x1 += decode[1+(ofs)*4] * c;      \
5579     x2 += decode[2+(ofs)*4] * c;      \
5580     x3 += decode[3+(ofs)*4] * c;      \
5581     c = hc[1+(ofs)];                  \
5582     y0 += decode[4+(ofs)*4] * c;      \
5583     y1 += decode[5+(ofs)*4] * c;      \
5584     y2 += decode[6+(ofs)*4] * c;      \
5585     y3 += decode[7+(ofs)*4] * c;
5586 
5587 #define stbir__3_coeff_remnant( ofs ) \
5588     STBIR_SIMD_NO_UNROLL(decode);     \
5589     c = hc[0+(ofs)];                  \
5590     x0 += decode[0+(ofs)*4] * c;      \
5591     x1 += decode[1+(ofs)*4] * c;      \
5592     x2 += decode[2+(ofs)*4] * c;      \
5593     x3 += decode[3+(ofs)*4] * c;      \
5594     c = hc[1+(ofs)];                  \
5595     y0 += decode[4+(ofs)*4] * c;      \
5596     y1 += decode[5+(ofs)*4] * c;      \
5597     y2 += decode[6+(ofs)*4] * c;      \
5598     y3 += decode[7+(ofs)*4] * c;      \
5599     c = hc[2+(ofs)];                  \
5600     x0 += decode[8+(ofs)*4] * c;      \
5601     x1 += decode[9+(ofs)*4] * c;      \
5602     x2 += decode[10+(ofs)*4] * c;     \
5603     x3 += decode[11+(ofs)*4] * c;
5604 
5605 #define stbir__store_output()                     \
5606     output[0] = x0 + y0;                          \
5607     output[1] = x1 + y1;                          \
5608     output[2] = x2 + y2;                          \
5609     output[3] = x3 + y3;                          \
5610     horizontal_coefficients += coefficient_width; \
5611     ++horizontal_contributors;                    \
5612     output += 4;
5613 
5614 #endif
5615 
5616 #define STBIR__horizontal_channels 4
5617 #define STB_IMAGE_RESIZE_DO_HORIZONTALS
5618 #include STBIR__HEADER_FILENAME
5619 
5620 
5621 
5622 //=================
5623 // Do 7 channel horizontal routines
5624 
5625 #ifdef STBIR_SIMD
5626 
5627 #define stbir__1_coeff_only()                   \
5628     stbir__simdf tot0,tot1,c;                   \
5629     STBIR_SIMD_NO_UNROLL(decode);               \
5630     stbir__simdf_load1( c, hc );                \
5631     stbir__simdf_0123to0000( c, c );            \
5632     stbir__simdf_mult_mem( tot0, c, decode );   \
5633     stbir__simdf_mult_mem( tot1, c, decode+3 );
5634 
5635 #define stbir__2_coeff_only()                         \
5636     stbir__simdf tot0,tot1,c,cs;                      \
5637     STBIR_SIMD_NO_UNROLL(decode);                     \
5638     stbir__simdf_load2( cs, hc );                     \
5639     stbir__simdf_0123to0000( c, cs );                 \
5640     stbir__simdf_mult_mem( tot0, c, decode );         \
5641     stbir__simdf_mult_mem( tot1, c, decode+3 );       \
5642     stbir__simdf_0123to1111( c, cs );                 \
5643     stbir__simdf_madd_mem( tot0, tot0, c, decode+7 ); \
5644     stbir__simdf_madd_mem( tot1, tot1, c,decode+10 );
5645 
5646 #define stbir__3_coeff_only()                           \
5647     stbir__simdf tot0,tot1,c,cs;                        \
5648     STBIR_SIMD_NO_UNROLL(decode);                       \
5649     stbir__simdf_load( cs, hc );                        \
5650     stbir__simdf_0123to0000( c, cs );                   \
5651     stbir__simdf_mult_mem( tot0, c, decode );           \
5652     stbir__simdf_mult_mem( tot1, c, decode+3 );         \
5653     stbir__simdf_0123to1111( c, cs );                   \
5654     stbir__simdf_madd_mem( tot0, tot0, c, decode+7 );   \
5655     stbir__simdf_madd_mem( tot1, tot1, c, decode+10 );  \
5656     stbir__simdf_0123to2222( c, cs );                   \
5657     stbir__simdf_madd_mem( tot0, tot0, c, decode+14 );  \
5658     stbir__simdf_madd_mem( tot1, tot1, c, decode+17 );
5659 
5660 #define stbir__store_output_tiny()                \
5661     stbir__simdf_store( output+3, tot1 );         \
5662     stbir__simdf_store( output, tot0 );           \
5663     horizontal_coefficients += coefficient_width; \
5664     ++horizontal_contributors;                    \
5665     output += 7;
5666 
5667 #ifdef STBIR_SIMD8
5668 
5669 #define stbir__4_coeff_start()                     \
5670     stbir__simdf8 tot0,tot1,c,cs;                  \
5671     STBIR_SIMD_NO_UNROLL(decode);                  \
5672     stbir__simdf8_load4b( cs, hc );                \
5673     stbir__simdf8_0123to00000000( c, cs );         \
5674     stbir__simdf8_mult_mem( tot0, c, decode );     \
5675     stbir__simdf8_0123to11111111( c, cs );         \
5676     stbir__simdf8_mult_mem( tot1, c, decode+7 );   \
5677     stbir__simdf8_0123to22222222( c, cs );         \
5678     stbir__simdf8_madd_mem( tot0, tot0, c, decode+14 );  \
5679     stbir__simdf8_0123to33333333( c, cs );         \
5680     stbir__simdf8_madd_mem( tot1, tot1, c, decode+21 );
5681 
5682 #define stbir__4_coeff_continue_from_4( ofs )                   \
5683     STBIR_SIMD_NO_UNROLL(decode);                               \
5684     stbir__simdf8_load4b( cs, hc + (ofs) );                     \
5685     stbir__simdf8_0123to00000000( c, cs );                      \
5686     stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*7 );    \
5687     stbir__simdf8_0123to11111111( c, cs );                      \
5688     stbir__simdf8_madd_mem( tot1, tot1, c, decode+(ofs)*7+7 );  \
5689     stbir__simdf8_0123to22222222( c, cs );                      \
5690     stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*7+14 ); \
5691     stbir__simdf8_0123to33333333( c, cs );                      \
5692     stbir__simdf8_madd_mem( tot1, tot1, c, decode+(ofs)*7+21 );
5693 
5694 #define stbir__1_coeff_remnant( ofs )                           \
5695     STBIR_SIMD_NO_UNROLL(decode);                               \
5696     stbir__simdf8_load1b( c, hc + (ofs) );                      \
5697     stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*7 );
5698 
5699 #define stbir__2_coeff_remnant( ofs )                           \
5700     STBIR_SIMD_NO_UNROLL(decode);                               \
5701     stbir__simdf8_load1b( c, hc + (ofs) );                      \
5702     stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*7 );    \
5703     stbir__simdf8_load1b( c, hc + (ofs)+1 );                    \
5704     stbir__simdf8_madd_mem( tot1, tot1, c, decode+(ofs)*7+7 );
5705 
5706 #define stbir__3_coeff_remnant( ofs )                           \
5707     STBIR_SIMD_NO_UNROLL(decode);                               \
5708     stbir__simdf8_load4b( cs, hc + (ofs) );                     \
5709     stbir__simdf8_0123to00000000( c, cs );                      \
5710     stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*7 );    \
5711     stbir__simdf8_0123to11111111( c, cs );                      \
5712     stbir__simdf8_madd_mem( tot1, tot1, c, decode+(ofs)*7+7 );  \
5713     stbir__simdf8_0123to22222222( c, cs );                      \
5714     stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*7+14 );
5715 
5716 #define stbir__store_output()                     \
5717     stbir__simdf8_add( tot0, tot0, tot1 );        \
5718     horizontal_coefficients += coefficient_width; \
5719     ++horizontal_contributors;                    \
5720     output += 7;                                  \
5721     if ( output < output_end )                    \
5722     {                                             \
5723       stbir__simdf8_store( output-7, tot0 );      \
5724       continue;                                   \
5725     }                                             \
5726     stbir__simdf_store( output-7+3, stbir__simdf_swiz(stbir__simdf8_gettop4(tot0),0,0,1,2) ); \
5727     stbir__simdf_store( output-7, stbir__if_simdf8_cast_to_simdf4(tot0) );           \
5728     break;
5729 
5730 #else
5731 
5732 #define stbir__4_coeff_start()                    \
5733     stbir__simdf tot0,tot1,tot2,tot3,c,cs;        \
5734     STBIR_SIMD_NO_UNROLL(decode);                 \
5735     stbir__simdf_load( cs, hc );                  \
5736     stbir__simdf_0123to0000( c, cs );             \
5737     stbir__simdf_mult_mem( tot0, c, decode );     \
5738     stbir__simdf_mult_mem( tot1, c, decode+3 );   \
5739     stbir__simdf_0123to1111( c, cs );             \
5740     stbir__simdf_mult_mem( tot2, c, decode+7 );   \
5741     stbir__simdf_mult_mem( tot3, c, decode+10 );  \
5742     stbir__simdf_0123to2222( c, cs );             \
5743     stbir__simdf_madd_mem( tot0, tot0, c, decode+14 );  \
5744     stbir__simdf_madd_mem( tot1, tot1, c, decode+17 );  \
5745     stbir__simdf_0123to3333( c, cs );                   \
5746     stbir__simdf_madd_mem( tot2, tot2, c, decode+21 );  \
5747     stbir__simdf_madd_mem( tot3, tot3, c, decode+24 );
5748 
5749 #define stbir__4_coeff_continue_from_4( ofs )                   \
5750     STBIR_SIMD_NO_UNROLL(decode);                               \
5751     stbir__simdf_load( cs, hc + (ofs) );                        \
5752     stbir__simdf_0123to0000( c, cs );                           \
5753     stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*7 );     \
5754     stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*7+3 );   \
5755     stbir__simdf_0123to1111( c, cs );                           \
5756     stbir__simdf_madd_mem( tot2, tot2, c, decode+(ofs)*7+7 );   \
5757     stbir__simdf_madd_mem( tot3, tot3, c, decode+(ofs)*7+10 );  \
5758     stbir__simdf_0123to2222( c, cs );                           \
5759     stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*7+14 );  \
5760     stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*7+17 );  \
5761     stbir__simdf_0123to3333( c, cs );                           \
5762     stbir__simdf_madd_mem( tot2, tot2, c, decode+(ofs)*7+21 );  \
5763     stbir__simdf_madd_mem( tot3, tot3, c, decode+(ofs)*7+24 );
5764 
5765 #define stbir__1_coeff_remnant( ofs )                           \
5766     STBIR_SIMD_NO_UNROLL(decode);                               \
5767     stbir__simdf_load1( c, hc + (ofs) );                        \
5768     stbir__simdf_0123to0000( c, c );                            \
5769     stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*7 );     \
5770     stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*7+3 );   \
5771 
5772 #define stbir__2_coeff_remnant( ofs )                           \
5773     STBIR_SIMD_NO_UNROLL(decode);                               \
5774     stbir__simdf_load2( cs, hc + (ofs) );                       \
5775     stbir__simdf_0123to0000( c, cs );                           \
5776     stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*7 );     \
5777     stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*7+3 );   \
5778     stbir__simdf_0123to1111( c, cs );                           \
5779     stbir__simdf_madd_mem( tot2, tot2, c, decode+(ofs)*7+7 );   \
5780     stbir__simdf_madd_mem( tot3, tot3, c, decode+(ofs)*7+10 );
5781 
5782 #define stbir__3_coeff_remnant( ofs )                           \
5783     STBIR_SIMD_NO_UNROLL(decode);                               \
5784     stbir__simdf_load( cs, hc + (ofs) );                        \
5785     stbir__simdf_0123to0000( c, cs );                           \
5786     stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*7 );     \
5787     stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*7+3 );   \
5788     stbir__simdf_0123to1111( c, cs );                           \
5789     stbir__simdf_madd_mem( tot2, tot2, c, decode+(ofs)*7+7 );   \
5790     stbir__simdf_madd_mem( tot3, tot3, c, decode+(ofs)*7+10 );  \
5791     stbir__simdf_0123to2222( c, cs );                           \
5792     stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*7+14 );  \
5793     stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*7+17 );
5794 
5795 #define stbir__store_output()                     \
5796     stbir__simdf_add( tot0, tot0, tot2 );         \
5797     stbir__simdf_add( tot1, tot1, tot3 );         \
5798     stbir__simdf_store( output+3, tot1 );         \
5799     stbir__simdf_store( output, tot0 );           \
5800     horizontal_coefficients += coefficient_width; \
5801     ++horizontal_contributors;                    \
5802     output += 7;
5803 
5804 #endif
5805 
5806 #else
5807 
5808 #define stbir__1_coeff_only()        \
5809     float tot0, tot1, tot2, tot3, tot4, tot5, tot6, c; \
5810     c = hc[0];                       \
5811     tot0 = decode[0]*c;              \
5812     tot1 = decode[1]*c;              \
5813     tot2 = decode[2]*c;              \
5814     tot3 = decode[3]*c;              \
5815     tot4 = decode[4]*c;              \
5816     tot5 = decode[5]*c;              \
5817     tot6 = decode[6]*c;
5818 
5819 #define stbir__2_coeff_only()        \
5820     float tot0, tot1, tot2, tot3, tot4, tot5, tot6, c; \
5821     c = hc[0];                       \
5822     tot0 = decode[0]*c;              \
5823     tot1 = decode[1]*c;              \
5824     tot2 = decode[2]*c;              \
5825     tot3 = decode[3]*c;              \
5826     tot4 = decode[4]*c;              \
5827     tot5 = decode[5]*c;              \
5828     tot6 = decode[6]*c;              \
5829     c = hc[1];                       \
5830     tot0 += decode[7]*c;             \
5831     tot1 += decode[8]*c;             \
5832     tot2 += decode[9]*c;             \
5833     tot3 += decode[10]*c;            \
5834     tot4 += decode[11]*c;            \
5835     tot5 += decode[12]*c;            \
5836     tot6 += decode[13]*c;            \
5837 
5838 #define stbir__3_coeff_only()        \
5839     float tot0, tot1, tot2, tot3, tot4, tot5, tot6, c; \
5840     c = hc[0];                       \
5841     tot0 = decode[0]*c;              \
5842     tot1 = decode[1]*c;              \
5843     tot2 = decode[2]*c;              \
5844     tot3 = decode[3]*c;              \
5845     tot4 = decode[4]*c;              \
5846     tot5 = decode[5]*c;              \
5847     tot6 = decode[6]*c;              \
5848     c = hc[1];                       \
5849     tot0 += decode[7]*c;             \
5850     tot1 += decode[8]*c;             \
5851     tot2 += decode[9]*c;             \
5852     tot3 += decode[10]*c;            \
5853     tot4 += decode[11]*c;            \
5854     tot5 += decode[12]*c;            \
5855     tot6 += decode[13]*c;            \
5856     c = hc[2];                       \
5857     tot0 += decode[14]*c;            \
5858     tot1 += decode[15]*c;            \
5859     tot2 += decode[16]*c;            \
5860     tot3 += decode[17]*c;            \
5861     tot4 += decode[18]*c;            \
5862     tot5 += decode[19]*c;            \
5863     tot6 += decode[20]*c;            \
5864 
5865 #define stbir__store_output_tiny()                \
5866     output[0] = tot0;                             \
5867     output[1] = tot1;                             \
5868     output[2] = tot2;                             \
5869     output[3] = tot3;                             \
5870     output[4] = tot4;                             \
5871     output[5] = tot5;                             \
5872     output[6] = tot6;                             \
5873     horizontal_coefficients += coefficient_width; \
5874     ++horizontal_contributors;                    \
5875     output += 7;
5876 
5877 #define stbir__4_coeff_start()    \
5878     float x0,x1,x2,x3,x4,x5,x6,y0,y1,y2,y3,y4,y5,y6,c; \
5879     STBIR_SIMD_NO_UNROLL(decode); \
5880     c = hc[0];                    \
5881     x0 = decode[0] * c;           \
5882     x1 = decode[1] * c;           \
5883     x2 = decode[2] * c;           \
5884     x3 = decode[3] * c;           \
5885     x4 = decode[4] * c;           \
5886     x5 = decode[5] * c;           \
5887     x6 = decode[6] * c;           \
5888     c = hc[1];                    \
5889     y0 = decode[7] * c;           \
5890     y1 = decode[8] * c;           \
5891     y2 = decode[9] * c;           \
5892     y3 = decode[10] * c;          \
5893     y4 = decode[11] * c;          \
5894     y5 = decode[12] * c;          \
5895     y6 = decode[13] * c;          \
5896     c = hc[2];                    \
5897     x0 += decode[14] * c;         \
5898     x1 += decode[15] * c;         \
5899     x2 += decode[16] * c;         \
5900     x3 += decode[17] * c;         \
5901     x4 += decode[18] * c;         \
5902     x5 += decode[19] * c;         \
5903     x6 += decode[20] * c;         \
5904     c = hc[3];                    \
5905     y0 += decode[21] * c;         \
5906     y1 += decode[22] * c;         \
5907     y2 += decode[23] * c;         \
5908     y3 += decode[24] * c;         \
5909     y4 += decode[25] * c;         \
5910     y5 += decode[26] * c;         \
5911     y6 += decode[27] * c;
5912 
5913 #define stbir__4_coeff_continue_from_4( ofs ) \
5914     STBIR_SIMD_NO_UNROLL(decode);  \
5915     c = hc[0+(ofs)];               \
5916     x0 += decode[0+(ofs)*7] * c;   \
5917     x1 += decode[1+(ofs)*7] * c;   \
5918     x2 += decode[2+(ofs)*7] * c;   \
5919     x3 += decode[3+(ofs)*7] * c;   \
5920     x4 += decode[4+(ofs)*7] * c;   \
5921     x5 += decode[5+(ofs)*7] * c;   \
5922     x6 += decode[6+(ofs)*7] * c;   \
5923     c = hc[1+(ofs)];               \
5924     y0 += decode[7+(ofs)*7] * c;   \
5925     y1 += decode[8+(ofs)*7] * c;   \
5926     y2 += decode[9+(ofs)*7] * c;   \
5927     y3 += decode[10+(ofs)*7] * c;  \
5928     y4 += decode[11+(ofs)*7] * c;  \
5929     y5 += decode[12+(ofs)*7] * c;  \
5930     y6 += decode[13+(ofs)*7] * c;  \
5931     c = hc[2+(ofs)];               \
5932     x0 += decode[14+(ofs)*7] * c;  \
5933     x1 += decode[15+(ofs)*7] * c;  \
5934     x2 += decode[16+(ofs)*7] * c;  \
5935     x3 += decode[17+(ofs)*7] * c;  \
5936     x4 += decode[18+(ofs)*7] * c;  \
5937     x5 += decode[19+(ofs)*7] * c;  \
5938     x6 += decode[20+(ofs)*7] * c;  \
5939     c = hc[3+(ofs)];               \
5940     y0 += decode[21+(ofs)*7] * c;  \
5941     y1 += decode[22+(ofs)*7] * c;  \
5942     y2 += decode[23+(ofs)*7] * c;  \
5943     y3 += decode[24+(ofs)*7] * c;  \
5944     y4 += decode[25+(ofs)*7] * c;  \
5945     y5 += decode[26+(ofs)*7] * c;  \
5946     y6 += decode[27+(ofs)*7] * c;
5947 
5948 #define stbir__1_coeff_remnant( ofs ) \
5949     STBIR_SIMD_NO_UNROLL(decode);  \
5950     c = hc[0+(ofs)];               \
5951     x0 += decode[0+(ofs)*7] * c;   \
5952     x1 += decode[1+(ofs)*7] * c;   \
5953     x2 += decode[2+(ofs)*7] * c;   \
5954     x3 += decode[3+(ofs)*7] * c;   \
5955     x4 += decode[4+(ofs)*7] * c;   \
5956     x5 += decode[5+(ofs)*7] * c;   \
5957     x6 += decode[6+(ofs)*7] * c;   \
5958 
5959 #define stbir__2_coeff_remnant( ofs ) \
5960     STBIR_SIMD_NO_UNROLL(decode);  \
5961     c = hc[0+(ofs)];               \
5962     x0 += decode[0+(ofs)*7] * c;   \
5963     x1 += decode[1+(ofs)*7] * c;   \
5964     x2 += decode[2+(ofs)*7] * c;   \
5965     x3 += decode[3+(ofs)*7] * c;   \
5966     x4 += decode[4+(ofs)*7] * c;   \
5967     x5 += decode[5+(ofs)*7] * c;   \
5968     x6 += decode[6+(ofs)*7] * c;   \
5969     c = hc[1+(ofs)];               \
5970     y0 += decode[7+(ofs)*7] * c;   \
5971     y1 += decode[8+(ofs)*7] * c;   \
5972     y2 += decode[9+(ofs)*7] * c;   \
5973     y3 += decode[10+(ofs)*7] * c;  \
5974     y4 += decode[11+(ofs)*7] * c;  \
5975     y5 += decode[12+(ofs)*7] * c;  \
5976     y6 += decode[13+(ofs)*7] * c;  \
5977 
5978 #define stbir__3_coeff_remnant( ofs ) \
5979     STBIR_SIMD_NO_UNROLL(decode);  \
5980     c = hc[0+(ofs)];               \
5981     x0 += decode[0+(ofs)*7] * c;   \
5982     x1 += decode[1+(ofs)*7] * c;   \
5983     x2 += decode[2+(ofs)*7] * c;   \
5984     x3 += decode[3+(ofs)*7] * c;   \
5985     x4 += decode[4+(ofs)*7] * c;   \
5986     x5 += decode[5+(ofs)*7] * c;   \
5987     x6 += decode[6+(ofs)*7] * c;   \
5988     c = hc[1+(ofs)];               \
5989     y0 += decode[7+(ofs)*7] * c;   \
5990     y1 += decode[8+(ofs)*7] * c;   \
5991     y2 += decode[9+(ofs)*7] * c;   \
5992     y3 += decode[10+(ofs)*7] * c;  \
5993     y4 += decode[11+(ofs)*7] * c;  \
5994     y5 += decode[12+(ofs)*7] * c;  \
5995     y6 += decode[13+(ofs)*7] * c;  \
5996     c = hc[2+(ofs)];               \
5997     x0 += decode[14+(ofs)*7] * c;  \
5998     x1 += decode[15+(ofs)*7] * c;  \
5999     x2 += decode[16+(ofs)*7] * c;  \
6000     x3 += decode[17+(ofs)*7] * c;  \
6001     x4 += decode[18+(ofs)*7] * c;  \
6002     x5 += decode[19+(ofs)*7] * c;  \
6003     x6 += decode[20+(ofs)*7] * c;  \
6004 
6005 #define stbir__store_output()                     \
6006     output[0] = x0 + y0;                          \
6007     output[1] = x1 + y1;                          \
6008     output[2] = x2 + y2;                          \
6009     output[3] = x3 + y3;                          \
6010     output[4] = x4 + y4;                          \
6011     output[5] = x5 + y5;                          \
6012     output[6] = x6 + y6;                          \
6013     horizontal_coefficients += coefficient_width; \
6014     ++horizontal_contributors;                    \
6015     output += 7;
6016 
6017 #endif
6018 
6019 #define STBIR__horizontal_channels 7
6020 #define STB_IMAGE_RESIZE_DO_HORIZONTALS
6021 #include STBIR__HEADER_FILENAME
6022 
6023 
6024 // include all of the vertical resamplers (both scatter and gather versions)
6025 
6026 #define STBIR__vertical_channels 1
6027 #define STB_IMAGE_RESIZE_DO_VERTICALS
6028 #include STBIR__HEADER_FILENAME
6029 
6030 #define STBIR__vertical_channels 1
6031 #define STB_IMAGE_RESIZE_DO_VERTICALS
6032 #define STB_IMAGE_RESIZE_VERTICAL_CONTINUE
6033 #include STBIR__HEADER_FILENAME
6034 
6035 #define STBIR__vertical_channels 2
6036 #define STB_IMAGE_RESIZE_DO_VERTICALS
6037 #include STBIR__HEADER_FILENAME
6038 
6039 #define STBIR__vertical_channels 2
6040 #define STB_IMAGE_RESIZE_DO_VERTICALS
6041 #define STB_IMAGE_RESIZE_VERTICAL_CONTINUE
6042 #include STBIR__HEADER_FILENAME
6043 
6044 #define STBIR__vertical_channels 3
6045 #define STB_IMAGE_RESIZE_DO_VERTICALS
6046 #include STBIR__HEADER_FILENAME
6047 
6048 #define STBIR__vertical_channels 3
6049 #define STB_IMAGE_RESIZE_DO_VERTICALS
6050 #define STB_IMAGE_RESIZE_VERTICAL_CONTINUE
6051 #include STBIR__HEADER_FILENAME
6052 
6053 #define STBIR__vertical_channels 4
6054 #define STB_IMAGE_RESIZE_DO_VERTICALS
6055 #include STBIR__HEADER_FILENAME
6056 
6057 #define STBIR__vertical_channels 4
6058 #define STB_IMAGE_RESIZE_DO_VERTICALS
6059 #define STB_IMAGE_RESIZE_VERTICAL_CONTINUE
6060 #include STBIR__HEADER_FILENAME
6061 
6062 #define STBIR__vertical_channels 5
6063 #define STB_IMAGE_RESIZE_DO_VERTICALS
6064 #include STBIR__HEADER_FILENAME
6065 
6066 #define STBIR__vertical_channels 5
6067 #define STB_IMAGE_RESIZE_DO_VERTICALS
6068 #define STB_IMAGE_RESIZE_VERTICAL_CONTINUE
6069 #include STBIR__HEADER_FILENAME
6070 
6071 #define STBIR__vertical_channels 6
6072 #define STB_IMAGE_RESIZE_DO_VERTICALS
6073 #include STBIR__HEADER_FILENAME
6074 
6075 #define STBIR__vertical_channels 6
6076 #define STB_IMAGE_RESIZE_DO_VERTICALS
6077 #define STB_IMAGE_RESIZE_VERTICAL_CONTINUE
6078 #include STBIR__HEADER_FILENAME
6079 
6080 #define STBIR__vertical_channels 7
6081 #define STB_IMAGE_RESIZE_DO_VERTICALS
6082 #include STBIR__HEADER_FILENAME
6083 
6084 #define STBIR__vertical_channels 7
6085 #define STB_IMAGE_RESIZE_DO_VERTICALS
6086 #define STB_IMAGE_RESIZE_VERTICAL_CONTINUE
6087 #include STBIR__HEADER_FILENAME
6088 
6089 #define STBIR__vertical_channels 8
6090 #define STB_IMAGE_RESIZE_DO_VERTICALS
6091 #include STBIR__HEADER_FILENAME
6092 
6093 #define STBIR__vertical_channels 8
6094 #define STB_IMAGE_RESIZE_DO_VERTICALS
6095 #define STB_IMAGE_RESIZE_VERTICAL_CONTINUE
6096 #include STBIR__HEADER_FILENAME
6097 
6098 typedef void STBIR_VERTICAL_GATHERFUNC( float * output, float const * coeffs, float const ** inputs, float const * input0_end );
6099 
6100 static STBIR_VERTICAL_GATHERFUNC * stbir__vertical_gathers[ 8 ] =
6101 {
6102   stbir__vertical_gather_with_1_coeffs,stbir__vertical_gather_with_2_coeffs,stbir__vertical_gather_with_3_coeffs,stbir__vertical_gather_with_4_coeffs,stbir__vertical_gather_with_5_coeffs,stbir__vertical_gather_with_6_coeffs,stbir__vertical_gather_with_7_coeffs,stbir__vertical_gather_with_8_coeffs
6103 };
6104 
6105 static STBIR_VERTICAL_GATHERFUNC * stbir__vertical_gathers_continues[ 8 ] =
6106 {
6107   stbir__vertical_gather_with_1_coeffs_cont,stbir__vertical_gather_with_2_coeffs_cont,stbir__vertical_gather_with_3_coeffs_cont,stbir__vertical_gather_with_4_coeffs_cont,stbir__vertical_gather_with_5_coeffs_cont,stbir__vertical_gather_with_6_coeffs_cont,stbir__vertical_gather_with_7_coeffs_cont,stbir__vertical_gather_with_8_coeffs_cont
6108 };
6109 
6110 typedef void STBIR_VERTICAL_SCATTERFUNC( float ** outputs, float const * coeffs, float const * input, float const * input_end );
6111 
6112 static STBIR_VERTICAL_SCATTERFUNC * stbir__vertical_scatter_sets[ 8 ] =
6113 {
6114   stbir__vertical_scatter_with_1_coeffs,stbir__vertical_scatter_with_2_coeffs,stbir__vertical_scatter_with_3_coeffs,stbir__vertical_scatter_with_4_coeffs,stbir__vertical_scatter_with_5_coeffs,stbir__vertical_scatter_with_6_coeffs,stbir__vertical_scatter_with_7_coeffs,stbir__vertical_scatter_with_8_coeffs
6115 };
6116 
6117 static STBIR_VERTICAL_SCATTERFUNC * stbir__vertical_scatter_blends[ 8 ] =
6118 {
6119   stbir__vertical_scatter_with_1_coeffs_cont,stbir__vertical_scatter_with_2_coeffs_cont,stbir__vertical_scatter_with_3_coeffs_cont,stbir__vertical_scatter_with_4_coeffs_cont,stbir__vertical_scatter_with_5_coeffs_cont,stbir__vertical_scatter_with_6_coeffs_cont,stbir__vertical_scatter_with_7_coeffs_cont,stbir__vertical_scatter_with_8_coeffs_cont
6120 };
6121 
6122 
6123 static void stbir__encode_scanline( stbir__info const * stbir_info, void *output_buffer_data, float * encode_buffer, int row  STBIR_ONLY_PROFILE_GET_SPLIT_INFO )
6124 {
6125   int num_pixels = stbir_info->horizontal.scale_info.output_sub_size;
6126   int channels = stbir_info->channels;
6127   int width_times_channels = num_pixels * channels;
6128   void * output_buffer;
6129 
6130   // un-alpha weight if we need to
6131   if ( stbir_info->alpha_unweight )
6132   {
6133     STBIR_PROFILE_START( unalpha );
6134     stbir_info->alpha_unweight( encode_buffer, width_times_channels );
6135     STBIR_PROFILE_END( unalpha );
6136   }
6137 
6138   // write directly into output by default
6139   output_buffer = output_buffer_data;
6140 
6141   // if we have an output callback, we first convert the decode buffer in place (and then hand that to the callback)
6142   if ( stbir_info->out_pixels_cb )
6143     output_buffer = encode_buffer;
6144 
6145   STBIR_PROFILE_START( encode );
6146   // convert into the output buffer
6147   stbir_info->encode_pixels( output_buffer, width_times_channels, encode_buffer );
6148   STBIR_PROFILE_END( encode );
6149 
6150   // if we have an output callback, call it to send the data
6151   if ( stbir_info->out_pixels_cb )
6152     stbir_info->out_pixels_cb( output_buffer, num_pixels, row, stbir_info->user_data );
6153 }
6154 
6155 
6156 // Get the ring buffer pointer for an index
6157 static float* stbir__get_ring_buffer_entry(stbir__info const * stbir_info, stbir__per_split_info const * split_info, int index )
6158 {
6159   STBIR_ASSERT( index < stbir_info->ring_buffer_num_entries );
6160 
6161   #ifdef STBIR__SEPARATE_ALLOCATIONS
6162     return split_info->ring_buffers[ index ];
6163   #else
6164     return (float*) ( ( (char*) split_info->ring_buffer ) + ( index * stbir_info->ring_buffer_length_bytes ) );
6165   #endif
6166 }
6167 
6168 // Get the specified scan line from the ring buffer
6169 static float* stbir__get_ring_buffer_scanline(stbir__info const * stbir_info, stbir__per_split_info const * split_info, int get_scanline)
6170 {
6171   int ring_buffer_index = (split_info->ring_buffer_begin_index + (get_scanline - split_info->ring_buffer_first_scanline)) % stbir_info->ring_buffer_num_entries;
6172   return stbir__get_ring_buffer_entry( stbir_info, split_info, ring_buffer_index );
6173 }
6174 
6175 static void stbir__resample_horizontal_gather(stbir__info const * stbir_info, float* output_buffer, float const * input_buffer STBIR_ONLY_PROFILE_GET_SPLIT_INFO )
6176 {
6177   float const * decode_buffer = input_buffer - ( stbir_info->scanline_extents.conservative.n0 * stbir_info->effective_channels );
6178 
6179   STBIR_PROFILE_START( horizontal );
6180   if ( ( stbir_info->horizontal.filter_enum == STBIR_FILTER_POINT_SAMPLE ) && ( stbir_info->horizontal.scale_info.scale == 1.0f ) )
6181     STBIR_MEMCPY( output_buffer, input_buffer, stbir_info->horizontal.scale_info.output_sub_size * sizeof( float ) * stbir_info->effective_channels );
6182   else
6183     stbir_info->horizontal_gather_channels( output_buffer, stbir_info->horizontal.scale_info.output_sub_size, decode_buffer, stbir_info->horizontal.contributors, stbir_info->horizontal.coefficients, stbir_info->horizontal.coefficient_width );
6184   STBIR_PROFILE_END( horizontal );
6185 }
6186 
6187 static void stbir__resample_vertical_gather(stbir__info const * stbir_info, stbir__per_split_info* split_info, int n, int contrib_n0, int contrib_n1, float const * vertical_coefficients )
6188 {
6189   float* encode_buffer = split_info->vertical_buffer;
6190   float* decode_buffer = split_info->decode_buffer;
6191   int vertical_first = stbir_info->vertical_first;
6192   int width = (vertical_first) ? ( stbir_info->scanline_extents.conservative.n1-stbir_info->scanline_extents.conservative.n0+1 ) : stbir_info->horizontal.scale_info.output_sub_size;
6193   int width_times_channels = stbir_info->effective_channels * width;
6194 
6195   STBIR_ASSERT( stbir_info->vertical.is_gather );
6196 
6197   // loop over the contributing scanlines and scale into the buffer
6198   STBIR_PROFILE_START( vertical );
6199   {
6200     int k = 0, total = contrib_n1 - contrib_n0 + 1;
6201     STBIR_ASSERT( total > 0 );
6202     do {
6203       float const * inputs[8];
6204       int i, cnt = total; if ( cnt > 8 ) cnt = 8;
6205       for( i = 0 ; i < cnt ; i++ )
6206         inputs[ i ] = stbir__get_ring_buffer_scanline(stbir_info, split_info, k+i+contrib_n0 );
6207 
6208       // call the N scanlines at a time function (up to 8 scanlines of blending at once)
6209       ((k==0)?stbir__vertical_gathers:stbir__vertical_gathers_continues)[cnt-1]( (vertical_first) ? decode_buffer : encode_buffer, vertical_coefficients + k, inputs, inputs[0] + width_times_channels );
6210       k += cnt;
6211       total -= cnt;
6212     } while ( total );
6213   }
6214   STBIR_PROFILE_END( vertical );
6215 
6216   if ( vertical_first )
6217   {
6218     // Now resample the gathered vertical data in the horizontal axis into the encode buffer
6219     stbir__resample_horizontal_gather(stbir_info, encode_buffer, decode_buffer  STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
6220   }
6221 
6222   stbir__encode_scanline( stbir_info, ( (char *) stbir_info->output_data ) + ((size_t)n * (size_t)stbir_info->output_stride_bytes),
6223                           encode_buffer, n  STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
6224 }
6225 
6226 static void stbir__decode_and_resample_for_vertical_gather_loop(stbir__info const * stbir_info, stbir__per_split_info* split_info, int n)
6227 {
6228   int ring_buffer_index;
6229   float* ring_buffer;
6230 
6231   // Decode the nth scanline from the source image into the decode buffer.
6232   stbir__decode_scanline( stbir_info, n, split_info->decode_buffer  STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
6233 
6234   // update new end scanline
6235   split_info->ring_buffer_last_scanline = n;
6236 
6237   // get ring buffer
6238   ring_buffer_index = (split_info->ring_buffer_begin_index + (split_info->ring_buffer_last_scanline - split_info->ring_buffer_first_scanline)) % stbir_info->ring_buffer_num_entries;
6239   ring_buffer = stbir__get_ring_buffer_entry(stbir_info, split_info, ring_buffer_index);
6240 
6241   // Now resample it into the ring buffer.
6242   stbir__resample_horizontal_gather( stbir_info, ring_buffer, split_info->decode_buffer  STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
6243 
6244   // Now it's sitting in the ring buffer ready to be used as source for the vertical sampling.
6245 }
6246 
6247 static void stbir__vertical_gather_loop( stbir__info const * stbir_info, stbir__per_split_info* split_info, int split_count )
6248 {
6249   int y, start_output_y, end_output_y;
6250   stbir__contributors* vertical_contributors = stbir_info->vertical.contributors;
6251   float const * vertical_coefficients = stbir_info->vertical.coefficients;
6252 
6253   STBIR_ASSERT( stbir_info->vertical.is_gather );
6254 
6255   start_output_y = split_info->start_output_y;
6256   end_output_y = split_info[split_count-1].end_output_y;
6257 
6258   vertical_contributors += start_output_y;
6259   vertical_coefficients += start_output_y * stbir_info->vertical.coefficient_width;
6260 
6261   // initialize the ring buffer for gathering
6262   split_info->ring_buffer_begin_index = 0;
6263   split_info->ring_buffer_first_scanline = vertical_contributors->n0;
6264   split_info->ring_buffer_last_scanline = split_info->ring_buffer_first_scanline - 1; // means "empty"
6265 
6266   for (y = start_output_y; y < end_output_y; y++)
6267   {
6268     int in_first_scanline, in_last_scanline;
6269 
6270     in_first_scanline = vertical_contributors->n0;
6271     in_last_scanline = vertical_contributors->n1;
6272 
6273     // make sure the indexing hasn't broken
6274     STBIR_ASSERT( in_first_scanline >= split_info->ring_buffer_first_scanline );
6275 
6276     // Load in new scanlines
6277     while (in_last_scanline > split_info->ring_buffer_last_scanline)
6278     {
6279       STBIR_ASSERT( ( split_info->ring_buffer_last_scanline - split_info->ring_buffer_first_scanline + 1 ) <= stbir_info->ring_buffer_num_entries );
6280 
6281       // make sure there was room in the ring buffer when we add new scanlines
6282       if ( ( split_info->ring_buffer_last_scanline - split_info->ring_buffer_first_scanline + 1 ) == stbir_info->ring_buffer_num_entries )
6283       {
6284         split_info->ring_buffer_first_scanline++;
6285         split_info->ring_buffer_begin_index++;
6286       }
6287 
6288       if ( stbir_info->vertical_first )
6289       {
6290         float * ring_buffer = stbir__get_ring_buffer_scanline( stbir_info, split_info, ++split_info->ring_buffer_last_scanline );
6291         // Decode the nth scanline from the source image into the decode buffer.
6292         stbir__decode_scanline( stbir_info, split_info->ring_buffer_last_scanline, ring_buffer  STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
6293       }
6294       else
6295       {
6296         stbir__decode_and_resample_for_vertical_gather_loop(stbir_info, split_info, split_info->ring_buffer_last_scanline + 1);
6297       }
6298     }
6299 
6300     // Now all buffers should be ready to write a row of vertical sampling, so do it.
6301     stbir__resample_vertical_gather(stbir_info, split_info, y, in_first_scanline, in_last_scanline, vertical_coefficients );
6302 
6303     ++vertical_contributors;
6304     vertical_coefficients += stbir_info->vertical.coefficient_width;
6305   }
6306 }
6307 
6308 #define STBIR__FLOAT_EMPTY_MARKER 3.0e+38F
6309 #define STBIR__FLOAT_BUFFER_IS_EMPTY(ptr) ((ptr)[0]==STBIR__FLOAT_EMPTY_MARKER)
6310 
6311 static void stbir__encode_first_scanline_from_scatter(stbir__info const * stbir_info, stbir__per_split_info* split_info)
6312 {
6313   // evict a scanline out into the output buffer
6314   float* ring_buffer_entry = stbir__get_ring_buffer_entry(stbir_info, split_info, split_info->ring_buffer_begin_index );
6315 
6316   // dump the scanline out
6317   stbir__encode_scanline( stbir_info, ( (char *)stbir_info->output_data ) + ( (size_t)split_info->ring_buffer_first_scanline * (size_t)stbir_info->output_stride_bytes ), ring_buffer_entry, split_info->ring_buffer_first_scanline  STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
6318 
6319   // mark it as empty
6320   ring_buffer_entry[ 0 ] = STBIR__FLOAT_EMPTY_MARKER;
6321 
6322   // advance the first scanline
6323   split_info->ring_buffer_first_scanline++;
6324   if ( ++split_info->ring_buffer_begin_index == stbir_info->ring_buffer_num_entries )
6325     split_info->ring_buffer_begin_index = 0;
6326 }
6327 
6328 static void stbir__horizontal_resample_and_encode_first_scanline_from_scatter(stbir__info const * stbir_info, stbir__per_split_info* split_info)
6329 {
6330   // evict a scanline out into the output buffer
6331 
6332   float* ring_buffer_entry = stbir__get_ring_buffer_entry(stbir_info, split_info, split_info->ring_buffer_begin_index );
6333 
6334   // Now resample it into the buffer.
6335   stbir__resample_horizontal_gather( stbir_info, split_info->vertical_buffer, ring_buffer_entry  STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
6336 
6337   // dump the scanline out
6338   stbir__encode_scanline( stbir_info, ( (char *)stbir_info->output_data ) + ( (size_t)split_info->ring_buffer_first_scanline * (size_t)stbir_info->output_stride_bytes ), split_info->vertical_buffer, split_info->ring_buffer_first_scanline  STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
6339 
6340   // mark it as empty
6341   ring_buffer_entry[ 0 ] = STBIR__FLOAT_EMPTY_MARKER;
6342 
6343   // advance the first scanline
6344   split_info->ring_buffer_first_scanline++;
6345   if ( ++split_info->ring_buffer_begin_index == stbir_info->ring_buffer_num_entries )
6346     split_info->ring_buffer_begin_index = 0;
6347 }
6348 
6349 static void stbir__resample_vertical_scatter(stbir__info const * stbir_info, stbir__per_split_info* split_info, int n0, int n1, float const * vertical_coefficients, float const * vertical_buffer, float const * vertical_buffer_end )
6350 {
6351   STBIR_ASSERT( !stbir_info->vertical.is_gather );
6352 
6353   STBIR_PROFILE_START( vertical );
6354   {
6355     int k = 0, total = n1 - n0 + 1;
6356     STBIR_ASSERT( total > 0 );
6357     do {
6358       float * outputs[8];
6359       int i, n = total; if ( n > 8 ) n = 8;
6360       for( i = 0 ; i < n ; i++ )
6361       {
6362         outputs[ i ] = stbir__get_ring_buffer_scanline(stbir_info, split_info, k+i+n0 );
6363         if ( ( i ) && ( STBIR__FLOAT_BUFFER_IS_EMPTY( outputs[i] ) != STBIR__FLOAT_BUFFER_IS_EMPTY( outputs[0] ) ) ) // make sure runs are of the same type
6364         {
6365           n = i;
6366           break;
6367         }
6368       }
6369       // call the scatter to N scanlines at a time function (up to 8 scanlines of scattering at once)
6370       ((STBIR__FLOAT_BUFFER_IS_EMPTY( outputs[0] ))?stbir__vertical_scatter_sets:stbir__vertical_scatter_blends)[n-1]( outputs, vertical_coefficients + k, vertical_buffer, vertical_buffer_end );
6371       k += n;
6372       total -= n;
6373     } while ( total );
6374   }
6375 
6376   STBIR_PROFILE_END( vertical );
6377 }
6378 
6379 typedef void stbir__handle_scanline_for_scatter_func(stbir__info const * stbir_info, stbir__per_split_info* split_info);
6380 
6381 static void stbir__vertical_scatter_loop( stbir__info const * stbir_info, stbir__per_split_info* split_info, int split_count )
6382 {
6383   int y, start_output_y, end_output_y, start_input_y, end_input_y;
6384   stbir__contributors* vertical_contributors = stbir_info->vertical.contributors;
6385   float const * vertical_coefficients = stbir_info->vertical.coefficients;
6386   stbir__handle_scanline_for_scatter_func * handle_scanline_for_scatter;
6387   void * scanline_scatter_buffer;
6388   void * scanline_scatter_buffer_end;
6389   int on_first_input_y, last_input_y;
6390 
6391   STBIR_ASSERT( !stbir_info->vertical.is_gather );
6392 
6393   start_output_y = split_info->start_output_y;
6394   end_output_y = split_info[split_count-1].end_output_y;  // may do multiple split counts
6395 
6396   start_input_y = split_info->start_input_y;
6397   end_input_y = split_info[split_count-1].end_input_y;
6398 
6399   // adjust for starting offset start_input_y
6400   y = start_input_y + stbir_info->vertical.filter_pixel_margin;
6401   vertical_contributors += y ;
6402   vertical_coefficients += stbir_info->vertical.coefficient_width * y;
6403 
6404   if ( stbir_info->vertical_first )
6405   {
6406     handle_scanline_for_scatter = stbir__horizontal_resample_and_encode_first_scanline_from_scatter;
6407     scanline_scatter_buffer = split_info->decode_buffer;
6408     scanline_scatter_buffer_end = ( (char*) scanline_scatter_buffer ) + sizeof( float ) * stbir_info->effective_channels * (stbir_info->scanline_extents.conservative.n1-stbir_info->scanline_extents.conservative.n0+1);
6409   }
6410   else
6411   {
6412     handle_scanline_for_scatter = stbir__encode_first_scanline_from_scatter;
6413     scanline_scatter_buffer = split_info->vertical_buffer;
6414     scanline_scatter_buffer_end = ( (char*) scanline_scatter_buffer ) + sizeof( float ) * stbir_info->effective_channels * stbir_info->horizontal.scale_info.output_sub_size;
6415   }
6416 
6417   // initialize the ring buffer for scattering
6418   split_info->ring_buffer_first_scanline = start_output_y;
6419   split_info->ring_buffer_last_scanline = -1;
6420   split_info->ring_buffer_begin_index = -1;
6421 
6422   // mark all the buffers as empty to start
6423   for( y = 0 ; y < stbir_info->ring_buffer_num_entries ; y++ )
6424     stbir__get_ring_buffer_entry( stbir_info, split_info, y )[0] = STBIR__FLOAT_EMPTY_MARKER; // only used on scatter
6425 
6426   // do the loop in input space
6427   on_first_input_y = 1; last_input_y = start_input_y;
6428   for (y = start_input_y ; y < end_input_y; y++)
6429   {
6430     int out_first_scanline, out_last_scanline;
6431 
6432     out_first_scanline = vertical_contributors->n0;
6433     out_last_scanline = vertical_contributors->n1;
6434 
6435     STBIR_ASSERT(out_last_scanline - out_first_scanline + 1 <= stbir_info->ring_buffer_num_entries);
6436 
6437     if ( ( out_last_scanline >= out_first_scanline ) && ( ( ( out_first_scanline >= start_output_y ) && ( out_first_scanline < end_output_y ) ) || ( ( out_last_scanline >= start_output_y ) && ( out_last_scanline < end_output_y ) ) ) )
6438     {
6439       float const * vc = vertical_coefficients;
6440 
6441       // keep track of the range actually seen for the next resize
6442       last_input_y = y;
6443       if ( ( on_first_input_y ) && ( y > start_input_y ) )
6444         split_info->start_input_y = y;
6445       on_first_input_y = 0;
6446 
6447       // clip the region
6448       if ( out_first_scanline < start_output_y )
6449       {
6450         vc += start_output_y - out_first_scanline;
6451         out_first_scanline = start_output_y;
6452       }
6453 
6454       if ( out_last_scanline >= end_output_y )
6455         out_last_scanline = end_output_y - 1;
6456 
6457       // if very first scanline, init the index
6458       if (split_info->ring_buffer_begin_index < 0)
6459         split_info->ring_buffer_begin_index = out_first_scanline - start_output_y;
6460 
6461       STBIR_ASSERT( split_info->ring_buffer_begin_index <= out_first_scanline );
6462 
6463       // Decode the nth scanline from the source image into the decode buffer.
6464       stbir__decode_scanline( stbir_info, y, split_info->decode_buffer  STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
6465 
6466       // When horizontal first, we resample horizontally into the vertical buffer before we scatter it out
6467       if ( !stbir_info->vertical_first )
6468         stbir__resample_horizontal_gather( stbir_info, split_info->vertical_buffer, split_info->decode_buffer  STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
6469 
6470       // Now it's sitting in the buffer ready to be distributed into the ring buffers.
6471 
6472       // evict from the ringbuffer, if we need are full
6473       if ( ( ( split_info->ring_buffer_last_scanline - split_info->ring_buffer_first_scanline + 1 ) == stbir_info->ring_buffer_num_entries ) &&
6474            ( out_last_scanline > split_info->ring_buffer_last_scanline ) )
6475         handle_scanline_for_scatter( stbir_info, split_info );
6476 
6477       // Now the horizontal buffer is ready to write to all ring buffer rows, so do it.