v2 / thirdparty / stb_image / stb_image_resize2.h
10608 lines · 9153 sloc · 440.58 KB · 78bdeca9b196ea2a9ef311b4eae6b731be858702
Raw
<
1/* stb_image_resize2 - v2.11 - public domain image resizing
2
3 by Jeff Roberts (v2) and Jorge L Rodriguez
4 http://github.com/nothings/stb
5
6 Can be threaded with the extended API. SSE2, AVX, Neon and WASM SIMD support. Only
7 scaling and translation is supported, no rotations or shears.
8
9 COMPILING & LINKING
10 In one C/C++ file that #includes this file, do this:
11 #define STB_IMAGE_RESIZE_IMPLEMENTATION
12 before the #include. That will create the implementation in that file.
13
14 EASY API CALLS:
15 Easy API downsamples w/Mitchell filter, upsamples w/cubic interpolation, clamps to edge.
16
17 stbir_resize_uint8_srgb( input_pixels, input_w, input_h, input_stride_in_bytes,
18 output_pixels, output_w, output_h, output_stride_in_bytes,
19 pixel_layout_enum )
20
21 stbir_resize_uint8_linear( input_pixels, input_w, input_h, input_stride_in_bytes,
22 output_pixels, output_w, output_h, output_stride_in_bytes,
23 pixel_layout_enum )
24
25 stbir_resize_float_linear( input_pixels, input_w, input_h, input_stride_in_bytes,
26 output_pixels, output_w, output_h, output_stride_in_bytes,
27 pixel_layout_enum )
28
29 If you pass NULL or zero for the output_pixels, we will allocate the output buffer
30 for you and return it from the function (free with free() or STBIR_FREE).
31 As a special case, XX_stride_in_bytes of 0 means packed continuously in memory.
32
33 API LEVELS
34 There are three levels of API - easy-to-use, medium-complexity and extended-complexity.
35
36 See the "header file" section of the source for API documentation.
37
38 ADDITIONAL DOCUMENTATION
39
40 MEMORY ALLOCATION
41 By default, we use malloc and free for memory allocation. To override the
42 memory allocation, before the implementation #include, add a:
43
44 #define STBIR_MALLOC(size,user_data) ...
45 #define STBIR_FREE(ptr,user_data) ...
46
47 Each resize makes exactly one call to malloc/free (unless you use the
48 extended API where you can do one allocation for many resizes). Under
49 address sanitizer, we do separate allocations to find overread/writes.
50
51 PERFORMANCE
52 This library was written with an emphasis on performance. When testing
53 stb_image_resize with RGBA, the fastest mode is STBIR_4CHANNEL with
54 STBIR_TYPE_UINT8 pixels and CLAMPed edges (which is what many other resize
55 libs do by default). Also, make sure SIMD is turned on of course (default
56 for 64-bit targets). Avoid WRAP edge mode if you want the fastest speed.
57
58 This library also comes with profiling built-in. If you define STBIR_PROFILE,
59 you can use the advanced API and get low-level profiling information by
60 calling stbir_resize_extended_profile_info() or stbir_resize_split_profile_info()
61 after a resize.
62
63 SIMD
64 Most of the routines have optimized SSE2, AVX, NEON and WASM versions.
65
66 On Microsoft compilers, we automatically turn on SIMD for 64-bit x64 and
67 ARM; for 32-bit x86 and ARM, you select SIMD mode by defining STBIR_SSE2 or
68 STBIR_NEON. For AVX and AVX2, we auto-select it by detecting the /arch:AVX
69 or /arch:AVX2 switches. You can also always manually turn SSE2, AVX or AVX2
70 support on by defining STBIR_SSE2, STBIR_AVX or STBIR_AVX2.
71
72 On Linux, SSE2 and Neon is on by default for 64-bit x64 or ARM64. For 32-bit,
73 we select x86 SIMD mode by whether you have -msse2, -mavx or -mavx2 enabled
74 on the command line. For 32-bit ARM, you must pass -mfpu=neon-vfpv4 for both
75 clang and GCC, but GCC also requires an additional -mfp16-format=ieee to
76 automatically enable NEON.
77
78 On x86 platforms, you can also define STBIR_FP16C to turn on FP16C instructions
79 for converting back and forth to half-floats. This is autoselected when we
80 are using AVX2. Clang and GCC also require the -mf16c switch. ARM always uses
81 the built-in half float hardware NEON instructions.
82
83 You can also tell us to use multiply-add instructions with STBIR_USE_FMA.
84 Because x86 doesn't always have fma, we turn it off by default to maintain
85 determinism across all platforms. If you don't care about non-FMA determinism
86 and are willing to restrict yourself to more recent x86 CPUs (around the AVX
87 timeframe), then fma will give you around a 15% speedup.
88
89 You can force off SIMD in all cases by defining STBIR_NO_SIMD. You can turn
90 off AVX or AVX2 specifically with STBIR_NO_AVX or STBIR_NO_AVX2. AVX is 10%
91 to 40% faster, and AVX2 is generally another 12%.
92
93 ALPHA CHANNEL
94 Most of the resizing functions provide the ability to control how the alpha
95 channel of an image is processed.
96
97 When alpha represents transparency, it is important that when combining
98 colors with filtering, the pixels should not be treated equally; they
99 should use a weighted average based on their alpha values. For example,
100 if a pixel is 1% opaque bright green and another pixel is 99% opaque
101 black and you average them, the average will be 50% opaque, but the
102 unweighted average and will be a middling green color, while the weighted
103 average will be nearly black. This means the unweighted version introduced
104 green energy that didn't exist in the source image.
105
106 (If you want to know why this makes sense, you can work out the math for
107 the following: consider what happens if you alpha composite a source image
108 over a fixed color and then average the output, vs. if you average the
109 source image pixels and then composite that over the same fixed color.
110 Only the weighted average produces the same result as the ground truth
111 composite-then-average result.)
112
113 Therefore, it is in general best to "alpha weight" the pixels when applying
114 filters to them. This essentially means multiplying the colors by the alpha
115 values before combining them, and then dividing by the alpha value at the
116 end.
117
118 The computer graphics industry introduced a technique called "premultiplied
119 alpha" or "associated alpha" in which image colors are stored in image files
120 already multiplied by their alpha. This saves some math when compositing,
121 and also avoids the need to divide by the alpha at the end (which is quite
122 inefficient). However, while premultiplied alpha is common in the movie CGI
123 industry, it is not commonplace in other industries like videogames, and most
124 consumer file formats are generally expected to contain not-premultiplied
125 colors. For example, Photoshop saves PNG files "unpremultiplied", and web
126 browsers like Chrome and Firefox expect PNG images to be unpremultiplied.
127
128 Note that there are three possibilities that might describe your image
129 and resize expectation:
130
131 1. images are not premultiplied, alpha weighting is desired
132 2. images are not premultiplied, alpha weighting is not desired
133 3. images are premultiplied
134
135 Both case #2 and case #3 require the exact same math: no alpha weighting
136 should be applied or removed. Only case 1 requires extra math operations;
137 the other two cases can be handled identically.
138
139 stb_image_resize expects case #1 by default, applying alpha weighting to
140 images, expecting the input images to be unpremultiplied. This is what the
141 COLOR+ALPHA buffer types tell the resizer to do.
142
143 When you use the pixel layouts STBIR_RGBA, STBIR_BGRA, STBIR_ARGB,
144 STBIR_ABGR, STBIR_RX, or STBIR_XR you are telling us that the pixels are
145 non-premultiplied. In these cases, the resizer will alpha weight the colors
146 (effectively creating the premultiplied image), do the filtering, and then
147 convert back to non-premult on exit.
148
149 When you use the pixel layouts STBIR_RGBA_PM, STBIR_RGBA_PM, STBIR_RGBA_PM,
150 STBIR_RGBA_PM, STBIR_RX_PM or STBIR_XR_PM, you are telling that the pixels
151 ARE premultiplied. In this case, the resizer doesn't have to do the
152 premultipling - it can filter directly on the input. This about twice as
153 fast as the non-premultiplied case, so it's the right option if your data is
154 already setup correctly.
155
156 When you use the pixel layout STBIR_4CHANNEL or STBIR_2CHANNEL, you are
157 telling us that there is no channel that represents transparency; it may be
158 RGB and some unrelated fourth channel that has been stored in the alpha
159 channel, but it is actually not alpha. No special processing will be
160 performed.
161
162 The difference between the generic 4 or 2 channel layouts, and the
163 specialized _PM versions is with the _PM versions you are telling us that
164 the data *is* alpha, just don't premultiply it. That's important when
165 using SRGB pixel formats, we need to know where the alpha is, because
166 it is converted linearly (rather than with the SRGB converters).
167
168 Because alpha weighting produces the same effect as premultiplying, you
169 even have the option with non-premultiplied inputs to let the resizer
170 produce a premultiplied output. Because the intially computed alpha-weighted
171 output image is effectively premultiplied, this is actually more performant
172 than the normal path which un-premultiplies the output image as a final step.
173
174 Finally, when converting both in and out of non-premulitplied space (for
175 example, when using STBIR_RGBA), we go to somewhat heroic measures to
176 ensure that areas with zero alpha value pixels get something reasonable
177 in the RGB values. If you don't care about the RGB values of zero alpha
178 pixels, you can call the stbir_set_non_pm_alpha_speed_over_quality()
179 function - this runs a premultiplied resize about 25% faster. That said,
180 when you really care about speed, using premultiplied pixels for both in
181 and out (STBIR_RGBA_PM, etc) much faster than both of these premultiplied
182 options.
183
184 PIXEL LAYOUT CONVERSION
185 The resizer can convert from some pixel layouts to others. When using the
186 stbir_set_pixel_layouts(), you can, for example, specify STBIR_RGBA
187 on input, and STBIR_ARGB on output, and it will re-organize the channels
188 during the resize. Currently, you can only convert between two pixel
189 layouts with the same number of channels.
190
191 DETERMINISM
192 We commit to being deterministic (from x64 to ARM to scalar to SIMD, etc).
193 This requires compiling with fast-math off (using at least /fp:precise).
194 Also, you must turn off fp-contracting (which turns mult+adds into fmas)!
195 We attempt to do this with pragmas, but with Clang, you usually want to add
196 -ffp-contract=off to the command line as well.
197
198 For 32-bit x86, you must use SSE and SSE2 codegen for determinism. That is,
199 if the scalar x87 unit gets used at all, we immediately lose determinism.
200 On Microsoft Visual Studio 2008 and earlier, from what we can tell there is
201 no way to be deterministic in 32-bit x86 (some x87 always leaks in, even
202 with fp:strict). On 32-bit x86 GCC, determinism requires both -msse2 and
203 -fpmath=sse.
204
205 Note that we will not be deterministic with float data containing NaNs -
206 the NaNs will propagate differently on different SIMD and platforms.
207
208 If you turn on STBIR_USE_FMA, then we will be deterministic with other
209 fma targets, but we will differ from non-fma targets (this is unavoidable,
210 because a fma isn't simply an add with a mult - it also introduces a
211 rounding difference compared to non-fma instruction sequences.
212
213 FLOAT PIXEL FORMAT RANGE
214 Any range of values can be used for the non-alpha float data that you pass
215 in (0 to 1, -1 to 1, whatever). However, if you are inputting float values
216 but *outputting* bytes or shorts, you must use a range of 0 to 1 so that we
217 scale back properly. The alpha channel must also be 0 to 1 for any format
218 that does premultiplication prior to resizing.
219
220 Note also that with float output, using filters with negative lobes, the
221 output filtered values might go slightly out of range. You can define
222 STBIR_FLOAT_LOW_CLAMP and/or STBIR_FLOAT_HIGH_CLAMP to specify the range
223 to clamp to on output, if that's important.
224
225 MAX/MIN SCALE FACTORS
226 The input pixel resolutions are in integers, and we do the internal pointer
227 resolution in size_t sized integers. However, the scale ratio from input
228 resolution to output resolution is calculated in float form. This means
229 the effective possible scale ratio is limited to 24 bits (or 16 million
230 to 1). As you get close to the size of the float resolution (again, 16
231 million pixels wide or high), you might start seeing float inaccuracy
232 issues in general in the pipeline. If you have to do extreme resizes,
233 you can usually do this is multiple stages (using float intermediate
234 buffers).
235
236 FLIPPED IMAGES
237 Stride is just the delta from one scanline to the next. This means you can
238 use a negative stride to handle inverted images (point to the final
239 scanline and use a negative stride). You can invert the input or output,
240 using negative strides.
241
242 DEFAULT FILTERS
243 For functions which don't provide explicit control over what filters to
244 use, you can change the compile-time defaults with:
245
246 #define STBIR_DEFAULT_FILTER_UPSAMPLE STBIR_FILTER_something
247 #define STBIR_DEFAULT_FILTER_DOWNSAMPLE STBIR_FILTER_something
248
249 See stbir_filter in the header-file section for the list of filters.
250
251 NEW FILTERS
252 A number of 1D filter kernels are supplied. For a list of supported
253 filters, see the stbir_filter enum. You can install your own filters by
254 using the stbir_set_filter_callbacks function.
255
256 PROGRESS
257 For interactive use with slow resize operations, you can use the the
258 scanline callbacks in the extended API. It would have to be a *very* large
259 image resample to need progress though - we're very fast.
260
261 CEIL and FLOOR
262 In scalar mode, the only functions we use from math.h are ceilf and floorf,
263 but if you have your own versions, you can define the STBIR_CEILF(v) and
264 STBIR_FLOORF(v) macros and we'll use them instead. In SIMD, we just use
265 our own versions.
266
267 ASSERT
268 Define STBIR_ASSERT(boolval) to override assert() and not use assert.h
269
270 PORTING FROM VERSION 1
271 The API has changed. You can continue to use the old version of stb_image_resize.h,
272 which is available in the "deprecated/" directory.
273
274 If you're using the old simple-to-use API, porting is straightforward.
275 (For more advanced APIs, read the documentation.)
276
277 stbir_resize_uint8():
278 - call `stbir_resize_uint8_linear`, cast channel count to `stbir_pixel_layout`
279
280 stbir_resize_float():
281 - call `stbir_resize_float_linear`, cast channel count to `stbir_pixel_layout`
282
283 stbir_resize_uint8_srgb():
284 - function name is unchanged
285 - cast channel count to `stbir_pixel_layout`
286 - above is sufficient unless your image has alpha and it's not RGBA/BGRA
287 - in that case, follow the below instructions for stbir_resize_uint8_srgb_edgemode
288
289 stbir_resize_uint8_srgb_edgemode()
290 - switch to the "medium complexity" API
291 - stbir_resize(), very similar API but a few more parameters:
292 - pixel_layout: cast channel count to `stbir_pixel_layout`
293 - data_type: STBIR_TYPE_UINT8_SRGB
294 - edge: unchanged (STBIR_EDGE_WRAP, etc.)
295 - filter: STBIR_FILTER_DEFAULT
296 - which channel is alpha is specified in stbir_pixel_layout, see enum for details
297
298 FUTURE TODOS
299 * For polyphase integral filters, we just memcpy the coeffs to dupe
300 them, but we should indirect and use the same coeff memory.
301 * Add pixel layout conversions for sensible different channel counts
302 (maybe, 1->3/4, 3->4, 4->1, 3->1).
303 * For SIMD encode and decode scanline routines, do any pre-aligning
304 for bad input/output buffer alignments and pitch?
305 * For very wide scanlines, we should we do vertical strips to stay within
306 L2 cache. Maybe do chunks of 1K pixels at a time. There would be
307 some pixel reconversion, but probably dwarfed by things falling out
308 of cache. Probably also something possible with alternating between
309 scattering and gathering at high resize scales?
310 * Rewrite the coefficient generator to do many at once.
311 * AVX-512 vertical kernels - worried about downclocking here.
312 * Convert the reincludes to macros when we know they aren't changing.
313 * Experiment with pivoting the horizontal and always using the
314 vertical filters (which are faster, but perhaps not enough to overcome
315 the pivot cost and the extra memory touches). Need to buffer the whole
316 image so have to balance memory use.
317 * Most of our code is internally function pointers, should we compile
318 all the SIMD stuff always and dynamically dispatch?
319
320 CONTRIBUTORS
321 Jeff Roberts: 2.0 implementation, optimizations, SIMD
322 Martins Mozeiko: NEON simd, WASM simd, clang and GCC whisperer
323 Fabian Giesen: half float and srgb converters
324 Sean Barrett: API design, optimizations
325 Jorge L Rodriguez: Original 1.0 implementation
326 Aras Pranckevicius: bugfixes
327 Nathan Reed: warning fixes for 1.0
328
329 REVISIONS
330 2.11 (2024-09-08) fix harmless asan warnings in 2-channel and 3-channel mode
331 with AVX-2, fix some weird scaling edge conditions with
332 point sample mode.
333 2.10 (2024-07-27) fix the defines GCC and mingw for loop unroll control,
334 fix MSVC 32-bit arm half float routines.
335 2.09 (2024-06-19) fix the defines for 32-bit ARM GCC builds (was selecting
336 hardware half floats).
337 2.08 (2024-06-10) fix for RGB->BGR three channel flips and add SIMD (thanks
338 to Ryan Salsbury), fix for sub-rect resizes, use the
339 pragmas to control unrolling when they are available.
340 2.07 (2024-05-24) fix for slow final split during threaded conversions of very
341 wide scanlines when downsampling (caused by extra input
342 converting), fix for wide scanline resamples with many
343 splits (int overflow), fix GCC warning.
344 2.06 (2024-02-10) fix for identical width/height 3x or more down-scaling
345 undersampling a single row on rare resize ratios (about 1%).
346 2.05 (2024-02-07) fix for 2 pixel to 1 pixel resizes with wrap (thanks Aras),
347 fix for output callback (thanks Julien Koenen).
348 2.04 (2023-11-17) fix for rare AVX bug, shadowed symbol (thanks Nikola Smiljanic).
349 2.03 (2023-11-01) ASAN and TSAN warnings fixed, minor tweaks.
350 2.00 (2023-10-10) mostly new source: new api, optimizations, simd, vertical-first, etc
351 2x-5x faster without simd, 4x-12x faster with simd,
352 in some cases, 20x to 40x faster esp resizing large to very small.
353 0.96 (2019-03-04) fixed warnings
354 0.95 (2017-07-23) fixed warnings
355 0.94 (2017-03-18) fixed warnings
356 0.93 (2017-03-03) fixed bug with certain combinations of heights
357 0.92 (2017-01-02) fix integer overflow on large (>2GB) images
358 0.91 (2016-04-02) fix warnings; fix handling of subpixel regions
359 0.90 (2014-09-17) first released version
360
361 LICENSE
362 See end of file for license information.
363*/
364
365// __v_ start
366#ifdef __TINYC__
367#define STBIR_NO_SIMD
368#endif
369// __v_ end
370
371#if !defined(STB_IMAGE_RESIZE_DO_HORIZONTALS) && !defined(STB_IMAGE_RESIZE_DO_VERTICALS) && !defined(STB_IMAGE_RESIZE_DO_CODERS) // for internal re-includes
372
373#ifndef STBIR_INCLUDE_STB_IMAGE_RESIZE2_H
374#define STBIR_INCLUDE_STB_IMAGE_RESIZE2_H
375
376#include <stddef.h>
377#ifdef _MSC_VER
378typedef unsigned char stbir_uint8;
379typedef unsigned short stbir_uint16;
380typedef unsigned int stbir_uint32;
381typedef unsigned __int64 stbir_uint64;
382#else
383#include <stdint.h>
384typedef uint8_t stbir_uint8;
385typedef uint16_t stbir_uint16;
386typedef uint32_t stbir_uint32;
387typedef uint64_t stbir_uint64;
388#endif
389
390#ifdef _M_IX86_FP
391#if ( _M_IX86_FP >= 1 )
392#ifndef STBIR_SSE
393#define STBIR_SSE
394#endif
395#endif
396#endif
397
398#if defined(_x86_64) || defined( __x86_64__ ) || defined( _M_X64 ) || defined(__x86_64) || defined(_M_AMD64) || defined(__SSE2__) || defined(STBIR_SSE) || defined(STBIR_SSE2)
399 #ifndef STBIR_SSE2
400 #define STBIR_SSE2
401 #endif
402 #if defined(__AVX__) || defined(STBIR_AVX2)
403 #ifndef STBIR_AVX
404 #ifndef STBIR_NO_AVX
405 #define STBIR_AVX
406 #endif
407 #endif
408 #endif
409 #if defined(__AVX2__) || defined(STBIR_AVX2)
410 #ifndef STBIR_NO_AVX2
411 #ifndef STBIR_AVX2
412 #define STBIR_AVX2
413 #endif
414 #if defined( _MSC_VER ) && !defined(__clang__)
415 #ifndef STBIR_FP16C // FP16C instructions are on all AVX2 cpus, so we can autoselect it here on microsoft - clang needs -m16c
416 #define STBIR_FP16C
417 #endif
418 #endif
419 #endif
420 #endif
421 #ifdef __F16C__
422 #ifndef STBIR_FP16C // turn on FP16C instructions if the define is set (for clang and gcc)
423 #define STBIR_FP16C
424 #endif
425 #endif
426#endif
427
428#if defined( _M_ARM64 ) || defined( __aarch64__ ) || defined( __arm64__ ) || ((__ARM_NEON_FP & 4) != 0) || defined(__ARM_NEON__)
429#ifndef STBIR_NEON
430#define STBIR_NEON
431#endif
432#endif
433
434#if defined(_M_ARM) || defined(__arm__)
435#ifdef STBIR_USE_FMA
436#undef STBIR_USE_FMA // no FMA for 32-bit arm on MSVC
437#endif
438#endif
439
440#if defined(__wasm__) && defined(__wasm_simd128__)
441#ifndef STBIR_WASM
442#define STBIR_WASM
443#endif
444#endif
445
446#ifndef STBIRDEF
447#ifdef STB_IMAGE_RESIZE_STATIC
448#define STBIRDEF static
449#else
450#ifdef __cplusplus
451#define STBIRDEF extern "C"
452#else
453#define STBIRDEF extern
454#endif
455#endif
456#endif
457
458//////////////////////////////////////////////////////////////////////////////
459//// start "header file" ///////////////////////////////////////////////////
460//
461// Easy-to-use API:
462//
463// * stride is the offset between successive rows of image data
464// in memory, in bytes. specify 0 for packed continuously in memory
465// * colorspace is linear or sRGB as specified by function name
466// * Uses the default filters
467// * Uses edge mode clamped
468// * returned result is 1 for success or 0 in case of an error.
469
470
471// stbir_pixel_layout specifies:
472// number of channels
473// order of channels
474// whether color is premultiplied by alpha
475// for back compatibility, you can cast the old channel count to an stbir_pixel_layout
476typedef enum
477{
478 STBIR_1CHANNEL = 1,
479 STBIR_2CHANNEL = 2,
480 STBIR_RGB = 3, // 3-chan, with order specified (for channel flipping)
481 STBIR_BGR = 0, // 3-chan, with order specified (for channel flipping)
482 STBIR_4CHANNEL = 5,
483
484 STBIR_RGBA = 4, // alpha formats, where alpha is NOT premultiplied into color channels
485 STBIR_BGRA = 6,
486 STBIR_ARGB = 7,
487 STBIR_ABGR = 8,
488 STBIR_RA = 9,
489 STBIR_AR = 10,
490
491 STBIR_RGBA_PM = 11, // alpha formats, where alpha is premultiplied into color channels
492 STBIR_BGRA_PM = 12,
493 STBIR_ARGB_PM = 13,
494 STBIR_ABGR_PM = 14,
495 STBIR_RA_PM = 15,
496 STBIR_AR_PM = 16,
497
498 STBIR_RGBA_NO_AW = 11, // alpha formats, where NO alpha weighting is applied at all!
499 STBIR_BGRA_NO_AW = 12, // these are just synonyms for the _PM flags (which also do
500 STBIR_ARGB_NO_AW = 13, // no alpha weighting). These names just make it more clear
501 STBIR_ABGR_NO_AW = 14, // for some folks).
502 STBIR_RA_NO_AW = 15,
503 STBIR_AR_NO_AW = 16,
504
505} stbir_pixel_layout;
506
507//===============================================================
508// Simple-complexity API
509//
510// If output_pixels is NULL (0), then we will allocate the buffer and return it to you.
511//--------------------------------
512
513STBIRDEF unsigned char * stbir_resize_uint8_srgb( const unsigned char *input_pixels , int input_w , int input_h, int input_stride_in_bytes,
514 unsigned char *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
515 stbir_pixel_layout pixel_type );
516
517STBIRDEF unsigned char * stbir_resize_uint8_linear( const unsigned char *input_pixels , int input_w , int input_h, int input_stride_in_bytes,
518 unsigned char *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
519 stbir_pixel_layout pixel_type );
520
521STBIRDEF float * stbir_resize_float_linear( const float *input_pixels , int input_w , int input_h, int input_stride_in_bytes,
522 float *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
523 stbir_pixel_layout pixel_type );
524//===============================================================
525
526//===============================================================
527// Medium-complexity API
528//
529// This extends the easy-to-use API as follows:
530//
531// * Can specify the datatype - U8, U8_SRGB, U16, FLOAT, HALF_FLOAT
532// * Edge wrap can selected explicitly
533// * Filter can be selected explicitly
534//--------------------------------
535
536typedef enum
537{
538 STBIR_EDGE_CLAMP = 0,
539 STBIR_EDGE_REFLECT = 1,
540 STBIR_EDGE_WRAP = 2, // this edge mode is slower and uses more memory
541 STBIR_EDGE_ZERO = 3,
542} stbir_edge;
543
544typedef enum
545{
546 STBIR_FILTER_DEFAULT = 0, // use same filter type that easy-to-use API chooses
547 STBIR_FILTER_BOX = 1, // A trapezoid w/1-pixel wide ramps, same result as box for integer scale ratios
548 STBIR_FILTER_TRIANGLE = 2, // On upsampling, produces same results as bilinear texture filtering
549 STBIR_FILTER_CUBICBSPLINE = 3, // The cubic b-spline (aka Mitchell-Netrevalli with B=1,C=0), gaussian-esque
550 STBIR_FILTER_CATMULLROM = 4, // An interpolating cubic spline
551 STBIR_FILTER_MITCHELL = 5, // Mitchell-Netrevalli filter with B=1/3, C=1/3
552 STBIR_FILTER_POINT_SAMPLE = 6, // Simple point sampling
553 STBIR_FILTER_OTHER = 7, // User callback specified
554} stbir_filter;
555
556typedef enum
557{
558 STBIR_TYPE_UINT8 = 0,
559 STBIR_TYPE_UINT8_SRGB = 1,
560 STBIR_TYPE_UINT8_SRGB_ALPHA = 2, // alpha channel, when present, should also be SRGB (this is very unusual)
561 STBIR_TYPE_UINT16 = 3,
562 STBIR_TYPE_FLOAT = 4,
563 STBIR_TYPE_HALF_FLOAT = 5
564} stbir_datatype;
565
566// medium api
567STBIRDEF void * stbir_resize( const void *input_pixels , int input_w , int input_h, int input_stride_in_bytes,
568 void *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
569 stbir_pixel_layout pixel_layout, stbir_datatype data_type,
570 stbir_edge edge, stbir_filter filter );
571//===============================================================
572
573
574
575//===============================================================
576// Extended-complexity API
577//
578// This API exposes all resize functionality.
579//
580// * Separate filter types for each axis
581// * Separate edge modes for each axis
582// * Separate input and output data types
583// * Can specify regions with subpixel correctness
584// * Can specify alpha flags
585// * Can specify a memory callback
586// * Can specify a callback data type for pixel input and output
587// * Can be threaded for a single resize
588// * Can be used to resize many frames without recalculating the sampler info
589//
590// Use this API as follows:
591// 1) Call the stbir_resize_init function on a local STBIR_RESIZE structure
592// 2) Call any of the stbir_set functions
593// 3) Optionally call stbir_build_samplers() if you are going to resample multiple times
594// with the same input and output dimensions (like resizing video frames)
595// 4) Resample by calling stbir_resize_extended().
596// 5) Call stbir_free_samplers() if you called stbir_build_samplers()
597//--------------------------------
598
599
600// Types:
601
602// INPUT CALLBACK: this callback is used for input scanlines
603typedef void const * stbir_input_callback( void * optional_output, void const * input_ptr, int num_pixels, int x, int y, void * context );
604
605// OUTPUT CALLBACK: this callback is used for output scanlines
606typedef void stbir_output_callback( void const * output_ptr, int num_pixels, int y, void * context );
607
608// callbacks for user installed filters
609typedef float stbir__kernel_callback( float x, float scale, void * user_data ); // centered at zero
610typedef float stbir__support_callback( float scale, void * user_data );
611
612// internal structure with precomputed scaling
613typedef struct stbir__info stbir__info;
614
615typedef struct STBIR_RESIZE // use the stbir_resize_init and stbir_override functions to set these values for future compatibility
616{
617 void * user_data;
618 void const * input_pixels;
619 int input_w, input_h;
620 double input_s0, input_t0, input_s1, input_t1;
621 stbir_input_callback * input_cb;
622 void * output_pixels;
623 int output_w, output_h;
624 int output_subx, output_suby, output_subw, output_subh;
625 stbir_output_callback * output_cb;
626 int input_stride_in_bytes;
627 int output_stride_in_bytes;
628 int splits;
629 int fast_alpha;
630 int needs_rebuild;
631 int called_alloc;
632 stbir_pixel_layout input_pixel_layout_public;
633 stbir_pixel_layout output_pixel_layout_public;
634 stbir_datatype input_data_type;
635 stbir_datatype output_data_type;
636 stbir_filter horizontal_filter, vertical_filter;
637 stbir_edge horizontal_edge, vertical_edge;
638 stbir__kernel_callback * horizontal_filter_kernel; stbir__support_callback * horizontal_filter_support;
639 stbir__kernel_callback * vertical_filter_kernel; stbir__support_callback * vertical_filter_support;
640 stbir__info * samplers;
641} STBIR_RESIZE;
642
643// extended complexity api
644
645
646// First off, you must ALWAYS call stbir_resize_init on your resize structure before any of the other calls!
647STBIRDEF void stbir_resize_init( STBIR_RESIZE * resize,
648 const void *input_pixels, int input_w, int input_h, int input_stride_in_bytes, // stride can be zero
649 void *output_pixels, int output_w, int output_h, int output_stride_in_bytes, // stride can be zero
650 stbir_pixel_layout pixel_layout, stbir_datatype data_type );
651
652//===============================================================
653// You can update these parameters any time after resize_init and there is no cost
654//--------------------------------
655
656STBIRDEF void stbir_set_datatypes( STBIR_RESIZE * resize, stbir_datatype input_type, stbir_datatype output_type );
657STBIRDEF void stbir_set_pixel_callbacks( STBIR_RESIZE * resize, stbir_input_callback * input_cb, stbir_output_callback * output_cb ); // no callbacks by default
658STBIRDEF void stbir_set_user_data( STBIR_RESIZE * resize, void * user_data ); // pass back STBIR_RESIZE* by default
659STBIRDEF void stbir_set_buffer_ptrs( STBIR_RESIZE * resize, const void * input_pixels, int input_stride_in_bytes, void * output_pixels, int output_stride_in_bytes );
660
661//===============================================================
662
663
664//===============================================================
665// If you call any of these functions, you will trigger a sampler rebuild!
666//--------------------------------
667
668STBIRDEF int stbir_set_pixel_layouts( STBIR_RESIZE * resize, stbir_pixel_layout input_pixel_layout, stbir_pixel_layout output_pixel_layout ); // sets new buffer layouts
669STBIRDEF int stbir_set_edgemodes( STBIR_RESIZE * resize, stbir_edge horizontal_edge, stbir_edge vertical_edge ); // CLAMP by default
670
671STBIRDEF int stbir_set_filters( STBIR_RESIZE * resize, stbir_filter horizontal_filter, stbir_filter vertical_filter ); // STBIR_DEFAULT_FILTER_UPSAMPLE/DOWNSAMPLE by default
672STBIRDEF int stbir_set_filter_callbacks( STBIR_RESIZE * resize, stbir__kernel_callback * horizontal_filter, stbir__support_callback * horizontal_support, stbir__kernel_callback * vertical_filter, stbir__support_callback * vertical_support );
673
674STBIRDEF int stbir_set_pixel_subrect( STBIR_RESIZE * resize, int subx, int suby, int subw, int subh ); // sets both sub-regions (full regions by default)
675STBIRDEF int stbir_set_input_subrect( STBIR_RESIZE * resize, double s0, double t0, double s1, double t1 ); // sets input sub-region (full region by default)
676STBIRDEF int stbir_set_output_pixel_subrect( STBIR_RESIZE * resize, int subx, int suby, int subw, int subh ); // sets output sub-region (full region by default)
677
678// when inputting AND outputting non-premultiplied alpha pixels, we use a slower but higher quality technique
679// that fills the zero alpha pixel's RGB values with something plausible. If you don't care about areas of
680// zero alpha, you can call this function to get about a 25% speed improvement for STBIR_RGBA to STBIR_RGBA
681// types of resizes.
682STBIRDEF int stbir_set_non_pm_alpha_speed_over_quality( STBIR_RESIZE * resize, int non_pma_alpha_speed_over_quality );
683//===============================================================
684
685
686//===============================================================
687// You can call build_samplers to prebuild all the internal data we need to resample.
688// Then, if you call resize_extended many times with the same resize, you only pay the
689// cost once.
690// If you do call build_samplers, you MUST call free_samplers eventually.
691//--------------------------------
692
693// This builds the samplers and does one allocation
694STBIRDEF int stbir_build_samplers( STBIR_RESIZE * resize );
695
696// You MUST call this, if you call stbir_build_samplers or stbir_build_samplers_with_splits
697STBIRDEF void stbir_free_samplers( STBIR_RESIZE * resize );
698//===============================================================
699
700
701// And this is the main function to perform the resize synchronously on one thread.
702STBIRDEF int stbir_resize_extended( STBIR_RESIZE * resize );
703
704
705//===============================================================
706// Use these functions for multithreading.
707// 1) You call stbir_build_samplers_with_splits first on the main thread
708// 2) Then stbir_resize_with_split on each thread
709// 3) stbir_free_samplers when done on the main thread
710//--------------------------------
711
712// This will build samplers for threading.
713// You can pass in the number of threads you'd like to use (try_splits).
714// It returns the number of splits (threads) that you can call it with.
715/// It might be less if the image resize can't be split up that many ways.
716
717STBIRDEF int stbir_build_samplers_with_splits( STBIR_RESIZE * resize, int try_splits );
718
719// This function does a split of the resizing (you call this fuction for each
720// split, on multiple threads). A split is a piece of the output resize pixel space.
721
722// Note that you MUST call stbir_build_samplers_with_splits before stbir_resize_extended_split!
723
724// Usually, you will always call stbir_resize_split with split_start as the thread_index
725// and "1" for the split_count.
726// But, if you have a weird situation where you MIGHT want 8 threads, but sometimes
727// only 4 threads, you can use 0,2,4,6 for the split_start's and use "2" for the
728// split_count each time to turn in into a 4 thread resize. (This is unusual).
729
730STBIRDEF int stbir_resize_extended_split( STBIR_RESIZE * resize, int split_start, int split_count );
731//===============================================================
732
733
734//===============================================================
735// Pixel Callbacks info:
736//--------------------------------
737
738// The input callback is super flexible - it calls you with the input address
739// (based on the stride and base pointer), it gives you an optional_output
740// pointer that you can fill, or you can just return your own pointer into
741// your own data.
742//
743// You can also do conversion from non-supported data types if necessary - in
744// this case, you ignore the input_ptr and just use the x and y parameters to
745// calculate your own input_ptr based on the size of each non-supported pixel.
746// (Something like the third example below.)
747//
748// You can also install just an input or just an output callback by setting the
749// callback that you don't want to zero.
750//
751// First example, progress: (getting a callback that you can monitor the progress):
752// void const * my_callback( void * optional_output, void const * input_ptr, int num_pixels, int x, int y, void * context )
753// {
754// percentage_done = y / input_height;
755// return input_ptr; // use buffer from call
756// }
757//
758// Next example, copying: (copy from some other buffer or stream):
759// void const * my_callback( void * optional_output, void const * input_ptr, int num_pixels, int x, int y, void * context )
760// {
761// CopyOrStreamData( optional_output, other_data_src, num_pixels * pixel_width_in_bytes );
762// return optional_output; // return the optional buffer that we filled
763// }
764//
765// Third example, input another buffer without copying: (zero-copy from other buffer):
766// void const * my_callback( void * optional_output, void const * input_ptr, int num_pixels, int x, int y, void * context )
767// {
768// void * pixels = ( (char*) other_image_base ) + ( y * other_image_stride ) + ( x * other_pixel_width_in_bytes );
769// return pixels; // return pointer to your data without copying
770// }
771//
772//
773// The output callback is considerably simpler - it just calls you so that you can dump
774// out each scanline. You could even directly copy out to disk if you have a simple format
775// like TGA or BMP. You can also convert to other output types here if you want.
776//
777// Simple example:
778// void const * my_output( void * output_ptr, int num_pixels, int y, void * context )
779// {
780// percentage_done = y / output_height;
781// fwrite( output_ptr, pixel_width_in_bytes, num_pixels, output_file );
782// }
783//===============================================================
784
785
786
787
788//===============================================================
789// optional built-in profiling API
790//--------------------------------
791
792#ifdef STBIR_PROFILE
793
794typedef struct STBIR_PROFILE_INFO
795{
796 stbir_uint64 total_clocks;
797
798 // how many clocks spent (of total_clocks) in the various resize routines, along with a string description
799 // there are "resize_count" number of zones
800 stbir_uint64 clocks[ 8 ];
801 char const ** descriptions;
802
803 // count of clocks and descriptions
804 stbir_uint32 count;
805} STBIR_PROFILE_INFO;
806
807// use after calling stbir_resize_extended (or stbir_build_samplers or stbir_build_samplers_with_splits)
808STBIRDEF void stbir_resize_build_profile_info( STBIR_PROFILE_INFO * out_info, STBIR_RESIZE const * resize );
809
810// use after calling stbir_resize_extended
811STBIRDEF void stbir_resize_extended_profile_info( STBIR_PROFILE_INFO * out_info, STBIR_RESIZE const * resize );
812
813// use after calling stbir_resize_extended_split
814STBIRDEF void stbir_resize_split_profile_info( STBIR_PROFILE_INFO * out_info, STBIR_RESIZE const * resize, int split_start, int split_num );
815
816//===============================================================
817
818#endif
819
820
821//// end header file /////////////////////////////////////////////////////
822#endif // STBIR_INCLUDE_STB_IMAGE_RESIZE2_H
823
824#if defined(STB_IMAGE_RESIZE_IMPLEMENTATION) || defined(STB_IMAGE_RESIZE2_IMPLEMENTATION)
825
826#ifndef STBIR_ASSERT
827#include <assert.h>
828#define STBIR_ASSERT(x) assert(x)
829#endif
830
831#ifndef STBIR_MALLOC
832#include <stdlib.h>
833#define STBIR_MALLOC(size,user_data) ((void)(user_data), malloc(size))
834#define STBIR_FREE(ptr,user_data) ((void)(user_data), free(ptr))
835// (we used the comma operator to evaluate user_data, to avoid "unused parameter" warnings)
836#endif
837
838#ifdef _MSC_VER
839
840#define stbir__inline __forceinline
841
842#else
843
844#define stbir__inline __inline__
845
846// Clang address sanitizer
847#if defined(__has_feature)
848 #if __has_feature(address_sanitizer) || __has_feature(memory_sanitizer)
849 #ifndef STBIR__SEPARATE_ALLOCATIONS
850 #define STBIR__SEPARATE_ALLOCATIONS
851 #endif
852 #endif
853#endif
854
855#endif
856
857// GCC and MSVC
858#if defined(__SANITIZE_ADDRESS__)
859 #ifndef STBIR__SEPARATE_ALLOCATIONS
860 #define STBIR__SEPARATE_ALLOCATIONS
861 #endif
862#endif
863
864// Always turn off automatic FMA use - use STBIR_USE_FMA if you want.
865// Otherwise, this is a determinism disaster.
866#ifndef STBIR_DONT_CHANGE_FP_CONTRACT // override in case you don't want this behavior
867#if defined(_MSC_VER) && !defined(__clang__)
868#if _MSC_VER > 1200
869#pragma fp_contract(off)
870#endif
871#elif defined(__GNUC__) && !defined(__clang__)
872#pragma GCC optimize("fp-contract=off")
873#else
874#pragma STDC FP_CONTRACT OFF
875#endif
876#endif
877
878#ifdef _MSC_VER
879#define STBIR__UNUSED(v) (void)(v)
880#else
881#define STBIR__UNUSED(v) (void)sizeof(v)
882#endif
883
884#define STBIR__ARRAY_SIZE(a) (sizeof((a))/sizeof((a)[0]))
885
886
887#ifndef STBIR_DEFAULT_FILTER_UPSAMPLE
888#define STBIR_DEFAULT_FILTER_UPSAMPLE STBIR_FILTER_CATMULLROM
889#endif
890
891#ifndef STBIR_DEFAULT_FILTER_DOWNSAMPLE
892#define STBIR_DEFAULT_FILTER_DOWNSAMPLE STBIR_FILTER_MITCHELL
893#endif
894
895
896#ifndef STBIR__HEADER_FILENAME
897#define STBIR__HEADER_FILENAME "stb_image_resize2.h"
898#endif
899
900// the internal pixel layout enums are in a different order, so we can easily do range comparisons of types
901// the public pixel layout is ordered in a way that if you cast num_channels (1-4) to the enum, you get something sensible
902typedef enum
903{
904 STBIRI_1CHANNEL = 0,
905 STBIRI_2CHANNEL = 1,
906 STBIRI_RGB = 2,
907 STBIRI_BGR = 3,
908 STBIRI_4CHANNEL = 4,
909
910 STBIRI_RGBA = 5,
911 STBIRI_BGRA = 6,
912 STBIRI_ARGB = 7,
913 STBIRI_ABGR = 8,
914 STBIRI_RA = 9,
915 STBIRI_AR = 10,
916
917 STBIRI_RGBA_PM = 11,
918 STBIRI_BGRA_PM = 12,
919 STBIRI_ARGB_PM = 13,
920 STBIRI_ABGR_PM = 14,
921 STBIRI_RA_PM = 15,
922 STBIRI_AR_PM = 16,
923} stbir_internal_pixel_layout;
924
925// define the public pixel layouts to not compile inside the implementation (to avoid accidental use)
926#define STBIR_BGR bad_dont_use_in_implementation
927#define STBIR_1CHANNEL STBIR_BGR
928#define STBIR_2CHANNEL STBIR_BGR
929#define STBIR_RGB STBIR_BGR
930#define STBIR_RGBA STBIR_BGR
931#define STBIR_4CHANNEL STBIR_BGR
932#define STBIR_BGRA STBIR_BGR
933#define STBIR_ARGB STBIR_BGR
934#define STBIR_ABGR STBIR_BGR
935#define STBIR_RA STBIR_BGR
936#define STBIR_AR STBIR_BGR
937#define STBIR_RGBA_PM STBIR_BGR
938#define STBIR_BGRA_PM STBIR_BGR
939#define STBIR_ARGB_PM STBIR_BGR
940#define STBIR_ABGR_PM STBIR_BGR
941#define STBIR_RA_PM STBIR_BGR
942#define STBIR_AR_PM STBIR_BGR
943
944// must match stbir_datatype
945static unsigned char stbir__type_size[] = {
946 1,1,1,2,4,2 // STBIR_TYPE_UINT8,STBIR_TYPE_UINT8_SRGB,STBIR_TYPE_UINT8_SRGB_ALPHA,STBIR_TYPE_UINT16,STBIR_TYPE_FLOAT,STBIR_TYPE_HALF_FLOAT
947};
948
949// When gathering, the contributors are which source pixels contribute.
950// When scattering, the contributors are which destination pixels are contributed to.
951typedef struct
952{
953 int n0; // First contributing pixel
954 int n1; // Last contributing pixel
955} stbir__contributors;
956
957typedef struct
958{
959 int lowest; // First sample index for whole filter
960 int highest; // Last sample index for whole filter
961 int widest; // widest single set of samples for an output
962} stbir__filter_extent_info;
963
964typedef struct
965{
966 int n0; // First pixel of decode buffer to write to
967 int n1; // Last pixel of decode that will be written to
968 int pixel_offset_for_input; // Pixel offset into input_scanline
969} stbir__span;
970
971typedef struct stbir__scale_info
972{
973 int input_full_size;
974 int output_sub_size;
975 float scale;
976 float inv_scale;
977 float pixel_shift; // starting shift in output pixel space (in pixels)
978 int scale_is_rational;
979 stbir_uint32 scale_numerator, scale_denominator;
980} stbir__scale_info;
981
982typedef struct
983{
984 stbir__contributors * contributors;
985 float* coefficients;
986 stbir__contributors * gather_prescatter_contributors;
987 float * gather_prescatter_coefficients;
988 stbir__scale_info scale_info;
989 float support;
990 stbir_filter filter_enum;
991 stbir__kernel_callback * filter_kernel;
992 stbir__support_callback * filter_support;
993 stbir_edge edge;
994 int coefficient_width;
995 int filter_pixel_width;
996 int filter_pixel_margin;
997 int num_contributors;
998 int contributors_size;
999 int coefficients_size;
1000 stbir__filter_extent_info extent_info;
1001 int is_gather; // 0 = scatter, 1 = gather with scale >= 1, 2 = gather with scale < 1
1002 int gather_prescatter_num_contributors;
1003 int gather_prescatter_coefficient_width;
1004 int gather_prescatter_contributors_size;
1005 int gather_prescatter_coefficients_size;
1006} stbir__sampler;
1007
1008typedef struct
1009{
1010 stbir__contributors conservative;
1011 int edge_sizes[2]; // this can be less than filter_pixel_margin, if the filter and scaling falls off
1012 stbir__span spans[2]; // can be two spans, if doing input subrect with clamp mode WRAP
1013} stbir__extents;
1014
1015typedef struct
1016{
1017#ifdef STBIR_PROFILE
1018 union
1019 {
1020 struct { stbir_uint64 total, looping, vertical, horizontal, decode, encode, alpha, unalpha; } named;
1021 stbir_uint64 array[8];
1022 } profile;
1023 stbir_uint64 * current_zone_excluded_ptr;
1024#endif
1025 float* decode_buffer;
1026
1027 int ring_buffer_first_scanline;
1028 int ring_buffer_last_scanline;
1029 int ring_buffer_begin_index; // first_scanline is at this index in the ring buffer
1030 int start_output_y, end_output_y;
1031 int start_input_y, end_input_y; // used in scatter only
1032
1033 #ifdef STBIR__SEPARATE_ALLOCATIONS
1034 float** ring_buffers; // one pointer for each ring buffer
1035 #else
1036 float* ring_buffer; // one big buffer that we index into
1037 #endif
1038
1039 float* vertical_buffer;
1040
1041 char no_cache_straddle[64];
1042} stbir__per_split_info;
1043
1044typedef void stbir__decode_pixels_func( float * decode, int width_times_channels, void const * input );
1045typedef void stbir__alpha_weight_func( float * decode_buffer, int width_times_channels );
1046typedef void stbir__horizontal_gather_channels_func( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer,
1047 stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width );
1048typedef void stbir__alpha_unweight_func(float * encode_buffer, int width_times_channels );
1049typedef void stbir__encode_pixels_func( void * output, int width_times_channels, float const * encode );
1050
1051struct stbir__info
1052{
1053#ifdef STBIR_PROFILE
1054 union
1055 {
1056 struct { stbir_uint64 total, build, alloc, horizontal, vertical, cleanup, pivot; } named;
1057 stbir_uint64 array[7];
1058 } profile;
1059 stbir_uint64 * current_zone_excluded_ptr;
1060#endif
1061 stbir__sampler horizontal;
1062 stbir__sampler vertical;
1063
1064 void const * input_data;
1065 void * output_data;
1066
1067 int input_stride_bytes;
1068 int output_stride_bytes;
1069 int ring_buffer_length_bytes; // The length of an individual entry in the ring buffer. The total number of ring buffers is stbir__get_filter_pixel_width(filter)
1070 int ring_buffer_num_entries; // Total number of entries in the ring buffer.
1071
1072 stbir_datatype input_type;
1073 stbir_datatype output_type;
1074
1075 stbir_input_callback * in_pixels_cb;
1076 void * user_data;
1077 stbir_output_callback * out_pixels_cb;
1078
1079 stbir__extents scanline_extents;
1080
1081 void * alloced_mem;
1082 stbir__per_split_info * split_info; // by default 1, but there will be N of these allocated based on the thread init you did
1083
1084 stbir__decode_pixels_func * decode_pixels;
1085 stbir__alpha_weight_func * alpha_weight;
1086 stbir__horizontal_gather_channels_func * horizontal_gather_channels;
1087 stbir__alpha_unweight_func * alpha_unweight;
1088 stbir__encode_pixels_func * encode_pixels;
1089
1090 int alloc_ring_buffer_num_entries; // Number of entries in the ring buffer that will be allocated
1091 int splits; // count of splits
1092
1093 stbir_internal_pixel_layout input_pixel_layout_internal;
1094 stbir_internal_pixel_layout output_pixel_layout_internal;
1095
1096 int input_color_and_type;
1097 int offset_x, offset_y; // offset within output_data
1098 int vertical_first;
1099 int channels;
1100 int effective_channels; // same as channels, except on RGBA/ARGB (7), or XA/AX (3)
1101 size_t alloced_total;
1102};
1103
1104
1105#define stbir__max_uint8_as_float 255.0f
1106#define stbir__max_uint16_as_float 65535.0f
1107#define stbir__max_uint8_as_float_inverted (1.0f/255.0f)
1108#define stbir__max_uint16_as_float_inverted (1.0f/65535.0f)
1109#define stbir__small_float ((float)1 / (1 << 20) / (1 << 20) / (1 << 20) / (1 << 20) / (1 << 20) / (1 << 20))
1110
1111// min/max friendly
1112#define STBIR_CLAMP(x, xmin, xmax) for(;;) { \
1113 if ( (x) < (xmin) ) (x) = (xmin); \
1114 if ( (x) > (xmax) ) (x) = (xmax); \
1115 break; \
1116}
1117
1118static stbir__inline int stbir__min(int a, int b)
1119{
1120 return a < b ? a : b;
1121}
1122
1123static stbir__inline int stbir__max(int a, int b)
1124{
1125 return a > b ? a : b;
1126}
1127
1128static float stbir__srgb_uchar_to_linear_float[256] = {
1129 0.000000f, 0.000304f, 0.000607f, 0.000911f, 0.001214f, 0.001518f, 0.001821f, 0.002125f, 0.002428f, 0.002732f, 0.003035f,
1130 0.003347f, 0.003677f, 0.004025f, 0.004391f, 0.004777f, 0.005182f, 0.005605f, 0.006049f, 0.006512f, 0.006995f, 0.007499f,
1131 0.008023f, 0.008568f, 0.009134f, 0.009721f, 0.010330f, 0.010960f, 0.011612f, 0.012286f, 0.012983f, 0.013702f, 0.014444f,
1132 0.015209f, 0.015996f, 0.016807f, 0.017642f, 0.018500f, 0.019382f, 0.020289f, 0.021219f, 0.022174f, 0.023153f, 0.024158f,
1133 0.025187f, 0.026241f, 0.027321f, 0.028426f, 0.029557f, 0.030713f, 0.031896f, 0.033105f, 0.034340f, 0.035601f, 0.036889f,
1134 0.038204f, 0.039546f, 0.040915f, 0.042311f, 0.043735f, 0.045186f, 0.046665f, 0.048172f, 0.049707f, 0.051269f, 0.052861f,
1135 0.054480f, 0.056128f, 0.057805f, 0.059511f, 0.061246f, 0.063010f, 0.064803f, 0.066626f, 0.068478f, 0.070360f, 0.072272f,
1136 0.074214f, 0.076185f, 0.078187f, 0.080220f, 0.082283f, 0.084376f, 0.086500f, 0.088656f, 0.090842f, 0.093059f, 0.095307f,
1137 0.097587f, 0.099899f, 0.102242f, 0.104616f, 0.107023f, 0.109462f, 0.111932f, 0.114435f, 0.116971f, 0.119538f, 0.122139f,
1138 0.124772f, 0.127438f, 0.130136f, 0.132868f, 0.135633f, 0.138432f, 0.141263f, 0.144128f, 0.147027f, 0.149960f, 0.152926f,
1139 0.155926f, 0.158961f, 0.162029f, 0.165132f, 0.168269f, 0.171441f, 0.174647f, 0.177888f, 0.181164f, 0.184475f, 0.187821f,
1140 0.191202f, 0.194618f, 0.198069f, 0.201556f, 0.205079f, 0.208637f, 0.212231f, 0.215861f, 0.219526f, 0.223228f, 0.226966f,
1141 0.230740f, 0.234551f, 0.238398f, 0.242281f, 0.246201f, 0.250158f, 0.254152f, 0.258183f, 0.262251f, 0.266356f, 0.270498f,
1142 0.274677f, 0.278894f, 0.283149f, 0.287441f, 0.291771f, 0.296138f, 0.300544f, 0.304987f, 0.309469f, 0.313989f, 0.318547f,
1143 0.323143f, 0.327778f, 0.332452f, 0.337164f, 0.341914f, 0.346704f, 0.351533f, 0.356400f, 0.361307f, 0.366253f, 0.371238f,
1144 0.376262f, 0.381326f, 0.386430f, 0.391573f, 0.396755f, 0.401978f, 0.407240f, 0.412543f, 0.417885f, 0.423268f, 0.428691f,
1145 0.434154f, 0.439657f, 0.445201f, 0.450786f, 0.456411f, 0.462077f, 0.467784f, 0.473532f, 0.479320f, 0.485150f, 0.491021f,
1146 0.496933f, 0.502887f, 0.508881f, 0.514918f, 0.520996f, 0.527115f, 0.533276f, 0.539480f, 0.545725f, 0.552011f, 0.558340f,
1147 0.564712f, 0.571125f, 0.577581f, 0.584078f, 0.590619f, 0.597202f, 0.603827f, 0.610496f, 0.617207f, 0.623960f, 0.630757f,
1148 0.637597f, 0.644480f, 0.651406f, 0.658375f, 0.665387f, 0.672443f, 0.679543f, 0.686685f, 0.693872f, 0.701102f, 0.708376f,
1149 0.715694f, 0.723055f, 0.730461f, 0.737911f, 0.745404f, 0.752942f, 0.760525f, 0.768151f, 0.775822f, 0.783538f, 0.791298f,
1150 0.799103f, 0.806952f, 0.814847f, 0.822786f, 0.830770f, 0.838799f, 0.846873f, 0.854993f, 0.863157f, 0.871367f, 0.879622f,
1151 0.887923f, 0.896269f, 0.904661f, 0.913099f, 0.921582f, 0.930111f, 0.938686f, 0.947307f, 0.955974f, 0.964686f, 0.973445f,
1152 0.982251f, 0.991102f, 1.0f
1153};
1154
1155typedef union
1156{
1157 unsigned int u;
1158 float f;
1159} stbir__FP32;
1160
1161// From https://gist.github.com/rygorous/2203834
1162
1163static const stbir_uint32 fp32_to_srgb8_tab4[104] = {
1164 0x0073000d, 0x007a000d, 0x0080000d, 0x0087000d, 0x008d000d, 0x0094000d, 0x009a000d, 0x00a1000d,
1165 0x00a7001a, 0x00b4001a, 0x00c1001a, 0x00ce001a, 0x00da001a, 0x00e7001a, 0x00f4001a, 0x0101001a,
1166 0x010e0033, 0x01280033, 0x01410033, 0x015b0033, 0x01750033, 0x018f0033, 0x01a80033, 0x01c20033,
1167 0x01dc0067, 0x020f0067, 0x02430067, 0x02760067, 0x02aa0067, 0x02dd0067, 0x03110067, 0x03440067,
1168 0x037800ce, 0x03df00ce, 0x044600ce, 0x04ad00ce, 0x051400ce, 0x057b00c5, 0x05dd00bc, 0x063b00b5,
1169 0x06970158, 0x07420142, 0x07e30130, 0x087b0120, 0x090b0112, 0x09940106, 0x0a1700fc, 0x0a9500f2,
1170 0x0b0f01cb, 0x0bf401ae, 0x0ccb0195, 0x0d950180, 0x0e56016e, 0x0f0d015e, 0x0fbc0150, 0x10630143,
1171 0x11070264, 0x1238023e, 0x1357021d, 0x14660201, 0x156601e9, 0x165a01d3, 0x174401c0, 0x182401af,
1172 0x18fe0331, 0x1a9602fe, 0x1c1502d2, 0x1d7e02ad, 0x1ed4028d, 0x201a0270, 0x21520256, 0x227d0240,
1173 0x239f0443, 0x25c003fe, 0x27bf03c4, 0x29a10392, 0x2b6a0367, 0x2d1d0341, 0x2ebe031f, 0x304d0300,
1174 0x31d105b0, 0x34a80555, 0x37520507, 0x39d504c5, 0x3c37048b, 0x3e7c0458, 0x40a8042a, 0x42bd0401,
1175 0x44c20798, 0x488e071e, 0x4c1c06b6, 0x4f76065d, 0x52a50610, 0x55ac05cc, 0x5892058f, 0x5b590559,
1176 0x5e0c0a23, 0x631c0980, 0x67db08f6, 0x6c55087f, 0x70940818, 0x74a007bd, 0x787d076c, 0x7c330723,
1177};
1178
1179static stbir__inline stbir_uint8 stbir__linear_to_srgb_uchar(float in)
1180{
1181 static const stbir__FP32 almostone = { 0x3f7fffff }; // 1-eps
1182 static const stbir__FP32 minval = { (127-13) << 23 };
1183 stbir_uint32 tab,bias,scale,t;
1184 stbir__FP32 f;
1185
1186 // Clamp to [2^(-13), 1-eps]; these two values map to 0 and 1, respectively.
1187 // The tests are carefully written so that NaNs map to 0, same as in the reference
1188 // implementation.
1189 if (!(in > minval.f)) // written this way to catch NaNs
1190 return 0;
1191 if (in > almostone.f)
1192 return 255;
1193
1194 // Do the table lookup and unpack bias, scale
1195 f.f = in;
1196 tab = fp32_to_srgb8_tab4[(f.u - minval.u) >> 20];
1197 bias = (tab >> 16) << 9;
1198 scale = tab & 0xffff;
1199
1200 // Grab next-highest mantissa bits and perform linear interpolation
1201 t = (f.u >> 12) & 0xff;
1202 return (unsigned char) ((bias + scale*t) >> 16);
1203}
1204
1205#ifndef STBIR_FORCE_GATHER_FILTER_SCANLINES_AMOUNT
1206#define STBIR_FORCE_GATHER_FILTER_SCANLINES_AMOUNT 32 // when downsampling and <= 32 scanlines of buffering, use gather. gather used down to 1/8th scaling for 25% win.
1207#endif
1208
1209#ifndef STBIR_FORCE_MINIMUM_SCANLINES_FOR_SPLITS
1210#define STBIR_FORCE_MINIMUM_SCANLINES_FOR_SPLITS 4 // when threading, what is the minimum number of scanlines for a split?
1211#endif
1212
1213// restrict pointers for the output pointers, other loop and unroll control
1214#if defined( _MSC_VER ) && !defined(__clang__)
1215 #define STBIR_STREAMOUT_PTR( star ) star __restrict
1216 #define STBIR_NO_UNROLL( ptr ) __assume(ptr) // this oddly keeps msvc from unrolling a loop
1217 #if _MSC_VER >= 1900
1218 #define STBIR_NO_UNROLL_LOOP_START __pragma(loop( no_vector ))
1219 #else
1220 #define STBIR_NO_UNROLL_LOOP_START
1221 #endif
1222#elif defined( __clang__ )
1223 #define STBIR_STREAMOUT_PTR( star ) star __restrict__
1224 #define STBIR_NO_UNROLL( ptr ) __asm__ (""::"r"(ptr))
1225 #if ( __clang_major__ >= 4 ) || ( ( __clang_major__ >= 3 ) && ( __clang_minor__ >= 5 ) )
1226 #define STBIR_NO_UNROLL_LOOP_START _Pragma("clang loop unroll(disable)") _Pragma("clang loop vectorize(disable)")
1227 #else
1228 #define STBIR_NO_UNROLL_LOOP_START
1229 #endif
1230#elif defined( __GNUC__ )
1231 #define STBIR_STREAMOUT_PTR( star ) star __restrict__
1232 #define STBIR_NO_UNROLL( ptr ) __asm__ (""::"r"(ptr))
1233 #if __GNUC__ >= 14
1234 #define STBIR_NO_UNROLL_LOOP_START _Pragma("GCC unroll 0") _Pragma("GCC novector")
1235 #else
1236 #define STBIR_NO_UNROLL_LOOP_START
1237 #endif
1238 #define STBIR_NO_UNROLL_LOOP_START_INF_FOR
1239#else
1240 #define STBIR_STREAMOUT_PTR( star ) star
1241 #define STBIR_NO_UNROLL( ptr )
1242 #define STBIR_NO_UNROLL_LOOP_START
1243#endif
1244
1245#ifndef STBIR_NO_UNROLL_LOOP_START_INF_FOR
1246#define STBIR_NO_UNROLL_LOOP_START_INF_FOR STBIR_NO_UNROLL_LOOP_START
1247#endif
1248
1249#ifdef STBIR_NO_SIMD // force simd off for whatever reason
1250
1251// force simd off overrides everything else, so clear it all
1252
1253#ifdef STBIR_SSE2
1254#undef STBIR_SSE2
1255#endif
1256
1257#ifdef STBIR_AVX
1258#undef STBIR_AVX
1259#endif
1260
1261#ifdef STBIR_NEON
1262#undef STBIR_NEON
1263#endif
1264
1265#ifdef STBIR_AVX2
1266#undef STBIR_AVX2
1267#endif
1268
1269#ifdef STBIR_FP16C
1270#undef STBIR_FP16C
1271#endif
1272
1273#ifdef STBIR_WASM
1274#undef STBIR_WASM
1275#endif
1276
1277#ifdef STBIR_SIMD
1278#undef STBIR_SIMD
1279#endif
1280
1281#else // STBIR_SIMD
1282
1283// __v_ start
1284# if defined(STBIR_SSE2) && !defined(__TINYC__)
1285// __v_ end
1286 #include <emmintrin.h>
1287
1288 #define stbir__simdf __m128
1289 #define stbir__simdi __m128i
1290
1291 #define stbir_simdi_castf( reg ) _mm_castps_si128(reg)
1292 #define stbir_simdf_casti( reg ) _mm_castsi128_ps(reg)
1293
1294 #define stbir__simdf_load( reg, ptr ) (reg) = _mm_loadu_ps( (float const*)(ptr) )
1295 #define stbir__simdi_load( reg, ptr ) (reg) = _mm_loadu_si128 ( (stbir__simdi const*)(ptr) )
1296 #define stbir__simdf_load1( out, ptr ) (out) = _mm_load_ss( (float const*)(ptr) ) // top values can be random (not denormal or nan for perf)
1297 #define stbir__simdi_load1( out, ptr ) (out) = _mm_castps_si128( _mm_load_ss( (float const*)(ptr) ))
1298 #define stbir__simdf_load1z( out, ptr ) (out) = _mm_load_ss( (float const*)(ptr) ) // top values must be zero
1299 #define stbir__simdf_frep4( fvar ) _mm_set_ps1( fvar )
1300 #define stbir__simdf_load1frep4( out, fvar ) (out) = _mm_set_ps1( fvar )
1301 #define stbir__simdf_load2( out, ptr ) (out) = _mm_castsi128_ps( _mm_loadl_epi64( (__m128i*)(ptr)) ) // top values can be random (not denormal or nan for perf)
1302 #define stbir__simdf_load2z( out, ptr ) (out) = _mm_castsi128_ps( _mm_loadl_epi64( (__m128i*)(ptr)) ) // top values must be zero
1303 #define stbir__simdf_load2hmerge( out, reg, ptr ) (out) = _mm_castpd_ps(_mm_loadh_pd( _mm_castps_pd(reg), (double*)(ptr) ))
1304
1305 #define stbir__simdf_zeroP() _mm_setzero_ps()
1306 #define stbir__simdf_zero( reg ) (reg) = _mm_setzero_ps()
1307
1308 #define stbir__simdf_store( ptr, reg ) _mm_storeu_ps( (float*)(ptr), reg )
1309 #define stbir__simdf_store1( ptr, reg ) _mm_store_ss( (float*)(ptr), reg )
1310 #define stbir__simdf_store2( ptr, reg ) _mm_storel_epi64( (__m128i*)(ptr), _mm_castps_si128(reg) )
1311 #define stbir__simdf_store2h( ptr, reg ) _mm_storeh_pd( (double*)(ptr), _mm_castps_pd(reg) )
1312
1313 #define stbir__simdi_store( ptr, reg ) _mm_storeu_si128( (__m128i*)(ptr), reg )
1314 #define stbir__simdi_store1( ptr, reg ) _mm_store_ss( (float*)(ptr), _mm_castsi128_ps(reg) )
1315 #define stbir__simdi_store2( ptr, reg ) _mm_storel_epi64( (__m128i*)(ptr), (reg) )
1316
1317 #define stbir__prefetch( ptr ) _mm_prefetch((char*)(ptr), _MM_HINT_T0 )
1318
1319 #define stbir__simdi_expand_u8_to_u32(out0,out1,out2,out3,ireg) \
1320 { \
1321 stbir__simdi zero = _mm_setzero_si128(); \
1322 out2 = _mm_unpacklo_epi8( ireg, zero ); \
1323 out3 = _mm_unpackhi_epi8( ireg, zero ); \
1324 out0 = _mm_unpacklo_epi16( out2, zero ); \
1325 out1 = _mm_unpackhi_epi16( out2, zero ); \
1326 out2 = _mm_unpacklo_epi16( out3, zero ); \
1327 out3 = _mm_unpackhi_epi16( out3, zero ); \
1328 }
1329
1330#define stbir__simdi_expand_u8_to_1u32(out,ireg) \
1331 { \
1332 stbir__simdi zero = _mm_setzero_si128(); \
1333 out = _mm_unpacklo_epi8( ireg, zero ); \
1334 out = _mm_unpacklo_epi16( out, zero ); \
1335 }
1336
1337 #define stbir__simdi_expand_u16_to_u32(out0,out1,ireg) \
1338 { \
1339 stbir__simdi zero = _mm_setzero_si128(); \
1340 out0 = _mm_unpacklo_epi16( ireg, zero ); \
1341 out1 = _mm_unpackhi_epi16( ireg, zero ); \
1342 }
1343
1344 #define stbir__simdf_convert_float_to_i32( i, f ) (i) = _mm_cvttps_epi32(f)
1345 #define stbir__simdf_convert_float_to_int( f ) _mm_cvtt_ss2si(f)
1346 #define stbir__simdf_convert_float_to_uint8( f ) ((unsigned char)_mm_cvtsi128_si32(_mm_cvttps_epi32(_mm_max_ps(_mm_min_ps(f,STBIR__CONSTF(STBIR_max_uint8_as_float)),_mm_setzero_ps()))))
1347 #define stbir__simdf_convert_float_to_short( f ) ((unsigned short)_mm_cvtsi128_si32(_mm_cvttps_epi32(_mm_max_ps(_mm_min_ps(f,STBIR__CONSTF(STBIR_max_uint16_as_float)),_mm_setzero_ps()))))
1348
1349 #define stbir__simdi_to_int( i ) _mm_cvtsi128_si32(i)
1350 #define stbir__simdi_convert_i32_to_float(out, ireg) (out) = _mm_cvtepi32_ps( ireg )
1351 #define stbir__simdf_add( out, reg0, reg1 ) (out) = _mm_add_ps( reg0, reg1 )
1352 #define stbir__simdf_mult( out, reg0, reg1 ) (out) = _mm_mul_ps( reg0, reg1 )
1353 #define stbir__simdf_mult_mem( out, reg, ptr ) (out) = _mm_mul_ps( reg, _mm_loadu_ps( (float const*)(ptr) ) )
1354 #define stbir__simdf_mult1_mem( out, reg, ptr ) (out) = _mm_mul_ss( reg, _mm_load_ss( (float const*)(ptr) ) )
1355 #define stbir__simdf_add_mem( out, reg, ptr ) (out) = _mm_add_ps( reg, _mm_loadu_ps( (float const*)(ptr) ) )
1356 #define stbir__simdf_add1_mem( out, reg, ptr ) (out) = _mm_add_ss( reg, _mm_load_ss( (float const*)(ptr) ) )
1357
1358 #ifdef STBIR_USE_FMA // not on by default to maintain bit identical simd to non-simd
1359 #include <immintrin.h>
1360 #define stbir__simdf_madd( out, add, mul1, mul2 ) (out) = _mm_fmadd_ps( mul1, mul2, add )
1361 #define stbir__simdf_madd1( out, add, mul1, mul2 ) (out) = _mm_fmadd_ss( mul1, mul2, add )
1362 #define stbir__simdf_madd_mem( out, add, mul, ptr ) (out) = _mm_fmadd_ps( mul, _mm_loadu_ps( (float const*)(ptr) ), add )
1363 #define stbir__simdf_madd1_mem( out, add, mul, ptr ) (out) = _mm_fmadd_ss( mul, _mm_load_ss( (float const*)(ptr) ), add )
1364 #else
1365 #define stbir__simdf_madd( out, add, mul1, mul2 ) (out) = _mm_add_ps( add, _mm_mul_ps( mul1, mul2 ) )
1366 #define stbir__simdf_madd1( out, add, mul1, mul2 ) (out) = _mm_add_ss( add, _mm_mul_ss( mul1, mul2 ) )
1367 #define stbir__simdf_madd_mem( out, add, mul, ptr ) (out) = _mm_add_ps( add, _mm_mul_ps( mul, _mm_loadu_ps( (float const*)(ptr) ) ) )
1368 #define stbir__simdf_madd1_mem( out, add, mul, ptr ) (out) = _mm_add_ss( add, _mm_mul_ss( mul, _mm_load_ss( (float const*)(ptr) ) ) )
1369 #endif
1370
1371 #define stbir__simdf_add1( out, reg0, reg1 ) (out) = _mm_add_ss( reg0, reg1 )
1372 #define stbir__simdf_mult1( out, reg0, reg1 ) (out) = _mm_mul_ss( reg0, reg1 )
1373
1374 #define stbir__simdf_and( out, reg0, reg1 ) (out) = _mm_and_ps( reg0, reg1 )
1375 #define stbir__simdf_or( out, reg0, reg1 ) (out) = _mm_or_ps( reg0, reg1 )
1376
1377 #define stbir__simdf_min( out, reg0, reg1 ) (out) = _mm_min_ps( reg0, reg1 )
1378 #define stbir__simdf_max( out, reg0, reg1 ) (out) = _mm_max_ps( reg0, reg1 )
1379 #define stbir__simdf_min1( out, reg0, reg1 ) (out) = _mm_min_ss( reg0, reg1 )
1380 #define stbir__simdf_max1( out, reg0, reg1 ) (out) = _mm_max_ss( reg0, reg1 )
1381
1382 #define stbir__simdf_0123ABCDto3ABx( out, reg0, reg1 ) (out)=_mm_castsi128_ps( _mm_shuffle_epi32( _mm_castps_si128( _mm_shuffle_ps( reg1,reg0, (0<<0) + (1<<2) + (2<<4) + (3<<6) )), (3<<0) + (0<<2) + (1<<4) + (2<<6) ) )
1383 #define stbir__simdf_0123ABCDto23Ax( out, reg0, reg1 ) (out)=_mm_castsi128_ps( _mm_shuffle_epi32( _mm_castps_si128( _mm_shuffle_ps( reg1,reg0, (0<<0) + (1<<2) + (2<<4) + (3<<6) )), (2<<0) + (3<<2) + (0<<4) + (1<<6) ) )
1384
1385 static const stbir__simdf STBIR_zeroones = { 0.0f,1.0f,0.0f,1.0f };
1386 static const stbir__simdf STBIR_onezeros = { 1.0f,0.0f,1.0f,0.0f };
1387 #define stbir__simdf_aaa1( out, alp, ones ) (out)=_mm_castsi128_ps( _mm_shuffle_epi32( _mm_castps_si128( _mm_movehl_ps( ones, alp ) ), (1<<0) + (1<<2) + (1<<4) + (2<<6) ) )
1388 #define stbir__simdf_1aaa( out, alp, ones ) (out)=_mm_castsi128_ps( _mm_shuffle_epi32( _mm_castps_si128( _mm_movelh_ps( ones, alp ) ), (0<<0) + (2<<2) + (2<<4) + (2<<6) ) )
1389 #define stbir__simdf_a1a1( out, alp, ones) (out) = _mm_or_ps( _mm_castsi128_ps( _mm_srli_epi64( _mm_castps_si128(alp), 32 ) ), STBIR_zeroones )
1390 #define stbir__simdf_1a1a( out, alp, ones) (out) = _mm_or_ps( _mm_castsi128_ps( _mm_slli_epi64( _mm_castps_si128(alp), 32 ) ), STBIR_onezeros )
1391
1392 #define stbir__simdf_swiz( reg, one, two, three, four ) _mm_castsi128_ps( _mm_shuffle_epi32( _mm_castps_si128( reg ), (one<<0) + (two<<2) + (three<<4) + (four<<6) ) )
1393
1394 #define stbir__simdi_and( out, reg0, reg1 ) (out) = _mm_and_si128( reg0, reg1 )
1395 #define stbir__simdi_or( out, reg0, reg1 ) (out) = _mm_or_si128( reg0, reg1 )
1396 #define stbir__simdi_16madd( out, reg0, reg1 ) (out) = _mm_madd_epi16( reg0, reg1 )
1397
1398 #define stbir__simdf_pack_to_8bytes(out,aa,bb) \
1399 { \
1400 stbir__simdf af,bf; \
1401 stbir__simdi a,b; \
1402 af = _mm_min_ps( aa, STBIR_max_uint8_as_float ); \
1403 bf = _mm_min_ps( bb, STBIR_max_uint8_as_float ); \
1404 af = _mm_max_ps( af, _mm_setzero_ps() ); \
1405 bf = _mm_max_ps( bf, _mm_setzero_ps() ); \
1406 a = _mm_cvttps_epi32( af ); \
1407 b = _mm_cvttps_epi32( bf ); \
1408 a = _mm_packs_epi32( a, b ); \
1409 out = _mm_packus_epi16( a, a ); \
1410 }
1411
1412 #define stbir__simdf_load4_transposed( o0, o1, o2, o3, ptr ) \
1413 stbir__simdf_load( o0, (ptr) ); \
1414 stbir__simdf_load( o1, (ptr)+4 ); \
1415 stbir__simdf_load( o2, (ptr)+8 ); \
1416 stbir__simdf_load( o3, (ptr)+12 ); \
1417 { \
1418 __m128 tmp0, tmp1, tmp2, tmp3; \
1419 tmp0 = _mm_unpacklo_ps(o0, o1); \
1420 tmp2 = _mm_unpacklo_ps(o2, o3); \
1421 tmp1 = _mm_unpackhi_ps(o0, o1); \
1422 tmp3 = _mm_unpackhi_ps(o2, o3); \
1423 o0 = _mm_movelh_ps(tmp0, tmp2); \
1424 o1 = _mm_movehl_ps(tmp2, tmp0); \
1425 o2 = _mm_movelh_ps(tmp1, tmp3); \
1426 o3 = _mm_movehl_ps(tmp3, tmp1); \
1427 }
1428
1429 #define stbir__interleave_pack_and_store_16_u8( ptr, r0, r1, r2, r3 ) \
1430 r0 = _mm_packs_epi32( r0, r1 ); \
1431 r2 = _mm_packs_epi32( r2, r3 ); \
1432 r1 = _mm_unpacklo_epi16( r0, r2 ); \
1433 r3 = _mm_unpackhi_epi16( r0, r2 ); \
1434 r0 = _mm_unpacklo_epi16( r1, r3 ); \
1435 r2 = _mm_unpackhi_epi16( r1, r3 ); \
1436 r0 = _mm_packus_epi16( r0, r2 ); \
1437 stbir__simdi_store( ptr, r0 ); \
1438
1439 #define stbir__simdi_32shr( out, reg, imm ) out = _mm_srli_epi32( reg, imm )
1440
1441 #if defined(_MSC_VER) && !defined(__clang__)
1442 // msvc inits with 8 bytes
1443 #define STBIR__CONST_32_TO_8( v ) (char)(unsigned char)((v)&255),(char)(unsigned char)(((v)>>8)&255),(char)(unsigned char)(((v)>>16)&255),(char)(unsigned char)(((v)>>24)&255)
1444 #define STBIR__CONST_4_32i( v ) STBIR__CONST_32_TO_8( v ), STBIR__CONST_32_TO_8( v ), STBIR__CONST_32_TO_8( v ), STBIR__CONST_32_TO_8( v )
1445 #define STBIR__CONST_4d_32i( v0, v1, v2, v3 ) STBIR__CONST_32_TO_8( v0 ), STBIR__CONST_32_TO_8( v1 ), STBIR__CONST_32_TO_8( v2 ), STBIR__CONST_32_TO_8( v3 )
1446 #else
1447 // everything else inits with long long's
1448 #define STBIR__CONST_4_32i( v ) (long long)((((stbir_uint64)(stbir_uint32)(v))<<32)|((stbir_uint64)(stbir_uint32)(v))),(long long)((((stbir_uint64)(stbir_uint32)(v))<<32)|((stbir_uint64)(stbir_uint32)(v)))
1449 #define STBIR__CONST_4d_32i( v0, v1, v2, v3 ) (long long)((((stbir_uint64)(stbir_uint32)(v1))<<32)|((stbir_uint64)(stbir_uint32)(v0))),(long long)((((stbir_uint64)(stbir_uint32)(v3))<<32)|((stbir_uint64)(stbir_uint32)(v2)))
1450 #endif
1451
1452 #define STBIR__SIMDF_CONST(var, x) stbir__simdf var = { x, x, x, x }
1453 #define STBIR__SIMDI_CONST(var, x) stbir__simdi var = { STBIR__CONST_4_32i(x) }
1454 #define STBIR__CONSTF(var) (var)
1455 #define STBIR__CONSTI(var) (var)
1456
1457 #if defined(STBIR_AVX) || defined(__SSE4_1__)
1458 #include <smmintrin.h>
1459 #define stbir__simdf_pack_to_8words(out,reg0,reg1) out = _mm_packus_epi32(_mm_cvttps_epi32(_mm_max_ps(_mm_min_ps(reg0,STBIR__CONSTF(STBIR_max_uint16_as_float)),_mm_setzero_ps())), _mm_cvttps_epi32(_mm_max_ps(_mm_min_ps(reg1,STBIR__CONSTF(STBIR_max_uint16_as_float)),_mm_setzero_ps())))
1460 #else
1461 STBIR__SIMDI_CONST(stbir__s32_32768, 32768);
1462 STBIR__SIMDI_CONST(stbir__s16_32768, ((32768<<16)|32768));
1463
1464 #define stbir__simdf_pack_to_8words(out,reg0,reg1) \
1465 { \
1466 stbir__simdi tmp0,tmp1; \
1467 tmp0 = _mm_cvttps_epi32(_mm_max_ps(_mm_min_ps(reg0,STBIR__CONSTF(STBIR_max_uint16_as_float)),_mm_setzero_ps())); \
1468 tmp1 = _mm_cvttps_epi32(_mm_max_ps(_mm_min_ps(reg1,STBIR__CONSTF(STBIR_max_uint16_as_float)),_mm_setzero_ps())); \
1469 tmp0 = _mm_sub_epi32( tmp0, stbir__s32_32768 ); \
1470 tmp1 = _mm_sub_epi32( tmp1, stbir__s32_32768 ); \
1471 out = _mm_packs_epi32( tmp0, tmp1 ); \
1472 out = _mm_sub_epi16( out, stbir__s16_32768 ); \
1473 }
1474
1475 #endif
1476
1477 #define STBIR_SIMD
1478
1479 // if we detect AVX, set the simd8 defines
1480 #ifdef STBIR_AVX
1481 #include <immintrin.h>
1482 #define STBIR_SIMD8
1483 #define stbir__simdf8 __m256
1484 #define stbir__simdi8 __m256i
1485 #define stbir__simdf8_load( out, ptr ) (out) = _mm256_loadu_ps( (float const *)(ptr) )
1486 #define stbir__simdi8_load( out, ptr ) (out) = _mm256_loadu_si256( (__m256i const *)(ptr) )
1487 #define stbir__simdf8_mult( out, a, b ) (out) = _mm256_mul_ps( (a), (b) )
1488 #define stbir__simdf8_store( ptr, out ) _mm256_storeu_ps( (float*)(ptr), out )
1489 #define stbir__simdi8_store( ptr, reg ) _mm256_storeu_si256( (__m256i*)(ptr), reg )
1490 #define stbir__simdf8_frep8( fval ) _mm256_set1_ps( fval )
1491
1492 #define stbir__simdf8_min( out, reg0, reg1 ) (out) = _mm256_min_ps( reg0, reg1 )
1493 #define stbir__simdf8_max( out, reg0, reg1 ) (out) = _mm256_max_ps( reg0, reg1 )
1494
1495 #define stbir__simdf8_add4halves( out, bot4, top8 ) (out) = _mm_add_ps( bot4, _mm256_extractf128_ps( top8, 1 ) )
1496 #define stbir__simdf8_mult_mem( out, reg, ptr ) (out) = _mm256_mul_ps( reg, _mm256_loadu_ps( (float const*)(ptr) ) )
1497 #define stbir__simdf8_add_mem( out, reg, ptr ) (out) = _mm256_add_ps( reg, _mm256_loadu_ps( (float const*)(ptr) ) )
1498 #define stbir__simdf8_add( out, a, b ) (out) = _mm256_add_ps( a, b )
1499 #define stbir__simdf8_load1b( out, ptr ) (out) = _mm256_broadcast_ss( ptr )
1500 #define stbir__simdf_load1rep4( out, ptr ) (out) = _mm_broadcast_ss( ptr ) // avx load instruction
1501
1502 #define stbir__simdi8_convert_i32_to_float(out, ireg) (out) = _mm256_cvtepi32_ps( ireg )
1503 #define stbir__simdf8_convert_float_to_i32( i, f ) (i) = _mm256_cvttps_epi32(f)
1504
1505 #define stbir__simdf8_bot4s( out, a, b ) (out) = _mm256_permute2f128_ps(a,b, (0<<0)+(2<<4) )
1506 #define stbir__simdf8_top4s( out, a, b ) (out) = _mm256_permute2f128_ps(a,b, (1<<0)+(3<<4) )
1507
1508 #define stbir__simdf8_gettop4( reg ) _mm256_extractf128_ps(reg,1)
1509
1510 #ifdef STBIR_AVX2
1511
1512 #define stbir__simdi8_expand_u8_to_u32(out0,out1,ireg) \
1513 { \
1514 stbir__simdi8 a, zero =_mm256_setzero_si256();\
1515 a = _mm256_permute4x64_epi64( _mm256_unpacklo_epi8( _mm256_permute4x64_epi64(_mm256_castsi128_si256(ireg),(0<<0)+(2<<2)+(1<<4)+(3<<6)), zero ),(0<<0)+(2<<2)+(1<<4)+(3<<6)); \
1516 out0 = _mm256_unpacklo_epi16( a, zero ); \
1517 out1 = _mm256_unpackhi_epi16( a, zero ); \
1518 }
1519
1520 #define stbir__simdf8_pack_to_16bytes(out,aa,bb) \
1521 { \
1522 stbir__simdi8 t; \
1523 stbir__simdf8 af,bf; \
1524 stbir__simdi8 a,b; \
1525 af = _mm256_min_ps( aa, STBIR_max_uint8_as_floatX ); \
1526 bf = _mm256_min_ps( bb, STBIR_max_uint8_as_floatX ); \
1527 af = _mm256_max_ps( af, _mm256_setzero_ps() ); \
1528 bf = _mm256_max_ps( bf, _mm256_setzero_ps() ); \
1529 a = _mm256_cvttps_epi32( af ); \
1530 b = _mm256_cvttps_epi32( bf ); \
1531 t = _mm256_permute4x64_epi64( _mm256_packs_epi32( a, b ), (0<<0)+(2<<2)+(1<<4)+(3<<6) ); \
1532 out = _mm256_castsi256_si128( _mm256_permute4x64_epi64( _mm256_packus_epi16( t, t ), (0<<0)+(2<<2)+(1<<4)+(3<<6) ) ); \
1533 }
1534
1535 #define stbir__simdi8_expand_u16_to_u32(out,ireg) out = _mm256_unpacklo_epi16( _mm256_permute4x64_epi64(_mm256_castsi128_si256(ireg),(0<<0)+(2<<2)+(1<<4)+(3<<6)), _mm256_setzero_si256() );
1536
1537 #define stbir__simdf8_pack_to_16words(out,aa,bb) \
1538 { \
1539 stbir__simdf8 af,bf; \
1540 stbir__simdi8 a,b; \
1541 af = _mm256_min_ps( aa, STBIR_max_uint16_as_floatX ); \
1542 bf = _mm256_min_ps( bb, STBIR_max_uint16_as_floatX ); \
1543 af = _mm256_max_ps( af, _mm256_setzero_ps() ); \
1544 bf = _mm256_max_ps( bf, _mm256_setzero_ps() ); \
1545 a = _mm256_cvttps_epi32( af ); \
1546 b = _mm256_cvttps_epi32( bf ); \
1547 (out) = _mm256_permute4x64_epi64( _mm256_packus_epi32(a, b), (0<<0)+(2<<2)+(1<<4)+(3<<6) ); \
1548 }
1549
1550 #else
1551
1552 #define stbir__simdi8_expand_u8_to_u32(out0,out1,ireg) \
1553 { \
1554 stbir__simdi a,zero = _mm_setzero_si128(); \
1555 a = _mm_unpacklo_epi8( ireg, zero ); \
1556 out0 = _mm256_setr_m128i( _mm_unpacklo_epi16( a, zero ), _mm_unpackhi_epi16( a, zero ) ); \
1557 a = _mm_unpackhi_epi8( ireg, zero ); \
1558 out1 = _mm256_setr_m128i( _mm_unpacklo_epi16( a, zero ), _mm_unpackhi_epi16( a, zero ) ); \
1559 }
1560
1561 #define stbir__simdf8_pack_to_16bytes(out,aa,bb) \
1562 { \
1563 stbir__simdi t; \
1564 stbir__simdf8 af,bf; \
1565 stbir__simdi8 a,b; \
1566 af = _mm256_min_ps( aa, STBIR_max_uint8_as_floatX ); \
1567 bf = _mm256_min_ps( bb, STBIR_max_uint8_as_floatX ); \
1568 af = _mm256_max_ps( af, _mm256_setzero_ps() ); \
1569 bf = _mm256_max_ps( bf, _mm256_setzero_ps() ); \
1570 a = _mm256_cvttps_epi32( af ); \
1571 b = _mm256_cvttps_epi32( bf ); \
1572 out = _mm_packs_epi32( _mm256_castsi256_si128(a), _mm256_extractf128_si256( a, 1 ) ); \
1573 out = _mm_packus_epi16( out, out ); \
1574 t = _mm_packs_epi32( _mm256_castsi256_si128(b), _mm256_extractf128_si256( b, 1 ) ); \
1575 t = _mm_packus_epi16( t, t ); \
1576 out = _mm_castps_si128( _mm_shuffle_ps( _mm_castsi128_ps(out), _mm_castsi128_ps(t), (0<<0)+(1<<2)+(0<<4)+(1<<6) ) ); \
1577 }
1578
1579 #define stbir__simdi8_expand_u16_to_u32(out,ireg) \
1580 { \
1581 stbir__simdi a,b,zero = _mm_setzero_si128(); \
1582 a = _mm_unpacklo_epi16( ireg, zero ); \
1583 b = _mm_unpackhi_epi16( ireg, zero ); \
1584 out = _mm256_insertf128_si256( _mm256_castsi128_si256( a ), b, 1 ); \
1585 }
1586
1587 #define stbir__simdf8_pack_to_16words(out,aa,bb) \
1588 { \
1589 stbir__simdi t0,t1; \
1590 stbir__simdf8 af,bf; \
1591 stbir__simdi8 a,b; \
1592 af = _mm256_min_ps( aa, STBIR_max_uint16_as_floatX ); \
1593 bf = _mm256_min_ps( bb, STBIR_max_uint16_as_floatX ); \
1594 af = _mm256_max_ps( af, _mm256_setzero_ps() ); \
1595 bf = _mm256_max_ps( bf, _mm256_setzero_ps() ); \
1596 a = _mm256_cvttps_epi32( af ); \
1597 b = _mm256_cvttps_epi32( bf ); \
1598 t0 = _mm_packus_epi32( _mm256_castsi256_si128(a), _mm256_extractf128_si256( a, 1 ) ); \
1599 t1 = _mm_packus_epi32( _mm256_castsi256_si128(b), _mm256_extractf128_si256( b, 1 ) ); \
1600 out = _mm256_setr_m128i( t0, t1 ); \
1601 }
1602
1603 #endif
1604
1605 static __m256i stbir_00001111 = { STBIR__CONST_4d_32i( 0, 0, 0, 0 ), STBIR__CONST_4d_32i( 1, 1, 1, 1 ) };
1606 #define stbir__simdf8_0123to00001111( out, in ) (out) = _mm256_permutevar_ps ( in, stbir_00001111 )
1607
1608 static __m256i stbir_22223333 = { STBIR__CONST_4d_32i( 2, 2, 2, 2 ), STBIR__CONST_4d_32i( 3, 3, 3, 3 ) };
1609 #define stbir__simdf8_0123to22223333( out, in ) (out) = _mm256_permutevar_ps ( in, stbir_22223333 )
1610
1611 #define stbir__simdf8_0123to2222( out, in ) (out) = stbir__simdf_swiz(_mm256_castps256_ps128(in), 2,2,2,2 )
1612
1613 #define stbir__simdf8_load4b( out, ptr ) (out) = _mm256_broadcast_ps( (__m128 const *)(ptr) )
1614
1615 static __m256i stbir_00112233 = { STBIR__CONST_4d_32i( 0, 0, 1, 1 ), STBIR__CONST_4d_32i( 2, 2, 3, 3 ) };
1616 #define stbir__simdf8_0123to00112233( out, in ) (out) = _mm256_permutevar_ps ( in, stbir_00112233 )
1617 #define stbir__simdf8_add4( out, a8, b ) (out) = _mm256_add_ps( a8, _mm256_castps128_ps256( b ) )
1618
1619 static __m256i stbir_load6 = { STBIR__CONST_4_32i( 0x80000000 ), STBIR__CONST_4d_32i( 0x80000000, 0x80000000, 0, 0 ) };
1620 #define stbir__simdf8_load6z( out, ptr ) (out) = _mm256_maskload_ps( ptr, stbir_load6 )
1621
1622 #define stbir__simdf8_0123to00000000( out, in ) (out) = _mm256_shuffle_ps ( in, in, (0<<0)+(0<<2)+(0<<4)+(0<<6) )
1623 #define stbir__simdf8_0123to11111111( out, in ) (out) = _mm256_shuffle_ps ( in, in, (1<<0)+(1<<2)+(1<<4)+(1<<6) )
1624 #define stbir__simdf8_0123to22222222( out, in ) (out) = _mm256_shuffle_ps ( in, in, (2<<0)+(2<<2)+(2<<4)+(2<<6) )
1625 #define stbir__simdf8_0123to33333333( out, in ) (out) = _mm256_shuffle_ps ( in, in, (3<<0)+(3<<2)+(3<<4)+(3<<6) )
1626 #define stbir__simdf8_0123to21032103( out, in ) (out) = _mm256_shuffle_ps ( in, in, (2<<0)+(1<<2)+(0<<4)+(3<<6) )
1627 #define stbir__simdf8_0123to32103210( out, in ) (out) = _mm256_shuffle_ps ( in, in, (3<<0)+(2<<2)+(1<<4)+(0<<6) )
1628 #define stbir__simdf8_0123to12301230( out, in ) (out) = _mm256_shuffle_ps ( in, in, (1<<0)+(2<<2)+(3<<4)+(0<<6) )
1629 #define stbir__simdf8_0123to10321032( out, in ) (out) = _mm256_shuffle_ps ( in, in, (1<<0)+(0<<2)+(3<<4)+(2<<6) )
1630 #define stbir__simdf8_0123to30123012( out, in ) (out) = _mm256_shuffle_ps ( in, in, (3<<0)+(0<<2)+(1<<4)+(2<<6) )
1631
1632 #define stbir__simdf8_0123to11331133( out, in ) (out) = _mm256_shuffle_ps ( in, in, (1<<0)+(1<<2)+(3<<4)+(3<<6) )
1633 #define stbir__simdf8_0123to00220022( out, in ) (out) = _mm256_shuffle_ps ( in, in, (0<<0)+(0<<2)+(2<<4)+(2<<6) )
1634
1635 #define stbir__simdf8_aaa1( out, alp, ones ) (out) = _mm256_blend_ps( alp, ones, (1<<0)+(1<<1)+(1<<2)+(0<<3)+(1<<4)+(1<<5)+(1<<6)+(0<<7)); (out)=_mm256_shuffle_ps( out,out, (3<<0) + (3<<2) + (3<<4) + (0<<6) )
1636 #define stbir__simdf8_1aaa( out, alp, ones ) (out) = _mm256_blend_ps( alp, ones, (0<<0)+(1<<1)+(1<<2)+(1<<3)+(0<<4)+(1<<5)+(1<<6)+(1<<7)); (out)=_mm256_shuffle_ps( out,out, (1<<0) + (0<<2) + (0<<4) + (0<<6) )
1637 #define stbir__simdf8_a1a1( out, alp, ones) (out) = _mm256_blend_ps( alp, ones, (1<<0)+(0<<1)+(1<<2)+(0<<3)+(1<<4)+(0<<5)+(1<<6)+(0<<7)); (out)=_mm256_shuffle_ps( out,out, (1<<0) + (0<<2) + (3<<4) + (2<<6) )
1638 #define stbir__simdf8_1a1a( out, alp, ones) (out) = _mm256_blend_ps( alp, ones, (0<<0)+(1<<1)+(0<<2)+(1<<3)+(0<<4)+(1<<5)+(0<<6)+(1<<7)); (out)=_mm256_shuffle_ps( out,out, (1<<0) + (0<<2) + (3<<4) + (2<<6) )
1639
1640 #define stbir__simdf8_zero( reg ) (reg) = _mm256_setzero_ps()
1641
1642 #ifdef STBIR_USE_FMA // not on by default to maintain bit identical simd to non-simd
1643 #define stbir__simdf8_madd( out, add, mul1, mul2 ) (out) = _mm256_fmadd_ps( mul1, mul2, add )
1644 #define stbir__simdf8_madd_mem( out, add, mul, ptr ) (out) = _mm256_fmadd_ps( mul, _mm256_loadu_ps( (float const*)(ptr) ), add )
1645 #define stbir__simdf8_madd_mem4( out, add, mul, ptr )(out) = _mm256_fmadd_ps( _mm256_setr_m128( mul, _mm_setzero_ps() ), _mm256_setr_m128( _mm_loadu_ps( (float const*)(ptr) ), _mm_setzero_ps() ), add )
1646 #else
1647 #define stbir__simdf8_madd( out, add, mul1, mul2 ) (out) = _mm256_add_ps( add, _mm256_mul_ps( mul1, mul2 ) )
1648 #define stbir__simdf8_madd_mem( out, add, mul, ptr ) (out) = _mm256_add_ps( add, _mm256_mul_ps( mul, _mm256_loadu_ps( (float const*)(ptr) ) ) )
1649 #define stbir__simdf8_madd_mem4( out, add, mul, ptr ) (out) = _mm256_add_ps( add, _mm256_setr_m128( _mm_mul_ps( mul, _mm_loadu_ps( (float const*)(ptr) ) ), _mm_setzero_ps() ) )
1650 #endif
1651 #define stbir__if_simdf8_cast_to_simdf4( val ) _mm256_castps256_ps128( val )
1652
1653 #endif
1654
1655 #ifdef STBIR_FLOORF
1656 #undef STBIR_FLOORF
1657 #endif
1658 #define STBIR_FLOORF stbir_simd_floorf
1659 static stbir__inline float stbir_simd_floorf(float x) // martins floorf
1660 {
1661 #if defined(STBIR_AVX) || defined(__SSE4_1__) || defined(STBIR_SSE41)
1662 __m128 t = _mm_set_ss(x);
1663 return _mm_cvtss_f32( _mm_floor_ss(t, t) );
1664 #else
1665 __m128 f = _mm_set_ss(x);
1666 __m128 t = _mm_cvtepi32_ps(_mm_cvttps_epi32(f));
1667 __m128 r = _mm_add_ss(t, _mm_and_ps(_mm_cmplt_ss(f, t), _mm_set_ss(-1.0f)));
1668 return _mm_cvtss_f32(r);
1669 #endif
1670 }
1671
1672 #ifdef STBIR_CEILF
1673 #undef STBIR_CEILF
1674 #endif
1675 #define STBIR_CEILF stbir_simd_ceilf
1676 static stbir__inline float stbir_simd_ceilf(float x) // martins ceilf
1677 {
1678 #if defined(STBIR_AVX) || defined(__SSE4_1__) || defined(STBIR_SSE41)
1679 __m128 t = _mm_set_ss(x);
1680 return _mm_cvtss_f32( _mm_ceil_ss(t, t) );
1681 #else
1682 __m128 f = _mm_set_ss(x);
1683 __m128 t = _mm_cvtepi32_ps(_mm_cvttps_epi32(f));
1684 __m128 r = _mm_add_ss(t, _mm_and_ps(_mm_cmplt_ss(t, f), _mm_set_ss(1.0f)));
1685 return _mm_cvtss_f32(r);
1686 #endif
1687 }
1688
1689#elif defined(STBIR_NEON)
1690
1691 #include <arm_neon.h>
1692
1693 #define stbir__simdf float32x4_t
1694 #define stbir__simdi uint32x4_t
1695
1696 #define stbir_simdi_castf( reg ) vreinterpretq_u32_f32(reg)
1697 #define stbir_simdf_casti( reg ) vreinterpretq_f32_u32(reg)
1698
1699 #define stbir__simdf_load( reg, ptr ) (reg) = vld1q_f32( (float const*)(ptr) )
1700 #define stbir__simdi_load( reg, ptr ) (reg) = vld1q_u32( (uint32_t const*)(ptr) )
1701 #define stbir__simdf_load1( out, ptr ) (out) = vld1q_dup_f32( (float const*)(ptr) ) // top values can be random (not denormal or nan for perf)
1702 #define stbir__simdi_load1( out, ptr ) (out) = vld1q_dup_u32( (uint32_t const*)(ptr) )
1703 #define stbir__simdf_load1z( out, ptr ) (out) = vld1q_lane_f32( (float const*)(ptr), vdupq_n_f32(0), 0 ) // top values must be zero
1704 #define stbir__simdf_frep4( fvar ) vdupq_n_f32( fvar )
1705 #define stbir__simdf_load1frep4( out, fvar ) (out) = vdupq_n_f32( fvar )
1706 #define stbir__simdf_load2( out, ptr ) (out) = vcombine_f32( vld1_f32( (float const*)(ptr) ), vcreate_f32(0) ) // top values can be random (not denormal or nan for perf)
1707 #define stbir__simdf_load2z( out, ptr ) (out) = vcombine_f32( vld1_f32( (float const*)(ptr) ), vcreate_f32(0) ) // top values must be zero
1708 #define stbir__simdf_load2hmerge( out, reg, ptr ) (out) = vcombine_f32( vget_low_f32(reg), vld1_f32( (float const*)(ptr) ) )
1709
1710 #define stbir__simdf_zeroP() vdupq_n_f32(0)
1711 #define stbir__simdf_zero( reg ) (reg) = vdupq_n_f32(0)
1712
1713 #define stbir__simdf_store( ptr, reg ) vst1q_f32( (float*)(ptr), reg )
1714 #define stbir__simdf_store1( ptr, reg ) vst1q_lane_f32( (float*)(ptr), reg, 0)
1715 #define stbir__simdf_store2( ptr, reg ) vst1_f32( (float*)(ptr), vget_low_f32(reg) )
1716 #define stbir__simdf_store2h( ptr, reg ) vst1_f32( (float*)(ptr), vget_high_f32(reg) )
1717
1718 #define stbir__simdi_store( ptr, reg ) vst1q_u32( (uint32_t*)(ptr), reg )
1719 #define stbir__simdi_store1( ptr, reg ) vst1q_lane_u32( (uint32_t*)(ptr), reg, 0 )
1720 #define stbir__simdi_store2( ptr, reg ) vst1_u32( (uint32_t*)(ptr), vget_low_u32(reg) )
1721
1722 #define stbir__prefetch( ptr )
1723
1724 #define stbir__simdi_expand_u8_to_u32(out0,out1,out2,out3,ireg) \
1725 { \
1726 uint16x8_t l = vmovl_u8( vget_low_u8 ( vreinterpretq_u8_u32(ireg) ) ); \
1727 uint16x8_t h = vmovl_u8( vget_high_u8( vreinterpretq_u8_u32(ireg) ) ); \
1728 out0 = vmovl_u16( vget_low_u16 ( l ) ); \
1729 out1 = vmovl_u16( vget_high_u16( l ) ); \
1730 out2 = vmovl_u16( vget_low_u16 ( h ) ); \
1731 out3 = vmovl_u16( vget_high_u16( h ) ); \
1732 }
1733
1734 #define stbir__simdi_expand_u8_to_1u32(out,ireg) \
1735 { \
1736 uint16x8_t tmp = vmovl_u8( vget_low_u8( vreinterpretq_u8_u32(ireg) ) ); \
1737 out = vmovl_u16( vget_low_u16( tmp ) ); \
1738 }
1739
1740 #define stbir__simdi_expand_u16_to_u32(out0,out1,ireg) \
1741 { \
1742 uint16x8_t tmp = vreinterpretq_u16_u32(ireg); \
1743 out0 = vmovl_u16( vget_low_u16 ( tmp ) ); \
1744 out1 = vmovl_u16( vget_high_u16( tmp ) ); \
1745 }
1746
1747 #define stbir__simdf_convert_float_to_i32( i, f ) (i) = vreinterpretq_u32_s32( vcvtq_s32_f32(f) )
1748 #define stbir__simdf_convert_float_to_int( f ) vgetq_lane_s32(vcvtq_s32_f32(f), 0)
1749 #define stbir__simdi_to_int( i ) (int)vgetq_lane_u32(i, 0)
1750 #define stbir__simdf_convert_float_to_uint8( f ) ((unsigned char)vgetq_lane_s32(vcvtq_s32_f32(vmaxq_f32(vminq_f32(f,STBIR__CONSTF(STBIR_max_uint8_as_float)),vdupq_n_f32(0))), 0))
1751 #define stbir__simdf_convert_float_to_short( f ) ((unsigned short)vgetq_lane_s32(vcvtq_s32_f32(vmaxq_f32(vminq_f32(f,STBIR__CONSTF(STBIR_max_uint16_as_float)),vdupq_n_f32(0))), 0))
1752 #define stbir__simdi_convert_i32_to_float(out, ireg) (out) = vcvtq_f32_s32( vreinterpretq_s32_u32(ireg) )
1753 #define stbir__simdf_add( out, reg0, reg1 ) (out) = vaddq_f32( reg0, reg1 )
1754 #define stbir__simdf_mult( out, reg0, reg1 ) (out) = vmulq_f32( reg0, reg1 )
1755 #define stbir__simdf_mult_mem( out, reg, ptr ) (out) = vmulq_f32( reg, vld1q_f32( (float const*)(ptr) ) )
1756 #define stbir__simdf_mult1_mem( out, reg, ptr ) (out) = vmulq_f32( reg, vld1q_dup_f32( (float const*)(ptr) ) )
1757 #define stbir__simdf_add_mem( out, reg, ptr ) (out) = vaddq_f32( reg, vld1q_f32( (float const*)(ptr) ) )
1758 #define stbir__simdf_add1_mem( out, reg, ptr ) (out) = vaddq_f32( reg, vld1q_dup_f32( (float const*)(ptr) ) )
1759
1760 #ifdef STBIR_USE_FMA // not on by default to maintain bit identical simd to non-simd (and also x64 no madd to arm madd)
1761 #define stbir__simdf_madd( out, add, mul1, mul2 ) (out) = vfmaq_f32( add, mul1, mul2 )
1762 #define stbir__simdf_madd1( out, add, mul1, mul2 ) (out) = vfmaq_f32( add, mul1, mul2 )
1763 #define stbir__simdf_madd_mem( out, add, mul, ptr ) (out) = vfmaq_f32( add, mul, vld1q_f32( (float const*)(ptr) ) )
1764 #define stbir__simdf_madd1_mem( out, add, mul, ptr ) (out) = vfmaq_f32( add, mul, vld1q_dup_f32( (float const*)(ptr) ) )
1765 #else
1766 #define stbir__simdf_madd( out, add, mul1, mul2 ) (out) = vaddq_f32( add, vmulq_f32( mul1, mul2 ) )
1767 #define stbir__simdf_madd1( out, add, mul1, mul2 ) (out) = vaddq_f32( add, vmulq_f32( mul1, mul2 ) )
1768 #define stbir__simdf_madd_mem( out, add, mul, ptr ) (out) = vaddq_f32( add, vmulq_f32( mul, vld1q_f32( (float const*)(ptr) ) ) )
1769 #define stbir__simdf_madd1_mem( out, add, mul, ptr ) (out) = vaddq_f32( add, vmulq_f32( mul, vld1q_dup_f32( (float const*)(ptr) ) ) )
1770 #endif
1771
1772 #define stbir__simdf_add1( out, reg0, reg1 ) (out) = vaddq_f32( reg0, reg1 )
1773 #define stbir__simdf_mult1( out, reg0, reg1 ) (out) = vmulq_f32( reg0, reg1 )
1774
1775 #define stbir__simdf_and( out, reg0, reg1 ) (out) = vreinterpretq_f32_u32( vandq_u32( vreinterpretq_u32_f32(reg0), vreinterpretq_u32_f32(reg1) ) )
1776 #define stbir__simdf_or( out, reg0, reg1 ) (out) = vreinterpretq_f32_u32( vorrq_u32( vreinterpretq_u32_f32(reg0), vreinterpretq_u32_f32(reg1) ) )
1777
1778 #define stbir__simdf_min( out, reg0, reg1 ) (out) = vminq_f32( reg0, reg1 )
1779 #define stbir__simdf_max( out, reg0, reg1 ) (out) = vmaxq_f32( reg0, reg1 )
1780 #define stbir__simdf_min1( out, reg0, reg1 ) (out) = vminq_f32( reg0, reg1 )
1781 #define stbir__simdf_max1( out, reg0, reg1 ) (out) = vmaxq_f32( reg0, reg1 )
1782
1783 #define stbir__simdf_0123ABCDto3ABx( out, reg0, reg1 ) (out) = vextq_f32( reg0, reg1, 3 )
1784 #define stbir__simdf_0123ABCDto23Ax( out, reg0, reg1 ) (out) = vextq_f32( reg0, reg1, 2 )
1785
1786 #define stbir__simdf_a1a1( out, alp, ones ) (out) = vzipq_f32(vuzpq_f32(alp, alp).val[1], ones).val[0]
1787 #define stbir__simdf_1a1a( out, alp, ones ) (out) = vzipq_f32(ones, vuzpq_f32(alp, alp).val[0]).val[0]
1788
1789 #if defined( _M_ARM64 ) || defined( __aarch64__ ) || defined( __arm64__ )
1790
1791 #define stbir__simdf_aaa1( out, alp, ones ) (out) = vcopyq_laneq_f32(vdupq_n_f32(vgetq_lane_f32(alp, 3)), 3, ones, 3)
1792 #define stbir__simdf_1aaa( out, alp, ones ) (out) = vcopyq_laneq_f32(vdupq_n_f32(vgetq_lane_f32(alp, 0)), 0, ones, 0)
1793
1794 #if defined( _MSC_VER ) && !defined(__clang__)
1795 #define stbir_make16(a,b,c,d) vcombine_u8( \
1796 vcreate_u8( (4*a+0) | ((4*a+1)<<8) | ((4*a+2)<<16) | ((4*a+3)<<24) | \
1797 ((stbir_uint64)(4*b+0)<<32) | ((stbir_uint64)(4*b+1)<<40) | ((stbir_uint64)(4*b+2)<<48) | ((stbir_uint64)(4*b+3)<<56)), \
1798 vcreate_u8( (4*c+0) | ((4*c+1)<<8) | ((4*c+2)<<16) | ((4*c+3)<<24) | \
1799 ((stbir_uint64)(4*d+0)<<32) | ((stbir_uint64)(4*d+1)<<40) | ((stbir_uint64)(4*d+2)<<48) | ((stbir_uint64)(4*d+3)<<56) ) )
1800
1801 static stbir__inline uint8x16x2_t stbir_make16x2(float32x4_t rega,float32x4_t regb)
1802 {
1803 uint8x16x2_t r = { vreinterpretq_u8_f32(rega), vreinterpretq_u8_f32(regb) };
1804 return r;
1805 }
1806 #else
1807 #define stbir_make16(a,b,c,d) (uint8x16_t){4*a+0,4*a+1,4*a+2,4*a+3,4*b+0,4*b+1,4*b+2,4*b+3,4*c+0,4*c+1,4*c+2,4*c+3,4*d+0,4*d+1,4*d+2,4*d+3}
1808 #define stbir_make16x2(a,b) (uint8x16x2_t){{vreinterpretq_u8_f32(a),vreinterpretq_u8_f32(b)}}
1809 #endif
1810
1811 #define stbir__simdf_swiz( reg, one, two, three, four ) vreinterpretq_f32_u8( vqtbl1q_u8( vreinterpretq_u8_f32(reg), stbir_make16(one, two, three, four) ) )
1812 #define stbir__simdf_swiz2( rega, regb, one, two, three, four ) vreinterpretq_f32_u8( vqtbl2q_u8( stbir_make16x2(rega,regb), stbir_make16(one, two, three, four) ) )
1813
1814 #define stbir__simdi_16madd( out, reg0, reg1 ) \
1815 { \
1816 int16x8_t r0 = vreinterpretq_s16_u32(reg0); \
1817 int16x8_t r1 = vreinterpretq_s16_u32(reg1); \
1818 int32x4_t tmp0 = vmull_s16( vget_low_s16(r0), vget_low_s16(r1) ); \
1819 int32x4_t tmp1 = vmull_s16( vget_high_s16(r0), vget_high_s16(r1) ); \
1820 (out) = vreinterpretq_u32_s32( vpaddq_s32(tmp0, tmp1) ); \
1821 }
1822
1823 #else
1824
1825 #define stbir__simdf_aaa1( out, alp, ones ) (out) = vsetq_lane_f32(1.0f, vdupq_n_f32(vgetq_lane_f32(alp, 3)), 3)
1826 #define stbir__simdf_1aaa( out, alp, ones ) (out) = vsetq_lane_f32(1.0f, vdupq_n_f32(vgetq_lane_f32(alp, 0)), 0)
1827
1828 #if defined( _MSC_VER ) && !defined(__clang__)
1829 static stbir__inline uint8x8x2_t stbir_make8x2(float32x4_t reg)
1830 {
1831 uint8x8x2_t r = { { vget_low_u8(vreinterpretq_u8_f32(reg)), vget_high_u8(vreinterpretq_u8_f32(reg)) } };
1832 return r;
1833 }
1834 #define stbir_make8(a,b) vcreate_u8( \
1835 (4*a+0) | ((4*a+1)<<8) | ((4*a+2)<<16) | ((4*a+3)<<24) | \
1836 ((stbir_uint64)(4*b+0)<<32) | ((stbir_uint64)(4*b+1)<<40) | ((stbir_uint64)(4*b+2)<<48) | ((stbir_uint64)(4*b+3)<<56) )
1837 #else
1838 #define stbir_make8x2(reg) (uint8x8x2_t){ { vget_low_u8(vreinterpretq_u8_f32(reg)), vget_high_u8(vreinterpretq_u8_f32(reg)) } }
1839 #define stbir_make8(a,b) (uint8x8_t){4*a+0,4*a+1,4*a+2,4*a+3,4*b+0,4*b+1,4*b+2,4*b+3}
1840 #endif
1841
1842 #define stbir__simdf_swiz( reg, one, two, three, four ) vreinterpretq_f32_u8( vcombine_u8( \
1843 vtbl2_u8( stbir_make8x2( reg ), stbir_make8( one, two ) ), \
1844 vtbl2_u8( stbir_make8x2( reg ), stbir_make8( three, four ) ) ) )
1845
1846 #define stbir__simdi_16madd( out, reg0, reg1 ) \
1847 { \
1848 int16x8_t r0 = vreinterpretq_s16_u32(reg0); \
1849 int16x8_t r1 = vreinterpretq_s16_u32(reg1); \
1850 int32x4_t tmp0 = vmull_s16( vget_low_s16(r0), vget_low_s16(r1) ); \
1851 int32x4_t tmp1 = vmull_s16( vget_high_s16(r0), vget_high_s16(r1) ); \
1852 int32x2_t out0 = vpadd_s32( vget_low_s32(tmp0), vget_high_s32(tmp0) ); \
1853 int32x2_t out1 = vpadd_s32( vget_low_s32(tmp1), vget_high_s32(tmp1) ); \
1854 (out) = vreinterpretq_u32_s32( vcombine_s32(out0, out1) ); \
1855 }
1856
1857 #endif
1858
1859 #define stbir__simdi_and( out, reg0, reg1 ) (out) = vandq_u32( reg0, reg1 )
1860 #define stbir__simdi_or( out, reg0, reg1 ) (out) = vorrq_u32( reg0, reg1 )
1861
1862 #define stbir__simdf_pack_to_8bytes(out,aa,bb) \
1863 { \
1864 float32x4_t af = vmaxq_f32( vminq_f32(aa,STBIR__CONSTF(STBIR_max_uint8_as_float) ), vdupq_n_f32(0) ); \
1865 float32x4_t bf = vmaxq_f32( vminq_f32(bb,STBIR__CONSTF(STBIR_max_uint8_as_float) ), vdupq_n_f32(0) ); \
1866 int16x4_t ai = vqmovn_s32( vcvtq_s32_f32( af ) ); \
1867 int16x4_t bi = vqmovn_s32( vcvtq_s32_f32( bf ) ); \
1868 uint8x8_t out8 = vqmovun_s16( vcombine_s16(ai, bi) ); \
1869 out = vreinterpretq_u32_u8( vcombine_u8(out8, out8) ); \
1870 }
1871
1872 #define stbir__simdf_pack_to_8words(out,aa,bb) \
1873 { \
1874 float32x4_t af = vmaxq_f32( vminq_f32(aa,STBIR__CONSTF(STBIR_max_uint16_as_float) ), vdupq_n_f32(0) ); \
1875 float32x4_t bf = vmaxq_f32( vminq_f32(bb,STBIR__CONSTF(STBIR_max_uint16_as_float) ), vdupq_n_f32(0) ); \
1876 int32x4_t ai = vcvtq_s32_f32( af ); \
1877 int32x4_t bi = vcvtq_s32_f32( bf ); \
1878 out = vreinterpretq_u32_u16( vcombine_u16(vqmovun_s32(ai), vqmovun_s32(bi)) ); \
1879 }
1880
1881 #define stbir__interleave_pack_and_store_16_u8( ptr, r0, r1, r2, r3 ) \
1882 { \
1883 int16x4x2_t tmp0 = vzip_s16( vqmovn_s32(vreinterpretq_s32_u32(r0)), vqmovn_s32(vreinterpretq_s32_u32(r2)) ); \
1884 int16x4x2_t tmp1 = vzip_s16( vqmovn_s32(vreinterpretq_s32_u32(r1)), vqmovn_s32(vreinterpretq_s32_u32(r3)) ); \
1885 uint8x8x2_t out = \
1886 { { \
1887 vqmovun_s16( vcombine_s16(tmp0.val[0], tmp0.val[1]) ), \
1888 vqmovun_s16( vcombine_s16(tmp1.val[0], tmp1.val[1]) ), \
1889 } }; \
1890 vst2_u8(ptr, out); \
1891 }
1892
1893 #define stbir__simdf_load4_transposed( o0, o1, o2, o3, ptr ) \
1894 { \
1895 float32x4x4_t tmp = vld4q_f32(ptr); \
1896 o0 = tmp.val[0]; \
1897 o1 = tmp.val[1]; \
1898 o2 = tmp.val[2]; \
1899 o3 = tmp.val[3]; \
1900 }
1901
1902 #define stbir__simdi_32shr( out, reg, imm ) out = vshrq_n_u32( reg, imm )
1903
1904 #if defined( _MSC_VER ) && !defined(__clang__)
1905 #define STBIR__SIMDF_CONST(var, x) __declspec(align(8)) float var[] = { x, x, x, x }
1906 #define STBIR__SIMDI_CONST(var, x) __declspec(align(8)) uint32_t var[] = { x, x, x, x }
1907 #define STBIR__CONSTF(var) (*(const float32x4_t*)var)
1908 #define STBIR__CONSTI(var) (*(const uint32x4_t*)var)
1909 #else
1910 #define STBIR__SIMDF_CONST(var, x) stbir__simdf var = { x, x, x, x }
1911 #define STBIR__SIMDI_CONST(var, x) stbir__simdi var = { x, x, x, x }
1912 #define STBIR__CONSTF(var) (var)
1913 #define STBIR__CONSTI(var) (var)
1914 #endif
1915
1916 #ifdef STBIR_FLOORF
1917 #undef STBIR_FLOORF
1918 #endif
1919 #define STBIR_FLOORF stbir_simd_floorf
1920 static stbir__inline float stbir_simd_floorf(float x)
1921 {
1922 #if defined( _M_ARM64 ) || defined( __aarch64__ ) || defined( __arm64__ )
1923 return vget_lane_f32( vrndm_f32( vdup_n_f32(x) ), 0);
1924 #else
1925 float32x2_t f = vdup_n_f32(x);
1926 float32x2_t t = vcvt_f32_s32(vcvt_s32_f32(f));
1927 uint32x2_t a = vclt_f32(f, t);
1928 uint32x2_t b = vreinterpret_u32_f32(vdup_n_f32(-1.0f));
1929 float32x2_t r = vadd_f32(t, vreinterpret_f32_u32(vand_u32(a, b)));
1930 return vget_lane_f32(r, 0);
1931 #endif
1932 }
1933
1934 #ifdef STBIR_CEILF
1935 #undef STBIR_CEILF
1936 #endif
1937 #define STBIR_CEILF stbir_simd_ceilf
1938 static stbir__inline float stbir_simd_ceilf(float x)
1939 {
1940 #if defined( _M_ARM64 ) || defined( __aarch64__ ) || defined( __arm64__ )
1941 return vget_lane_f32( vrndp_f32( vdup_n_f32(x) ), 0);
1942 #else
1943 float32x2_t f = vdup_n_f32(x);
1944 float32x2_t t = vcvt_f32_s32(vcvt_s32_f32(f));
1945 uint32x2_t a = vclt_f32(t, f);
1946 uint32x2_t b = vreinterpret_u32_f32(vdup_n_f32(1.0f));
1947 float32x2_t r = vadd_f32(t, vreinterpret_f32_u32(vand_u32(a, b)));
1948 return vget_lane_f32(r, 0);
1949 #endif
1950 }
1951
1952 #define STBIR_SIMD
1953
1954#elif defined(STBIR_WASM)
1955
1956 #include <wasm_simd128.h>
1957
1958 #define stbir__simdf v128_t
1959 #define stbir__simdi v128_t
1960
1961 #define stbir_simdi_castf( reg ) (reg)
1962 #define stbir_simdf_casti( reg ) (reg)
1963
1964 #define stbir__simdf_load( reg, ptr ) (reg) = wasm_v128_load( (void const*)(ptr) )
1965 #define stbir__simdi_load( reg, ptr ) (reg) = wasm_v128_load( (void const*)(ptr) )
1966 #define stbir__simdf_load1( out, ptr ) (out) = wasm_v128_load32_splat( (void const*)(ptr) ) // top values can be random (not denormal or nan for perf)
1967 #define stbir__simdi_load1( out, ptr ) (out) = wasm_v128_load32_splat( (void const*)(ptr) )
1968 #define stbir__simdf_load1z( out, ptr ) (out) = wasm_v128_load32_zero( (void const*)(ptr) ) // top values must be zero
1969 #define stbir__simdf_frep4( fvar ) wasm_f32x4_splat( fvar )
1970 #define stbir__simdf_load1frep4( out, fvar ) (out) = wasm_f32x4_splat( fvar )
1971 #define stbir__simdf_load2( out, ptr ) (out) = wasm_v128_load64_splat( (void const*)(ptr) ) // top values can be random (not denormal or nan for perf)
1972 #define stbir__simdf_load2z( out, ptr ) (out) = wasm_v128_load64_zero( (void const*)(ptr) ) // top values must be zero
1973 #define stbir__simdf_load2hmerge( out, reg, ptr ) (out) = wasm_v128_load64_lane( (void const*)(ptr), reg, 1 )
1974
1975 #define stbir__simdf_zeroP() wasm_f32x4_const_splat(0)
1976 #define stbir__simdf_zero( reg ) (reg) = wasm_f32x4_const_splat(0)
1977
1978 #define stbir__simdf_store( ptr, reg ) wasm_v128_store( (void*)(ptr), reg )
1979 #define stbir__simdf_store1( ptr, reg ) wasm_v128_store32_lane( (void*)(ptr), reg, 0 )
1980 #define stbir__simdf_store2( ptr, reg ) wasm_v128_store64_lane( (void*)(ptr), reg, 0 )
1981 #define stbir__simdf_store2h( ptr, reg ) wasm_v128_store64_lane( (void*)(ptr), reg, 1 )
1982
1983 #define stbir__simdi_store( ptr, reg ) wasm_v128_store( (void*)(ptr), reg )
1984 #define stbir__simdi_store1( ptr, reg ) wasm_v128_store32_lane( (void*)(ptr), reg, 0 )
1985 #define stbir__simdi_store2( ptr, reg ) wasm_v128_store64_lane( (void*)(ptr), reg, 0 )
1986
1987 #define stbir__prefetch( ptr )
1988
1989 #define stbir__simdi_expand_u8_to_u32(out0,out1,out2,out3,ireg) \
1990 { \
1991 v128_t l = wasm_u16x8_extend_low_u8x16 ( ireg ); \
1992 v128_t h = wasm_u16x8_extend_high_u8x16( ireg ); \
1993 out0 = wasm_u32x4_extend_low_u16x8 ( l ); \
1994 out1 = wasm_u32x4_extend_high_u16x8( l ); \
1995 out2 = wasm_u32x4_extend_low_u16x8 ( h ); \
1996 out3 = wasm_u32x4_extend_high_u16x8( h ); \
1997 }
1998
1999 #define stbir__simdi_expand_u8_to_1u32(out,ireg) \
2000 { \
2001 v128_t tmp = wasm_u16x8_extend_low_u8x16(ireg); \
2002 out = wasm_u32x4_extend_low_u16x8(tmp); \
2003 }
2004
2005 #define stbir__simdi_expand_u16_to_u32(out0,out1,ireg) \
2006 { \
2007 out0 = wasm_u32x4_extend_low_u16x8 ( ireg ); \
2008 out1 = wasm_u32x4_extend_high_u16x8( ireg ); \
2009 }
2010
2011 #define stbir__simdf_convert_float_to_i32( i, f ) (i) = wasm_i32x4_trunc_sat_f32x4(f)
2012 #define stbir__simdf_convert_float_to_int( f ) wasm_i32x4_extract_lane(wasm_i32x4_trunc_sat_f32x4(f), 0)
2013 #define stbir__simdi_to_int( i ) wasm_i32x4_extract_lane(i, 0)
2014 #define stbir__simdf_convert_float_to_uint8( f ) ((unsigned char)wasm_i32x4_extract_lane(wasm_i32x4_trunc_sat_f32x4(wasm_f32x4_max(wasm_f32x4_min(f,STBIR_max_uint8_as_float),wasm_f32x4_const_splat(0))), 0))
2015 #define stbir__simdf_convert_float_to_short( f ) ((unsigned short)wasm_i32x4_extract_lane(wasm_i32x4_trunc_sat_f32x4(wasm_f32x4_max(wasm_f32x4_min(f,STBIR_max_uint16_as_float),wasm_f32x4_const_splat(0))), 0))
2016 #define stbir__simdi_convert_i32_to_float(out, ireg) (out) = wasm_f32x4_convert_i32x4(ireg)
2017 #define stbir__simdf_add( out, reg0, reg1 ) (out) = wasm_f32x4_add( reg0, reg1 )
2018 #define stbir__simdf_mult( out, reg0, reg1 ) (out) = wasm_f32x4_mul( reg0, reg1 )
2019 #define stbir__simdf_mult_mem( out, reg, ptr ) (out) = wasm_f32x4_mul( reg, wasm_v128_load( (void const*)(ptr) ) )
2020 #define stbir__simdf_mult1_mem( out, reg, ptr ) (out) = wasm_f32x4_mul( reg, wasm_v128_load32_splat( (void const*)(ptr) ) )
2021 #define stbir__simdf_add_mem( out, reg, ptr ) (out) = wasm_f32x4_add( reg, wasm_v128_load( (void const*)(ptr) ) )
2022 #define stbir__simdf_add1_mem( out, reg, ptr ) (out) = wasm_f32x4_add( reg, wasm_v128_load32_splat( (void const*)(ptr) ) )
2023
2024 #define stbir__simdf_madd( out, add, mul1, mul2 ) (out) = wasm_f32x4_add( add, wasm_f32x4_mul( mul1, mul2 ) )
2025 #define stbir__simdf_madd1( out, add, mul1, mul2 ) (out) = wasm_f32x4_add( add, wasm_f32x4_mul( mul1, mul2 ) )
2026 #define stbir__simdf_madd_mem( out, add, mul, ptr ) (out) = wasm_f32x4_add( add, wasm_f32x4_mul( mul, wasm_v128_load( (void const*)(ptr) ) ) )
2027 #define stbir__simdf_madd1_mem( out, add, mul, ptr ) (out) = wasm_f32x4_add( add, wasm_f32x4_mul( mul, wasm_v128_load32_splat( (void const*)(ptr) ) ) )
2028
2029 #define stbir__simdf_add1( out, reg0, reg1 ) (out) = wasm_f32x4_add( reg0, reg1 )
2030 #define stbir__simdf_mult1( out, reg0, reg1 ) (out) = wasm_f32x4_mul( reg0, reg1 )
2031
2032 #define stbir__simdf_and( out, reg0, reg1 ) (out) = wasm_v128_and( reg0, reg1 )
2033 #define stbir__simdf_or( out, reg0, reg1 ) (out) = wasm_v128_or( reg0, reg1 )
2034
2035 #define stbir__simdf_min( out, reg0, reg1 ) (out) = wasm_f32x4_min( reg0, reg1 )
2036 #define stbir__simdf_max( out, reg0, reg1 ) (out) = wasm_f32x4_max( reg0, reg1 )
2037 #define stbir__simdf_min1( out, reg0, reg1 ) (out) = wasm_f32x4_min( reg0, reg1 )
2038 #define stbir__simdf_max1( out, reg0, reg1 ) (out) = wasm_f32x4_max( reg0, reg1 )
2039
2040 #define stbir__simdf_0123ABCDto3ABx( out, reg0, reg1 ) (out) = wasm_i32x4_shuffle( reg0, reg1, 3, 4, 5, -1 )
2041 #define stbir__simdf_0123ABCDto23Ax( out, reg0, reg1 ) (out) = wasm_i32x4_shuffle( reg0, reg1, 2, 3, 4, -1 )
2042
2043 #define stbir__simdf_aaa1(out,alp,ones) (out) = wasm_i32x4_shuffle(alp, ones, 3, 3, 3, 4)
2044 #define stbir__simdf_1aaa(out,alp,ones) (out) = wasm_i32x4_shuffle(alp, ones, 4, 0, 0, 0)
2045 #define stbir__simdf_a1a1(out,alp,ones) (out) = wasm_i32x4_shuffle(alp, ones, 1, 4, 3, 4)
2046 #define stbir__simdf_1a1a(out,alp,ones) (out) = wasm_i32x4_shuffle(alp, ones, 4, 0, 4, 2)
2047
2048 #define stbir__simdf_swiz( reg, one, two, three, four ) wasm_i32x4_shuffle(reg, reg, one, two, three, four)
2049
2050 #define stbir__simdi_and( out, reg0, reg1 ) (out) = wasm_v128_and( reg0, reg1 )
2051 #define stbir__simdi_or( out, reg0, reg1 ) (out) = wasm_v128_or( reg0, reg1 )
2052 #define stbir__simdi_16madd( out, reg0, reg1 ) (out) = wasm_i32x4_dot_i16x8( reg0, reg1 )
2053
2054 #define stbir__simdf_pack_to_8bytes(out,aa,bb) \
2055 { \
2056 v128_t af = wasm_f32x4_max( wasm_f32x4_min(aa, STBIR_max_uint8_as_float), wasm_f32x4_const_splat(0) ); \
2057 v128_t bf = wasm_f32x4_max( wasm_f32x4_min(bb, STBIR_max_uint8_as_float), wasm_f32x4_const_splat(0) ); \
2058 v128_t ai = wasm_i32x4_trunc_sat_f32x4( af ); \
2059 v128_t bi = wasm_i32x4_trunc_sat_f32x4( bf ); \
2060 v128_t out16 = wasm_i16x8_narrow_i32x4( ai, bi ); \
2061 out = wasm_u8x16_narrow_i16x8( out16, out16 ); \
2062 }
2063
2064 #define stbir__simdf_pack_to_8words(out,aa,bb) \
2065 { \
2066 v128_t af = wasm_f32x4_max( wasm_f32x4_min(aa, STBIR_max_uint16_as_float), wasm_f32x4_const_splat(0)); \
2067 v128_t bf = wasm_f32x4_max( wasm_f32x4_min(bb, STBIR_max_uint16_as_float), wasm_f32x4_const_splat(0)); \
2068 v128_t ai = wasm_i32x4_trunc_sat_f32x4( af ); \
2069 v128_t bi = wasm_i32x4_trunc_sat_f32x4( bf ); \
2070 out = wasm_u16x8_narrow_i32x4( ai, bi ); \
2071 }
2072
2073 #define stbir__interleave_pack_and_store_16_u8( ptr, r0, r1, r2, r3 ) \
2074 { \
2075 v128_t tmp0 = wasm_i16x8_narrow_i32x4(r0, r1); \
2076 v128_t tmp1 = wasm_i16x8_narrow_i32x4(r2, r3); \
2077 v128_t tmp = wasm_u8x16_narrow_i16x8(tmp0, tmp1); \
2078 tmp = wasm_i8x16_shuffle(tmp, tmp, 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15); \
2079 wasm_v128_store( (void*)(ptr), tmp); \
2080 }
2081
2082 #define stbir__simdf_load4_transposed( o0, o1, o2, o3, ptr ) \
2083 { \
2084 v128_t t0 = wasm_v128_load( ptr ); \
2085 v128_t t1 = wasm_v128_load( ptr+4 ); \
2086 v128_t t2 = wasm_v128_load( ptr+8 ); \
2087 v128_t t3 = wasm_v128_load( ptr+12 ); \
2088 v128_t s0 = wasm_i32x4_shuffle(t0, t1, 0, 4, 2, 6); \
2089 v128_t s1 = wasm_i32x4_shuffle(t0, t1, 1, 5, 3, 7); \
2090 v128_t s2 = wasm_i32x4_shuffle(t2, t3, 0, 4, 2, 6); \
2091 v128_t s3 = wasm_i32x4_shuffle(t2, t3, 1, 5, 3, 7); \
2092 o0 = wasm_i32x4_shuffle(s0, s2, 0, 1, 4, 5); \
2093 o1 = wasm_i32x4_shuffle(s1, s3, 0, 1, 4, 5); \
2094 o2 = wasm_i32x4_shuffle(s0, s2, 2, 3, 6, 7); \
2095 o3 = wasm_i32x4_shuffle(s1, s3, 2, 3, 6, 7); \
2096 }
2097
2098 #define stbir__simdi_32shr( out, reg, imm ) out = wasm_u32x4_shr( reg, imm )
2099
2100 typedef float stbir__f32x4 __attribute__((__vector_size__(16), __aligned__(16)));
2101 #define STBIR__SIMDF_CONST(var, x) stbir__simdf var = (v128_t)(stbir__f32x4){ x, x, x, x }
2102 #define STBIR__SIMDI_CONST(var, x) stbir__simdi var = { x, x, x, x }
2103 #define STBIR__CONSTF(var) (var)
2104 #define STBIR__CONSTI(var) (var)
2105
2106 #ifdef STBIR_FLOORF
2107 #undef STBIR_FLOORF
2108 #endif
2109 #define STBIR_FLOORF stbir_simd_floorf
2110 static stbir__inline float stbir_simd_floorf(float x)
2111 {
2112 return wasm_f32x4_extract_lane( wasm_f32x4_floor( wasm_f32x4_splat(x) ), 0);
2113 }
2114
2115 #ifdef STBIR_CEILF
2116 #undef STBIR_CEILF
2117 #endif
2118 #define STBIR_CEILF stbir_simd_ceilf
2119 static stbir__inline float stbir_simd_ceilf(float x)
2120 {
2121 return wasm_f32x4_extract_lane( wasm_f32x4_ceil( wasm_f32x4_splat(x) ), 0);
2122 }
2123
2124 #define STBIR_SIMD
2125
2126#endif // SSE2/NEON/WASM
2127
2128#endif // NO SIMD
2129
2130#ifdef STBIR_SIMD8
2131 #define stbir__simdfX stbir__simdf8
2132 #define stbir__simdiX stbir__simdi8
2133 #define stbir__simdfX_load stbir__simdf8_load
2134 #define stbir__simdiX_load stbir__simdi8_load
2135 #define stbir__simdfX_mult stbir__simdf8_mult
2136 #define stbir__simdfX_add_mem stbir__simdf8_add_mem
2137 #define stbir__simdfX_madd_mem stbir__simdf8_madd_mem
2138 #define stbir__simdfX_store stbir__simdf8_store
2139 #define stbir__simdiX_store stbir__simdi8_store
2140 #define stbir__simdf_frepX stbir__simdf8_frep8
2141 #define stbir__simdfX_madd stbir__simdf8_madd
2142 #define stbir__simdfX_min stbir__simdf8_min
2143 #define stbir__simdfX_max stbir__simdf8_max
2144 #define stbir__simdfX_aaa1 stbir__simdf8_aaa1
2145 #define stbir__simdfX_1aaa stbir__simdf8_1aaa
2146 #define stbir__simdfX_a1a1 stbir__simdf8_a1a1
2147 #define stbir__simdfX_1a1a stbir__simdf8_1a1a
2148 #define stbir__simdfX_convert_float_to_i32 stbir__simdf8_convert_float_to_i32
2149 #define stbir__simdfX_pack_to_words stbir__simdf8_pack_to_16words
2150 #define stbir__simdfX_zero stbir__simdf8_zero
2151 #define STBIR_onesX STBIR_ones8
2152 #define STBIR_max_uint8_as_floatX STBIR_max_uint8_as_float8
2153 #define STBIR_max_uint16_as_floatX STBIR_max_uint16_as_float8
2154 #define STBIR_simd_point5X STBIR_simd_point58
2155 #define stbir__simdfX_float_count 8
2156 #define stbir__simdfX_0123to1230 stbir__simdf8_0123to12301230
2157 #define stbir__simdfX_0123to2103 stbir__simdf8_0123to21032103
2158 static const stbir__simdf8 STBIR_max_uint16_as_float_inverted8 = { stbir__max_uint16_as_float_inverted,stbir__max_uint16_as_float_inverted,stbir__max_uint16_as_float_inverted,stbir__max_uint16_as_float_inverted,stbir__max_uint16_as_float_inverted,stbir__max_uint16_as_float_inverted,stbir__max_uint16_as_float_inverted,stbir__max_uint16_as_float_inverted };
2159 static const stbir__simdf8 STBIR_max_uint8_as_float_inverted8 = { stbir__max_uint8_as_float_inverted,stbir__max_uint8_as_float_inverted,stbir__max_uint8_as_float_inverted,stbir__max_uint8_as_float_inverted,stbir__max_uint8_as_float_inverted,stbir__max_uint8_as_float_inverted,stbir__max_uint8_as_float_inverted,stbir__max_uint8_as_float_inverted };
2160 static const stbir__simdf8 STBIR_ones8 = { 1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0 };
2161 static const stbir__simdf8 STBIR_simd_point58 = { 0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5 };
2162 static const stbir__simdf8 STBIR_max_uint8_as_float8 = { stbir__max_uint8_as_float,stbir__max_uint8_as_float,stbir__max_uint8_as_float,stbir__max_uint8_as_float, stbir__max_uint8_as_float,stbir__max_uint8_as_float,stbir__max_uint8_as_float,stbir__max_uint8_as_float };
2163 static const stbir__simdf8 STBIR_max_uint16_as_float8 = { stbir__max_uint16_as_float,stbir__max_uint16_as_float,stbir__max_uint16_as_float,stbir__max_uint16_as_float, stbir__max_uint16_as_float,stbir__max_uint16_as_float,stbir__max_uint16_as_float,stbir__max_uint16_as_float };
2164#else
2165 #define stbir__simdfX stbir__simdf
2166 #define stbir__simdiX stbir__simdi
2167 #define stbir__simdfX_load stbir__simdf_load
2168 #define stbir__simdiX_load stbir__simdi_load
2169 #define stbir__simdfX_mult stbir__simdf_mult
2170 #define stbir__simdfX_add_mem stbir__simdf_add_mem
2171 #define stbir__simdfX_madd_mem stbir__simdf_madd_mem
2172 #define stbir__simdfX_store stbir__simdf_store
2173 #define stbir__simdiX_store stbir__simdi_store
2174 #define stbir__simdf_frepX stbir__simdf_frep4
2175 #define stbir__simdfX_madd stbir__simdf_madd
2176 #define stbir__simdfX_min stbir__simdf_min
2177 #define stbir__simdfX_max stbir__simdf_max
2178 #define stbir__simdfX_aaa1 stbir__simdf_aaa1
2179 #define stbir__simdfX_1aaa stbir__simdf_1aaa
2180 #define stbir__simdfX_a1a1 stbir__simdf_a1a1
2181 #define stbir__simdfX_1a1a stbir__simdf_1a1a
2182 #define stbir__simdfX_convert_float_to_i32 stbir__simdf_convert_float_to_i32
2183 #define stbir__simdfX_pack_to_words stbir__simdf_pack_to_8words
2184 #define stbir__simdfX_zero stbir__simdf_zero
2185 #define STBIR_onesX STBIR__CONSTF(STBIR_ones)
2186 #define STBIR_simd_point5X STBIR__CONSTF(STBIR_simd_point5)
2187 #define STBIR_max_uint8_as_floatX STBIR__CONSTF(STBIR_max_uint8_as_float)
2188 #define STBIR_max_uint16_as_floatX STBIR__CONSTF(STBIR_max_uint16_as_float)
2189 #define stbir__simdfX_float_count 4
2190 #define stbir__if_simdf8_cast_to_simdf4( val ) ( val )
2191 #define stbir__simdfX_0123to1230 stbir__simdf_0123to1230
2192 #define stbir__simdfX_0123to2103 stbir__simdf_0123to2103
2193#endif
2194
2195
2196#if defined(STBIR_NEON) && !defined(_M_ARM) && !defined(__arm__)
2197
2198 #if defined( _MSC_VER ) && !defined(__clang__)
2199 typedef __int16 stbir__FP16;
2200 #else
2201 typedef float16_t stbir__FP16;
2202 #endif
2203
2204#else // no NEON, or 32-bit ARM for MSVC
2205
2206 typedef union stbir__FP16
2207 {
2208 unsigned short u;
2209 } stbir__FP16;
2210
2211#endif
2212
2213#if (!defined(STBIR_NEON) && !defined(STBIR_FP16C)) || (defined(STBIR_NEON) && defined(_M_ARM)) || (defined(STBIR_NEON) && defined(__arm__))
2214
2215 // Fabian's half float routines, see: https://gist.github.com/rygorous/2156668
2216
2217 static stbir__inline float stbir__half_to_float( stbir__FP16 h )
2218 {
2219 static const stbir__FP32 magic = { (254 - 15) << 23 };
2220 static const stbir__FP32 was_infnan = { (127 + 16) << 23 };
2221 stbir__FP32 o;
2222
2223 o.u = (h.u & 0x7fff) << 13; // exponent/mantissa bits
2224 o.f *= magic.f; // exponent adjust
2225 if (o.f >= was_infnan.f) // make sure Inf/NaN survive
2226 o.u |= 255 << 23;
2227 o.u |= (h.u & 0x8000) << 16; // sign bit
2228 return o.f;
2229 }
2230
2231 static stbir__inline stbir__FP16 stbir__float_to_half(float val)
2232 {
2233 stbir__FP32 f32infty = { 255 << 23 };
2234 stbir__FP32 f16max = { (127 + 16) << 23 };
2235 stbir__FP32 denorm_magic = { ((127 - 15) + (23 - 10) + 1) << 23 };
2236 unsigned int sign_mask = 0x80000000u;
2237 stbir__FP16 o = { 0 };
2238 stbir__FP32 f;
2239 unsigned int sign;
2240
2241 f.f = val;
2242 sign = f.u & sign_mask;
2243 f.u ^= sign;
2244
2245 if (f.u >= f16max.u) // result is Inf or NaN (all exponent bits set)
2246 o.u = (f.u > f32infty.u) ? 0x7e00 : 0x7c00; // NaN->qNaN and Inf->Inf
2247 else // (De)normalized number or zero
2248 {
2249 if (f.u < (113 << 23)) // resulting FP16 is subnormal or zero
2250 {
2251 // use a magic value to align our 10 mantissa bits at the bottom of
2252 // the float. as long as FP addition is round-to-nearest-even this
2253 // just works.
2254 f.f += denorm_magic.f;
2255 // and one integer subtract of the bias later, we have our final float!
2256 o.u = (unsigned short) ( f.u - denorm_magic.u );
2257 }
2258 else
2259 {
2260 unsigned int mant_odd = (f.u >> 13) & 1; // resulting mantissa is odd
2261 // update exponent, rounding bias part 1
2262 f.u = f.u + ((15u - 127) << 23) + 0xfff;
2263 // rounding bias part 2
2264 f.u += mant_odd;
2265 // take the bits!
2266 o.u = (unsigned short) ( f.u >> 13 );
2267 }
2268 }
2269
2270 o.u |= sign >> 16;
2271 return o;
2272 }
2273
2274#endif
2275
2276
2277#if defined(STBIR_FP16C)
2278
2279 #include <immintrin.h>
2280
2281 static stbir__inline void stbir__half_to_float_SIMD(float * output, stbir__FP16 const * input)
2282 {
2283 _mm256_storeu_ps( (float*)output, _mm256_cvtph_ps( _mm_loadu_si128( (__m128i const* )input ) ) );
2284 }
2285
2286 static stbir__inline void stbir__float_to_half_SIMD(stbir__FP16 * output, float const * input)
2287 {
2288 _mm_storeu_si128( (__m128i*)output, _mm256_cvtps_ph( _mm256_loadu_ps( input ), 0 ) );
2289 }
2290
2291 static stbir__inline float stbir__half_to_float( stbir__FP16 h )
2292 {
2293 return _mm_cvtss_f32( _mm_cvtph_ps( _mm_cvtsi32_si128( (int)h.u ) ) );
2294 }
2295
2296 static stbir__inline stbir__FP16 stbir__float_to_half( float f )
2297 {
2298 stbir__FP16 h;
2299 h.u = (unsigned short) _mm_cvtsi128_si32( _mm_cvtps_ph( _mm_set_ss( f ), 0 ) );
2300 return h;
2301 }
2302
2303#elif defined(STBIR_SSE2)
2304
2305 // Fabian's half float routines, see: https://gist.github.com/rygorous/2156668
2306 stbir__inline static void stbir__half_to_float_SIMD(float * output, void const * input)
2307 {
2308 static const STBIR__SIMDI_CONST(mask_nosign, 0x7fff);
2309 static const STBIR__SIMDI_CONST(smallest_normal, 0x0400);
2310 static const STBIR__SIMDI_CONST(infinity, 0x7c00);
2311 static const STBIR__SIMDI_CONST(expadjust_normal, (127 - 15) << 23);
2312 static const STBIR__SIMDI_CONST(magic_denorm, 113 << 23);
2313
2314 __m128i i = _mm_loadu_si128 ( (__m128i const*)(input) );
2315 __m128i h = _mm_unpacklo_epi16 ( i, _mm_setzero_si128() );
2316 __m128i mnosign = STBIR__CONSTI(mask_nosign);
2317 __m128i eadjust = STBIR__CONSTI(expadjust_normal);
2318 __m128i smallest = STBIR__CONSTI(smallest_normal);
2319 __m128i infty = STBIR__CONSTI(infinity);
2320 __m128i expmant = _mm_and_si128(mnosign, h);
2321 __m128i justsign = _mm_xor_si128(h, expmant);
2322 __m128i b_notinfnan = _mm_cmpgt_epi32(infty, expmant);
2323 __m128i b_isdenorm = _mm_cmpgt_epi32(smallest, expmant);
2324 __m128i shifted = _mm_slli_epi32(expmant, 13);
2325 __m128i adj_infnan = _mm_andnot_si128(b_notinfnan, eadjust);
2326 __m128i adjusted = _mm_add_epi32(eadjust, shifted);
2327 __m128i den1 = _mm_add_epi32(shifted, STBIR__CONSTI(magic_denorm));
2328 __m128i adjusted2 = _mm_add_epi32(adjusted, adj_infnan);
2329 __m128 den2 = _mm_sub_ps(_mm_castsi128_ps(den1), *(const __m128 *)&magic_denorm);
2330 __m128 adjusted3 = _mm_and_ps(den2, _mm_castsi128_ps(b_isdenorm));
2331 __m128 adjusted4 = _mm_andnot_ps(_mm_castsi128_ps(b_isdenorm), _mm_castsi128_ps(adjusted2));
2332 __m128 adjusted5 = _mm_or_ps(adjusted3, adjusted4);
2333 __m128i sign = _mm_slli_epi32(justsign, 16);
2334 __m128 final = _mm_or_ps(adjusted5, _mm_castsi128_ps(sign));
2335 stbir__simdf_store( output + 0, final );
2336
2337 h = _mm_unpackhi_epi16 ( i, _mm_setzero_si128() );
2338 expmant = _mm_and_si128(mnosign, h);
2339 justsign = _mm_xor_si128(h, expmant);
2340 b_notinfnan = _mm_cmpgt_epi32(infty, expmant);
2341 b_isdenorm = _mm_cmpgt_epi32(smallest, expmant);
2342 shifted = _mm_slli_epi32(expmant, 13);
2343 adj_infnan = _mm_andnot_si128(b_notinfnan, eadjust);
2344 adjusted = _mm_add_epi32(eadjust, shifted);
2345 den1 = _mm_add_epi32(shifted, STBIR__CONSTI(magic_denorm));
2346 adjusted2 = _mm_add_epi32(adjusted, adj_infnan);
2347 den2 = _mm_sub_ps(_mm_castsi128_ps(den1), *(const __m128 *)&magic_denorm);
2348 adjusted3 = _mm_and_ps(den2, _mm_castsi128_ps(b_isdenorm));
2349 adjusted4 = _mm_andnot_ps(_mm_castsi128_ps(b_isdenorm), _mm_castsi128_ps(adjusted2));
2350 adjusted5 = _mm_or_ps(adjusted3, adjusted4);
2351 sign = _mm_slli_epi32(justsign, 16);
2352 final = _mm_or_ps(adjusted5, _mm_castsi128_ps(sign));
2353 stbir__simdf_store( output + 4, final );
2354
2355 // ~38 SSE2 ops for 8 values
2356 }
2357
2358 // Fabian's round-to-nearest-even float to half
2359 // ~48 SSE2 ops for 8 output
2360 stbir__inline static void stbir__float_to_half_SIMD(void * output, float const * input)
2361 {
2362 static const STBIR__SIMDI_CONST(mask_sign, 0x80000000u);
2363 static const STBIR__SIMDI_CONST(c_f16max, (127 + 16) << 23); // all FP32 values >=this round to +inf
2364 static const STBIR__SIMDI_CONST(c_nanbit, 0x200);
2365 static const STBIR__SIMDI_CONST(c_infty_as_fp16, 0x7c00);
2366 static const STBIR__SIMDI_CONST(c_min_normal, (127 - 14) << 23); // smallest FP32 that yields a normalized FP16
2367 static const STBIR__SIMDI_CONST(c_subnorm_magic, ((127 - 15) + (23 - 10) + 1) << 23);
2368 static const STBIR__SIMDI_CONST(c_normal_bias, 0xfff - ((127 - 15) << 23)); // adjust exponent and add mantissa rounding
2369
2370 __m128 f = _mm_loadu_ps(input);
2371 __m128 msign = _mm_castsi128_ps(STBIR__CONSTI(mask_sign));
2372 __m128 justsign = _mm_and_ps(msign, f);
2373 __m128 absf = _mm_xor_ps(f, justsign);
2374 __m128i absf_int = _mm_castps_si128(absf); // the cast is "free" (extra bypass latency, but no thruput hit)
2375 __m128i f16max = STBIR__CONSTI(c_f16max);
2376 __m128 b_isnan = _mm_cmpunord_ps(absf, absf); // is this a NaN?
2377 __m128i b_isregular = _mm_cmpgt_epi32(f16max, absf_int); // (sub)normalized or special?
2378 __m128i nanbit = _mm_and_si128(_mm_castps_si128(b_isnan), STBIR__CONSTI(c_nanbit));
2379 __m128i inf_or_nan = _mm_or_si128(nanbit, STBIR__CONSTI(c_infty_as_fp16)); // output for specials
2380
2381 __m128i min_normal = STBIR__CONSTI(c_min_normal);
2382 __m128i b_issub = _mm_cmpgt_epi32(min_normal, absf_int);
2383
2384 // "result is subnormal" path
2385 __m128 subnorm1 = _mm_add_ps(absf, _mm_castsi128_ps(STBIR__CONSTI(c_subnorm_magic))); // magic value to round output mantissa
2386 __m128i subnorm2 = _mm_sub_epi32(_mm_castps_si128(subnorm1), STBIR__CONSTI(c_subnorm_magic)); // subtract out bias
2387
2388 // "result is normal" path
2389 __m128i mantoddbit = _mm_slli_epi32(absf_int, 31 - 13); // shift bit 13 (mantissa LSB) to sign
2390 __m128i mantodd = _mm_srai_epi32(mantoddbit, 31); // -1 if FP16 mantissa odd, else 0
2391
2392 __m128i round1 = _mm_add_epi32(absf_int, STBIR__CONSTI(c_normal_bias));
2393 __m128i round2 = _mm_sub_epi32(round1, mantodd); // if mantissa LSB odd, bias towards rounding up (RTNE)
2394 __m128i normal = _mm_srli_epi32(round2, 13); // rounded result
2395
2396 // combine the two non-specials
2397 __m128i nonspecial = _mm_or_si128(_mm_and_si128(subnorm2, b_issub), _mm_andnot_si128(b_issub, normal));
2398
2399 // merge in specials as well
2400 __m128i joined = _mm_or_si128(_mm_and_si128(nonspecial, b_isregular), _mm_andnot_si128(b_isregular, inf_or_nan));
2401
2402 __m128i sign_shift = _mm_srai_epi32(_mm_castps_si128(justsign), 16);
2403 __m128i final2, final= _mm_or_si128(joined, sign_shift);
2404
2405 f = _mm_loadu_ps(input+4);
2406 justsign = _mm_and_ps(msign, f);
2407 absf = _mm_xor_ps(f, justsign);
2408 absf_int = _mm_castps_si128(absf); // the cast is "free" (extra bypass latency, but no thruput hit)
2409 b_isnan = _mm_cmpunord_ps(absf, absf); // is this a NaN?
2410 b_isregular = _mm_cmpgt_epi32(f16max, absf_int); // (sub)normalized or special?
2411 nanbit = _mm_and_si128(_mm_castps_si128(b_isnan), c_nanbit);
2412 inf_or_nan = _mm_or_si128(nanbit, STBIR__CONSTI(c_infty_as_fp16)); // output for specials
2413
2414 b_issub = _mm_cmpgt_epi32(min_normal, absf_int);
2415
2416 // "result is subnormal" path
2417 subnorm1 = _mm_add_ps(absf, _mm_castsi128_ps(STBIR__CONSTI(c_subnorm_magic))); // magic value to round output mantissa
2418 subnorm2 = _mm_sub_epi32(_mm_castps_si128(subnorm1), STBIR__CONSTI(c_subnorm_magic)); // subtract out bias
2419
2420 // "result is normal" path
2421 mantoddbit = _mm_slli_epi32(absf_int, 31 - 13); // shift bit 13 (mantissa LSB) to sign
2422 mantodd = _mm_srai_epi32(mantoddbit, 31); // -1 if FP16 mantissa odd, else 0
2423
2424 round1 = _mm_add_epi32(absf_int, STBIR__CONSTI(c_normal_bias));
2425 round2 = _mm_sub_epi32(round1, mantodd); // if mantissa LSB odd, bias towards rounding up (RTNE)
2426 normal = _mm_srli_epi32(round2, 13); // rounded result
2427
2428 // combine the two non-specials
2429 nonspecial = _mm_or_si128(_mm_and_si128(subnorm2, b_issub), _mm_andnot_si128(b_issub, normal));
2430
2431 // merge in specials as well
2432 joined = _mm_or_si128(_mm_and_si128(nonspecial, b_isregular), _mm_andnot_si128(b_isregular, inf_or_nan));
2433
2434 sign_shift = _mm_srai_epi32(_mm_castps_si128(justsign), 16);
2435 final2 = _mm_or_si128(joined, sign_shift);
2436 final = _mm_packs_epi32(final, final2);
2437 stbir__simdi_store( output,final );
2438 }
2439
2440#elif defined(STBIR_NEON) && defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__) // 64-bit ARM on MSVC (not clang)
2441
2442 static stbir__inline void stbir__half_to_float_SIMD(float * output, stbir__FP16 const * input)
2443 {
2444 float16x4_t in0 = vld1_f16(input + 0);
2445 float16x4_t in1 = vld1_f16(input + 4);
2446 vst1q_f32(output + 0, vcvt_f32_f16(in0));
2447 vst1q_f32(output + 4, vcvt_f32_f16(in1));
2448 }
2449
2450 static stbir__inline void stbir__float_to_half_SIMD(stbir__FP16 * output, float const * input)
2451 {
2452 float16x4_t out0 = vcvt_f16_f32(vld1q_f32(input + 0));
2453 float16x4_t out1 = vcvt_f16_f32(vld1q_f32(input + 4));
2454 vst1_f16(output+0, out0);
2455 vst1_f16(output+4, out1);
2456 }
2457
2458 static stbir__inline float stbir__half_to_float( stbir__FP16 h )
2459 {
2460 return vgetq_lane_f32(vcvt_f32_f16(vld1_dup_f16(&h)), 0);
2461 }
2462
2463 static stbir__inline stbir__FP16 stbir__float_to_half( float f )
2464 {
2465 return vget_lane_f16(vcvt_f16_f32(vdupq_n_f32(f)), 0).n16_u16[0];
2466 }
2467
2468#elif defined(STBIR_NEON) && ( defined( _M_ARM64 ) || defined( __aarch64__ ) || defined( __arm64__ ) ) // 64-bit ARM
2469
2470 static stbir__inline void stbir__half_to_float_SIMD(float * output, stbir__FP16 const * input)
2471 {
2472 float16x8_t in = vld1q_f16(input);
2473 vst1q_f32(output + 0, vcvt_f32_f16(vget_low_f16(in)));
2474 vst1q_f32(output + 4, vcvt_f32_f16(vget_high_f16(in)));
2475 }
2476
2477 static stbir__inline void stbir__float_to_half_SIMD(stbir__FP16 * output, float const * input)
2478 {
2479 float16x4_t out0 = vcvt_f16_f32(vld1q_f32(input + 0));
2480 float16x4_t out1 = vcvt_f16_f32(vld1q_f32(input + 4));
2481 vst1q_f16(output, vcombine_f16(out0, out1));
2482 }
2483
2484 static stbir__inline float stbir__half_to_float( stbir__FP16 h )
2485 {
2486 return vgetq_lane_f32(vcvt_f32_f16(vdup_n_f16(h)), 0);
2487 }
2488
2489 static stbir__inline stbir__FP16 stbir__float_to_half( float f )
2490 {
2491 return vget_lane_f16(vcvt_f16_f32(vdupq_n_f32(f)), 0);
2492 }
2493
2494#elif defined(STBIR_WASM) || (defined(STBIR_NEON) && (defined(_MSC_VER) || defined(_M_ARM) || defined(__arm__))) // WASM or 32-bit ARM on MSVC/clang
2495
2496 static stbir__inline void stbir__half_to_float_SIMD(float * output, stbir__FP16 const * input)
2497 {
2498 for (int i=0; i<8; i++)
2499 {
2500 output[i] = stbir__half_to_float(input[i]);
2501 }
2502 }
2503 static stbir__inline void stbir__float_to_half_SIMD(stbir__FP16 * output, float const * input)
2504 {
2505 for (int i=0; i<8; i++)
2506 {
2507 output[i] = stbir__float_to_half(input[i]);
2508 }
2509 }
2510
2511#endif
2512
2513
2514#ifdef STBIR_SIMD
2515
2516#define stbir__simdf_0123to3333( out, reg ) (out) = stbir__simdf_swiz( reg, 3,3,3,3 )
2517#define stbir__simdf_0123to2222( out, reg ) (out) = stbir__simdf_swiz( reg, 2,2,2,2 )
2518#define stbir__simdf_0123to1111( out, reg ) (out) = stbir__simdf_swiz( reg, 1,1,1,1 )
2519#define stbir__simdf_0123to0000( out, reg ) (out) = stbir__simdf_swiz( reg, 0,0,0,0 )
2520#define stbir__simdf_0123to0003( out, reg ) (out) = stbir__simdf_swiz( reg, 0,0,0,3 )
2521#define stbir__simdf_0123to0001( out, reg ) (out) = stbir__simdf_swiz( reg, 0,0,0,1 )
2522#define stbir__simdf_0123to1122( out, reg ) (out) = stbir__simdf_swiz( reg, 1,1,2,2 )
2523#define stbir__simdf_0123to2333( out, reg ) (out) = stbir__simdf_swiz( reg, 2,3,3,3 )
2524#define stbir__simdf_0123to0023( out, reg ) (out) = stbir__simdf_swiz( reg, 0,0,2,3 )
2525#define stbir__simdf_0123to1230( out, reg ) (out) = stbir__simdf_swiz( reg, 1,2,3,0 )
2526#define stbir__simdf_0123to2103( out, reg ) (out) = stbir__simdf_swiz( reg, 2,1,0,3 )
2527#define stbir__simdf_0123to3210( out, reg ) (out) = stbir__simdf_swiz( reg, 3,2,1,0 )
2528#define stbir__simdf_0123to2301( out, reg ) (out) = stbir__simdf_swiz( reg, 2,3,0,1 )
2529#define stbir__simdf_0123to3012( out, reg ) (out) = stbir__simdf_swiz( reg, 3,0,1,2 )
2530#define stbir__simdf_0123to0011( out, reg ) (out) = stbir__simdf_swiz( reg, 0,0,1,1 )
2531#define stbir__simdf_0123to1100( out, reg ) (out) = stbir__simdf_swiz( reg, 1,1,0,0 )
2532#define stbir__simdf_0123to2233( out, reg ) (out) = stbir__simdf_swiz( reg, 2,2,3,3 )
2533#define stbir__simdf_0123to1133( out, reg ) (out) = stbir__simdf_swiz( reg, 1,1,3,3 )
2534#define stbir__simdf_0123to0022( out, reg ) (out) = stbir__simdf_swiz( reg, 0,0,2,2 )
2535#define stbir__simdf_0123to1032( out, reg ) (out) = stbir__simdf_swiz( reg, 1,0,3,2 )
2536
2537typedef union stbir__simdi_u32
2538{
2539 stbir_uint32 m128i_u32[4];
2540 int m128i_i32[4];
2541 stbir__simdi m128i_i128;
2542} stbir__simdi_u32;
2543
2544static const int STBIR_mask[9] = { 0,0,0,-1,-1,-1,0,0,0 };
2545
2546static const STBIR__SIMDF_CONST(STBIR_max_uint8_as_float, stbir__max_uint8_as_float);
2547static const STBIR__SIMDF_CONST(STBIR_max_uint16_as_float, stbir__max_uint16_as_float);
2548static const STBIR__SIMDF_CONST(STBIR_max_uint8_as_float_inverted, stbir__max_uint8_as_float_inverted);
2549static const STBIR__SIMDF_CONST(STBIR_max_uint16_as_float_inverted, stbir__max_uint16_as_float_inverted);
2550
2551static const STBIR__SIMDF_CONST(STBIR_simd_point5, 0.5f);
2552static const STBIR__SIMDF_CONST(STBIR_ones, 1.0f);
2553static const STBIR__SIMDI_CONST(STBIR_almost_zero, (127 - 13) << 23);
2554static const STBIR__SIMDI_CONST(STBIR_almost_one, 0x3f7fffff);
2555static const STBIR__SIMDI_CONST(STBIR_mastissa_mask, 0xff);
2556static const STBIR__SIMDI_CONST(STBIR_topscale, 0x02000000);
2557
2558// Basically, in simd mode, we unroll the proper amount, and we don't want
2559// the non-simd remnant loops to be unroll because they only run a few times
2560// Adding this switch saves about 5K on clang which is Captain Unroll the 3rd.
2561#define STBIR_SIMD_STREAMOUT_PTR( star ) STBIR_STREAMOUT_PTR( star )
2562#define STBIR_SIMD_NO_UNROLL(ptr) STBIR_NO_UNROLL(ptr)
2563#define STBIR_SIMD_NO_UNROLL_LOOP_START STBIR_NO_UNROLL_LOOP_START
2564#define STBIR_SIMD_NO_UNROLL_LOOP_START_INF_FOR STBIR_NO_UNROLL_LOOP_START_INF_FOR
2565
2566#ifdef STBIR_MEMCPY
2567#undef STBIR_MEMCPY
2568#endif
2569#define STBIR_MEMCPY stbir_simd_memcpy
2570
2571// override normal use of memcpy with much simpler copy (faster and smaller with our sized copies)
2572static void stbir_simd_memcpy( void * dest, void const * src, size_t bytes )
2573{
2574 char STBIR_SIMD_STREAMOUT_PTR (*) d = (char*) dest;
2575 char STBIR_SIMD_STREAMOUT_PTR( * ) d_end = ((char*) dest) + bytes;
2576 ptrdiff_t ofs_to_src = (char*)src - (char*)dest;
2577
2578 // check overlaps
2579 STBIR_ASSERT( ( ( d >= ( (char*)src) + bytes ) ) || ( ( d + bytes ) <= (char*)src ) );
2580
2581 if ( bytes < (16*stbir__simdfX_float_count) )
2582 {
2583 if ( bytes < 16 )
2584 {
2585 if ( bytes )
2586 {
2587 STBIR_SIMD_NO_UNROLL_LOOP_START
2588 do
2589 {
2590 STBIR_SIMD_NO_UNROLL(d);
2591 d[ 0 ] = d[ ofs_to_src ];
2592 ++d;
2593 } while ( d < d_end );
2594 }
2595 }
2596 else
2597 {
2598 stbir__simdf x;
2599 // do one unaligned to get us aligned for the stream out below
2600 stbir__simdf_load( x, ( d + ofs_to_src ) );
2601 stbir__simdf_store( d, x );
2602 d = (char*)( ( ( (size_t)d ) + 16 ) & ~15 );
2603
2604 STBIR_SIMD_NO_UNROLL_LOOP_START_INF_FOR
2605 for(;;)
2606 {
2607 STBIR_SIMD_NO_UNROLL(d);
2608
2609 if ( d > ( d_end - 16 ) )
2610 {
2611 if ( d == d_end )
2612 return;
2613 d = d_end - 16;
2614 }
2615
2616 stbir__simdf_load( x, ( d + ofs_to_src ) );
2617 stbir__simdf_store( d, x );
2618 d += 16;
2619 }
2620 }
2621 }
2622 else
2623 {
2624 stbir__simdfX x0,x1,x2,x3;
2625
2626 // do one unaligned to get us aligned for the stream out below
2627 stbir__simdfX_load( x0, ( d + ofs_to_src ) + 0*stbir__simdfX_float_count );
2628 stbir__simdfX_load( x1, ( d + ofs_to_src ) + 4*stbir__simdfX_float_count );
2629 stbir__simdfX_load( x2, ( d + ofs_to_src ) + 8*stbir__simdfX_float_count );
2630 stbir__simdfX_load( x3, ( d + ofs_to_src ) + 12*stbir__simdfX_float_count );
2631 stbir__simdfX_store( d + 0*stbir__simdfX_float_count, x0 );
2632 stbir__simdfX_store( d + 4*stbir__simdfX_float_count, x1 );
2633 stbir__simdfX_store( d + 8*stbir__simdfX_float_count, x2 );
2634 stbir__simdfX_store( d + 12*stbir__simdfX_float_count, x3 );
2635 d = (char*)( ( ( (size_t)d ) + (16*stbir__simdfX_float_count) ) & ~((16*stbir__simdfX_float_count)-1) );
2636
2637 STBIR_SIMD_NO_UNROLL_LOOP_START_INF_FOR
2638 for(;;)
2639 {
2640 STBIR_SIMD_NO_UNROLL(d);
2641
2642 if ( d > ( d_end - (16*stbir__simdfX_float_count) ) )
2643 {
2644 if ( d == d_end )
2645 return;
2646 d = d_end - (16*stbir__simdfX_float_count);
2647 }
2648
2649 stbir__simdfX_load( x0, ( d + ofs_to_src ) + 0*stbir__simdfX_float_count );
2650 stbir__simdfX_load( x1, ( d + ofs_to_src ) + 4*stbir__simdfX_float_count );
2651 stbir__simdfX_load( x2, ( d + ofs_to_src ) + 8*stbir__simdfX_float_count );
2652 stbir__simdfX_load( x3, ( d + ofs_to_src ) + 12*stbir__simdfX_float_count );
2653 stbir__simdfX_store( d + 0*stbir__simdfX_float_count, x0 );
2654 stbir__simdfX_store( d + 4*stbir__simdfX_float_count, x1 );
2655 stbir__simdfX_store( d + 8*stbir__simdfX_float_count, x2 );
2656 stbir__simdfX_store( d + 12*stbir__simdfX_float_count, x3 );
2657 d += (16*stbir__simdfX_float_count);
2658 }
2659 }
2660}
2661
2662// memcpy that is specically intentionally overlapping (src is smaller then dest, so can be
2663// a normal forward copy, bytes is divisible by 4 and bytes is greater than or equal to
2664// the diff between dest and src)
2665static void stbir_overlapping_memcpy( void * dest, void const * src, size_t bytes )
2666{
2667 char STBIR_SIMD_STREAMOUT_PTR (*) sd = (char*) src;
2668 char STBIR_SIMD_STREAMOUT_PTR( * ) s_end = ((char*) src) + bytes;
2669 ptrdiff_t ofs_to_dest = (char*)dest - (char*)src;
2670
2671 if ( ofs_to_dest >= 16 ) // is the overlap more than 16 away?
2672 {
2673 char STBIR_SIMD_STREAMOUT_PTR( * ) s_end16 = ((char*) src) + (bytes&~15);
2674 STBIR_SIMD_NO_UNROLL_LOOP_START
2675 do
2676 {
2677 stbir__simdf x;
2678 STBIR_SIMD_NO_UNROLL(sd);
2679 stbir__simdf_load( x, sd );
2680 stbir__simdf_store( ( sd + ofs_to_dest ), x );
2681 sd += 16;
2682 } while ( sd < s_end16 );
2683
2684 if ( sd == s_end )
2685 return;
2686 }
2687
2688 do
2689 {
2690 STBIR_SIMD_NO_UNROLL(sd);
2691 *(int*)( sd + ofs_to_dest ) = *(int*) sd;
2692 sd += 4;
2693 } while ( sd < s_end );
2694}
2695
2696#else // no SSE2
2697
2698// when in scalar mode, we let unrolling happen, so this macro just does the __restrict
2699#define STBIR_SIMD_STREAMOUT_PTR( star ) STBIR_STREAMOUT_PTR( star )
2700#define STBIR_SIMD_NO_UNROLL(ptr)
2701#define STBIR_SIMD_NO_UNROLL_LOOP_START
2702#define STBIR_SIMD_NO_UNROLL_LOOP_START_INF_FOR
2703
2704#endif // SSE2
2705
2706
2707#ifdef STBIR_PROFILE
2708
2709#ifndef STBIR_PROFILE_FUNC
2710
2711#if defined(_x86_64) || defined( __x86_64__ ) || defined( _M_X64 ) || defined(__x86_64) || defined(__SSE2__) || defined(STBIR_SSE) || defined( _M_IX86_FP ) || defined(__i386) || defined( __i386__ ) || defined( _M_IX86 ) || defined( _X86_ )
2712
2713#ifdef _MSC_VER
2714
2715 STBIRDEF stbir_uint64 __rdtsc();
2716 #define STBIR_PROFILE_FUNC() __rdtsc()
2717
2718#else // non msvc
2719
2720 static stbir__inline stbir_uint64 STBIR_PROFILE_FUNC()
2721 {
2722 stbir_uint32 lo, hi;
2723 asm volatile ("rdtsc" : "=a" (lo), "=d" (hi) );
2724 return ( ( (stbir_uint64) hi ) << 32 ) | ( (stbir_uint64) lo );
2725 }
2726
2727#endif // msvc
2728
2729#elif defined( _M_ARM64 ) || defined( __aarch64__ ) || defined( __arm64__ ) || defined(__ARM_NEON__)
2730
2731#if defined( _MSC_VER ) && !defined(__clang__)
2732
2733 #define STBIR_PROFILE_FUNC() _ReadStatusReg(ARM64_CNTVCT)
2734
2735#else
2736
2737 static stbir__inline stbir_uint64 STBIR_PROFILE_FUNC()
2738 {
2739 stbir_uint64 tsc;
2740 asm volatile("mrs %0, cntvct_el0" : "=r" (tsc));
2741 return tsc;
2742 }
2743
2744#endif
2745
2746#else // x64, arm
2747
2748#error Unknown platform for profiling.
2749
2750#endif // x64, arm
2751
2752#endif // STBIR_PROFILE_FUNC
2753
2754#define STBIR_ONLY_PROFILE_GET_SPLIT_INFO ,stbir__per_split_info * split_info
2755#define STBIR_ONLY_PROFILE_SET_SPLIT_INFO ,split_info
2756
2757#define STBIR_ONLY_PROFILE_BUILD_GET_INFO ,stbir__info * profile_info
2758#define STBIR_ONLY_PROFILE_BUILD_SET_INFO ,profile_info
2759
2760// super light-weight micro profiler
2761#define STBIR_PROFILE_START_ll( info, wh ) { stbir_uint64 wh##thiszonetime = STBIR_PROFILE_FUNC(); stbir_uint64 * wh##save_parent_excluded_ptr = info->current_zone_excluded_ptr; stbir_uint64 wh##current_zone_excluded = 0; info->current_zone_excluded_ptr = &wh##current_zone_excluded;
2762#define STBIR_PROFILE_END_ll( info, wh ) wh##thiszonetime = STBIR_PROFILE_FUNC() - wh##thiszonetime; info->profile.named.wh += wh##thiszonetime - wh##current_zone_excluded; *wh##save_parent_excluded_ptr += wh##thiszonetime; info->current_zone_excluded_ptr = wh##save_parent_excluded_ptr; }
2763#define STBIR_PROFILE_FIRST_START_ll( info, wh ) { int i; info->current_zone_excluded_ptr = &info->profile.named.total; for(i=0;i<STBIR__ARRAY_SIZE(info->profile.array);i++) info->profile.array[i]=0; } STBIR_PROFILE_START_ll( info, wh );
2764#define STBIR_PROFILE_CLEAR_EXTRAS_ll( info, num ) { int extra; for(extra=1;extra<(num);extra++) { int i; for(i=0;i<STBIR__ARRAY_SIZE((info)->profile.array);i++) (info)[extra].profile.array[i]=0; } }
2765
2766// for thread data
2767#define STBIR_PROFILE_START( wh ) STBIR_PROFILE_START_ll( split_info, wh )
2768#define STBIR_PROFILE_END( wh ) STBIR_PROFILE_END_ll( split_info, wh )
2769#define STBIR_PROFILE_FIRST_START( wh ) STBIR_PROFILE_FIRST_START_ll( split_info, wh )
2770#define STBIR_PROFILE_CLEAR_EXTRAS() STBIR_PROFILE_CLEAR_EXTRAS_ll( split_info, split_count )
2771
2772// for build data
2773#define STBIR_PROFILE_BUILD_START( wh ) STBIR_PROFILE_START_ll( profile_info, wh )
2774#define STBIR_PROFILE_BUILD_END( wh ) STBIR_PROFILE_END_ll( profile_info, wh )
2775#define STBIR_PROFILE_BUILD_FIRST_START( wh ) STBIR_PROFILE_FIRST_START_ll( profile_info, wh )
2776#define STBIR_PROFILE_BUILD_CLEAR( info ) { int i; for(i=0;i<STBIR__ARRAY_SIZE(info->profile.array);i++) info->profile.array[i]=0; }
2777
2778#else // no profile
2779
2780#define STBIR_ONLY_PROFILE_GET_SPLIT_INFO
2781#define STBIR_ONLY_PROFILE_SET_SPLIT_INFO
2782
2783#define STBIR_ONLY_PROFILE_BUILD_GET_INFO
2784#define STBIR_ONLY_PROFILE_BUILD_SET_INFO
2785
2786#define STBIR_PROFILE_START( wh )
2787#define STBIR_PROFILE_END( wh )
2788#define STBIR_PROFILE_FIRST_START( wh )
2789#define STBIR_PROFILE_CLEAR_EXTRAS( )
2790
2791#define STBIR_PROFILE_BUILD_START( wh )
2792#define STBIR_PROFILE_BUILD_END( wh )
2793#define STBIR_PROFILE_BUILD_FIRST_START( wh )
2794#define STBIR_PROFILE_BUILD_CLEAR( info )
2795
2796#endif // stbir_profile
2797
2798#ifndef STBIR_CEILF
2799#include <math.h>
2800#if _MSC_VER <= 1200 // support VC6 for Sean
2801#define STBIR_CEILF(x) ((float)ceil((float)(x)))
2802#define STBIR_FLOORF(x) ((float)floor((float)(x)))
2803#else
2804#define STBIR_CEILF(x) ceilf(x)
2805#define STBIR_FLOORF(x) floorf(x)
2806#endif
2807#endif
2808
2809#ifndef STBIR_MEMCPY
2810// For memcpy
2811#include <string.h>
2812#define STBIR_MEMCPY( dest, src, len ) memcpy( dest, src, len )
2813#endif
2814
2815#ifndef STBIR_SIMD
2816
2817// memcpy that is specifically intentionally overlapping (src is smaller then dest, so can be
2818// a normal forward copy, bytes is divisible by 4 and bytes is greater than or equal to
2819// the diff between dest and src)
2820static void stbir_overlapping_memcpy( void * dest, void const * src, size_t bytes )
2821{
2822 char STBIR_SIMD_STREAMOUT_PTR (*) sd = (char*) src;
2823 char STBIR_SIMD_STREAMOUT_PTR( * ) s_end = ((char*) src) + bytes;
2824 ptrdiff_t ofs_to_dest = (char*)dest - (char*)src;
2825
2826 if ( ofs_to_dest >= 8 ) // is the overlap more than 8 away?
2827 {
2828 char STBIR_SIMD_STREAMOUT_PTR( * ) s_end8 = ((char*) src) + (bytes&~7);
2829 STBIR_NO_UNROLL_LOOP_START
2830 do
2831 {
2832 STBIR_NO_UNROLL(sd);
2833 *(stbir_uint64*)( sd + ofs_to_dest ) = *(stbir_uint64*) sd;
2834 sd += 8;
2835 } while ( sd < s_end8 );
2836
2837 if ( sd == s_end )
2838 return;
2839 }
2840
2841 STBIR_NO_UNROLL_LOOP_START
2842 do
2843 {
2844 STBIR_NO_UNROLL(sd);
2845 *(int*)( sd + ofs_to_dest ) = *(int*) sd;
2846 sd += 4;
2847 } while ( sd < s_end );
2848}
2849
2850#endif
2851
2852static float stbir__filter_trapezoid(float x, float scale, void * user_data)
2853{
2854 float halfscale = scale / 2;
2855 float t = 0.5f + halfscale;
2856 STBIR_ASSERT(scale <= 1);
2857 STBIR__UNUSED(user_data);
2858
2859 if ( x < 0.0f ) x = -x;
2860
2861 if (x >= t)
2862 return 0.0f;
2863 else
2864 {
2865 float r = 0.5f - halfscale;
2866 if (x <= r)
2867 return 1.0f;
2868 else
2869 return (t - x) / scale;
2870 }
2871}
2872
2873static float stbir__support_trapezoid(float scale, void * user_data)
2874{
2875 STBIR__UNUSED(user_data);
2876 return 0.5f + scale / 2.0f;
2877}
2878
2879static float stbir__filter_triangle(float x, float s, void * user_data)
2880{
2881 STBIR__UNUSED(s);
2882 STBIR__UNUSED(user_data);
2883
2884 if ( x < 0.0f ) x = -x;
2885
2886 if (x <= 1.0f)
2887 return 1.0f - x;
2888 else
2889 return 0.0f;
2890}
2891
2892static float stbir__filter_point(float x, float s, void * user_data)
2893{
2894 STBIR__UNUSED(x);
2895 STBIR__UNUSED(s);
2896 STBIR__UNUSED(user_data);
2897
2898 return 1.0f;
2899}
2900
2901static float stbir__filter_cubic(float x, float s, void * user_data)
2902{
2903 STBIR__UNUSED(s);
2904 STBIR__UNUSED(user_data);
2905
2906 if ( x < 0.0f ) x = -x;
2907
2908 if (x < 1.0f)
2909 return (4.0f + x*x*(3.0f*x - 6.0f))/6.0f;
2910 else if (x < 2.0f)
2911 return (8.0f + x*(-12.0f + x*(6.0f - x)))/6.0f;
2912
2913 return (0.0f);
2914}
2915
2916static float stbir__filter_catmullrom(float x, float s, void * user_data)
2917{
2918 STBIR__UNUSED(s);
2919 STBIR__UNUSED(user_data);
2920
2921 if ( x < 0.0f ) x = -x;
2922
2923 if (x < 1.0f)
2924 return 1.0f - x*x*(2.5f - 1.5f*x);
2925 else if (x < 2.0f)
2926 return 2.0f - x*(4.0f + x*(0.5f*x - 2.5f));
2927
2928 return (0.0f);
2929}
2930
2931static float stbir__filter_mitchell(float x, float s, void * user_data)
2932{
2933 STBIR__UNUSED(s);
2934 STBIR__UNUSED(user_data);
2935
2936 if ( x < 0.0f ) x = -x;
2937
2938 if (x < 1.0f)
2939 return (16.0f + x*x*(21.0f * x - 36.0f))/18.0f;
2940 else if (x < 2.0f)
2941 return (32.0f + x*(-60.0f + x*(36.0f - 7.0f*x)))/18.0f;
2942
2943 return (0.0f);
2944}
2945
2946static float stbir__support_zeropoint5(float s, void * user_data)
2947{
2948 STBIR__UNUSED(s);
2949 STBIR__UNUSED(user_data);
2950 return 0.5f;
2951}
2952
2953static float stbir__support_one(float s, void * user_data)
2954{
2955 STBIR__UNUSED(s);
2956 STBIR__UNUSED(user_data);
2957 return 1;
2958}
2959
2960static float stbir__support_two(float s, void * user_data)
2961{
2962 STBIR__UNUSED(s);
2963 STBIR__UNUSED(user_data);
2964 return 2;
2965}
2966
2967// This is the maximum number of input samples that can affect an output sample
2968// with the given filter from the output pixel's perspective
2969static int stbir__get_filter_pixel_width(stbir__support_callback * support, float scale, void * user_data)
2970{
2971 STBIR_ASSERT(support != 0);
2972
2973 if ( scale >= ( 1.0f-stbir__small_float ) ) // upscale
2974 return (int)STBIR_CEILF(support(1.0f/scale,user_data) * 2.0f);
2975 else
2976 return (int)STBIR_CEILF(support(scale,user_data) * 2.0f / scale);
2977}
2978
2979// this is how many coefficents per run of the filter (which is different
2980// from the filter_pixel_width depending on if we are scattering or gathering)
2981static int stbir__get_coefficient_width(stbir__sampler * samp, int is_gather, void * user_data)
2982{
2983 float scale = samp->scale_info.scale;
2984 stbir__support_callback * support = samp->filter_support;
2985
2986 switch( is_gather )
2987 {
2988 case 1:
2989 return (int)STBIR_CEILF(support(1.0f / scale, user_data) * 2.0f);
2990 case 2:
2991 return (int)STBIR_CEILF(support(scale, user_data) * 2.0f / scale);
2992 case 0:
2993 return (int)STBIR_CEILF(support(scale, user_data) * 2.0f);
2994 default:
2995 STBIR_ASSERT( (is_gather >= 0 ) && (is_gather <= 2 ) );
2996 return 0;
2997 }
2998}
2999
3000static int stbir__get_contributors(stbir__sampler * samp, int is_gather)
3001{
3002 if (is_gather)
3003 return samp->scale_info.output_sub_size;
3004 else
3005 return (samp->scale_info.input_full_size + samp->filter_pixel_margin * 2);
3006}
3007
3008static int stbir__edge_zero_full( int n, int max )
3009{
3010 STBIR__UNUSED(n);
3011 STBIR__UNUSED(max);
3012 return 0; // NOTREACHED
3013}
3014
3015static int stbir__edge_clamp_full( int n, int max )
3016{
3017 if (n < 0)
3018 return 0;
3019
3020 if (n >= max)
3021 return max - 1;
3022
3023 return n; // NOTREACHED
3024}
3025
3026static int stbir__edge_reflect_full( int n, int max )
3027{
3028 if (n < 0)
3029 {
3030 if (n > -max)
3031 return -n;
3032 else
3033 return max - 1;
3034 }
3035
3036 if (n >= max)
3037 {
3038 int max2 = max * 2;
3039 if (n >= max2)
3040 return 0;
3041 else
3042 return max2 - n - 1;
3043 }
3044
3045 return n; // NOTREACHED
3046}
3047
3048static int stbir__edge_wrap_full( int n, int max )
3049{
3050 if (n >= 0)
3051 return (n % max);
3052 else
3053 {
3054 int m = (-n) % max;
3055
3056 if (m != 0)
3057 m = max - m;
3058
3059 return (m);
3060 }
3061}
3062
3063typedef int stbir__edge_wrap_func( int n, int max );
3064static stbir__edge_wrap_func * stbir__edge_wrap_slow[] =
3065{
3066 stbir__edge_clamp_full, // STBIR_EDGE_CLAMP
3067 stbir__edge_reflect_full, // STBIR_EDGE_REFLECT
3068 stbir__edge_wrap_full, // STBIR_EDGE_WRAP
3069 stbir__edge_zero_full, // STBIR_EDGE_ZERO
3070};
3071
3072stbir__inline static int stbir__edge_wrap(stbir_edge edge, int n, int max)
3073{
3074 // avoid per-pixel switch
3075 if (n >= 0 && n < max)
3076 return n;
3077 return stbir__edge_wrap_slow[edge]( n, max );
3078}
3079
3080#define STBIR__MERGE_RUNS_PIXEL_THRESHOLD 16
3081
3082// get information on the extents of a sampler
3083static void stbir__get_extents( stbir__sampler * samp, stbir__extents * scanline_extents )
3084{
3085 int j, stop;
3086 int left_margin, right_margin;
3087 int min_n = 0x7fffffff, max_n = -0x7fffffff;
3088 int min_left = 0x7fffffff, max_left = -0x7fffffff;
3089 int min_right = 0x7fffffff, max_right = -0x7fffffff;
3090 stbir_edge edge = samp->edge;
3091 stbir__contributors* contributors = samp->contributors;
3092 int output_sub_size = samp->scale_info.output_sub_size;
3093 int input_full_size = samp->scale_info.input_full_size;
3094 int filter_pixel_margin = samp->filter_pixel_margin;
3095
3096 STBIR_ASSERT( samp->is_gather );
3097
3098 stop = output_sub_size;
3099 for (j = 0; j < stop; j++ )
3100 {
3101 STBIR_ASSERT( contributors[j].n1 >= contributors[j].n0 );
3102 if ( contributors[j].n0 < min_n )
3103 {
3104 min_n = contributors[j].n0;
3105 stop = j + filter_pixel_margin; // if we find a new min, only scan another filter width
3106 if ( stop > output_sub_size ) stop = output_sub_size;
3107 }
3108 }
3109
3110 stop = 0;
3111 for (j = output_sub_size - 1; j >= stop; j-- )
3112 {
3113 STBIR_ASSERT( contributors[j].n1 >= contributors[j].n0 );
3114 if ( contributors[j].n1 > max_n )
3115 {
3116 max_n = contributors[j].n1;
3117 stop = j - filter_pixel_margin; // if we find a new max, only scan another filter width
3118 if (stop<0) stop = 0;
3119 }
3120 }
3121
3122 STBIR_ASSERT( scanline_extents->conservative.n0 <= min_n );
3123 STBIR_ASSERT( scanline_extents->conservative.n1 >= max_n );
3124
3125 // now calculate how much into the margins we really read
3126 left_margin = 0;
3127 if ( min_n < 0 )
3128 {
3129 left_margin = -min_n;
3130 min_n = 0;
3131 }
3132
3133 right_margin = 0;
3134 if ( max_n >= input_full_size )
3135 {
3136 right_margin = max_n - input_full_size + 1;
3137 max_n = input_full_size - 1;
3138 }
3139
3140 // index 1 is margin pixel extents (how many pixels we hang over the edge)
3141 scanline_extents->edge_sizes[0] = left_margin;
3142 scanline_extents->edge_sizes[1] = right_margin;
3143
3144 // index 2 is pixels read from the input
3145 scanline_extents->spans[0].n0 = min_n;
3146 scanline_extents->spans[0].n1 = max_n;
3147 scanline_extents->spans[0].pixel_offset_for_input = min_n;
3148
3149 // default to no other input range
3150 scanline_extents->spans[1].n0 = 0;
3151 scanline_extents->spans[1].n1 = -1;
3152 scanline_extents->spans[1].pixel_offset_for_input = 0;
3153
3154 // don't have to do edge calc for zero clamp
3155 if ( edge == STBIR_EDGE_ZERO )
3156 return;
3157
3158 // convert margin pixels to the pixels within the input (min and max)
3159 for( j = -left_margin ; j < 0 ; j++ )
3160 {
3161 int p = stbir__edge_wrap( edge, j, input_full_size );
3162 if ( p < min_left )
3163 min_left = p;
3164 if ( p > max_left )
3165 max_left = p;
3166 }
3167
3168 for( j = input_full_size ; j < (input_full_size + right_margin) ; j++ )
3169 {
3170 int p = stbir__edge_wrap( edge, j, input_full_size );
3171 if ( p < min_right )
3172 min_right = p;
3173 if ( p > max_right )
3174 max_right = p;
3175 }
3176
3177 // merge the left margin pixel region if it connects within 4 pixels of main pixel region
3178 if ( min_left != 0x7fffffff )
3179 {
3180 if ( ( ( min_left <= min_n ) && ( ( max_left + STBIR__MERGE_RUNS_PIXEL_THRESHOLD ) >= min_n ) ) ||
3181 ( ( min_n <= min_left ) && ( ( max_n + STBIR__MERGE_RUNS_PIXEL_THRESHOLD ) >= max_left ) ) )
3182 {
3183 scanline_extents->spans[0].n0 = min_n = stbir__min( min_n, min_left );
3184 scanline_extents->spans[0].n1 = max_n = stbir__max( max_n, max_left );
3185 scanline_extents->spans[0].pixel_offset_for_input = min_n;
3186 left_margin = 0;
3187 }
3188 }
3189
3190 // merge the right margin pixel region if it connects within 4 pixels of main pixel region
3191 if ( min_right != 0x7fffffff )
3192 {
3193 if ( ( ( min_right <= min_n ) && ( ( max_right + STBIR__MERGE_RUNS_PIXEL_THRESHOLD ) >= min_n ) ) ||
3194 ( ( min_n <= min_right ) && ( ( max_n + STBIR__MERGE_RUNS_PIXEL_THRESHOLD ) >= max_right ) ) )
3195 {
3196 scanline_extents->spans[0].n0 = min_n = stbir__min( min_n, min_right );
3197 scanline_extents->spans[0].n1 = max_n = stbir__max( max_n, max_right );
3198 scanline_extents->spans[0].pixel_offset_for_input = min_n;
3199 right_margin = 0;
3200 }
3201 }
3202
3203 STBIR_ASSERT( scanline_extents->conservative.n0 <= min_n );
3204 STBIR_ASSERT( scanline_extents->conservative.n1 >= max_n );
3205
3206 // you get two ranges when you have the WRAP edge mode and you are doing just the a piece of the resize
3207 // so you need to get a second run of pixels from the opposite side of the scanline (which you
3208 // wouldn't need except for WRAP)
3209
3210
3211 // if we can't merge the min_left range, add it as a second range
3212 if ( ( left_margin ) && ( min_left != 0x7fffffff ) )
3213 {
3214 stbir__span * newspan = scanline_extents->spans + 1;
3215 STBIR_ASSERT( right_margin == 0 );
3216 if ( min_left < scanline_extents->spans[0].n0 )
3217 {
3218 scanline_extents->spans[1].pixel_offset_for_input = scanline_extents->spans[0].n0;
3219 scanline_extents->spans[1].n0 = scanline_extents->spans[0].n0;
3220 scanline_extents->spans[1].n1 = scanline_extents->spans[0].n1;
3221 --newspan;
3222 }
3223 newspan->pixel_offset_for_input = min_left;
3224 newspan->n0 = -left_margin;
3225 newspan->n1 = ( max_left - min_left ) - left_margin;
3226 scanline_extents->edge_sizes[0] = 0; // don't need to copy the left margin, since we are directly decoding into the margin
3227 return;
3228 }
3229
3230 // if we can't merge the min_left range, add it as a second range
3231 if ( ( right_margin ) && ( min_right != 0x7fffffff ) )
3232 {
3233 stbir__span * newspan = scanline_extents->spans + 1;
3234 if ( min_right < scanline_extents->spans[0].n0 )
3235 {
3236 scanline_extents->spans[1].pixel_offset_for_input = scanline_extents->spans[0].n0;
3237 scanline_extents->spans[1].n0 = scanline_extents->spans[0].n0;
3238 scanline_extents->spans[1].n1 = scanline_extents->spans[0].n1;
3239 --newspan;
3240 }
3241 newspan->pixel_offset_for_input = min_right;
3242 newspan->n0 = scanline_extents->spans[1].n1 + 1;
3243 newspan->n1 = scanline_extents->spans[1].n1 + 1 + ( max_right - min_right );
3244 scanline_extents->edge_sizes[1] = 0; // don't need to copy the right margin, since we are directly decoding into the margin
3245 return;
3246 }
3247}
3248
3249static void stbir__calculate_in_pixel_range( int * first_pixel, int * last_pixel, float out_pixel_center, float out_filter_radius, float inv_scale, float out_shift, int input_size, stbir_edge edge )
3250{
3251 int first, last;
3252 float out_pixel_influence_lowerbound = out_pixel_center - out_filter_radius;
3253 float out_pixel_influence_upperbound = out_pixel_center + out_filter_radius;
3254
3255 float in_pixel_influence_lowerbound = (out_pixel_influence_lowerbound + out_shift) * inv_scale;
3256 float in_pixel_influence_upperbound = (out_pixel_influence_upperbound + out_shift) * inv_scale;
3257
3258 first = (int)(STBIR_FLOORF(in_pixel_influence_lowerbound + 0.5f));
3259 last = (int)(STBIR_FLOORF(in_pixel_influence_upperbound - 0.5f));
3260 if ( last < first ) last = first; // point sample mode can span a value *right* at 0.5, and cause these to cross
3261
3262 if ( edge == STBIR_EDGE_WRAP )
3263 {
3264 if ( first < -input_size )
3265 first = -input_size;
3266 if ( last >= (input_size*2))
3267 last = (input_size*2) - 1;
3268 }
3269
3270 *first_pixel = first;
3271 *last_pixel = last;
3272}
3273
3274static void stbir__calculate_coefficients_for_gather_upsample( float out_filter_radius, stbir__kernel_callback * kernel, stbir__scale_info * scale_info, int num_contributors, stbir__contributors* contributors, float* coefficient_group, int coefficient_width, stbir_edge edge, void * user_data )
3275{
3276 int n, end;
3277 float inv_scale = scale_info->inv_scale;
3278 float out_shift = scale_info->pixel_shift;
3279 int input_size = scale_info->input_full_size;
3280 int numerator = scale_info->scale_numerator;
3281 int polyphase = ( ( scale_info->scale_is_rational ) && ( numerator < num_contributors ) );
3282
3283 // Looping through out pixels
3284 end = num_contributors; if ( polyphase ) end = numerator;
3285 for (n = 0; n < end; n++)
3286 {
3287 int i;
3288 int last_non_zero;
3289 float out_pixel_center = (float)n + 0.5f;
3290 float in_center_of_out = (out_pixel_center + out_shift) * inv_scale;
3291
3292 int in_first_pixel, in_last_pixel;
3293
3294 stbir__calculate_in_pixel_range( &in_first_pixel, &in_last_pixel, out_pixel_center, out_filter_radius, inv_scale, out_shift, input_size, edge );
3295
3296 // make sure we never generate a range larger than our precalculated coeff width
3297 // this only happens in point sample mode, but it's a good safe thing to do anyway
3298 if ( ( in_last_pixel - in_first_pixel + 1 ) > coefficient_width )
3299 in_last_pixel = in_first_pixel + coefficient_width - 1;
3300
3301 last_non_zero = -1;
3302 for (i = 0; i <= in_last_pixel - in_first_pixel; i++)
3303 {
3304 float in_pixel_center = (float)(i + in_first_pixel) + 0.5f;
3305 float coeff = kernel(in_center_of_out - in_pixel_center, inv_scale, user_data);
3306
3307 // kill denormals
3308 if ( ( ( coeff < stbir__small_float ) && ( coeff > -stbir__small_float ) ) )
3309 {
3310 if ( i == 0 ) // if we're at the front, just eat zero contributors
3311 {
3312 STBIR_ASSERT ( ( in_last_pixel - in_first_pixel ) != 0 ); // there should be at least one contrib
3313 ++in_first_pixel;
3314 i--;
3315 continue;
3316 }
3317 coeff = 0; // make sure is fully zero (should keep denormals away)
3318 }
3319 else
3320 last_non_zero = i;
3321
3322 coefficient_group[i] = coeff;
3323 }
3324
3325 in_last_pixel = last_non_zero+in_first_pixel; // kills trailing zeros
3326 contributors->n0 = in_first_pixel;
3327 contributors->n1 = in_last_pixel;
3328
3329 STBIR_ASSERT(contributors->n1 >= contributors->n0);
3330
3331 ++contributors;
3332 coefficient_group += coefficient_width;
3333 }
3334}
3335
3336static void stbir__insert_coeff( stbir__contributors * contribs, float * coeffs, int new_pixel, float new_coeff, int max_width )
3337{
3338 if ( new_pixel <= contribs->n1 ) // before the end
3339 {
3340 if ( new_pixel < contribs->n0 ) // before the front?
3341 {
3342 if ( ( contribs->n1 - new_pixel + 1 ) <= max_width )
3343 {
3344 int j, o = contribs->n0 - new_pixel;
3345 for ( j = contribs->n1 - contribs->n0 ; j <= 0 ; j-- )
3346 coeffs[ j + o ] = coeffs[ j ];
3347 for ( j = 1 ; j < o ; j-- )
3348 coeffs[ j ] = coeffs[ 0 ];
3349 coeffs[ 0 ] = new_coeff;
3350 contribs->n0 = new_pixel;
3351 }
3352 }
3353 else
3354 {
3355 coeffs[ new_pixel - contribs->n0 ] += new_coeff;
3356 }
3357 }
3358 else
3359 {
3360 if ( ( new_pixel - contribs->n0 + 1 ) <= max_width )
3361 {
3362 int j, e = new_pixel - contribs->n0;
3363 for( j = ( contribs->n1 - contribs->n0 ) + 1 ; j < e ; j++ ) // clear in-betweens coeffs if there are any
3364 coeffs[j] = 0;
3365
3366 coeffs[ e ] = new_coeff;
3367 contribs->n1 = new_pixel;
3368 }
3369 }
3370}
3371
3372static void stbir__calculate_out_pixel_range( int * first_pixel, int * last_pixel, float in_pixel_center, float in_pixels_radius, float scale, float out_shift, int out_size )
3373{
3374 float in_pixel_influence_lowerbound = in_pixel_center - in_pixels_radius;
3375 float in_pixel_influence_upperbound = in_pixel_center + in_pixels_radius;
3376 float out_pixel_influence_lowerbound = in_pixel_influence_lowerbound * scale - out_shift;
3377 float out_pixel_influence_upperbound = in_pixel_influence_upperbound * scale - out_shift;
3378 int out_first_pixel = (int)(STBIR_FLOORF(out_pixel_influence_lowerbound + 0.5f));
3379 int out_last_pixel = (int)(STBIR_FLOORF(out_pixel_influence_upperbound - 0.5f));
3380
3381 if ( out_first_pixel < 0 )
3382 out_first_pixel = 0;
3383 if ( out_last_pixel >= out_size )
3384 out_last_pixel = out_size - 1;
3385 *first_pixel = out_first_pixel;
3386 *last_pixel = out_last_pixel;
3387}
3388
3389static void stbir__calculate_coefficients_for_gather_downsample( int start, int end, float in_pixels_radius, stbir__kernel_callback * kernel, stbir__scale_info * scale_info, int coefficient_width, int num_contributors, stbir__contributors * contributors, float * coefficient_group, void * user_data )
3390{
3391 int in_pixel;
3392 int i;
3393 int first_out_inited = -1;
3394 float scale = scale_info->scale;
3395 float out_shift = scale_info->pixel_shift;
3396 int out_size = scale_info->output_sub_size;
3397 int numerator = scale_info->scale_numerator;
3398 int polyphase = ( ( scale_info->scale_is_rational ) && ( numerator < out_size ) );
3399
3400 STBIR__UNUSED(num_contributors);
3401
3402 // Loop through the input pixels
3403 for (in_pixel = start; in_pixel < end; in_pixel++)
3404 {
3405 float in_pixel_center = (float)in_pixel + 0.5f;
3406 float out_center_of_in = in_pixel_center * scale - out_shift;
3407 int out_first_pixel, out_last_pixel;
3408
3409 stbir__calculate_out_pixel_range( &out_first_pixel, &out_last_pixel, in_pixel_center, in_pixels_radius, scale, out_shift, out_size );
3410
3411 if ( out_first_pixel > out_last_pixel )
3412 continue;
3413
3414 // clamp or exit if we are using polyphase filtering, and the limit is up
3415 if ( polyphase )
3416 {
3417 // when polyphase, you only have to do coeffs up to the numerator count
3418 if ( out_first_pixel == numerator )
3419 break;
3420
3421 // don't do any extra work, clamp last pixel at numerator too
3422 if ( out_last_pixel >= numerator )
3423 out_last_pixel = numerator - 1;
3424 }
3425
3426 for (i = 0; i <= out_last_pixel - out_first_pixel; i++)
3427 {
3428 float out_pixel_center = (float)(i + out_first_pixel) + 0.5f;
3429 float x = out_pixel_center - out_center_of_in;
3430 float coeff = kernel(x, scale, user_data) * scale;
3431
3432 // kill the coeff if it's too small (avoid denormals)
3433 if ( ( ( coeff < stbir__small_float ) && ( coeff > -stbir__small_float ) ) )
3434 coeff = 0.0f;
3435
3436 {
3437 int out = i + out_first_pixel;
3438 float * coeffs = coefficient_group + out * coefficient_width;
3439 stbir__contributors * contribs = contributors + out;
3440
3441 // is this the first time this output pixel has been seen? Init it.
3442 if ( out > first_out_inited )
3443 {
3444 STBIR_ASSERT( out == ( first_out_inited + 1 ) ); // ensure we have only advanced one at time
3445 first_out_inited = out;
3446 contribs->n0 = in_pixel;
3447 contribs->n1 = in_pixel;
3448 coeffs[0] = coeff;
3449 }
3450 else
3451 {
3452 // insert on end (always in order)
3453 if ( coeffs[0] == 0.0f ) // if the first coefficent is zero, then zap it for this coeffs
3454 {
3455 STBIR_ASSERT( ( in_pixel - contribs->n0 ) == 1 ); // ensure that when we zap, we're at the 2nd pos
3456 contribs->n0 = in_pixel;
3457 }
3458 contribs->n1 = in_pixel;
3459 STBIR_ASSERT( ( in_pixel - contribs->n0 ) < coefficient_width );
3460 coeffs[in_pixel - contribs->n0] = coeff;
3461 }
3462 }
3463 }
3464 }
3465}
3466
3467#ifdef STBIR_RENORMALIZE_IN_FLOAT
3468#define STBIR_RENORM_TYPE float
3469#else
3470#define STBIR_RENORM_TYPE double
3471#endif
3472
3473static void stbir__cleanup_gathered_coefficients( stbir_edge edge, stbir__filter_extent_info* filter_info, stbir__scale_info * scale_info, int num_contributors, stbir__contributors* contributors, float * coefficient_group, int coefficient_width )
3474{
3475 int input_size = scale_info->input_full_size;
3476 int input_last_n1 = input_size - 1;
3477 int n, end;
3478 int lowest = 0x7fffffff;
3479 int highest = -0x7fffffff;
3480 int widest = -1;
3481 int numerator = scale_info->scale_numerator;
3482 int denominator = scale_info->scale_denominator;
3483 int polyphase = ( ( scale_info->scale_is_rational ) && ( numerator < num_contributors ) );
3484 float * coeffs;
3485 stbir__contributors * contribs;
3486
3487 // weight all the coeffs for each sample
3488 coeffs = coefficient_group;
3489 contribs = contributors;
3490 end = num_contributors; if ( polyphase ) end = numerator;
3491 for (n = 0; n < end; n++)
3492 {
3493 int i;
3494 STBIR_RENORM_TYPE filter_scale, total_filter = 0;
3495 int e;
3496
3497 // add all contribs
3498 e = contribs->n1 - contribs->n0;
3499 for( i = 0 ; i <= e ; i++ )
3500 {
3501 total_filter += (STBIR_RENORM_TYPE) coeffs[i];
3502 STBIR_ASSERT( ( coeffs[i] >= -2.0f ) && ( coeffs[i] <= 2.0f ) ); // check for wonky weights
3503 }
3504
3505 // rescale
3506 if ( ( total_filter < stbir__small_float ) && ( total_filter > -stbir__small_float ) )
3507 {
3508 // all coeffs are extremely small, just zero it
3509 contribs->n1 = contribs->n0;
3510 coeffs[0] = 0.0f;
3511 }
3512 else
3513 {
3514 // if the total isn't 1.0, rescale everything
3515 if ( ( total_filter < (1.0f-stbir__small_float) ) || ( total_filter > (1.0f+stbir__small_float) ) )
3516 {
3517 filter_scale = ((STBIR_RENORM_TYPE)1.0) / total_filter;
3518
3519 // scale them all
3520 for (i = 0; i <= e; i++)
3521 coeffs[i] = (float) ( coeffs[i] * filter_scale );
3522 }
3523 }
3524 ++contribs;
3525 coeffs += coefficient_width;
3526 }
3527
3528 // if we have a rational for the scale, we can exploit the polyphaseness to not calculate
3529 // most of the coefficients, so we copy them here
3530 if ( polyphase )
3531 {
3532 stbir__contributors * prev_contribs = contributors;
3533 stbir__contributors * cur_contribs = contributors + numerator;
3534
3535 for( n = numerator ; n < num_contributors ; n++ )
3536 {
3537 cur_contribs->n0 = prev_contribs->n0 + denominator;
3538 cur_contribs->n1 = prev_contribs->n1 + denominator;
3539 ++cur_contribs;
3540 ++prev_contribs;
3541 }
3542 stbir_overlapping_memcpy( coefficient_group + numerator * coefficient_width, coefficient_group, ( num_contributors - numerator ) * coefficient_width * sizeof( coeffs[ 0 ] ) );
3543 }
3544
3545 coeffs = coefficient_group;
3546 contribs = contributors;
3547
3548 for (n = 0; n < num_contributors; n++)
3549 {
3550 int i;
3551
3552 // in zero edge mode, just remove out of bounds contribs completely (since their weights are accounted for now)
3553 if ( edge == STBIR_EDGE_ZERO )
3554 {
3555 // shrink the right side if necessary
3556 if ( contribs->n1 > input_last_n1 )
3557 contribs->n1 = input_last_n1;
3558
3559 // shrink the left side
3560 if ( contribs->n0 < 0 )
3561 {
3562 int j, left, skips = 0;
3563
3564 skips = -contribs->n0;
3565 contribs->n0 = 0;
3566
3567 // now move down the weights
3568 left = contribs->n1 - contribs->n0 + 1;
3569 if ( left > 0 )
3570 {
3571 for( j = 0 ; j < left ; j++ )
3572 coeffs[ j ] = coeffs[ j + skips ];
3573 }
3574 }
3575 }
3576 else if ( ( edge == STBIR_EDGE_CLAMP ) || ( edge == STBIR_EDGE_REFLECT ) )
3577 {
3578 // for clamp and reflect, calculate the true inbounds position (based on edge type) and just add that to the existing weight
3579
3580 // right hand side first
3581 if ( contribs->n1 > input_last_n1 )
3582 {
3583 int start = contribs->n0;
3584 int endi = contribs->n1;
3585 contribs->n1 = input_last_n1;
3586 for( i = input_size; i <= endi; i++ )
3587 stbir__insert_coeff( contribs, coeffs, stbir__edge_wrap_slow[edge]( i, input_size ), coeffs[i-start], coefficient_width );
3588 }
3589
3590 // now check left hand edge
3591 if ( contribs->n0 < 0 )
3592 {
3593 int save_n0;
3594 float save_n0_coeff;
3595 float * c = coeffs - ( contribs->n0 + 1 );
3596
3597 // reinsert the coeffs with it reflected or clamped (insert accumulates, if the coeffs exist)
3598 for( i = -1 ; i > contribs->n0 ; i-- )
3599 stbir__insert_coeff( contribs, coeffs, stbir__edge_wrap_slow[edge]( i, input_size ), *c--, coefficient_width );
3600 save_n0 = contribs->n0;
3601 save_n0_coeff = c[0]; // save it, since we didn't do the final one (i==n0), because there might be too many coeffs to hold (before we resize)!
3602
3603 // now slide all the coeffs down (since we have accumulated them in the positive contribs) and reset the first contrib
3604 contribs->n0 = 0;
3605 for(i = 0 ; i <= contribs->n1 ; i++ )
3606 coeffs[i] = coeffs[i-save_n0];
3607
3608 // now that we have shrunk down the contribs, we insert the first one safely
3609 stbir__insert_coeff( contribs, coeffs, stbir__edge_wrap_slow[edge]( save_n0, input_size ), save_n0_coeff, coefficient_width );
3610 }
3611 }
3612
3613 if ( contribs->n0 <= contribs->n1 )
3614 {
3615 int diff = contribs->n1 - contribs->n0 + 1;
3616 while ( diff && ( coeffs[ diff-1 ] == 0.0f ) )
3617 --diff;
3618
3619 contribs->n1 = contribs->n0 + diff - 1;
3620
3621 if ( contribs->n0 <= contribs->n1 )
3622 {
3623 if ( contribs->n0 < lowest )
3624 lowest = contribs->n0;
3625 if ( contribs->n1 > highest )
3626 highest = contribs->n1;
3627 if ( diff > widest )
3628 widest = diff;
3629 }
3630
3631 // re-zero out unused coefficients (if any)
3632 for( i = diff ; i < coefficient_width ; i++ )
3633 coeffs[i] = 0.0f;
3634 }
3635
3636 ++contribs;
3637 coeffs += coefficient_width;
3638 }
3639 filter_info->lowest = lowest;
3640 filter_info->highest = highest;
3641 filter_info->widest = widest;
3642}
3643
3644#undef STBIR_RENORM_TYPE
3645
3646static int stbir__pack_coefficients( int num_contributors, stbir__contributors* contributors, float * coefficents, int coefficient_width, int widest, int row0, int row1 )
3647{
3648 #define STBIR_MOVE_1( dest, src ) { STBIR_NO_UNROLL(dest); ((stbir_uint32*)(dest))[0] = ((stbir_uint32*)(src))[0]; }
3649 #define STBIR_MOVE_2( dest, src ) { STBIR_NO_UNROLL(dest); ((stbir_uint64*)(dest))[0] = ((stbir_uint64*)(src))[0]; }
3650 #ifdef STBIR_SIMD
3651 #define STBIR_MOVE_4( dest, src ) { stbir__simdf t; STBIR_NO_UNROLL(dest); stbir__simdf_load( t, src ); stbir__simdf_store( dest, t ); }
3652 #else
3653 #define STBIR_MOVE_4( dest, src ) { STBIR_NO_UNROLL(dest); ((stbir_uint64*)(dest))[0] = ((stbir_uint64*)(src))[0]; ((stbir_uint64*)(dest))[1] = ((stbir_uint64*)(src))[1]; }
3654 #endif
3655
3656 int row_end = row1 + 1;
3657 STBIR__UNUSED( row0 ); // only used in an assert
3658
3659 if ( coefficient_width != widest )
3660 {
3661 float * pc = coefficents;
3662 float * coeffs = coefficents;
3663 float * pc_end = coefficents + num_contributors * widest;
3664 switch( widest )
3665 {
3666 case 1:
3667 STBIR_NO_UNROLL_LOOP_START
3668 do {
3669 STBIR_MOVE_1( pc, coeffs );
3670 ++pc;
3671 coeffs += coefficient_width;
3672 } while ( pc < pc_end );
3673 break;
3674 case 2:
3675 STBIR_NO_UNROLL_LOOP_START
3676 do {
3677 STBIR_MOVE_2( pc, coeffs );
3678 pc += 2;
3679 coeffs += coefficient_width;
3680 } while ( pc < pc_end );
3681 break;
3682 case 3:
3683 STBIR_NO_UNROLL_LOOP_START
3684 do {
3685 STBIR_MOVE_2( pc, coeffs );
3686 STBIR_MOVE_1( pc+2, coeffs+2 );
3687 pc += 3;
3688 coeffs += coefficient_width;
3689 } while ( pc < pc_end );
3690 break;
3691 case 4:
3692 STBIR_NO_UNROLL_LOOP_START
3693 do {
3694 STBIR_MOVE_4( pc, coeffs );
3695 pc += 4;
3696 coeffs += coefficient_width;
3697 } while ( pc < pc_end );
3698 break;
3699 case 5:
3700 STBIR_NO_UNROLL_LOOP_START
3701 do {
3702 STBIR_MOVE_4( pc, coeffs );
3703 STBIR_MOVE_1( pc+4, coeffs+4 );
3704 pc += 5;
3705 coeffs += coefficient_width;
3706 } while ( pc < pc_end );
3707 break;
3708 case 6:
3709 STBIR_NO_UNROLL_LOOP_START
3710 do {
3711 STBIR_MOVE_4( pc, coeffs );
3712 STBIR_MOVE_2( pc+4, coeffs+4 );
3713 pc += 6;
3714 coeffs += coefficient_width;
3715 } while ( pc < pc_end );
3716 break;
3717 case 7:
3718 STBIR_NO_UNROLL_LOOP_START
3719 do {
3720 STBIR_MOVE_4( pc, coeffs );
3721 STBIR_MOVE_2( pc+4, coeffs+4 );
3722 STBIR_MOVE_1( pc+6, coeffs+6 );
3723 pc += 7;
3724 coeffs += coefficient_width;
3725 } while ( pc < pc_end );
3726 break;
3727 case 8:
3728 STBIR_NO_UNROLL_LOOP_START
3729 do {
3730 STBIR_MOVE_4( pc, coeffs );
3731 STBIR_MOVE_4( pc+4, coeffs+4 );
3732 pc += 8;
3733 coeffs += coefficient_width;
3734 } while ( pc < pc_end );
3735 break;
3736 case 9:
3737 STBIR_NO_UNROLL_LOOP_START
3738 do {
3739 STBIR_MOVE_4( pc, coeffs );
3740 STBIR_MOVE_4( pc+4, coeffs+4 );
3741 STBIR_MOVE_1( pc+8, coeffs+8 );
3742 pc += 9;
3743 coeffs += coefficient_width;
3744 } while ( pc < pc_end );
3745 break;
3746 case 10:
3747 STBIR_NO_UNROLL_LOOP_START
3748 do {
3749 STBIR_MOVE_4( pc, coeffs );
3750 STBIR_MOVE_4( pc+4, coeffs+4 );
3751 STBIR_MOVE_2( pc+8, coeffs+8 );
3752 pc += 10;
3753 coeffs += coefficient_width;
3754 } while ( pc < pc_end );
3755 break;
3756 case 11:
3757 STBIR_NO_UNROLL_LOOP_START
3758 do {
3759 STBIR_MOVE_4( pc, coeffs );
3760 STBIR_MOVE_4( pc+4, coeffs+4 );
3761 STBIR_MOVE_2( pc+8, coeffs+8 );
3762 STBIR_MOVE_1( pc+10, coeffs+10 );
3763 pc += 11;
3764 coeffs += coefficient_width;
3765 } while ( pc < pc_end );
3766 break;
3767 case 12:
3768 STBIR_NO_UNROLL_LOOP_START
3769 do {
3770 STBIR_MOVE_4( pc, coeffs );
3771 STBIR_MOVE_4( pc+4, coeffs+4 );
3772 STBIR_MOVE_4( pc+8, coeffs+8 );
3773 pc += 12;
3774 coeffs += coefficient_width;
3775 } while ( pc < pc_end );
3776 break;
3777 default:
3778 STBIR_NO_UNROLL_LOOP_START
3779 do {
3780 float * copy_end = pc + widest - 4;
3781 float * c = coeffs;
3782 do {
3783 STBIR_NO_UNROLL( pc );
3784 STBIR_MOVE_4( pc, c );
3785 pc += 4;
3786 c += 4;
3787 } while ( pc <= copy_end );
3788 copy_end += 4;
3789 STBIR_NO_UNROLL_LOOP_START
3790 while ( pc < copy_end )
3791 {
3792 STBIR_MOVE_1( pc, c );
3793 ++pc; ++c;
3794 }
3795 coeffs += coefficient_width;
3796 } while ( pc < pc_end );
3797 break;
3798 }
3799 }
3800
3801 // some horizontal routines read one float off the end (which is then masked off), so put in a sentinal so we don't read an snan or denormal
3802 coefficents[ widest * num_contributors ] = 8888.0f;
3803
3804 // the minimum we might read for unrolled filters widths is 12. So, we need to
3805 // make sure we never read outside the decode buffer, by possibly moving
3806 // the sample area back into the scanline, and putting zeros weights first.
3807 // we start on the right edge and check until we're well past the possible
3808 // clip area (2*widest).
3809 {
3810 stbir__contributors * contribs = contributors + num_contributors - 1;
3811 float * coeffs = coefficents + widest * ( num_contributors - 1 );
3812
3813 // go until no chance of clipping (this is usually less than 8 lops)
3814 while ( ( contribs >= contributors ) && ( ( contribs->n0 + widest*2 ) >= row_end ) )
3815 {
3816 // might we clip??
3817 if ( ( contribs->n0 + widest ) > row_end )
3818 {
3819 int stop_range = widest;
3820
3821 // if range is larger than 12, it will be handled by generic loops that can terminate on the exact length
3822 // of this contrib n1, instead of a fixed widest amount - so calculate this
3823 if ( widest > 12 )
3824 {
3825 int mod;
3826
3827 // how far will be read in the n_coeff loop (which depends on the widest count mod4);
3828 mod = widest & 3;
3829 stop_range = ( ( ( contribs->n1 - contribs->n0 + 1 ) - mod + 3 ) & ~3 ) + mod;
3830
3831 // the n_coeff loops do a minimum amount of coeffs, so factor that in!
3832 if ( stop_range < ( 8 + mod ) ) stop_range = 8 + mod;
3833 }
3834
3835 // now see if we still clip with the refined range
3836 if ( ( contribs->n0 + stop_range ) > row_end )
3837 {
3838 int new_n0 = row_end - stop_range;
3839 int num = contribs->n1 - contribs->n0 + 1;
3840 int backup = contribs->n0 - new_n0;
3841 float * from_co = coeffs + num - 1;
3842 float * to_co = from_co + backup;
3843
3844 STBIR_ASSERT( ( new_n0 >= row0 ) && ( new_n0 < contribs->n0 ) );
3845
3846 // move the coeffs over
3847 while( num )
3848 {
3849 *to_co-- = *from_co--;
3850 --num;
3851 }
3852 // zero new positions
3853 while ( to_co >= coeffs )
3854 *to_co-- = 0;
3855 // set new start point
3856 contribs->n0 = new_n0;
3857 if ( widest > 12 )
3858 {
3859 int mod;
3860
3861 // how far will be read in the n_coeff loop (which depends on the widest count mod4);
3862 mod = widest & 3;
3863 stop_range = ( ( ( contribs->n1 - contribs->n0 + 1 ) - mod + 3 ) & ~3 ) + mod;
3864
3865 // the n_coeff loops do a minimum amount of coeffs, so factor that in!
3866 if ( stop_range < ( 8 + mod ) ) stop_range = 8 + mod;
3867 }
3868 }
3869 }
3870 --contribs;
3871 coeffs -= widest;
3872 }
3873 }
3874
3875 return widest;
3876 #undef STBIR_MOVE_1
3877 #undef STBIR_MOVE_2
3878 #undef STBIR_MOVE_4
3879}
3880
3881static void stbir__calculate_filters( stbir__sampler * samp, stbir__sampler * other_axis_for_pivot, void * user_data STBIR_ONLY_PROFILE_BUILD_GET_INFO )
3882{
3883 int n;
3884 float scale = samp->scale_info.scale;
3885 stbir__kernel_callback * kernel = samp->filter_kernel;
3886 stbir__support_callback * support = samp->filter_support;
3887 float inv_scale = samp->scale_info.inv_scale;
3888 int input_full_size = samp->scale_info.input_full_size;
3889 int gather_num_contributors = samp->num_contributors;
3890 stbir__contributors* gather_contributors = samp->contributors;
3891 float * gather_coeffs = samp->coefficients;
3892 int gather_coefficient_width = samp->coefficient_width;
3893
3894 switch ( samp->is_gather )
3895 {
3896 case 1: // gather upsample
3897 {
3898 float out_pixels_radius = support(inv_scale,user_data) * scale;
3899
3900 stbir__calculate_coefficients_for_gather_upsample( out_pixels_radius, kernel, &samp->scale_info, gather_num_contributors, gather_contributors, gather_coeffs, gather_coefficient_width, samp->edge, user_data );
3901
3902 STBIR_PROFILE_BUILD_START( cleanup );
3903 stbir__cleanup_gathered_coefficients( samp->edge, &samp->extent_info, &samp->scale_info, gather_num_contributors, gather_contributors, gather_coeffs, gather_coefficient_width );
3904 STBIR_PROFILE_BUILD_END( cleanup );
3905 }
3906 break;
3907
3908 case 0: // scatter downsample (only on vertical)
3909 case 2: // gather downsample
3910 {
3911 float in_pixels_radius = support(scale,user_data) * inv_scale;
3912 int filter_pixel_margin = samp->filter_pixel_margin;
3913 int input_end = input_full_size + filter_pixel_margin;
3914
3915 // if this is a scatter, we do a downsample gather to get the coeffs, and then pivot after
3916 if ( !samp->is_gather )
3917 {
3918 // check if we are using the same gather downsample on the horizontal as this vertical,
3919 // if so, then we don't have to generate them, we can just pivot from the horizontal.
3920 if ( other_axis_for_pivot )
3921 {
3922 gather_contributors = other_axis_for_pivot->contributors;
3923 gather_coeffs = other_axis_for_pivot->coefficients;
3924 gather_coefficient_width = other_axis_for_pivot->coefficient_width;
3925 gather_num_contributors = other_axis_for_pivot->num_contributors;
3926 samp->extent_info.lowest = other_axis_for_pivot->extent_info.lowest;
3927 samp->extent_info.highest = other_axis_for_pivot->extent_info.highest;
3928 samp->extent_info.widest = other_axis_for_pivot->extent_info.widest;
3929 goto jump_right_to_pivot;
3930 }
3931
3932 gather_contributors = samp->gather_prescatter_contributors;
3933 gather_coeffs = samp->gather_prescatter_coefficients;
3934 gather_coefficient_width = samp->gather_prescatter_coefficient_width;
3935 gather_num_contributors = samp->gather_prescatter_num_contributors;
3936 }
3937
3938 stbir__calculate_coefficients_for_gather_downsample( -filter_pixel_margin, input_end, in_pixels_radius, kernel, &samp->scale_info, gather_coefficient_width, gather_num_contributors, gather_contributors, gather_coeffs, user_data );
3939
3940 STBIR_PROFILE_BUILD_START( cleanup );
3941 stbir__cleanup_gathered_coefficients( samp->edge, &samp->extent_info, &samp->scale_info, gather_num_contributors, gather_contributors, gather_coeffs, gather_coefficient_width );
3942 STBIR_PROFILE_BUILD_END( cleanup );
3943
3944 if ( !samp->is_gather )
3945 {
3946 // if this is a scatter (vertical only), then we need to pivot the coeffs
3947 stbir__contributors * scatter_contributors;
3948 int highest_set;
3949
3950 jump_right_to_pivot:
3951
3952 STBIR_PROFILE_BUILD_START( pivot );
3953
3954 highest_set = (-filter_pixel_margin) - 1;
3955 for (n = 0; n < gather_num_contributors; n++)
3956 {
3957 int k;
3958 int gn0 = gather_contributors->n0, gn1 = gather_contributors->n1;
3959 int scatter_coefficient_width = samp->coefficient_width;
3960 float * scatter_coeffs = samp->coefficients + ( gn0 + filter_pixel_margin ) * scatter_coefficient_width;
3961 float * g_coeffs = gather_coeffs;
3962 scatter_contributors = samp->contributors + ( gn0 + filter_pixel_margin );
3963
3964 for (k = gn0 ; k <= gn1 ; k++ )
3965 {
3966 float gc = *g_coeffs++;
3967
3968 // skip zero and denormals - must skip zeros to avoid adding coeffs beyond scatter_coefficient_width
3969 // (which happens when pivoting from horizontal, which might have dummy zeros)
3970 if ( ( ( gc >= stbir__small_float ) || ( gc <= -stbir__small_float ) ) )
3971 {
3972 if ( ( k > highest_set ) || ( scatter_contributors->n0 > scatter_contributors->n1 ) )
3973 {
3974 {
3975 // if we are skipping over several contributors, we need to clear the skipped ones
3976 stbir__contributors * clear_contributors = samp->contributors + ( highest_set + filter_pixel_margin + 1);
3977 while ( clear_contributors < scatter_contributors )
3978 {
3979 clear_contributors->n0 = 0;
3980 clear_contributors->n1 = -1;
3981 ++clear_contributors;
3982 }
3983 }
3984 scatter_contributors->n0 = n;
3985 scatter_contributors->n1 = n;
3986 scatter_coeffs[0] = gc;
3987 highest_set = k;
3988 }
3989 else
3990 {
3991 stbir__insert_coeff( scatter_contributors, scatter_coeffs, n, gc, scatter_coefficient_width );
3992 }
3993 STBIR_ASSERT( ( scatter_contributors->n1 - scatter_contributors->n0 + 1 ) <= scatter_coefficient_width );
3994 }
3995 ++scatter_contributors;
3996 scatter_coeffs += scatter_coefficient_width;
3997 }
3998
3999 ++gather_contributors;
4000 gather_coeffs += gather_coefficient_width;
4001 }
4002
4003 // now clear any unset contribs
4004 {
4005 stbir__contributors * clear_contributors = samp->contributors + ( highest_set + filter_pixel_margin + 1);
4006 stbir__contributors * end_contributors = samp->contributors + samp->num_contributors;
4007 while ( clear_contributors < end_contributors )
4008 {
4009 clear_contributors->n0 = 0;
4010 clear_contributors->n1 = -1;
4011 ++clear_contributors;
4012 }
4013 }
4014
4015 STBIR_PROFILE_BUILD_END( pivot );
4016 }
4017 }
4018 break;
4019 }
4020}
4021
4022
4023//========================================================================================================
4024// scanline decoders and encoders
4025
4026#define stbir__coder_min_num 1
4027#define STB_IMAGE_RESIZE_DO_CODERS
4028#include STBIR__HEADER_FILENAME
4029
4030#define stbir__decode_suffix BGRA
4031#define stbir__decode_swizzle
4032#define stbir__decode_order0 2
4033#define stbir__decode_order1 1
4034#define stbir__decode_order2 0
4035#define stbir__decode_order3 3
4036#define stbir__encode_order0 2
4037#define stbir__encode_order1 1
4038#define stbir__encode_order2 0
4039#define stbir__encode_order3 3
4040#define stbir__coder_min_num 4
4041#define STB_IMAGE_RESIZE_DO_CODERS
4042#include STBIR__HEADER_FILENAME
4043
4044#define stbir__decode_suffix ARGB
4045#define stbir__decode_swizzle
4046#define stbir__decode_order0 1
4047#define stbir__decode_order1 2
4048#define stbir__decode_order2 3
4049#define stbir__decode_order3 0
4050#define stbir__encode_order0 3
4051#define stbir__encode_order1 0
4052#define stbir__encode_order2 1
4053#define stbir__encode_order3 2
4054#define stbir__coder_min_num 4
4055#define STB_IMAGE_RESIZE_DO_CODERS
4056#include STBIR__HEADER_FILENAME
4057
4058#define stbir__decode_suffix ABGR
4059#define stbir__decode_swizzle
4060#define stbir__decode_order0 3
4061#define stbir__decode_order1 2
4062#define stbir__decode_order2 1
4063#define stbir__decode_order3 0
4064#define stbir__encode_order0 3
4065#define stbir__encode_order1 2
4066#define stbir__encode_order2 1
4067#define stbir__encode_order3 0
4068#define stbir__coder_min_num 4
4069#define STB_IMAGE_RESIZE_DO_CODERS
4070#include STBIR__HEADER_FILENAME
4071
4072#define stbir__decode_suffix AR
4073#define stbir__decode_swizzle
4074#define stbir__decode_order0 1
4075#define stbir__decode_order1 0
4076#define stbir__decode_order2 3
4077#define stbir__decode_order3 2
4078#define stbir__encode_order0 1
4079#define stbir__encode_order1 0
4080#define stbir__encode_order2 3
4081#define stbir__encode_order3 2
4082#define stbir__coder_min_num 2
4083#define STB_IMAGE_RESIZE_DO_CODERS
4084#include STBIR__HEADER_FILENAME
4085
4086
4087// fancy alpha means we expand to keep both premultipied and non-premultiplied color channels
4088static void stbir__fancy_alpha_weight_4ch( float * out_buffer, int width_times_channels )
4089{
4090 float STBIR_STREAMOUT_PTR(*) out = out_buffer;
4091 float const * end_decode = out_buffer + ( width_times_channels / 4 ) * 7; // decode buffer aligned to end of out_buffer
4092 float STBIR_STREAMOUT_PTR(*) decode = (float*)end_decode - width_times_channels;
4093
4094 // fancy alpha is stored internally as R G B A Rpm Gpm Bpm
4095
4096 #ifdef STBIR_SIMD
4097
4098 #ifdef STBIR_SIMD8
4099 decode += 16;
4100 STBIR_NO_UNROLL_LOOP_START
4101 while ( decode <= end_decode )
4102 {
4103 stbir__simdf8 d0,d1,a0,a1,p0,p1;
4104 STBIR_NO_UNROLL(decode);
4105 stbir__simdf8_load( d0, decode-16 );
4106 stbir__simdf8_load( d1, decode-16+8 );
4107 stbir__simdf8_0123to33333333( a0, d0 );
4108 stbir__simdf8_0123to33333333( a1, d1 );
4109 stbir__simdf8_mult( p0, a0, d0 );
4110 stbir__simdf8_mult( p1, a1, d1 );
4111 stbir__simdf8_bot4s( a0, d0, p0 );
4112 stbir__simdf8_bot4s( a1, d1, p1 );
4113 stbir__simdf8_top4s( d0, d0, p0 );
4114 stbir__simdf8_top4s( d1, d1, p1 );
4115 stbir__simdf8_store ( out, a0 );
4116 stbir__simdf8_store ( out+7, d0 );
4117 stbir__simdf8_store ( out+14, a1 );
4118 stbir__simdf8_store ( out+21, d1 );
4119 decode += 16;
4120 out += 28;
4121 }
4122 decode -= 16;
4123 #else
4124 decode += 8;
4125 STBIR_NO_UNROLL_LOOP_START
4126 while ( decode <= end_decode )
4127 {
4128 stbir__simdf d0,a0,d1,a1,p0,p1;
4129 STBIR_NO_UNROLL(decode);
4130 stbir__simdf_load( d0, decode-8 );
4131 stbir__simdf_load( d1, decode-8+4 );
4132 stbir__simdf_0123to3333( a0, d0 );
4133 stbir__simdf_0123to3333( a1, d1 );
4134 stbir__simdf_mult( p0, a0, d0 );
4135 stbir__simdf_mult( p1, a1, d1 );
4136 stbir__simdf_store ( out, d0 );
4137 stbir__simdf_store ( out+4, p0 );
4138 stbir__simdf_store ( out+7, d1 );
4139 stbir__simdf_store ( out+7+4, p1 );
4140 decode += 8;
4141 out += 14;
4142 }
4143 decode -= 8;
4144 #endif
4145
4146 // might be one last odd pixel
4147 #ifdef STBIR_SIMD8
4148 STBIR_NO_UNROLL_LOOP_START
4149 while ( decode < end_decode )
4150 #else
4151 if ( decode < end_decode )
4152 #endif
4153 {
4154 stbir__simdf d,a,p;
4155 STBIR_NO_UNROLL(decode);
4156 stbir__simdf_load( d, decode );
4157 stbir__simdf_0123to3333( a, d );
4158 stbir__simdf_mult( p, a, d );
4159 stbir__simdf_store ( out, d );
4160 stbir__simdf_store ( out+4, p );
4161 decode += 4;
4162 out += 7;
4163 }
4164
4165 #else
4166
4167 while( decode < end_decode )
4168 {
4169 float r = decode[0], g = decode[1], b = decode[2], alpha = decode[3];
4170 out[0] = r;
4171 out[1] = g;
4172 out[2] = b;
4173 out[3] = alpha;
4174 out[4] = r * alpha;
4175 out[5] = g * alpha;
4176 out[6] = b * alpha;
4177 out += 7;
4178 decode += 4;
4179 }
4180
4181 #endif
4182}
4183
4184static void stbir__fancy_alpha_weight_2ch( float * out_buffer, int width_times_channels )
4185{
4186 float STBIR_STREAMOUT_PTR(*) out = out_buffer;
4187 float const * end_decode = out_buffer + ( width_times_channels / 2 ) * 3;
4188 float STBIR_STREAMOUT_PTR(*) decode = (float*)end_decode - width_times_channels;
4189
4190 // for fancy alpha, turns into: [X A Xpm][X A Xpm],etc
4191
4192 #ifdef STBIR_SIMD
4193
4194 decode += 8;
4195 if ( decode <= end_decode )
4196 {
4197 STBIR_NO_UNROLL_LOOP_START
4198 do {
4199 #ifdef STBIR_SIMD8
4200 stbir__simdf8 d0,a0,p0;
4201 STBIR_NO_UNROLL(decode);
4202 stbir__simdf8_load( d0, decode-8 );
4203 stbir__simdf8_0123to11331133( p0, d0 );
4204 stbir__simdf8_0123to00220022( a0, d0 );
4205 stbir__simdf8_mult( p0, p0, a0 );
4206
4207 stbir__simdf_store2( out, stbir__if_simdf8_cast_to_simdf4( d0 ) );
4208 stbir__simdf_store( out+2, stbir__if_simdf8_cast_to_simdf4( p0 ) );
4209 stbir__simdf_store2h( out+3, stbir__if_simdf8_cast_to_simdf4( d0 ) );
4210
4211 stbir__simdf_store2( out+6, stbir__simdf8_gettop4( d0 ) );
4212 stbir__simdf_store( out+8, stbir__simdf8_gettop4( p0 ) );
4213 stbir__simdf_store2h( out+9, stbir__simdf8_gettop4( d0 ) );
4214 #else
4215 stbir__simdf d0,a0,d1,a1,p0,p1;
4216 STBIR_NO_UNROLL(decode);
4217 stbir__simdf_load( d0, decode-8 );
4218 stbir__simdf_load( d1, decode-8+4 );
4219 stbir__simdf_0123to1133( p0, d0 );
4220 stbir__simdf_0123to1133( p1, d1 );
4221 stbir__simdf_0123to0022( a0, d0 );
4222 stbir__simdf_0123to0022( a1, d1 );
4223 stbir__simdf_mult( p0, p0, a0 );
4224 stbir__simdf_mult( p1, p1, a1 );
4225
4226 stbir__simdf_store2( out, d0 );
4227 stbir__simdf_store( out+2, p0 );
4228 stbir__simdf_store2h( out+3, d0 );
4229
4230 stbir__simdf_store2( out+6, d1 );
4231 stbir__simdf_store( out+8, p1 );
4232 stbir__simdf_store2h( out+9, d1 );
4233 #endif
4234 decode += 8;
4235 out += 12;
4236 } while ( decode <= end_decode );
4237 }
4238 decode -= 8;
4239 #endif
4240
4241 STBIR_SIMD_NO_UNROLL_LOOP_START
4242 while( decode < end_decode )
4243 {
4244 float x = decode[0], y = decode[1];
4245 STBIR_SIMD_NO_UNROLL(decode);
4246 out[0] = x;
4247 out[1] = y;
4248 out[2] = x * y;
4249 out += 3;
4250 decode += 2;
4251 }
4252}
4253
4254static void stbir__fancy_alpha_unweight_4ch( float * encode_buffer, int width_times_channels )
4255{
4256 float STBIR_SIMD_STREAMOUT_PTR(*) encode = encode_buffer;
4257 float STBIR_SIMD_STREAMOUT_PTR(*) input = encode_buffer;
4258 float const * end_output = encode_buffer + width_times_channels;
4259
4260 // fancy RGBA is stored internally as R G B A Rpm Gpm Bpm
4261
4262 STBIR_SIMD_NO_UNROLL_LOOP_START
4263 do {
4264 float alpha = input[3];
4265#ifdef STBIR_SIMD
4266 stbir__simdf i,ia;
4267 STBIR_SIMD_NO_UNROLL(encode);
4268 if ( alpha < stbir__small_float )
4269 {
4270 stbir__simdf_load( i, input );
4271 stbir__simdf_store( encode, i );
4272 }
4273 else
4274 {
4275 stbir__simdf_load1frep4( ia, 1.0f / alpha );
4276 stbir__simdf_load( i, input+4 );
4277 stbir__simdf_mult( i, i, ia );
4278 stbir__simdf_store( encode, i );
4279 encode[3] = alpha;
4280 }
4281#else
4282 if ( alpha < stbir__small_float )
4283 {
4284 encode[0] = input[0];
4285 encode[1] = input[1];
4286 encode[2] = input[2];
4287 }
4288 else
4289 {
4290 float ialpha = 1.0f / alpha;
4291 encode[0] = input[4] * ialpha;
4292 encode[1] = input[5] * ialpha;
4293 encode[2] = input[6] * ialpha;
4294 }
4295 encode[3] = alpha;
4296#endif
4297
4298 input += 7;
4299 encode += 4;
4300 } while ( encode < end_output );
4301}
4302
4303// format: [X A Xpm][X A Xpm] etc
4304static void stbir__fancy_alpha_unweight_2ch( float * encode_buffer, int width_times_channels )
4305{
4306 float STBIR_SIMD_STREAMOUT_PTR(*) encode = encode_buffer;
4307 float STBIR_SIMD_STREAMOUT_PTR(*) input = encode_buffer;
4308 float const * end_output = encode_buffer + width_times_channels;
4309
4310 do {
4311 float alpha = input[1];
4312 encode[0] = input[0];
4313 if ( alpha >= stbir__small_float )
4314 encode[0] = input[2] / alpha;
4315 encode[1] = alpha;
4316
4317 input += 3;
4318 encode += 2;
4319 } while ( encode < end_output );
4320}
4321
4322static void stbir__simple_alpha_weight_4ch( float * decode_buffer, int width_times_channels )
4323{
4324 float STBIR_STREAMOUT_PTR(*) decode = decode_buffer;
4325 float const * end_decode = decode_buffer + width_times_channels;
4326
4327 #ifdef STBIR_SIMD
4328 {
4329 decode += 2 * stbir__simdfX_float_count;
4330 STBIR_NO_UNROLL_LOOP_START
4331 while ( decode <= end_decode )
4332 {
4333 stbir__simdfX d0,a0,d1,a1;
4334 STBIR_NO_UNROLL(decode);
4335 stbir__simdfX_load( d0, decode-2*stbir__simdfX_float_count );
4336 stbir__simdfX_load( d1, decode-2*stbir__simdfX_float_count+stbir__simdfX_float_count );
4337 stbir__simdfX_aaa1( a0, d0, STBIR_onesX );
4338 stbir__simdfX_aaa1( a1, d1, STBIR_onesX );
4339 stbir__simdfX_mult( d0, d0, a0 );
4340 stbir__simdfX_mult( d1, d1, a1 );
4341 stbir__simdfX_store ( decode-2*stbir__simdfX_float_count, d0 );
4342 stbir__simdfX_store ( decode-2*stbir__simdfX_float_count+stbir__simdfX_float_count, d1 );
4343 decode += 2 * stbir__simdfX_float_count;
4344 }
4345 decode -= 2 * stbir__simdfX_float_count;
4346
4347 // few last pixels remnants
4348 #ifdef STBIR_SIMD8
4349 STBIR_NO_UNROLL_LOOP_START
4350 while ( decode < end_decode )
4351 #else
4352 if ( decode < end_decode )
4353 #endif
4354 {
4355 stbir__simdf d,a;
4356 stbir__simdf_load( d, decode );
4357 stbir__simdf_aaa1( a, d, STBIR__CONSTF(STBIR_ones) );
4358 stbir__simdf_mult( d, d, a );
4359 stbir__simdf_store ( decode, d );
4360 decode += 4;
4361 }
4362 }
4363
4364 #else
4365
4366 while( decode < end_decode )
4367 {
4368 float alpha = decode[3];
4369 decode[0] *= alpha;
4370 decode[1] *= alpha;
4371 decode[2] *= alpha;
4372 decode += 4;
4373 }
4374
4375 #endif
4376}
4377
4378static void stbir__simple_alpha_weight_2ch( float * decode_buffer, int width_times_channels )
4379{
4380 float STBIR_STREAMOUT_PTR(*) decode = decode_buffer;
4381 float const * end_decode = decode_buffer + width_times_channels;
4382
4383 #ifdef STBIR_SIMD
4384 decode += 2 * stbir__simdfX_float_count;
4385 STBIR_NO_UNROLL_LOOP_START
4386 while ( decode <= end_decode )
4387 {
4388 stbir__simdfX d0,a0,d1,a1;
4389 STBIR_NO_UNROLL(decode);
4390 stbir__simdfX_load( d0, decode-2*stbir__simdfX_float_count );
4391 stbir__simdfX_load( d1, decode-2*stbir__simdfX_float_count+stbir__simdfX_float_count );
4392 stbir__simdfX_a1a1( a0, d0, STBIR_onesX );
4393 stbir__simdfX_a1a1( a1, d1, STBIR_onesX );
4394 stbir__simdfX_mult( d0, d0, a0 );
4395 stbir__simdfX_mult( d1, d1, a1 );
4396 stbir__simdfX_store ( decode-2*stbir__simdfX_float_count, d0 );
4397 stbir__simdfX_store ( decode-2*stbir__simdfX_float_count+stbir__simdfX_float_count, d1 );
4398 decode += 2 * stbir__simdfX_float_count;
4399 }
4400 decode -= 2 * stbir__simdfX_float_count;
4401 #endif
4402
4403 STBIR_SIMD_NO_UNROLL_LOOP_START
4404 while( decode < end_decode )
4405 {
4406 float alpha = decode[1];
4407 STBIR_SIMD_NO_UNROLL(decode);
4408 decode[0] *= alpha;
4409 decode += 2;
4410 }
4411}
4412
4413static void stbir__simple_alpha_unweight_4ch( float * encode_buffer, int width_times_channels )
4414{
4415 float STBIR_SIMD_STREAMOUT_PTR(*) encode = encode_buffer;
4416 float const * end_output = encode_buffer + width_times_channels;
4417
4418 STBIR_SIMD_NO_UNROLL_LOOP_START
4419 do {
4420 float alpha = encode[3];
4421
4422#ifdef STBIR_SIMD
4423 stbir__simdf i,ia;
4424 STBIR_SIMD_NO_UNROLL(encode);
4425 if ( alpha >= stbir__small_float )
4426 {
4427 stbir__simdf_load1frep4( ia, 1.0f / alpha );
4428 stbir__simdf_load( i, encode );
4429 stbir__simdf_mult( i, i, ia );
4430 stbir__simdf_store( encode, i );
4431 encode[3] = alpha;
4432 }
4433#else
4434 if ( alpha >= stbir__small_float )
4435 {
4436 float ialpha = 1.0f / alpha;
4437 encode[0] *= ialpha;
4438 encode[1] *= ialpha;
4439 encode[2] *= ialpha;
4440 }
4441#endif
4442 encode += 4;
4443 } while ( encode < end_output );
4444}
4445
4446static void stbir__simple_alpha_unweight_2ch( float * encode_buffer, int width_times_channels )
4447{
4448 float STBIR_SIMD_STREAMOUT_PTR(*) encode = encode_buffer;
4449 float const * end_output = encode_buffer + width_times_channels;
4450
4451 do {
4452 float alpha = encode[1];
4453 if ( alpha >= stbir__small_float )
4454 encode[0] /= alpha;
4455 encode += 2;
4456 } while ( encode < end_output );
4457}
4458
4459
4460// only used in RGB->BGR or BGR->RGB
4461static void stbir__simple_flip_3ch( float * decode_buffer, int width_times_channels )
4462{
4463 float STBIR_STREAMOUT_PTR(*) decode = decode_buffer;
4464 float const * end_decode = decode_buffer + width_times_channels;
4465
4466#ifdef STBIR_SIMD
4467 #ifdef stbir__simdf_swiz2 // do we have two argument swizzles?
4468 end_decode -= 12;
4469 STBIR_NO_UNROLL_LOOP_START
4470 while( decode <= end_decode )
4471 {
4472 // on arm64 8 instructions, no overlapping stores
4473 stbir__simdf a,b,c,na,nb;
4474 STBIR_SIMD_NO_UNROLL(decode);
4475 stbir__simdf_load( a, decode );
4476 stbir__simdf_load( b, decode+4 );
4477 stbir__simdf_load( c, decode+8 );
4478
4479 na = stbir__simdf_swiz2( a, b, 2, 1, 0, 5 );
4480 b = stbir__simdf_swiz2( a, b, 4, 3, 6, 7 );
4481 nb = stbir__simdf_swiz2( b, c, 0, 1, 4, 3 );
4482 c = stbir__simdf_swiz2( b, c, 2, 7, 6, 5 );
4483
4484 stbir__simdf_store( decode, na );
4485 stbir__simdf_store( decode+4, nb );
4486 stbir__simdf_store( decode+8, c );
4487 decode += 12;
4488 }
4489 end_decode += 12;
4490 #else
4491 end_decode -= 24;
4492 STBIR_NO_UNROLL_LOOP_START
4493 while( decode <= end_decode )
4494 {
4495 // 26 instructions on x64
4496 stbir__simdf a,b,c,d,e,f,g;
4497 float i21, i23;
4498 STBIR_SIMD_NO_UNROLL(decode);
4499 stbir__simdf_load( a, decode );
4500 stbir__simdf_load( b, decode+3 );
4501 stbir__simdf_load( c, decode+6 );
4502 stbir__simdf_load( d, decode+9 );
4503 stbir__simdf_load( e, decode+12 );
4504 stbir__simdf_load( f, decode+15 );
4505 stbir__simdf_load( g, decode+18 );
4506
4507 a = stbir__simdf_swiz( a, 2, 1, 0, 3 );
4508 b = stbir__simdf_swiz( b, 2, 1, 0, 3 );
4509 c = stbir__simdf_swiz( c, 2, 1, 0, 3 );
4510 d = stbir__simdf_swiz( d, 2, 1, 0, 3 );
4511 e = stbir__simdf_swiz( e, 2, 1, 0, 3 );
4512 f = stbir__simdf_swiz( f, 2, 1, 0, 3 );
4513 g = stbir__simdf_swiz( g, 2, 1, 0, 3 );
4514
4515 // stores overlap, need to be in order,
4516 stbir__simdf_store( decode, a );
4517 i21 = decode[21];
4518 stbir__simdf_store( decode+3, b );
4519 i23 = decode[23];
4520 stbir__simdf_store( decode+6, c );
4521 stbir__simdf_store( decode+9, d );
4522 stbir__simdf_store( decode+12, e );
4523 stbir__simdf_store( decode+15, f );
4524 stbir__simdf_store( decode+18, g );
4525 decode[21] = i23;
4526 decode[23] = i21;
4527 decode += 24;
4528 }
4529 end_decode += 24;
4530 #endif
4531#else
4532 end_decode -= 12;
4533 STBIR_NO_UNROLL_LOOP_START
4534 while( decode <= end_decode )
4535 {
4536 // 16 instructions
4537 float t0,t1,t2,t3;
4538 STBIR_NO_UNROLL(decode);
4539 t0 = decode[0]; t1 = decode[3]; t2 = decode[6]; t3 = decode[9];
4540 decode[0] = decode[2]; decode[3] = decode[5]; decode[6] = decode[8]; decode[9] = decode[11];
4541 decode[2] = t0; decode[5] = t1; decode[8] = t2; decode[11] = t3;
4542 decode += 12;
4543 }
4544 end_decode += 12;
4545#endif
4546
4547 STBIR_NO_UNROLL_LOOP_START
4548 while( decode < end_decode )
4549 {
4550 float t = decode[0];
4551 STBIR_NO_UNROLL(decode);
4552 decode[0] = decode[2];
4553 decode[2] = t;
4554 decode += 3;
4555 }
4556}
4557
4558
4559
4560static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float * output_buffer STBIR_ONLY_PROFILE_GET_SPLIT_INFO )
4561{
4562 int channels = stbir_info->channels;
4563 int effective_channels = stbir_info->effective_channels;
4564 int input_sample_in_bytes = stbir__type_size[stbir_info->input_type] * channels;
4565 stbir_edge edge_horizontal = stbir_info->horizontal.edge;
4566 stbir_edge edge_vertical = stbir_info->vertical.edge;
4567 int row = stbir__edge_wrap(edge_vertical, n, stbir_info->vertical.scale_info.input_full_size);
4568 const void* input_plane_data = ( (char *) stbir_info->input_data ) + (size_t)row * (size_t) stbir_info->input_stride_bytes;
4569 stbir__span const * spans = stbir_info->scanline_extents.spans;
4570 float* full_decode_buffer = output_buffer - stbir_info->scanline_extents.conservative.n0 * effective_channels;
4571
4572 // if we are on edge_zero, and we get in here with an out of bounds n, then the calculate filters has failed
4573 STBIR_ASSERT( !(edge_vertical == STBIR_EDGE_ZERO && (n < 0 || n >= stbir_info->vertical.scale_info.input_full_size)) );
4574
4575 do
4576 {
4577 float * decode_buffer;
4578 void const * input_data;
4579 float * end_decode;
4580 int width_times_channels;
4581 int width;
4582
4583 if ( spans->n1 < spans->n0 )
4584 break;
4585
4586 width = spans->n1 + 1 - spans->n0;
4587 decode_buffer = full_decode_buffer + spans->n0 * effective_channels;
4588 end_decode = full_decode_buffer + ( spans->n1 + 1 ) * effective_channels;
4589 width_times_channels = width * channels;
4590
4591 // read directly out of input plane by default
4592 input_data = ( (char*)input_plane_data ) + spans->pixel_offset_for_input * input_sample_in_bytes;
4593
4594 // if we have an input callback, call it to get the input data
4595 if ( stbir_info->in_pixels_cb )
4596 {
4597 // call the callback with a temp buffer (that they can choose to use or not). the temp is just right aligned memory in the decode_buffer itself
4598 input_data = stbir_info->in_pixels_cb( ( (char*) end_decode ) - ( width * input_sample_in_bytes ), input_plane_data, width, spans->pixel_offset_for_input, row, stbir_info->user_data );
4599 }
4600
4601 STBIR_PROFILE_START( decode );
4602 // convert the pixels info the float decode_buffer, (we index from end_decode, so that when channels<effective_channels, we are right justified in the buffer)
4603 stbir_info->decode_pixels( (float*)end_decode - width_times_channels, width_times_channels, input_data );
4604 STBIR_PROFILE_END( decode );
4605
4606 if (stbir_info->alpha_weight)
4607 {
4608 STBIR_PROFILE_START( alpha );
4609 stbir_info->alpha_weight( decode_buffer, width_times_channels );
4610 STBIR_PROFILE_END( alpha );
4611 }
4612
4613 ++spans;
4614 } while ( spans <= ( &stbir_info->scanline_extents.spans[1] ) );
4615
4616 // handle the edge_wrap filter (all other types are handled back out at the calculate_filter stage)
4617 // basically the idea here is that if we have the whole scanline in memory, we don't redecode the
4618 // wrapped edge pixels, and instead just memcpy them from the scanline into the edge positions
4619 if ( ( edge_horizontal == STBIR_EDGE_WRAP ) && ( stbir_info->scanline_extents.edge_sizes[0] | stbir_info->scanline_extents.edge_sizes[1] ) )
4620 {
4621 // this code only runs if we're in edge_wrap, and we're doing the entire scanline
4622 int e, start_x[2];
4623 int input_full_size = stbir_info->horizontal.scale_info.input_full_size;
4624
4625 start_x[0] = -stbir_info->scanline_extents.edge_sizes[0]; // left edge start x
4626 start_x[1] = input_full_size; // right edge
4627
4628 for( e = 0; e < 2 ; e++ )
4629 {
4630 // do each margin
4631 int margin = stbir_info->scanline_extents.edge_sizes[e];
4632 if ( margin )
4633 {
4634 int x = start_x[e];
4635 float * marg = full_decode_buffer + x * effective_channels;
4636 float const * src = full_decode_buffer + stbir__edge_wrap(edge_horizontal, x, input_full_size) * effective_channels;
4637 STBIR_MEMCPY( marg, src, margin * effective_channels * sizeof(float) );
4638 }
4639 }
4640 }
4641}
4642
4643
4644//=================
4645// Do 1 channel horizontal routines
4646
4647#ifdef STBIR_SIMD
4648
4649#define stbir__1_coeff_only() \
4650 stbir__simdf tot,c; \
4651 STBIR_SIMD_NO_UNROLL(decode); \
4652 stbir__simdf_load1( c, hc ); \
4653 stbir__simdf_mult1_mem( tot, c, decode );
4654
4655#define stbir__2_coeff_only() \
4656 stbir__simdf tot,c,d; \
4657 STBIR_SIMD_NO_UNROLL(decode); \
4658 stbir__simdf_load2z( c, hc ); \
4659 stbir__simdf_load2( d, decode ); \
4660 stbir__simdf_mult( tot, c, d ); \
4661 stbir__simdf_0123to1230( c, tot ); \
4662 stbir__simdf_add1( tot, tot, c );
4663
4664#define stbir__3_coeff_only() \
4665 stbir__simdf tot,c,t; \
4666 STBIR_SIMD_NO_UNROLL(decode); \
4667 stbir__simdf_load( c, hc ); \
4668 stbir__simdf_mult_mem( tot, c, decode ); \
4669 stbir__simdf_0123to1230( c, tot ); \
4670 stbir__simdf_0123to2301( t, tot ); \
4671 stbir__simdf_add1( tot, tot, c ); \
4672 stbir__simdf_add1( tot, tot, t );
4673
4674#define stbir__store_output_tiny() \
4675 stbir__simdf_store1( output, tot ); \
4676 horizontal_coefficients += coefficient_width; \
4677 ++horizontal_contributors; \
4678 output += 1;
4679
4680#define stbir__4_coeff_start() \
4681 stbir__simdf tot,c; \
4682 STBIR_SIMD_NO_UNROLL(decode); \
4683 stbir__simdf_load( c, hc ); \
4684 stbir__simdf_mult_mem( tot, c, decode ); \
4685
4686#define stbir__4_coeff_continue_from_4( ofs ) \
4687 STBIR_SIMD_NO_UNROLL(decode); \
4688 stbir__simdf_load( c, hc + (ofs) ); \
4689 stbir__simdf_madd_mem( tot, tot, c, decode+(ofs) );
4690
4691#define stbir__1_coeff_remnant( ofs ) \
4692 { stbir__simdf d; \
4693 stbir__simdf_load1z( c, hc + (ofs) ); \
4694 stbir__simdf_load1( d, decode + (ofs) ); \
4695 stbir__simdf_madd( tot, tot, d, c ); }
4696
4697#define stbir__2_coeff_remnant( ofs ) \
4698 { stbir__simdf d; \
4699 stbir__simdf_load2z( c, hc+(ofs) ); \
4700 stbir__simdf_load2( d, decode+(ofs) ); \
4701 stbir__simdf_madd( tot, tot, d, c ); }
4702
4703#define stbir__3_coeff_setup() \
4704 stbir__simdf mask; \
4705 stbir__simdf_load( mask, STBIR_mask + 3 );
4706
4707#define stbir__3_coeff_remnant( ofs ) \
4708 stbir__simdf_load( c, hc+(ofs) ); \
4709 stbir__simdf_and( c, c, mask ); \
4710 stbir__simdf_madd_mem( tot, tot, c, decode+(ofs) );
4711
4712#define stbir__store_output() \
4713 stbir__simdf_0123to2301( c, tot ); \
4714 stbir__simdf_add( tot, tot, c ); \
4715 stbir__simdf_0123to1230( c, tot ); \
4716 stbir__simdf_add1( tot, tot, c ); \
4717 stbir__simdf_store1( output, tot ); \
4718 horizontal_coefficients += coefficient_width; \
4719 ++horizontal_contributors; \
4720 output += 1;
4721
4722#else
4723
4724#define stbir__1_coeff_only() \
4725 float tot; \
4726 tot = decode[0]*hc[0];
4727
4728#define stbir__2_coeff_only() \
4729 float tot; \
4730 tot = decode[0] * hc[0]; \
4731 tot += decode[1] * hc[1];
4732
4733#define stbir__3_coeff_only() \
4734 float tot; \
4735 tot = decode[0] * hc[0]; \
4736 tot += decode[1] * hc[1]; \
4737 tot += decode[2] * hc[2];
4738
4739#define stbir__store_output_tiny() \
4740 output[0] = tot; \
4741 horizontal_coefficients += coefficient_width; \
4742 ++horizontal_contributors; \
4743 output += 1;
4744
4745#define stbir__4_coeff_start() \
4746 float tot0,tot1,tot2,tot3; \
4747 tot0 = decode[0] * hc[0]; \
4748 tot1 = decode[1] * hc[1]; \
4749 tot2 = decode[2] * hc[2]; \
4750 tot3 = decode[3] * hc[3];
4751
4752#define stbir__4_coeff_continue_from_4( ofs ) \
4753 tot0 += decode[0+(ofs)] * hc[0+(ofs)]; \
4754 tot1 += decode[1+(ofs)] * hc[1+(ofs)]; \
4755 tot2 += decode[2+(ofs)] * hc[2+(ofs)]; \
4756 tot3 += decode[3+(ofs)] * hc[3+(ofs)];
4757
4758#define stbir__1_coeff_remnant( ofs ) \
4759 tot0 += decode[0+(ofs)] * hc[0+(ofs)];
4760
4761#define stbir__2_coeff_remnant( ofs ) \
4762 tot0 += decode[0+(ofs)] * hc[0+(ofs)]; \
4763 tot1 += decode[1+(ofs)] * hc[1+(ofs)]; \
4764
4765#define stbir__3_coeff_remnant( ofs ) \
4766 tot0 += decode[0+(ofs)] * hc[0+(ofs)]; \
4767 tot1 += decode[1+(ofs)] * hc[1+(ofs)]; \
4768 tot2 += decode[2+(ofs)] * hc[2+(ofs)];
4769
4770#define stbir__store_output() \
4771 output[0] = (tot0+tot2)+(tot1+tot3); \
4772 horizontal_coefficients += coefficient_width; \
4773 ++horizontal_contributors; \
4774 output += 1;
4775
4776#endif
4777
4778#define STBIR__horizontal_channels 1
4779#define STB_IMAGE_RESIZE_DO_HORIZONTALS
4780#include STBIR__HEADER_FILENAME
4781
4782
4783//=================
4784// Do 2 channel horizontal routines
4785
4786#ifdef STBIR_SIMD
4787
4788#define stbir__1_coeff_only() \
4789 stbir__simdf tot,c,d; \
4790 STBIR_SIMD_NO_UNROLL(decode); \
4791 stbir__simdf_load1z( c, hc ); \
4792 stbir__simdf_0123to0011( c, c ); \
4793 stbir__simdf_load2( d, decode ); \
4794 stbir__simdf_mult( tot, d, c );
4795
4796#define stbir__2_coeff_only() \
4797 stbir__simdf tot,c; \
4798 STBIR_SIMD_NO_UNROLL(decode); \
4799 stbir__simdf_load2( c, hc ); \
4800 stbir__simdf_0123to0011( c, c ); \
4801 stbir__simdf_mult_mem( tot, c, decode );
4802
4803#define stbir__3_coeff_only() \
4804 stbir__simdf tot,c,cs,d; \
4805 STBIR_SIMD_NO_UNROLL(decode); \
4806 stbir__simdf_load( cs, hc ); \
4807 stbir__simdf_0123to0011( c, cs ); \
4808 stbir__simdf_mult_mem( tot, c, decode ); \
4809 stbir__simdf_0123to2222( c, cs ); \
4810 stbir__simdf_load2z( d, decode+4 ); \
4811 stbir__simdf_madd( tot, tot, d, c );
4812
4813#define stbir__store_output_tiny() \
4814 stbir__simdf_0123to2301( c, tot ); \
4815 stbir__simdf_add( tot, tot, c ); \
4816 stbir__simdf_store2( output, tot ); \
4817 horizontal_coefficients += coefficient_width; \
4818 ++horizontal_contributors; \
4819 output += 2;
4820
4821#ifdef STBIR_SIMD8
4822
4823#define stbir__4_coeff_start() \
4824 stbir__simdf8 tot0,c,cs; \
4825 STBIR_SIMD_NO_UNROLL(decode); \
4826 stbir__simdf8_load4b( cs, hc ); \
4827 stbir__simdf8_0123to00112233( c, cs ); \
4828 stbir__simdf8_mult_mem( tot0, c, decode );
4829
4830#define stbir__4_coeff_continue_from_4( ofs ) \
4831 STBIR_SIMD_NO_UNROLL(decode); \
4832 stbir__simdf8_load4b( cs, hc + (ofs) ); \
4833 stbir__simdf8_0123to00112233( c, cs ); \
4834 stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*2 );
4835
4836#define stbir__1_coeff_remnant( ofs ) \
4837 { stbir__simdf t,d; \
4838 stbir__simdf_load1z( t, hc + (ofs) ); \
4839 stbir__simdf_load2( d, decode + (ofs) * 2 ); \
4840 stbir__simdf_0123to0011( t, t ); \
4841 stbir__simdf_mult( t, t, d ); \
4842 stbir__simdf8_add4( tot0, tot0, t ); }
4843
4844#define stbir__2_coeff_remnant( ofs ) \
4845 { stbir__simdf t; \
4846 stbir__simdf_load2( t, hc + (ofs) ); \
4847 stbir__simdf_0123to0011( t, t ); \
4848 stbir__simdf_mult_mem( t, t, decode+(ofs)*2 ); \
4849 stbir__simdf8_add4( tot0, tot0, t ); }
4850
4851#define stbir__3_coeff_remnant( ofs ) \
4852 { stbir__simdf8 d; \
4853 stbir__simdf8_load4b( cs, hc + (ofs) ); \
4854 stbir__simdf8_0123to00112233( c, cs ); \
4855 stbir__simdf8_load6z( d, decode+(ofs)*2 ); \
4856 stbir__simdf8_madd( tot0, tot0, c, d ); }
4857
4858#define stbir__store_output() \
4859 { stbir__simdf t,d; \
4860 stbir__simdf8_add4halves( t, stbir__if_simdf8_cast_to_simdf4(tot0), tot0 ); \
4861 stbir__simdf_0123to2301( d, t ); \
4862 stbir__simdf_add( t, t, d ); \
4863 stbir__simdf_store2( output, t ); \
4864 horizontal_coefficients += coefficient_width; \
4865 ++horizontal_contributors; \
4866 output += 2; }
4867
4868#else
4869
4870#define stbir__4_coeff_start() \
4871 stbir__simdf tot0,tot1,c,cs; \
4872 STBIR_SIMD_NO_UNROLL(decode); \
4873 stbir__simdf_load( cs, hc ); \
4874 stbir__simdf_0123to0011( c, cs ); \
4875 stbir__simdf_mult_mem( tot0, c, decode ); \
4876 stbir__simdf_0123to2233( c, cs ); \
4877 stbir__simdf_mult_mem( tot1, c, decode+4 );
4878
4879#define stbir__4_coeff_continue_from_4( ofs ) \
4880 STBIR_SIMD_NO_UNROLL(decode); \
4881 stbir__simdf_load( cs, hc + (ofs) ); \
4882 stbir__simdf_0123to0011( c, cs ); \
4883 stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*2 ); \
4884 stbir__simdf_0123to2233( c, cs ); \
4885 stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*2+4 );
4886
4887#define stbir__1_coeff_remnant( ofs ) \
4888 { stbir__simdf d; \
4889 stbir__simdf_load1z( cs, hc + (ofs) ); \
4890 stbir__simdf_0123to0011( c, cs ); \
4891 stbir__simdf_load2( d, decode + (ofs) * 2 ); \
4892 stbir__simdf_madd( tot0, tot0, d, c ); }
4893
4894#define stbir__2_coeff_remnant( ofs ) \
4895 stbir__simdf_load2( cs, hc + (ofs) ); \
4896 stbir__simdf_0123to0011( c, cs ); \
4897 stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*2 );
4898
4899#define stbir__3_coeff_remnant( ofs ) \
4900 { stbir__simdf d; \
4901 stbir__simdf_load( cs, hc + (ofs) ); \
4902 stbir__simdf_0123to0011( c, cs ); \
4903 stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*2 ); \
4904 stbir__simdf_0123to2222( c, cs ); \
4905 stbir__simdf_load2z( d, decode + (ofs) * 2 + 4 ); \
4906 stbir__simdf_madd( tot1, tot1, d, c ); }
4907
4908#define stbir__store_output() \
4909 stbir__simdf_add( tot0, tot0, tot1 ); \
4910 stbir__simdf_0123to2301( c, tot0 ); \
4911 stbir__simdf_add( tot0, tot0, c ); \
4912 stbir__simdf_store2( output, tot0 ); \
4913 horizontal_coefficients += coefficient_width; \
4914 ++horizontal_contributors; \
4915 output += 2;
4916
4917#endif
4918
4919#else
4920
4921#define stbir__1_coeff_only() \
4922 float tota,totb,c; \
4923 c = hc[0]; \
4924 tota = decode[0]*c; \
4925 totb = decode[1]*c;
4926
4927#define stbir__2_coeff_only() \
4928 float tota,totb,c; \
4929 c = hc[0]; \
4930 tota = decode[0]*c; \
4931 totb = decode[1]*c; \
4932 c = hc[1]; \
4933 tota += decode[2]*c; \
4934 totb += decode[3]*c;
4935
4936// this weird order of add matches the simd
4937#define stbir__3_coeff_only() \
4938 float tota,totb,c; \
4939 c = hc[0]; \
4940 tota = decode[0]*c; \
4941 totb = decode[1]*c; \
4942 c = hc[2]; \
4943 tota += decode[4]*c; \
4944 totb += decode[5]*c; \
4945 c = hc[1]; \
4946 tota += decode[2]*c; \
4947 totb += decode[3]*c;
4948
4949#define stbir__store_output_tiny() \
4950 output[0] = tota; \
4951 output[1] = totb; \
4952 horizontal_coefficients += coefficient_width; \
4953 ++horizontal_contributors; \
4954 output += 2;
4955
4956#define stbir__4_coeff_start() \
4957 float tota0,tota1,tota2,tota3,totb0,totb1,totb2,totb3,c; \
4958 c = hc[0]; \
4959 tota0 = decode[0]*c; \
4960 totb0 = decode[1]*c; \
4961 c = hc[1]; \
4962 tota1 = decode[2]*c; \
4963 totb1 = decode[3]*c; \
4964 c = hc[2]; \
4965 tota2 = decode[4]*c; \
4966 totb2 = decode[5]*c; \
4967 c = hc[3]; \
4968 tota3 = decode[6]*c; \
4969 totb3 = decode[7]*c;
4970
4971#define stbir__4_coeff_continue_from_4( ofs ) \
4972 c = hc[0+(ofs)]; \
4973 tota0 += decode[0+(ofs)*2]*c; \
4974 totb0 += decode[1+(ofs)*2]*c; \
4975 c = hc[1+(ofs)]; \
4976 tota1 += decode[2+(ofs)*2]*c; \
4977 totb1 += decode[3+(ofs)*2]*c; \
4978 c = hc[2+(ofs)]; \
4979 tota2 += decode[4+(ofs)*2]*c; \
4980 totb2 += decode[5+(ofs)*2]*c; \
4981 c = hc[3+(ofs)]; \
4982 tota3 += decode[6+(ofs)*2]*c; \
4983 totb3 += decode[7+(ofs)*2]*c;
4984
4985#define stbir__1_coeff_remnant( ofs ) \
4986 c = hc[0+(ofs)]; \
4987 tota0 += decode[0+(ofs)*2] * c; \
4988 totb0 += decode[1+(ofs)*2] * c;
4989
4990#define stbir__2_coeff_remnant( ofs ) \
4991 c = hc[0+(ofs)]; \
4992 tota0 += decode[0+(ofs)*2] * c; \
4993 totb0 += decode[1+(ofs)*2] * c; \
4994 c = hc[1+(ofs)]; \
4995 tota1 += decode[2+(ofs)*2] * c; \
4996 totb1 += decode[3+(ofs)*2] * c;
4997
4998#define stbir__3_coeff_remnant( ofs ) \
4999 c = hc[0+(ofs)]; \
5000 tota0 += decode[0+(ofs)*2] * c; \
5001 totb0 += decode[1+(ofs)*2] * c; \
5002 c = hc[1+(ofs)]; \
5003 tota1 += decode[2+(ofs)*2] * c; \
5004 totb1 += decode[3+(ofs)*2] * c; \
5005 c = hc[2+(ofs)]; \
5006 tota2 += decode[4+(ofs)*2] * c; \
5007 totb2 += decode[5+(ofs)*2] * c;
5008
5009#define stbir__store_output() \
5010 output[0] = (tota0+tota2)+(tota1+tota3); \
5011 output[1] = (totb0+totb2)+(totb1+totb3); \
5012 horizontal_coefficients += coefficient_width; \
5013 ++horizontal_contributors; \
5014 output += 2;
5015
5016#endif
5017
5018#define STBIR__horizontal_channels 2
5019#define STB_IMAGE_RESIZE_DO_HORIZONTALS
5020#include STBIR__HEADER_FILENAME
5021
5022
5023//=================
5024// Do 3 channel horizontal routines
5025
5026#ifdef STBIR_SIMD
5027
5028#define stbir__1_coeff_only() \
5029 stbir__simdf tot,c,d; \
5030 STBIR_SIMD_NO_UNROLL(decode); \
5031 stbir__simdf_load1z( c, hc ); \
5032 stbir__simdf_0123to0001( c, c ); \
5033 stbir__simdf_load( d, decode ); \
5034 stbir__simdf_mult( tot, d, c );
5035
5036#define stbir__2_coeff_only() \
5037 stbir__simdf tot,c,cs,d; \
5038 STBIR_SIMD_NO_UNROLL(decode); \
5039 stbir__simdf_load2( cs, hc ); \
5040 stbir__simdf_0123to0000( c, cs ); \
5041 stbir__simdf_load( d, decode ); \
5042 stbir__simdf_mult( tot, d, c ); \
5043 stbir__simdf_0123to1111( c, cs ); \
5044 stbir__simdf_load( d, decode+3 ); \
5045 stbir__simdf_madd( tot, tot, d, c );
5046
5047#define stbir__3_coeff_only() \
5048 stbir__simdf tot,c,d,cs; \
5049 STBIR_SIMD_NO_UNROLL(decode); \
5050 stbir__simdf_load( cs, hc ); \
5051 stbir__simdf_0123to0000( c, cs ); \
5052 stbir__simdf_load( d, decode ); \
5053 stbir__simdf_mult( tot, d, c ); \
5054 stbir__simdf_0123to1111( c, cs ); \
5055 stbir__simdf_load( d, decode+3 ); \
5056 stbir__simdf_madd( tot, tot, d, c ); \
5057 stbir__simdf_0123to2222( c, cs ); \
5058 stbir__simdf_load( d, decode+6 ); \
5059 stbir__simdf_madd( tot, tot, d, c );
5060
5061#define stbir__store_output_tiny() \
5062 stbir__simdf_store2( output, tot ); \
5063 stbir__simdf_0123to2301( tot, tot ); \
5064 stbir__simdf_store1( output+2, tot ); \
5065 horizontal_coefficients += coefficient_width; \
5066 ++horizontal_contributors; \
5067 output += 3;
5068
5069#ifdef STBIR_SIMD8
5070
5071// we're loading from the XXXYYY decode by -1 to get the XXXYYY into different halves of the AVX reg fyi
5072#define stbir__4_coeff_start() \
5073 stbir__simdf8 tot0,tot1,c,cs; stbir__simdf t; \
5074 STBIR_SIMD_NO_UNROLL(decode); \
5075 stbir__simdf8_load4b( cs, hc ); \
5076 stbir__simdf8_0123to00001111( c, cs ); \
5077 stbir__simdf8_mult_mem( tot0, c, decode - 1 ); \
5078 stbir__simdf8_0123to22223333( c, cs ); \
5079 stbir__simdf8_mult_mem( tot1, c, decode+6 - 1 );
5080
5081#define stbir__4_coeff_continue_from_4( ofs ) \
5082 STBIR_SIMD_NO_UNROLL(decode); \
5083 stbir__simdf8_load4b( cs, hc + (ofs) ); \
5084 stbir__simdf8_0123to00001111( c, cs ); \
5085 stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*3 - 1 ); \
5086 stbir__simdf8_0123to22223333( c, cs ); \
5087 stbir__simdf8_madd_mem( tot1, tot1, c, decode+(ofs)*3 + 6 - 1 );
5088
5089#define stbir__1_coeff_remnant( ofs ) \
5090 STBIR_SIMD_NO_UNROLL(decode); \
5091 stbir__simdf_load1rep4( t, hc + (ofs) ); \
5092 stbir__simdf8_madd_mem4( tot0, tot0, t, decode+(ofs)*3 - 1 );
5093
5094#define stbir__2_coeff_remnant( ofs ) \
5095 STBIR_SIMD_NO_UNROLL(decode); \
5096 stbir__simdf8_load4b( cs, hc + (ofs) - 2 ); \
5097 stbir__simdf8_0123to22223333( c, cs ); \
5098 stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*3 - 1 );
5099
5100 #define stbir__3_coeff_remnant( ofs ) \
5101 STBIR_SIMD_NO_UNROLL(decode); \
5102 stbir__simdf8_load4b( cs, hc + (ofs) ); \
5103 stbir__simdf8_0123to00001111( c, cs ); \
5104 stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*3 - 1 ); \
5105 stbir__simdf8_0123to2222( t, cs ); \
5106 stbir__simdf8_madd_mem4( tot1, tot1, t, decode+(ofs)*3 + 6 - 1 );
5107
5108#define stbir__store_output() \
5109 stbir__simdf8_add( tot0, tot0, tot1 ); \
5110 stbir__simdf_0123to1230( t, stbir__if_simdf8_cast_to_simdf4( tot0 ) ); \
5111 stbir__simdf8_add4halves( t, t, tot0 ); \
5112 horizontal_coefficients += coefficient_width; \
5113 ++horizontal_contributors; \
5114 output += 3; \
5115 if ( output < output_end ) \
5116 { \
5117 stbir__simdf_store( output-3, t ); \
5118 continue; \
5119 } \
5120 { stbir__simdf tt; stbir__simdf_0123to2301( tt, t ); \
5121 stbir__simdf_store2( output-3, t ); \
5122 stbir__simdf_store1( output+2-3, tt ); } \
5123 break;
5124
5125
5126#else
5127
5128#define stbir__4_coeff_start() \
5129 stbir__simdf tot0,tot1,tot2,c,cs; \
5130 STBIR_SIMD_NO_UNROLL(decode); \
5131 stbir__simdf_load( cs, hc ); \
5132 stbir__simdf_0123to0001( c, cs ); \
5133 stbir__simdf_mult_mem( tot0, c, decode ); \
5134 stbir__simdf_0123to1122( c, cs ); \
5135 stbir__simdf_mult_mem( tot1, c, decode+4 ); \
5136 stbir__simdf_0123to2333( c, cs ); \
5137 stbir__simdf_mult_mem( tot2, c, decode+8 );
5138
5139#define stbir__4_coeff_continue_from_4( ofs ) \
5140 STBIR_SIMD_NO_UNROLL(decode); \
5141 stbir__simdf_load( cs, hc + (ofs) ); \
5142 stbir__simdf_0123to0001( c, cs ); \
5143 stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*3 ); \
5144 stbir__simdf_0123to1122( c, cs ); \
5145 stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*3+4 ); \
5146 stbir__simdf_0123to2333( c, cs ); \
5147 stbir__simdf_madd_mem( tot2, tot2, c, decode+(ofs)*3+8 );
5148
5149#define stbir__1_coeff_remnant( ofs ) \
5150 STBIR_SIMD_NO_UNROLL(decode); \
5151 stbir__simdf_load1z( c, hc + (ofs) ); \
5152 stbir__simdf_0123to0001( c, c ); \
5153 stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*3 );
5154
5155#define stbir__2_coeff_remnant( ofs ) \
5156 { stbir__simdf d; \
5157 STBIR_SIMD_NO_UNROLL(decode); \
5158 stbir__simdf_load2z( cs, hc + (ofs) ); \
5159 stbir__simdf_0123to0001( c, cs ); \
5160 stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*3 ); \
5161 stbir__simdf_0123to1122( c, cs ); \
5162 stbir__simdf_load2z( d, decode+(ofs)*3+4 ); \
5163 stbir__simdf_madd( tot1, tot1, c, d ); }
5164
5165#define stbir__3_coeff_remnant( ofs ) \
5166 { stbir__simdf d; \
5167 STBIR_SIMD_NO_UNROLL(decode); \
5168 stbir__simdf_load( cs, hc + (ofs) ); \
5169 stbir__simdf_0123to0001( c, cs ); \
5170 stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*3 ); \
5171 stbir__simdf_0123to1122( c, cs ); \
5172 stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*3+4 ); \
5173 stbir__simdf_0123to2222( c, cs ); \
5174 stbir__simdf_load1z( d, decode+(ofs)*3+8 ); \
5175 stbir__simdf_madd( tot2, tot2, c, d ); }
5176
5177#define stbir__store_output() \
5178 stbir__simdf_0123ABCDto3ABx( c, tot0, tot1 ); \
5179 stbir__simdf_0123ABCDto23Ax( cs, tot1, tot2 ); \
5180 stbir__simdf_0123to1230( tot2, tot2 ); \
5181 stbir__simdf_add( tot0, tot0, cs ); \
5182 stbir__simdf_add( c, c, tot2 ); \
5183 stbir__simdf_add( tot0, tot0, c ); \
5184 horizontal_coefficients += coefficient_width; \
5185 ++horizontal_contributors; \
5186 output += 3; \
5187 if ( output < output_end ) \
5188 { \
5189 stbir__simdf_store( output-3, tot0 ); \
5190 continue; \
5191 } \
5192 stbir__simdf_0123to2301( tot1, tot0 ); \
5193 stbir__simdf_store2( output-3, tot0 ); \
5194 stbir__simdf_store1( output+2-3, tot1 ); \
5195 break;
5196
5197#endif
5198
5199#else
5200
5201#define stbir__1_coeff_only() \
5202 float tot0, tot1, tot2, c; \
5203 c = hc[0]; \
5204 tot0 = decode[0]*c; \
5205 tot1 = decode[1]*c; \
5206 tot2 = decode[2]*c;
5207
5208#define stbir__2_coeff_only() \
5209 float tot0, tot1, tot2, c; \
5210 c = hc[0]; \
5211 tot0 = decode[0]*c; \
5212 tot1 = decode[1]*c; \
5213 tot2 = decode[2]*c; \
5214 c = hc[1]; \
5215 tot0 += decode[3]*c; \
5216 tot1 += decode[4]*c; \
5217 tot2 += decode[5]*c;
5218
5219#define stbir__3_coeff_only() \
5220 float tot0, tot1, tot2, c; \
5221 c = hc[0]; \
5222 tot0 = decode[0]*c; \
5223 tot1 = decode[1]*c; \
5224 tot2 = decode[2]*c; \
5225 c = hc[1]; \
5226 tot0 += decode[3]*c; \
5227 tot1 += decode[4]*c; \
5228 tot2 += decode[5]*c; \
5229 c = hc[2]; \
5230 tot0 += decode[6]*c; \
5231 tot1 += decode[7]*c; \
5232 tot2 += decode[8]*c;
5233
5234#define stbir__store_output_tiny() \
5235 output[0] = tot0; \
5236 output[1] = tot1; \
5237 output[2] = tot2; \
5238 horizontal_coefficients += coefficient_width; \
5239 ++horizontal_contributors; \
5240 output += 3;
5241
5242#define stbir__4_coeff_start() \
5243 float tota0,tota1,tota2,totb0,totb1,totb2,totc0,totc1,totc2,totd0,totd1,totd2,c; \
5244 c = hc[0]; \
5245 tota0 = decode[0]*c; \
5246 tota1 = decode[1]*c; \
5247 tota2 = decode[2]*c; \
5248 c = hc[1]; \
5249 totb0 = decode[3]*c; \
5250 totb1 = decode[4]*c; \
5251 totb2 = decode[5]*c; \
5252 c = hc[2]; \
5253 totc0 = decode[6]*c; \
5254 totc1 = decode[7]*c; \
5255 totc2 = decode[8]*c; \
5256 c = hc[3]; \
5257 totd0 = decode[9]*c; \
5258 totd1 = decode[10]*c; \
5259 totd2 = decode[11]*c;
5260
5261#define stbir__4_coeff_continue_from_4( ofs ) \
5262 c = hc[0+(ofs)]; \
5263 tota0 += decode[0+(ofs)*3]*c; \
5264 tota1 += decode[1+(ofs)*3]*c; \
5265 tota2 += decode[2+(ofs)*3]*c; \
5266 c = hc[1+(ofs)]; \
5267 totb0 += decode[3+(ofs)*3]*c; \
5268 totb1 += decode[4+(ofs)*3]*c; \
5269 totb2 += decode[5+(ofs)*3]*c; \
5270 c = hc[2+(ofs)]; \
5271 totc0 += decode[6+(ofs)*3]*c; \
5272 totc1 += decode[7+(ofs)*3]*c; \
5273 totc2 += decode[8+(ofs)*3]*c; \
5274 c = hc[3+(ofs)]; \
5275 totd0 += decode[9+(ofs)*3]*c; \
5276 totd1 += decode[10+(ofs)*3]*c; \
5277 totd2 += decode[11+(ofs)*3]*c;
5278
5279#define stbir__1_coeff_remnant( ofs ) \
5280 c = hc[0+(ofs)]; \
5281 tota0 += decode[0+(ofs)*3]*c; \
5282 tota1 += decode[1+(ofs)*3]*c; \
5283 tota2 += decode[2+(ofs)*3]*c;
5284
5285#define stbir__2_coeff_remnant( ofs ) \
5286 c = hc[0+(ofs)]; \
5287 tota0 += decode[0+(ofs)*3]*c; \
5288 tota1 += decode[1+(ofs)*3]*c; \
5289 tota2 += decode[2+(ofs)*3]*c; \
5290 c = hc[1+(ofs)]; \
5291 totb0 += decode[3+(ofs)*3]*c; \
5292 totb1 += decode[4+(ofs)*3]*c; \
5293 totb2 += decode[5+(ofs)*3]*c; \
5294
5295#define stbir__3_coeff_remnant( ofs ) \
5296 c = hc[0+(ofs)]; \
5297 tota0 += decode[0+(ofs)*3]*c; \
5298 tota1 += decode[1+(ofs)*3]*c; \
5299 tota2 += decode[2+(ofs)*3]*c; \
5300 c = hc[1+(ofs)]; \
5301 totb0 += decode[3+(ofs)*3]*c; \
5302 totb1 += decode[4+(ofs)*3]*c; \
5303 totb2 += decode[5+(ofs)*3]*c; \
5304 c = hc[2+(ofs)]; \
5305 totc0 += decode[6+(ofs)*3]*c; \
5306 totc1 += decode[7+(ofs)*3]*c; \
5307 totc2 += decode[8+(ofs)*3]*c;
5308
5309#define stbir__store_output() \
5310 output[0] = (tota0+totc0)+(totb0+totd0); \
5311 output[1] = (tota1+totc1)+(totb1+totd1); \
5312 output[2] = (tota2+totc2)+(totb2+totd2); \
5313 horizontal_coefficients += coefficient_width; \
5314 ++horizontal_contributors; \
5315 output += 3;
5316
5317#endif
5318
5319#define STBIR__horizontal_channels 3
5320#define STB_IMAGE_RESIZE_DO_HORIZONTALS
5321#include STBIR__HEADER_FILENAME
5322
5323//=================
5324// Do 4 channel horizontal routines
5325
5326#ifdef STBIR_SIMD
5327
5328#define stbir__1_coeff_only() \
5329 stbir__simdf tot,c; \
5330 STBIR_SIMD_NO_UNROLL(decode); \
5331 stbir__simdf_load1( c, hc ); \
5332 stbir__simdf_0123to0000( c, c ); \
5333 stbir__simdf_mult_mem( tot, c, decode );
5334
5335#define stbir__2_coeff_only() \
5336 stbir__simdf tot,c,cs; \
5337 STBIR_SIMD_NO_UNROLL(decode); \
5338 stbir__simdf_load2( cs, hc ); \
5339 stbir__simdf_0123to0000( c, cs ); \
5340 stbir__simdf_mult_mem( tot, c, decode ); \
5341 stbir__simdf_0123to1111( c, cs ); \
5342 stbir__simdf_madd_mem( tot, tot, c, decode+4 );
5343
5344#define stbir__3_coeff_only() \
5345 stbir__simdf tot,c,cs; \
5346 STBIR_SIMD_NO_UNROLL(decode); \
5347 stbir__simdf_load( cs, hc ); \
5348 stbir__simdf_0123to0000( c, cs ); \
5349 stbir__simdf_mult_mem( tot, c, decode ); \
5350 stbir__simdf_0123to1111( c, cs ); \
5351 stbir__simdf_madd_mem( tot, tot, c, decode+4 ); \
5352 stbir__simdf_0123to2222( c, cs ); \
5353 stbir__simdf_madd_mem( tot, tot, c, decode+8 );
5354
5355#define stbir__store_output_tiny() \
5356 stbir__simdf_store( output, tot ); \
5357 horizontal_coefficients += coefficient_width; \
5358 ++horizontal_contributors; \
5359 output += 4;
5360
5361#ifdef STBIR_SIMD8
5362
5363#define stbir__4_coeff_start() \
5364 stbir__simdf8 tot0,c,cs; stbir__simdf t; \
5365 STBIR_SIMD_NO_UNROLL(decode); \
5366 stbir__simdf8_load4b( cs, hc ); \
5367 stbir__simdf8_0123to00001111( c, cs ); \
5368 stbir__simdf8_mult_mem( tot0, c, decode ); \
5369 stbir__simdf8_0123to22223333( c, cs ); \
5370 stbir__simdf8_madd_mem( tot0, tot0, c, decode+8 );
5371
5372#define stbir__4_coeff_continue_from_4( ofs ) \
5373 STBIR_SIMD_NO_UNROLL(decode); \
5374 stbir__simdf8_load4b( cs, hc + (ofs) ); \
5375 stbir__simdf8_0123to00001111( c, cs ); \
5376 stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*4 ); \
5377 stbir__simdf8_0123to22223333( c, cs ); \
5378 stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*4+8 );
5379
5380#define stbir__1_coeff_remnant( ofs ) \
5381 STBIR_SIMD_NO_UNROLL(decode); \
5382 stbir__simdf_load1rep4( t, hc + (ofs) ); \
5383 stbir__simdf8_madd_mem4( tot0, tot0, t, decode+(ofs)*4 );
5384
5385#define stbir__2_coeff_remnant( ofs ) \
5386 STBIR_SIMD_NO_UNROLL(decode); \
5387 stbir__simdf8_load4b( cs, hc + (ofs) - 2 ); \
5388 stbir__simdf8_0123to22223333( c, cs ); \
5389 stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*4 );
5390
5391 #define stbir__3_coeff_remnant( ofs ) \
5392 STBIR_SIMD_NO_UNROLL(decode); \
5393 stbir__simdf8_load4b( cs, hc + (ofs) ); \
5394 stbir__simdf8_0123to00001111( c, cs ); \
5395 stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*4 ); \
5396 stbir__simdf8_0123to2222( t, cs ); \
5397 stbir__simdf8_madd_mem4( tot0, tot0, t, decode+(ofs)*4+8 );
5398
5399#define stbir__store_output() \
5400 stbir__simdf8_add4halves( t, stbir__if_simdf8_cast_to_simdf4(tot0), tot0 ); \
5401 stbir__simdf_store( output, t ); \
5402 horizontal_coefficients += coefficient_width; \
5403 ++horizontal_contributors; \
5404 output += 4;
5405
5406#else
5407
5408#define stbir__4_coeff_start() \
5409 stbir__simdf tot0,tot1,c,cs; \
5410 STBIR_SIMD_NO_UNROLL(decode); \
5411 stbir__simdf_load( cs, hc ); \
5412 stbir__simdf_0123to0000( c, cs ); \
5413 stbir__simdf_mult_mem( tot0, c, decode ); \
5414 stbir__simdf_0123to1111( c, cs ); \
5415 stbir__simdf_mult_mem( tot1, c, decode+4 ); \
5416 stbir__simdf_0123to2222( c, cs ); \
5417 stbir__simdf_madd_mem( tot0, tot0, c, decode+8 ); \
5418 stbir__simdf_0123to3333( c, cs ); \
5419 stbir__simdf_madd_mem( tot1, tot1, c, decode+12 );
5420
5421#define stbir__4_coeff_continue_from_4( ofs ) \
5422 STBIR_SIMD_NO_UNROLL(decode); \
5423 stbir__simdf_load( cs, hc + (ofs) ); \
5424 stbir__simdf_0123to0000( c, cs ); \
5425 stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*4 ); \
5426 stbir__simdf_0123to1111( c, cs ); \
5427 stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*4+4 ); \
5428 stbir__simdf_0123to2222( c, cs ); \
5429 stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*4+8 ); \
5430 stbir__simdf_0123to3333( c, cs ); \
5431 stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*4+12 );
5432
5433#define stbir__1_coeff_remnant( ofs ) \
5434 STBIR_SIMD_NO_UNROLL(decode); \
5435 stbir__simdf_load1( c, hc + (ofs) ); \
5436 stbir__simdf_0123to0000( c, c ); \
5437 stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*4 );
5438
5439#define stbir__2_coeff_remnant( ofs ) \
5440 STBIR_SIMD_NO_UNROLL(decode); \
5441 stbir__simdf_load2( cs, hc + (ofs) ); \
5442 stbir__simdf_0123to0000( c, cs ); \
5443 stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*4 ); \
5444 stbir__simdf_0123to1111( c, cs ); \
5445 stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*4+4 );
5446
5447#define stbir__3_coeff_remnant( ofs ) \
5448 STBIR_SIMD_NO_UNROLL(decode); \
5449 stbir__simdf_load( cs, hc + (ofs) ); \
5450 stbir__simdf_0123to0000( c, cs ); \
5451 stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*4 ); \
5452 stbir__simdf_0123to1111( c, cs ); \
5453 stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*4+4 ); \
5454 stbir__simdf_0123to2222( c, cs ); \
5455 stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*4+8 );
5456
5457#define stbir__store_output() \
5458 stbir__simdf_add( tot0, tot0, tot1 ); \
5459 stbir__simdf_store( output, tot0 ); \
5460 horizontal_coefficients += coefficient_width; \
5461 ++horizontal_contributors; \
5462 output += 4;
5463
5464#endif
5465
5466#else
5467
5468#define stbir__1_coeff_only() \
5469 float p0,p1,p2,p3,c; \
5470 STBIR_SIMD_NO_UNROLL(decode); \
5471 c = hc[0]; \
5472 p0 = decode[0] * c; \
5473 p1 = decode[1] * c; \
5474 p2 = decode[2] * c; \
5475 p3 = decode[3] * c;
5476
5477#define stbir__2_coeff_only() \
5478 float p0,p1,p2,p3,c; \
5479 STBIR_SIMD_NO_UNROLL(decode); \
5480 c = hc[0]; \
5481 p0 = decode[0] * c; \
5482 p1 = decode[1] * c; \
5483 p2 = decode[2] * c; \
5484 p3 = decode[3] * c; \
5485 c = hc[1]; \
5486 p0 += decode[4] * c; \
5487 p1 += decode[5] * c; \
5488 p2 += decode[6] * c; \
5489 p3 += decode[7] * c;
5490
5491#define stbir__3_coeff_only() \
5492 float p0,p1,p2,p3,c; \
5493 STBIR_SIMD_NO_UNROLL(decode); \
5494 c = hc[0]; \
5495 p0 = decode[0] * c; \
5496 p1 = decode[1] * c; \
5497 p2 = decode[2] * c; \
5498 p3 = decode[3] * c; \
5499 c = hc[1]; \
5500 p0 += decode[4] * c; \
5501 p1 += decode[5] * c; \
5502 p2 += decode[6] * c; \
5503 p3 += decode[7] * c; \
5504 c = hc[2]; \
5505 p0 += decode[8] * c; \
5506 p1 += decode[9] * c; \
5507 p2 += decode[10] * c; \
5508 p3 += decode[11] * c;
5509
5510#define stbir__store_output_tiny() \
5511 output[0] = p0; \
5512 output[1] = p1; \
5513 output[2] = p2; \
5514 output[3] = p3; \
5515 horizontal_coefficients += coefficient_width; \
5516 ++horizontal_contributors; \
5517 output += 4;
5518
5519#define stbir__4_coeff_start() \
5520 float x0,x1,x2,x3,y0,y1,y2,y3,c; \
5521 STBIR_SIMD_NO_UNROLL(decode); \
5522 c = hc[0]; \
5523 x0 = decode[0] * c; \
5524 x1 = decode[1] * c; \
5525 x2 = decode[2] * c; \
5526 x3 = decode[3] * c; \
5527 c = hc[1]; \
5528 y0 = decode[4] * c; \
5529 y1 = decode[5] * c; \
5530 y2 = decode[6] * c; \
5531 y3 = decode[7] * c; \
5532 c = hc[2]; \
5533 x0 += decode[8] * c; \
5534 x1 += decode[9] * c; \
5535 x2 += decode[10] * c; \
5536 x3 += decode[11] * c; \
5537 c = hc[3]; \
5538 y0 += decode[12] * c; \
5539 y1 += decode[13] * c; \
5540 y2 += decode[14] * c; \
5541 y3 += decode[15] * c;
5542
5543#define stbir__4_coeff_continue_from_4( ofs ) \
5544 STBIR_SIMD_NO_UNROLL(decode); \
5545 c = hc[0+(ofs)]; \
5546 x0 += decode[0+(ofs)*4] * c; \
5547 x1 += decode[1+(ofs)*4] * c; \
5548 x2 += decode[2+(ofs)*4] * c; \
5549 x3 += decode[3+(ofs)*4] * c; \
5550 c = hc[1+(ofs)]; \
5551 y0 += decode[4+(ofs)*4] * c; \
5552 y1 += decode[5+(ofs)*4] * c; \
5553 y2 += decode[6+(ofs)*4] * c; \
5554 y3 += decode[7+(ofs)*4] * c; \
5555 c = hc[2+(ofs)]; \
5556 x0 += decode[8+(ofs)*4] * c; \
5557 x1 += decode[9+(ofs)*4] * c; \
5558 x2 += decode[10+(ofs)*4] * c; \
5559 x3 += decode[11+(ofs)*4] * c; \
5560 c = hc[3+(ofs)]; \
5561 y0 += decode[12+(ofs)*4] * c; \
5562 y1 += decode[13+(ofs)*4] * c; \
5563 y2 += decode[14+(ofs)*4] * c; \
5564 y3 += decode[15+(ofs)*4] * c;
5565
5566#define stbir__1_coeff_remnant( ofs ) \
5567 STBIR_SIMD_NO_UNROLL(decode); \
5568 c = hc[0+(ofs)]; \
5569 x0 += decode[0+(ofs)*4] * c; \
5570 x1 += decode[1+(ofs)*4] * c; \
5571 x2 += decode[2+(ofs)*4] * c; \
5572 x3 += decode[3+(ofs)*4] * c;
5573
5574#define stbir__2_coeff_remnant( ofs ) \
5575 STBIR_SIMD_NO_UNROLL(decode); \
5576 c = hc[0+(ofs)]; \
5577 x0 += decode[0+(ofs)*4] * c; \
5578 x1 += decode[1+(ofs)*4] * c; \
5579 x2 += decode[2+(ofs)*4] * c; \
5580 x3 += decode[3+(ofs)*4] * c; \
5581 c = hc[1+(ofs)]; \
5582 y0 += decode[4+(ofs)*4] * c; \
5583 y1 += decode[5+(ofs)*4] * c; \
5584 y2 += decode[6+(ofs)*4] * c; \
5585 y3 += decode[7+(ofs)*4] * c;
5586
5587#define stbir__3_coeff_remnant( ofs ) \
5588 STBIR_SIMD_NO_UNROLL(decode); \
5589 c = hc[0+(ofs)]; \
5590 x0 += decode[0+(ofs)*4] * c; \
5591 x1 += decode[1+(ofs)*4] * c; \
5592 x2 += decode[2+(ofs)*4] * c; \
5593 x3 += decode[3+(ofs)*4] * c; \
5594 c = hc[1+(ofs)]; \
5595 y0 += decode[4+(ofs)*4] * c; \
5596 y1 += decode[5+(ofs)*4] * c; \
5597 y2 += decode[6+(ofs)*4] * c; \
5598 y3 += decode[7+(ofs)*4] * c; \
5599 c = hc[2+(ofs)]; \
5600 x0 += decode[8+(ofs)*4] * c; \
5601 x1 += decode[9+(ofs)*4] * c; \
5602 x2 += decode[10+(ofs)*4] * c; \
5603 x3 += decode[11+(ofs)*4] * c;
5604
5605#define stbir__store_output() \
5606 output[0] = x0 + y0; \
5607 output[1] = x1 + y1; \
5608 output[2] = x2 + y2; \
5609 output[3] = x3 + y3; \
5610 horizontal_coefficients += coefficient_width; \
5611 ++horizontal_contributors; \
5612 output += 4;
5613
5614#endif
5615
5616#define STBIR__horizontal_channels 4
5617#define STB_IMAGE_RESIZE_DO_HORIZONTALS
5618#include STBIR__HEADER_FILENAME
5619
5620
5621
5622//=================
5623// Do 7 channel horizontal routines
5624
5625#ifdef STBIR_SIMD
5626
5627#define stbir__1_coeff_only() \
5628 stbir__simdf tot0,tot1,c; \
5629 STBIR_SIMD_NO_UNROLL(decode); \
5630 stbir__simdf_load1( c, hc ); \
5631 stbir__simdf_0123to0000( c, c ); \
5632 stbir__simdf_mult_mem( tot0, c, decode ); \
5633 stbir__simdf_mult_mem( tot1, c, decode+3 );
5634
5635#define stbir__2_coeff_only() \
5636 stbir__simdf tot0,tot1,c,cs; \
5637 STBIR_SIMD_NO_UNROLL(decode); \
5638 stbir__simdf_load2( cs, hc ); \
5639 stbir__simdf_0123to0000( c, cs ); \
5640 stbir__simdf_mult_mem( tot0, c, decode ); \
5641 stbir__simdf_mult_mem( tot1, c, decode+3 ); \
5642 stbir__simdf_0123to1111( c, cs ); \
5643 stbir__simdf_madd_mem( tot0, tot0, c, decode+7 ); \
5644 stbir__simdf_madd_mem( tot1, tot1, c,decode+10 );
5645
5646#define stbir__3_coeff_only() \
5647 stbir__simdf tot0,tot1,c,cs; \
5648 STBIR_SIMD_NO_UNROLL(decode); \
5649 stbir__simdf_load( cs, hc ); \
5650 stbir__simdf_0123to0000( c, cs ); \
5651 stbir__simdf_mult_mem( tot0, c, decode ); \
5652 stbir__simdf_mult_mem( tot1, c, decode+3 ); \
5653 stbir__simdf_0123to1111( c, cs ); \
5654 stbir__simdf_madd_mem( tot0, tot0, c, decode+7 ); \
5655 stbir__simdf_madd_mem( tot1, tot1, c, decode+10 ); \
5656 stbir__simdf_0123to2222( c, cs ); \
5657 stbir__simdf_madd_mem( tot0, tot0, c, decode+14 ); \
5658 stbir__simdf_madd_mem( tot1, tot1, c, decode+17 );
5659
5660#define stbir__store_output_tiny() \
5661 stbir__simdf_store( output+3, tot1 ); \
5662 stbir__simdf_store( output, tot0 ); \
5663 horizontal_coefficients += coefficient_width; \
5664 ++horizontal_contributors; \
5665 output += 7;
5666
5667#ifdef STBIR_SIMD8
5668
5669#define stbir__4_coeff_start() \
5670 stbir__simdf8 tot0,tot1,c,cs; \
5671 STBIR_SIMD_NO_UNROLL(decode); \
5672 stbir__simdf8_load4b( cs, hc ); \
5673 stbir__simdf8_0123to00000000( c, cs ); \
5674 stbir__simdf8_mult_mem( tot0, c, decode ); \
5675 stbir__simdf8_0123to11111111( c, cs ); \
5676 stbir__simdf8_mult_mem( tot1, c, decode+7 ); \
5677 stbir__simdf8_0123to22222222( c, cs ); \
5678 stbir__simdf8_madd_mem( tot0, tot0, c, decode+14 ); \
5679 stbir__simdf8_0123to33333333( c, cs ); \
5680 stbir__simdf8_madd_mem( tot1, tot1, c, decode+21 );
5681
5682#define stbir__4_coeff_continue_from_4( ofs ) \
5683 STBIR_SIMD_NO_UNROLL(decode); \
5684 stbir__simdf8_load4b( cs, hc + (ofs) ); \
5685 stbir__simdf8_0123to00000000( c, cs ); \
5686 stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*7 ); \
5687 stbir__simdf8_0123to11111111( c, cs ); \
5688 stbir__simdf8_madd_mem( tot1, tot1, c, decode+(ofs)*7+7 ); \
5689 stbir__simdf8_0123to22222222( c, cs ); \
5690 stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*7+14 ); \
5691 stbir__simdf8_0123to33333333( c, cs ); \
5692 stbir__simdf8_madd_mem( tot1, tot1, c, decode+(ofs)*7+21 );
5693
5694#define stbir__1_coeff_remnant( ofs ) \
5695 STBIR_SIMD_NO_UNROLL(decode); \
5696 stbir__simdf8_load1b( c, hc + (ofs) ); \
5697 stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*7 );
5698
5699#define stbir__2_coeff_remnant( ofs ) \
5700 STBIR_SIMD_NO_UNROLL(decode); \
5701 stbir__simdf8_load1b( c, hc + (ofs) ); \
5702 stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*7 ); \
5703 stbir__simdf8_load1b( c, hc + (ofs)+1 ); \
5704 stbir__simdf8_madd_mem( tot1, tot1, c, decode+(ofs)*7+7 );
5705
5706#define stbir__3_coeff_remnant( ofs ) \
5707 STBIR_SIMD_NO_UNROLL(decode); \
5708 stbir__simdf8_load4b( cs, hc + (ofs) ); \
5709 stbir__simdf8_0123to00000000( c, cs ); \
5710 stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*7 ); \
5711 stbir__simdf8_0123to11111111( c, cs ); \
5712 stbir__simdf8_madd_mem( tot1, tot1, c, decode+(ofs)*7+7 ); \
5713 stbir__simdf8_0123to22222222( c, cs ); \
5714 stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*7+14 );
5715
5716#define stbir__store_output() \
5717 stbir__simdf8_add( tot0, tot0, tot1 ); \
5718 horizontal_coefficients += coefficient_width; \
5719 ++horizontal_contributors; \
5720 output += 7; \
5721 if ( output < output_end ) \
5722 { \
5723 stbir__simdf8_store( output-7, tot0 ); \
5724 continue; \
5725 } \
5726 stbir__simdf_store( output-7+3, stbir__simdf_swiz(stbir__simdf8_gettop4(tot0),0,0,1,2) ); \
5727 stbir__simdf_store( output-7, stbir__if_simdf8_cast_to_simdf4(tot0) ); \
5728 break;
5729
5730#else
5731
5732#define stbir__4_coeff_start() \
5733 stbir__simdf tot0,tot1,tot2,tot3,c,cs; \
5734 STBIR_SIMD_NO_UNROLL(decode); \
5735 stbir__simdf_load( cs, hc ); \
5736 stbir__simdf_0123to0000( c, cs ); \
5737 stbir__simdf_mult_mem( tot0, c, decode ); \
5738 stbir__simdf_mult_mem( tot1, c, decode+3 ); \
5739 stbir__simdf_0123to1111( c, cs ); \
5740 stbir__simdf_mult_mem( tot2, c, decode+7 ); \
5741 stbir__simdf_mult_mem( tot3, c, decode+10 ); \
5742 stbir__simdf_0123to2222( c, cs ); \
5743 stbir__simdf_madd_mem( tot0, tot0, c, decode+14 ); \
5744 stbir__simdf_madd_mem( tot1, tot1, c, decode+17 ); \
5745 stbir__simdf_0123to3333( c, cs ); \
5746 stbir__simdf_madd_mem( tot2, tot2, c, decode+21 ); \
5747 stbir__simdf_madd_mem( tot3, tot3, c, decode+24 );
5748
5749#define stbir__4_coeff_continue_from_4( ofs ) \
5750 STBIR_SIMD_NO_UNROLL(decode); \
5751 stbir__simdf_load( cs, hc + (ofs) ); \
5752 stbir__simdf_0123to0000( c, cs ); \
5753 stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*7 ); \
5754 stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*7+3 ); \
5755 stbir__simdf_0123to1111( c, cs ); \
5756 stbir__simdf_madd_mem( tot2, tot2, c, decode+(ofs)*7+7 ); \
5757 stbir__simdf_madd_mem( tot3, tot3, c, decode+(ofs)*7+10 ); \
5758 stbir__simdf_0123to2222( c, cs ); \
5759 stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*7+14 ); \
5760 stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*7+17 ); \
5761 stbir__simdf_0123to3333( c, cs ); \
5762 stbir__simdf_madd_mem( tot2, tot2, c, decode+(ofs)*7+21 ); \
5763 stbir__simdf_madd_mem( tot3, tot3, c, decode+(ofs)*7+24 );
5764
5765#define stbir__1_coeff_remnant( ofs ) \
5766 STBIR_SIMD_NO_UNROLL(decode); \
5767 stbir__simdf_load1( c, hc + (ofs) ); \
5768 stbir__simdf_0123to0000( c, c ); \
5769 stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*7 ); \
5770 stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*7+3 ); \
5771
5772#define stbir__2_coeff_remnant( ofs ) \
5773 STBIR_SIMD_NO_UNROLL(decode); \
5774 stbir__simdf_load2( cs, hc + (ofs) ); \
5775 stbir__simdf_0123to0000( c, cs ); \
5776 stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*7 ); \
5777 stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*7+3 ); \
5778 stbir__simdf_0123to1111( c, cs ); \
5779 stbir__simdf_madd_mem( tot2, tot2, c, decode+(ofs)*7+7 ); \
5780 stbir__simdf_madd_mem( tot3, tot3, c, decode+(ofs)*7+10 );
5781
5782#define stbir__3_coeff_remnant( ofs ) \
5783 STBIR_SIMD_NO_UNROLL(decode); \
5784 stbir__simdf_load( cs, hc + (ofs) ); \
5785 stbir__simdf_0123to0000( c, cs ); \
5786 stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*7 ); \
5787 stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*7+3 ); \
5788 stbir__simdf_0123to1111( c, cs ); \
5789 stbir__simdf_madd_mem( tot2, tot2, c, decode+(ofs)*7+7 ); \
5790 stbir__simdf_madd_mem( tot3, tot3, c, decode+(ofs)*7+10 ); \
5791 stbir__simdf_0123to2222( c, cs ); \
5792 stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*7+14 ); \
5793 stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*7+17 );
5794
5795#define stbir__store_output() \
5796 stbir__simdf_add( tot0, tot0, tot2 ); \
5797 stbir__simdf_add( tot1, tot1, tot3 ); \
5798 stbir__simdf_store( output+3, tot1 ); \
5799 stbir__simdf_store( output, tot0 ); \
5800 horizontal_coefficients += coefficient_width; \
5801 ++horizontal_contributors; \
5802 output += 7;
5803
5804#endif
5805
5806#else
5807
5808#define stbir__1_coeff_only() \
5809 float tot0, tot1, tot2, tot3, tot4, tot5, tot6, c; \
5810 c = hc[0]; \
5811 tot0 = decode[0]*c; \
5812 tot1 = decode[1]*c; \
5813 tot2 = decode[2]*c; \
5814 tot3 = decode[3]*c; \
5815 tot4 = decode[4]*c; \
5816 tot5 = decode[5]*c; \
5817 tot6 = decode[6]*c;
5818
5819#define stbir__2_coeff_only() \
5820 float tot0, tot1, tot2, tot3, tot4, tot5, tot6, c; \
5821 c = hc[0]; \
5822 tot0 = decode[0]*c; \
5823 tot1 = decode[1]*c; \
5824 tot2 = decode[2]*c; \
5825 tot3 = decode[3]*c; \
5826 tot4 = decode[4]*c; \
5827 tot5 = decode[5]*c; \
5828 tot6 = decode[6]*c; \
5829 c = hc[1]; \
5830 tot0 += decode[7]*c; \
5831 tot1 += decode[8]*c; \
5832 tot2 += decode[9]*c; \
5833 tot3 += decode[10]*c; \
5834 tot4 += decode[11]*c; \
5835 tot5 += decode[12]*c; \
5836 tot6 += decode[13]*c; \
5837
5838#define stbir__3_coeff_only() \
5839 float tot0, tot1, tot2, tot3, tot4, tot5, tot6, c; \
5840 c = hc[0]; \
5841 tot0 = decode[0]*c; \
5842 tot1 = decode[1]*c; \
5843 tot2 = decode[2]*c; \
5844 tot3 = decode[3]*c; \
5845 tot4 = decode[4]*c; \
5846 tot5 = decode[5]*c; \
5847 tot6 = decode[6]*c; \
5848 c = hc[1]; \
5849 tot0 += decode[7]*c; \
5850 tot1 += decode[8]*c; \
5851 tot2 += decode[9]*c; \
5852 tot3 += decode[10]*c; \
5853 tot4 += decode[11]*c; \
5854 tot5 += decode[12]*c; \
5855 tot6 += decode[13]*c; \
5856 c = hc[2]; \
5857 tot0 += decode[14]*c; \
5858 tot1 += decode[15]*c; \
5859 tot2 += decode[16]*c; \
5860 tot3 += decode[17]*c; \
5861 tot4 += decode[18]*c; \
5862 tot5 += decode[19]*c; \
5863 tot6 += decode[20]*c; \
5864
5865#define stbir__store_output_tiny() \
5866 output[0] = tot0; \
5867 output[1] = tot1; \
5868 output[2] = tot2; \
5869 output[3] = tot3; \
5870 output[4] = tot4; \
5871 output[5] = tot5; \
5872 output[6] = tot6; \
5873 horizontal_coefficients += coefficient_width; \
5874 ++horizontal_contributors; \
5875 output += 7;
5876
5877#define stbir__4_coeff_start() \
5878 float x0,x1,x2,x3,x4,x5,x6,y0,y1,y2,y3,y4,y5,y6,c; \
5879 STBIR_SIMD_NO_UNROLL(decode); \
5880 c = hc[0]; \
5881 x0 = decode[0] * c; \
5882 x1 = decode[1] * c; \
5883 x2 = decode[2] * c; \
5884 x3 = decode[3] * c; \
5885 x4 = decode[4] * c; \
5886 x5 = decode[5] * c; \
5887 x6 = decode[6] * c; \
5888 c = hc[1]; \
5889 y0 = decode[7] * c; \
5890 y1 = decode[8] * c; \
5891 y2 = decode[9] * c; \
5892 y3 = decode[10] * c; \
5893 y4 = decode[11] * c; \
5894 y5 = decode[12] * c; \
5895 y6 = decode[13] * c; \
5896 c = hc[2]; \
5897 x0 += decode[14] * c; \
5898 x1 += decode[15] * c; \
5899 x2 += decode[16] * c; \
5900 x3 += decode[17] * c; \
5901 x4 += decode[18] * c; \
5902 x5 += decode[19] * c; \
5903 x6 += decode[20] * c; \
5904 c = hc[3]; \
5905 y0 += decode[21] * c; \
5906 y1 += decode[22] * c; \
5907 y2 += decode[23] * c; \
5908 y3 += decode[24] * c; \
5909 y4 += decode[25] * c; \
5910 y5 += decode[26] * c; \
5911 y6 += decode[27] * c;
5912
5913#define stbir__4_coeff_continue_from_4( ofs ) \
5914 STBIR_SIMD_NO_UNROLL(decode); \
5915 c = hc[0+(ofs)]; \
5916 x0 += decode[0+(ofs)*7] * c; \
5917 x1 += decode[1+(ofs)*7] * c; \
5918 x2 += decode[2+(ofs)*7] * c; \
5919 x3 += decode[3+(ofs)*7] * c; \
5920 x4 += decode[4+(ofs)*7] * c; \
5921 x5 += decode[5+(ofs)*7] * c; \
5922 x6 += decode[6+(ofs)*7] * c; \
5923 c = hc[1+(ofs)]; \
5924 y0 += decode[7+(ofs)*7] * c; \
5925 y1 += decode[8+(ofs)*7] * c; \
5926 y2 += decode[9+(ofs)*7] * c; \
5927 y3 += decode[10+(ofs)*7] * c; \
5928 y4 += decode[11+(ofs)*7] * c; \
5929 y5 += decode[12+(ofs)*7] * c; \
5930 y6 += decode[13+(ofs)*7] * c; \
5931 c = hc[2+(ofs)]; \
5932 x0 += decode[14+(ofs)*7] * c; \
5933 x1 += decode[15+(ofs)*7] * c; \
5934 x2 += decode[16+(ofs)*7] * c; \
5935 x3 += decode[17+(ofs)*7] * c; \
5936 x4 += decode[18+(ofs)*7] * c; \
5937 x5 += decode[19+(ofs)*7] * c; \
5938 x6 += decode[20+(ofs)*7] * c; \
5939 c = hc[3+(ofs)]; \
5940 y0 += decode[21+(ofs)*7] * c; \
5941 y1 += decode[22+(ofs)*7] * c; \
5942 y2 += decode[23+(ofs)*7] * c; \
5943 y3 += decode[24+(ofs)*7] * c; \
5944 y4 += decode[25+(ofs)*7] * c; \
5945 y5 += decode[26+(ofs)*7] * c; \
5946 y6 += decode[27+(ofs)*7] * c;
5947
5948#define stbir__1_coeff_remnant( ofs ) \
5949 STBIR_SIMD_NO_UNROLL(decode); \
5950 c = hc[0+(ofs)]; \
5951 x0 += decode[0+(ofs)*7] * c; \
5952 x1 += decode[1+(ofs)*7] * c; \
5953 x2 += decode[2+(ofs)*7] * c; \
5954 x3 += decode[3+(ofs)*7] * c; \
5955 x4 += decode[4+(ofs)*7] * c; \
5956 x5 += decode[5+(ofs)*7] * c; \
5957 x6 += decode[6+(ofs)*7] * c; \
5958
5959#define stbir__2_coeff_remnant( ofs ) \
5960 STBIR_SIMD_NO_UNROLL(decode); \
5961 c = hc[0+(ofs)]; \
5962 x0 += decode[0+(ofs)*7] * c; \
5963 x1 += decode[1+(ofs)*7] * c; \
5964 x2 += decode[2+(ofs)*7] * c; \
5965 x3 += decode[3+(ofs)*7] * c; \
5966 x4 += decode[4+(ofs)*7] * c; \
5967 x5 += decode[5+(ofs)*7] * c; \
5968 x6 += decode[6+(ofs)*7] * c; \
5969 c = hc[1+(ofs)]; \
5970 y0 += decode[7+(ofs)*7] * c; \
5971 y1 += decode[8+(ofs)*7] * c; \
5972 y2 += decode[9+(ofs)*7] * c; \
5973 y3 += decode[10+(ofs)*7] * c; \
5974 y4 += decode[11+(ofs)*7] * c; \
5975 y5 += decode[12+(ofs)*7] * c; \
5976 y6 += decode[13+(ofs)*7] * c; \
5977
5978#define stbir__3_coeff_remnant( ofs ) \
5979 STBIR_SIMD_NO_UNROLL(decode); \
5980 c = hc[0+(ofs)]; \
5981 x0 += decode[0+(ofs)*7] * c; \
5982 x1 += decode[1+(ofs)*7] * c; \
5983 x2 += decode[2+(ofs)*7] * c; \
5984 x3 += decode[3+(ofs)*7] * c; \
5985 x4 += decode[4+(ofs)*7] * c; \
5986 x5 += decode[5+(ofs)*7] * c; \
5987 x6 += decode[6+(ofs)*7] * c; \
5988 c = hc[1+(ofs)]; \
5989 y0 += decode[7+(ofs)*7] * c; \
5990 y1 += decode[8+(ofs)*7] * c; \
5991 y2 += decode[9+(ofs)*7] * c; \
5992 y3 += decode[10+(ofs)*7] * c; \
5993 y4 += decode[11+(ofs)*7] * c; \
5994 y5 += decode[12+(ofs)*7] * c; \
5995 y6 += decode[13+(ofs)*7] * c; \
5996 c = hc[2+(ofs)]; \
5997 x0 += decode[14+(ofs)*7] * c; \
5998 x1 += decode[15+(ofs)*7] * c; \
5999 x2 += decode[16+(ofs)*7] * c; \
6000 x3 += decode[17+(ofs)*7] * c; \
6001 x4 += decode[18+(ofs)*7] * c; \
6002 x5 += decode[19+(ofs)*7] * c; \
6003 x6 += decode[20+(ofs)*7] * c; \
6004
6005#define stbir__store_output() \
6006 output[0] = x0 + y0; \
6007 output[1] = x1 + y1; \
6008 output[2] = x2 + y2; \
6009 output[3] = x3 + y3; \
6010 output[4] = x4 + y4; \
6011 output[5] = x5 + y5; \
6012 output[6] = x6 + y6; \
6013 horizontal_coefficients += coefficient_width; \
6014 ++horizontal_contributors; \
6015 output += 7;
6016
6017#endif
6018
6019#define STBIR__horizontal_channels 7
6020#define STB_IMAGE_RESIZE_DO_HORIZONTALS
6021#include STBIR__HEADER_FILENAME
6022
6023
6024// include all of the vertical resamplers (both scatter and gather versions)
6025
6026#define STBIR__vertical_channels 1
6027#define STB_IMAGE_RESIZE_DO_VERTICALS
6028#include STBIR__HEADER_FILENAME
6029
6030#define STBIR__vertical_channels 1
6031#define STB_IMAGE_RESIZE_DO_VERTICALS
6032#define STB_IMAGE_RESIZE_VERTICAL_CONTINUE
6033#include STBIR__HEADER_FILENAME
6034
6035#define STBIR__vertical_channels 2
6036#define STB_IMAGE_RESIZE_DO_VERTICALS
6037#include STBIR__HEADER_FILENAME
6038
6039#define STBIR__vertical_channels 2
6040#define STB_IMAGE_RESIZE_DO_VERTICALS
6041#define STB_IMAGE_RESIZE_VERTICAL_CONTINUE
6042#include STBIR__HEADER_FILENAME
6043
6044#define STBIR__vertical_channels 3
6045#define STB_IMAGE_RESIZE_DO_VERTICALS
6046#include STBIR__HEADER_FILENAME
6047
6048#define STBIR__vertical_channels 3
6049#define STB_IMAGE_RESIZE_DO_VERTICALS
6050#define STB_IMAGE_RESIZE_VERTICAL_CONTINUE
6051#include STBIR__HEADER_FILENAME
6052
6053#define STBIR__vertical_channels 4
6054#define STB_IMAGE_RESIZE_DO_VERTICALS
6055#include STBIR__HEADER_FILENAME
6056
6057#define STBIR__vertical_channels 4
6058#define STB_IMAGE_RESIZE_DO_VERTICALS
6059#define STB_IMAGE_RESIZE_VERTICAL_CONTINUE
6060#include STBIR__HEADER_FILENAME
6061
6062#define STBIR__vertical_channels 5
6063#define STB_IMAGE_RESIZE_DO_VERTICALS
6064#include STBIR__HEADER_FILENAME
6065
6066#define STBIR__vertical_channels 5
6067#define STB_IMAGE_RESIZE_DO_VERTICALS
6068#define STB_IMAGE_RESIZE_VERTICAL_CONTINUE
6069#include STBIR__HEADER_FILENAME
6070
6071#define STBIR__vertical_channels 6
6072#define STB_IMAGE_RESIZE_DO_VERTICALS
6073#include STBIR__HEADER_FILENAME
6074
6075#define STBIR__vertical_channels 6
6076#define STB_IMAGE_RESIZE_DO_VERTICALS
6077#define STB_IMAGE_RESIZE_VERTICAL_CONTINUE
6078#include STBIR__HEADER_FILENAME
6079
6080#define STBIR__vertical_channels 7
6081#define STB_IMAGE_RESIZE_DO_VERTICALS
6082#include STBIR__HEADER_FILENAME
6083
6084#define STBIR__vertical_channels 7
6085#define STB_IMAGE_RESIZE_DO_VERTICALS
6086#define STB_IMAGE_RESIZE_VERTICAL_CONTINUE
6087#include STBIR__HEADER_FILENAME
6088
6089#define STBIR__vertical_channels 8
6090#define STB_IMAGE_RESIZE_DO_VERTICALS
6091#include STBIR__HEADER_FILENAME
6092
6093#define STBIR__vertical_channels 8
6094#define STB_IMAGE_RESIZE_DO_VERTICALS
6095#define STB_IMAGE_RESIZE_VERTICAL_CONTINUE
6096#include STBIR__HEADER_FILENAME
6097
6098typedef void STBIR_VERTICAL_GATHERFUNC( float * output, float const * coeffs, float const ** inputs, float const * input0_end );
6099
6100static STBIR_VERTICAL_GATHERFUNC * stbir__vertical_gathers[ 8 ] =
6101{
6102 stbir__vertical_gather_with_1_coeffs,stbir__vertical_gather_with_2_coeffs,stbir__vertical_gather_with_3_coeffs,stbir__vertical_gather_with_4_coeffs,stbir__vertical_gather_with_5_coeffs,stbir__vertical_gather_with_6_coeffs,stbir__vertical_gather_with_7_coeffs,stbir__vertical_gather_with_8_coeffs
6103};
6104
6105static STBIR_VERTICAL_GATHERFUNC * stbir__vertical_gathers_continues[ 8 ] =
6106{
6107 stbir__vertical_gather_with_1_coeffs_cont,stbir__vertical_gather_with_2_coeffs_cont,stbir__vertical_gather_with_3_coeffs_cont,stbir__vertical_gather_with_4_coeffs_cont,stbir__vertical_gather_with_5_coeffs_cont,stbir__vertical_gather_with_6_coeffs_cont,stbir__vertical_gather_with_7_coeffs_cont,stbir__vertical_gather_with_8_coeffs_cont
6108};
6109
6110typedef void STBIR_VERTICAL_SCATTERFUNC( float ** outputs, float const * coeffs, float const * input, float const * input_end );
6111
6112static STBIR_VERTICAL_SCATTERFUNC * stbir__vertical_scatter_sets[ 8 ] =
6113{
6114 stbir__vertical_scatter_with_1_coeffs,stbir__vertical_scatter_with_2_coeffs,stbir__vertical_scatter_with_3_coeffs,stbir__vertical_scatter_with_4_coeffs,stbir__vertical_scatter_with_5_coeffs,stbir__vertical_scatter_with_6_coeffs,stbir__vertical_scatter_with_7_coeffs,stbir__vertical_scatter_with_8_coeffs
6115};
6116
6117static STBIR_VERTICAL_SCATTERFUNC * stbir__vertical_scatter_blends[ 8 ] =
6118{
6119 stbir__vertical_scatter_with_1_coeffs_cont,stbir__vertical_scatter_with_2_coeffs_cont,stbir__vertical_scatter_with_3_coeffs_cont,stbir__vertical_scatter_with_4_coeffs_cont,stbir__vertical_scatter_with_5_coeffs_cont,stbir__vertical_scatter_with_6_coeffs_cont,stbir__vertical_scatter_with_7_coeffs_cont,stbir__vertical_scatter_with_8_coeffs_cont
6120};
6121
6122
6123static void stbir__encode_scanline( stbir__info const * stbir_info, void *output_buffer_data, float * encode_buffer, int row STBIR_ONLY_PROFILE_GET_SPLIT_INFO )
6124{
6125 int num_pixels = stbir_info->horizontal.scale_info.output_sub_size;
6126 int channels = stbir_info->channels;
6127 int width_times_channels = num_pixels * channels;
6128 void * output_buffer;
6129
6130 // un-alpha weight if we need to
6131 if ( stbir_info->alpha_unweight )
6132 {
6133 STBIR_PROFILE_START( unalpha );
6134 stbir_info->alpha_unweight( encode_buffer, width_times_channels );
6135 STBIR_PROFILE_END( unalpha );
6136 }
6137
6138 // write directly into output by default
6139 output_buffer = output_buffer_data;
6140
6141 // if we have an output callback, we first convert the decode buffer in place (and then hand that to the callback)
6142 if ( stbir_info->out_pixels_cb )
6143 output_buffer = encode_buffer;
6144
6145 STBIR_PROFILE_START( encode );
6146 // convert into the output buffer
6147 stbir_info->encode_pixels( output_buffer, width_times_channels, encode_buffer );
6148 STBIR_PROFILE_END( encode );
6149
6150 // if we have an output callback, call it to send the data
6151 if ( stbir_info->out_pixels_cb )
6152 stbir_info->out_pixels_cb( output_buffer, num_pixels, row, stbir_info->user_data );
6153}
6154
6155
6156// Get the ring buffer pointer for an index
6157static float* stbir__get_ring_buffer_entry(stbir__info const * stbir_info, stbir__per_split_info const * split_info, int index )
6158{
6159 STBIR_ASSERT( index < stbir_info->ring_buffer_num_entries );
6160
6161 #ifdef STBIR__SEPARATE_ALLOCATIONS
6162 return split_info->ring_buffers[ index ];
6163 #else
6164 return (float*) ( ( (char*) split_info->ring_buffer ) + ( index * stbir_info->ring_buffer_length_bytes ) );
6165 #endif
6166}
6167
6168// Get the specified scan line from the ring buffer
6169static float* stbir__get_ring_buffer_scanline(stbir__info const * stbir_info, stbir__per_split_info const * split_info, int get_scanline)
6170{
6171 int ring_buffer_index = (split_info->ring_buffer_begin_index + (get_scanline - split_info->ring_buffer_first_scanline)) % stbir_info->ring_buffer_num_entries;
6172 return stbir__get_ring_buffer_entry( stbir_info, split_info, ring_buffer_index );
6173}
6174
6175static void stbir__resample_horizontal_gather(stbir__info const * stbir_info, float* output_buffer, float const * input_buffer STBIR_ONLY_PROFILE_GET_SPLIT_INFO )
6176{
6177 float const * decode_buffer = input_buffer - ( stbir_info->scanline_extents.conservative.n0 * stbir_info->effective_channels );
6178
6179 STBIR_PROFILE_START( horizontal );
6180 if ( ( stbir_info->horizontal.filter_enum == STBIR_FILTER_POINT_SAMPLE ) && ( stbir_info->horizontal.scale_info.scale == 1.0f ) )
6181 STBIR_MEMCPY( output_buffer, input_buffer, stbir_info->horizontal.scale_info.output_sub_size * sizeof( float ) * stbir_info->effective_channels );
6182 else
6183 stbir_info->horizontal_gather_channels( output_buffer, stbir_info->horizontal.scale_info.output_sub_size, decode_buffer, stbir_info->horizontal.contributors, stbir_info->horizontal.coefficients, stbir_info->horizontal.coefficient_width );
6184 STBIR_PROFILE_END( horizontal );
6185}
6186
6187static void stbir__resample_vertical_gather(stbir__info const * stbir_info, stbir__per_split_info* split_info, int n, int contrib_n0, int contrib_n1, float const * vertical_coefficients )
6188{
6189 float* encode_buffer = split_info->vertical_buffer;
6190 float* decode_buffer = split_info->decode_buffer;
6191 int vertical_first = stbir_info->vertical_first;
6192 int width = (vertical_first) ? ( stbir_info->scanline_extents.conservative.n1-stbir_info->scanline_extents.conservative.n0+1 ) : stbir_info->horizontal.scale_info.output_sub_size;
6193 int width_times_channels = stbir_info->effective_channels * width;
6194
6195 STBIR_ASSERT( stbir_info->vertical.is_gather );
6196
6197 // loop over the contributing scanlines and scale into the buffer
6198 STBIR_PROFILE_START( vertical );
6199 {
6200 int k = 0, total = contrib_n1 - contrib_n0 + 1;
6201 STBIR_ASSERT( total > 0 );
6202 do {
6203 float const * inputs[8];
6204 int i, cnt = total; if ( cnt > 8 ) cnt = 8;
6205 for( i = 0 ; i < cnt ; i++ )
6206 inputs[ i ] = stbir__get_ring_buffer_scanline(stbir_info, split_info, k+i+contrib_n0 );
6207
6208 // call the N scanlines at a time function (up to 8 scanlines of blending at once)
6209 ((k==0)?stbir__vertical_gathers:stbir__vertical_gathers_continues)[cnt-1]( (vertical_first) ? decode_buffer : encode_buffer, vertical_coefficients + k, inputs, inputs[0] + width_times_channels );
6210 k += cnt;
6211 total -= cnt;
6212 } while ( total );
6213 }
6214 STBIR_PROFILE_END( vertical );
6215
6216 if ( vertical_first )
6217 {
6218 // Now resample the gathered vertical data in the horizontal axis into the encode buffer
6219 stbir__resample_horizontal_gather(stbir_info, encode_buffer, decode_buffer STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
6220 }
6221
6222 stbir__encode_scanline( stbir_info, ( (char *) stbir_info->output_data ) + ((size_t)n * (size_t)stbir_info->output_stride_bytes),
6223 encode_buffer, n STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
6224}
6225
6226static void stbir__decode_and_resample_for_vertical_gather_loop(stbir__info const * stbir_info, stbir__per_split_info* split_info, int n)
6227{
6228 int ring_buffer_index;
6229 float* ring_buffer;
6230
6231 // Decode the nth scanline from the source image into the decode buffer.
6232 stbir__decode_scanline( stbir_info, n, split_info->decode_buffer STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
6233
6234 // update new end scanline
6235 split_info->ring_buffer_last_scanline = n;
6236
6237 // get ring buffer
6238 ring_buffer_index = (split_info->ring_buffer_begin_index + (split_info->ring_buffer_last_scanline - split_info->ring_buffer_first_scanline)) % stbir_info->ring_buffer_num_entries;
6239 ring_buffer = stbir__get_ring_buffer_entry(stbir_info, split_info, ring_buffer_index);
6240
6241 // Now resample it into the ring buffer.
6242 stbir__resample_horizontal_gather( stbir_info, ring_buffer, split_info->decode_buffer STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
6243
6244 // Now it's sitting in the ring buffer ready to be used as source for the vertical sampling.
6245}
6246
6247static void stbir__vertical_gather_loop( stbir__info const * stbir_info, stbir__per_split_info* split_info, int split_count )
6248{
6249 int y, start_output_y, end_output_y;
6250 stbir__contributors* vertical_contributors = stbir_info->vertical.contributors;
6251 float const * vertical_coefficients = stbir_info->vertical.coefficients;
6252
6253 STBIR_ASSERT( stbir_info->vertical.is_gather );
6254
6255 start_output_y = split_info->start_output_y;
6256 end_output_y = split_info[split_count-1].end_output_y;
6257
6258 vertical_contributors += start_output_y;
6259 vertical_coefficients += start_output_y * stbir_info->vertical.coefficient_width;
6260
6261 // initialize the ring buffer for gathering
6262 split_info->ring_buffer_begin_index = 0;
6263 split_info->ring_buffer_first_scanline = vertical_contributors->n0;
6264 split_info->ring_buffer_last_scanline = split_info->ring_buffer_first_scanline - 1; // means "empty"
6265
6266 for (y = start_output_y; y < end_output_y; y++)
6267 {
6268 int in_first_scanline, in_last_scanline;
6269
6270 in_first_scanline = vertical_contributors->n0;
6271 in_last_scanline = vertical_contributors->n1;
6272
6273 // make sure the indexing hasn't broken
6274 STBIR_ASSERT( in_first_scanline >= split_info->ring_buffer_first_scanline );
6275
6276 // Load in new scanlines
6277 while (in_last_scanline > split_info->ring_buffer_last_scanline)
6278 {
6279 STBIR_ASSERT( ( split_info->ring_buffer_last_scanline - split_info->ring_buffer_first_scanline + 1 ) <= stbir_info->ring_buffer_num_entries );
6280
6281 // make sure there was room in the ring buffer when we add new scanlines
6282 if ( ( split_info->ring_buffer_last_scanline - split_info->ring_buffer_first_scanline + 1 ) == stbir_info->ring_buffer_num_entries )
6283 {
6284 split_info->ring_buffer_first_scanline++;
6285 split_info->ring_buffer_begin_index++;
6286 }
6287
6288 if ( stbir_info->vertical_first )
6289 {
6290 float * ring_buffer = stbir__get_ring_buffer_scanline( stbir_info, split_info, ++split_info->ring_buffer_last_scanline );
6291 // Decode the nth scanline from the source image into the decode buffer.
6292 stbir__decode_scanline( stbir_info, split_info->ring_buffer_last_scanline, ring_buffer STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
6293 }
6294 else
6295 {
6296 stbir__decode_and_resample_for_vertical_gather_loop(stbir_info, split_info, split_info->ring_buffer_last_scanline + 1);
6297 }
6298 }
6299
6300 // Now all buffers should be ready to write a row of vertical sampling, so do it.
6301 stbir__resample_vertical_gather(stbir_info, split_info, y, in_first_scanline, in_last_scanline, vertical_coefficients );
6302
6303 ++vertical_contributors;
6304 vertical_coefficients += stbir_info->vertical.coefficient_width;
6305 }
6306}
6307
6308#define STBIR__FLOAT_EMPTY_MARKER 3.0e+38F
6309#define STBIR__FLOAT_BUFFER_IS_EMPTY(ptr) ((ptr)[0]==STBIR__FLOAT_EMPTY_MARKER)
6310
6311static void stbir__encode_first_scanline_from_scatter(stbir__info const * stbir_info, stbir__per_split_info* split_info)
6312{
6313 // evict a scanline out into the output buffer
6314 float* ring_buffer_entry = stbir__get_ring_buffer_entry(stbir_info, split_info, split_info->ring_buffer_begin_index );
6315
6316 // dump the scanline out
6317 stbir__encode_scanline( stbir_info, ( (char *)stbir_info->output_data ) + ( (size_t)split_info->ring_buffer_first_scanline * (size_t)stbir_info->output_stride_bytes ), ring_buffer_entry, split_info->ring_buffer_first_scanline STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
6318
6319 // mark it as empty
6320 ring_buffer_entry[ 0 ] = STBIR__FLOAT_EMPTY_MARKER;
6321
6322 // advance the first scanline
6323 split_info->ring_buffer_first_scanline++;
6324 if ( ++split_info->ring_buffer_begin_index == stbir_info->ring_buffer_num_entries )
6325 split_info->ring_buffer_begin_index = 0;
6326}
6327
6328static void stbir__horizontal_resample_and_encode_first_scanline_from_scatter(stbir__info const * stbir_info, stbir__per_split_info* split_info)
6329{
6330 // evict a scanline out into the output buffer
6331
6332 float* ring_buffer_entry = stbir__get_ring_buffer_entry(stbir_info, split_info, split_info->ring_buffer_begin_index );
6333
6334 // Now resample it into the buffer.
6335 stbir__resample_horizontal_gather( stbir_info, split_info->vertical_buffer, ring_buffer_entry STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
6336
6337 // dump the scanline out
6338 stbir__encode_scanline( stbir_info, ( (char *)stbir_info->output_data ) + ( (size_t)split_info->ring_buffer_first_scanline * (size_t)stbir_info->output_stride_bytes ), split_info->vertical_buffer, split_info->ring_buffer_first_scanline STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
6339
6340 // mark it as empty
6341 ring_buffer_entry[ 0 ] = STBIR__FLOAT_EMPTY_MARKER;
6342
6343 // advance the first scanline
6344 split_info->ring_buffer_first_scanline++;
6345 if ( ++split_info->ring_buffer_begin_index == stbir_info->ring_buffer_num_entries )
6346 split_info->ring_buffer_begin_index = 0;
6347}
6348
6349static void stbir__resample_vertical_scatter(stbir__info const * stbir_info, stbir__per_split_info* split_info, int n0, int n1, float const * vertical_coefficients, float const * vertical_buffer, float const * vertical_buffer_end )
6350{
6351 STBIR_ASSERT( !stbir_info->vertical.is_gather );
6352
6353 STBIR_PROFILE_START( vertical );
6354 {
6355 int k = 0, total = n1 - n0 + 1;
6356 STBIR_ASSERT( total > 0 );
6357 do {
6358 float * outputs[8];
6359 int i, n = total; if ( n > 8 ) n = 8;
6360 for( i = 0 ; i < n ; i++ )
6361 {
6362 outputs[ i ] = stbir__get_ring_buffer_scanline(stbir_info, split_info, k+i+n0 );
6363 if ( ( i ) && ( STBIR__FLOAT_BUFFER_IS_EMPTY( outputs[i] ) != STBIR__FLOAT_BUFFER_IS_EMPTY( outputs[0] ) ) ) // make sure runs are of the same type
6364 {
6365 n = i;
6366 break;
6367 }
6368 }
6369 // call the scatter to N scanlines at a time function (up to 8 scanlines of scattering at once)
6370 ((STBIR__FLOAT_BUFFER_IS_EMPTY( outputs[0] ))?stbir__vertical_scatter_sets:stbir__vertical_scatter_blends)[n-1]( outputs, vertical_coefficients + k, vertical_buffer, vertical_buffer_end );
6371 k += n;
6372 total -= n;
6373 } while ( total );
6374 }
6375
6376 STBIR_PROFILE_END( vertical );
6377}
6378
6379typedef void stbir__handle_scanline_for_scatter_func(stbir__info const * stbir_info, stbir__per_split_info* split_info);
6380
6381static void stbir__vertical_scatter_loop( stbir__info const * stbir_info, stbir__per_split_info* split_info, int split_count )
6382{
6383 int y, start_output_y, end_output_y, start_input_y, end_input_y;
6384 stbir__contributors* vertical_contributors = stbir_info->vertical.contributors;
6385 float const * vertical_coefficients = stbir_info->vertical.coefficients;
6386 stbir__handle_scanline_for_scatter_func * handle_scanline_for_scatter;
6387 void * scanline_scatter_buffer;
6388 void * scanline_scatter_buffer_end;
6389 int on_first_input_y, last_input_y;
6390
6391 STBIR_ASSERT( !stbir_info->vertical.is_gather );
6392
6393 start_output_y = split_info->start_output_y;
6394 end_output_y = split_info[split_count-1].end_output_y; // may do multiple split counts
6395
6396 start_input_y = split_info->start_input_y;
6397 end_input_y = split_info[split_count-1].end_input_y;
6398
6399 // adjust for starting offset start_input_y
6400 y = start_input_y + stbir_info->vertical.filter_pixel_margin;
6401 vertical_contributors += y ;
6402 vertical_coefficients += stbir_info->vertical.coefficient_width * y;
6403
6404 if ( stbir_info->vertical_first )
6405 {
6406 handle_scanline_for_scatter = stbir__horizontal_resample_and_encode_first_scanline_from_scatter;
6407 scanline_scatter_buffer = split_info->decode_buffer;
6408 scanline_scatter_buffer_end = ( (char*) scanline_scatter_buffer ) + sizeof( float ) * stbir_info->effective_channels * (stbir_info->scanline_extents.conservative.n1-stbir_info->scanline_extents.conservative.n0+1);
6409 }
6410 else
6411 {
6412 handle_scanline_for_scatter = stbir__encode_first_scanline_from_scatter;
6413 scanline_scatter_buffer = split_info->vertical_buffer;
6414 scanline_scatter_buffer_end = ( (char*) scanline_scatter_buffer ) + sizeof( float ) * stbir_info->effective_channels * stbir_info->horizontal.scale_info.output_sub_size;
6415 }
6416
6417 // initialize the ring buffer for scattering
6418 split_info->ring_buffer_first_scanline = start_output_y;
6419 split_info->ring_buffer_last_scanline = -1;
6420 split_info->ring_buffer_begin_index = -1;
6421
6422 // mark all the buffers as empty to start
6423 for( y = 0 ; y < stbir_info->ring_buffer_num_entries ; y++ )
6424 stbir__get_ring_buffer_entry( stbir_info, split_info, y )[0] = STBIR__FLOAT_EMPTY_MARKER; // only used on scatter
6425
6426 // do the loop in input space
6427 on_first_input_y = 1; last_input_y = start_input_y;
6428 for (y = start_input_y ; y < end_input_y; y++)
6429 {
6430 int out_first_scanline, out_last_scanline;
6431
6432 out_first_scanline = vertical_contributors->n0;
6433 out_last_scanline = vertical_contributors->n1;
6434
6435 STBIR_ASSERT(out_last_scanline - out_first_scanline + 1 <= stbir_info->ring_buffer_num_entries);
6436
6437 if ( ( out_last_scanline >= out_first_scanline ) && ( ( ( out_first_scanline >= start_output_y ) && ( out_first_scanline < end_output_y ) ) || ( ( out_last_scanline >= start_output_y ) && ( out_last_scanline < end_output_y ) ) ) )
6438 {
6439 float const * vc = vertical_coefficients;
6440
6441 // keep track of the range actually seen for the next resize
6442 last_input_y = y;
6443 if ( ( on_first_input_y ) && ( y > start_input_y ) )
6444 split_info->start_input_y = y;
6445 on_first_input_y = 0;
6446
6447 // clip the region
6448 if ( out_first_scanline < start_output_y )
6449 {
6450 vc += start_output_y - out_first_scanline;
6451 out_first_scanline = start_output_y;
6452 }
6453
6454 if ( out_last_scanline >= end_output_y )
6455 out_last_scanline = end_output_y - 1;
6456
6457 // if very first scanline, init the index
6458 if (split_info->ring_buffer_begin_index < 0)
6459 split_info->ring_buffer_begin_index = out_first_scanline - start_output_y;
6460
6461 STBIR_ASSERT( split_info->ring_buffer_begin_index <= out_first_scanline );
6462
6463 // Decode the nth scanline from the source image into the decode buffer.
6464 stbir__decode_scanline( stbir_info, y, split_info->decode_buffer STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
6465
6466 // When horizontal first, we resample horizontally into the vertical buffer before we scatter it out
6467 if ( !stbir_info->vertical_first )
6468 stbir__resample_horizontal_gather( stbir_info, split_info->vertical_buffer, split_info->decode_buffer STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
6469
6470 // Now it's sitting in the buffer ready to be distributed into the ring buffers.
6471
6472 // evict from the ringbuffer, if we need are full
6473 if ( ( ( split_info->ring_buffer_last_scanline - split_info->ring_buffer_first_scanline + 1 ) == stbir_info->ring_buffer_num_entries ) &&
6474 ( out_last_scanline > split_info->ring_buffer_last_scanline ) )
6475 handle_scanline_for_scatter( stbir_info, split_info );
6476
6477 // Now the horizontal buffer is ready to write to all ring buffer rows, so do it.