Gitly


1 // Ogg Vorbis audio decoder - v1.22 - public domain
2 // http://nothings.org/stb_vorbis/
3 //
4 // Original version written by Sean Barrett in 2007.
5 //
6 // Originally sponsored by RAD Game Tools. Seeking implementation
7 // sponsored by Phillip Bennefall, Marc Andersen, Aaron Baker,
8 // Elias Software, Aras Pranckevicius, and Sean Barrett.
9 //
10 // LICENSE
11 //
12 //   See end of file for license information.
13 //
14 // Limitations:
15 //
16 //   - floor 0 not supported (used in old ogg vorbis files pre-2004)
17 //   - lossless sample-truncation at beginning ignored
18 //   - cannot concatenate multiple vorbis streams
19 //   - sample positions are 32-bit, limiting seekable 192Khz
20 //       files to around 6 hours (Ogg supports 64-bit)
21 //
22 // Feature contributors:
23 //    Dougall Johnson (sample-exact seeking)
24 //
25 // Bugfix/warning contributors:
26 //    Terje Mathisen     Niklas Frykholm     Andy Hill
27 //    Casey Muratori     John Bolton         Gargaj
28 //    Laurent Gomila     Marc LeBlanc        Ronny Chevalier
29 //    Bernhard Wodo      Evan Balster        github:alxprd
30 //    Tom Beaumont       Ingo Leitgeb        Nicolas Guillemot
31 //    Phillip Bennefall  Rohit               Thiago Goulart
32 //    github:manxorist   Saga Musix          github:infatum
33 //    Timur Gagiev       Maxwell Koo         Peter Waller
34 //    github:audinowho   Dougall Johnson     David Reid
35 //    github:Clownacy    Pedro J. Estebanez  Remi Verschelde
36 //    AnthoFoxo          github:morlat       Gabriel Ravier
37 //
38 // Partial history:
39 //    1.22    - 2021-07-11 - various small fixes
40 //    1.21    - 2021-07-02 - fix bug for files with no comments
41 //    1.20    - 2020-07-11 - several small fixes
42 //    1.19    - 2020-02-05 - warnings
43 //    1.18    - 2020-02-02 - fix seek bugs; parse header comments; misc warnings etc.
44 //    1.17    - 2019-07-08 - fix CVE-2019-13217..CVE-2019-13223 (by ForAllSecure)
45 //    1.16    - 2019-03-04 - fix warnings
46 //    1.15    - 2019-02-07 - explicit failure if Ogg Skeleton data is found
47 //    1.14    - 2018-02-11 - delete bogus dealloca usage
48 //    1.13    - 2018-01-29 - fix truncation of last frame (hopefully)
49 //    1.12    - 2017-11-21 - limit residue begin/end to blocksize/2 to avoid large temp allocs in bad/corrupt files
50 //    1.11    - 2017-07-23 - fix MinGW compilation
51 //    1.10    - 2017-03-03 - more robust seeking; fix negative ilog(); clear error in open_memory
52 //    1.09    - 2016-04-04 - back out 'truncation of last frame' fix from previous version
53 //    1.08    - 2016-04-02 - warnings; setup memory leaks; truncation of last frame
54 //    1.07    - 2015-01-16 - fixes for crashes on invalid files; warning fixes; const
55 //    1.06    - 2015-08-31 - full, correct support for seeking API (Dougall Johnson)
56 //                           some crash fixes when out of memory or with corrupt files
57 //                           fix some inappropriately signed shifts
58 //    1.05    - 2015-04-19 - don't define __forceinline if it's redundant
59 //    1.04    - 2014-08-27 - fix missing const-correct case in API
60 //    1.03    - 2014-08-07 - warning fixes
61 //    1.02    - 2014-07-09 - declare qsort comparison as explicitly _cdecl in Windows
62 //    1.01    - 2014-06-18 - fix stb_vorbis_get_samples_float (interleaved was correct)
63 //    1.0     - 2014-05-26 - fix memory leaks; fix warnings; fix bugs in >2-channel;
64 //                           (API change) report sample rate for decode-full-file funcs
65 //
66 // See end of file for full version history.
67 
68 
69 //////////////////////////////////////////////////////////////////////////////
70 //
71 //  HEADER BEGINS HERE
72 //
73 
74 #ifndef STB_VORBIS_INCLUDE_STB_VORBIS_H
75 #define STB_VORBIS_INCLUDE_STB_VORBIS_H
76 
77 #if defined(STB_VORBIS_NO_CRT) && !defined(STB_VORBIS_NO_STDIO)
78 #define STB_VORBIS_NO_STDIO 1
79 #endif
80 
81 #ifndef STB_VORBIS_NO_STDIO
82 #include <stdio.h>
83 #endif
84 
85 #ifdef __cplusplus
86 extern "C" {
87 #endif
88 
89 ///////////   THREAD SAFETY
90 
91 // Individual stb_vorbis* handles are not thread-safe; you cannot decode from
92 // them from multiple threads at the same time. However, you can have multiple
93 // stb_vorbis* handles and decode from them independently in multiple thrads.
94 
95 
96 ///////////   MEMORY ALLOCATION
97 
98 // normally stb_vorbis uses malloc() to allocate memory at startup,
99 // and alloca() to allocate temporary memory during a frame on the
100 // stack. (Memory consumption will depend on the amount of setup
101 // data in the file and how you set the compile flags for speed
102 // vs. size. In my test files the maximal-size usage is ~150KB.)
103 //
104 // You can modify the wrapper functions in the source (setup_malloc,
105 // setup_temp_malloc, temp_malloc) to change this behavior, or you
106 // can use a simpler allocation model: you pass in a buffer from
107 // which stb_vorbis will allocate _all_ its memory (including the
108 // temp memory). "open" may fail with a VORBIS_outofmem if you
109 // do not pass in enough data; there is no way to determine how
110 // much you do need except to succeed (at which point you can
111 // query get_info to find the exact amount required. yes I know
112 // this is lame).
113 //
114 // If you pass in a non-NULL buffer of the type below, allocation
115 // will occur from it as described above. Otherwise just pass NULL
116 // to use malloc()/alloca()
117 
118 typedef struct
119 {
120    char *alloc_buffer;
121    int   alloc_buffer_length_in_bytes;
122 } stb_vorbis_alloc;
123 
124 
125 ///////////   FUNCTIONS USEABLE WITH ALL INPUT MODES
126 
127 typedef struct stb_vorbis stb_vorbis;
128 
129 typedef struct
130 {
131    unsigned int sample_rate;
132    int channels;
133 
134    unsigned int setup_memory_required;
135    unsigned int setup_temp_memory_required;
136    unsigned int temp_memory_required;
137 
138    int max_frame_size;
139 } stb_vorbis_info;
140 
141 typedef struct
142 {
143    char *vendor;
144 
145    int comment_list_length;
146    char **comment_list;
147 } stb_vorbis_comment;
148 
149 // get general information about the file
150 extern stb_vorbis_info stb_vorbis_get_info(stb_vorbis *f);
151 
152 // get ogg comments
153 extern stb_vorbis_comment stb_vorbis_get_comment(stb_vorbis *f);
154 
155 // get the last error detected (clears it, too)
156 extern int stb_vorbis_get_error(stb_vorbis *f);
157 
158 // close an ogg vorbis file and free all memory in use
159 extern void stb_vorbis_close(stb_vorbis *f);
160 
161 // this function returns the offset (in samples) from the beginning of the
162 // file that will be returned by the next decode, if it is known, or -1
163 // otherwise. after a flush_pushdata() call, this may take a while before
164 // it becomes valid again.
165 // NOT WORKING YET after a seek with PULLDATA API
166 extern int stb_vorbis_get_sample_offset(stb_vorbis *f);
167 
168 // returns the current seek point within the file, or offset from the beginning
169 // of the memory buffer. In pushdata mode it returns 0.
170 extern unsigned int stb_vorbis_get_file_offset(stb_vorbis *f);
171 
172 ///////////   PUSHDATA API
173 
174 #ifndef STB_VORBIS_NO_PUSHDATA_API
175 
176 // this API allows you to get blocks of data from any source and hand
177 // them to stb_vorbis. you have to buffer them; stb_vorbis will tell
178 // you how much it used, and you have to give it the rest next time;
179 // and stb_vorbis may not have enough data to work with and you will
180 // need to give it the same data again PLUS more. Note that the Vorbis
181 // specification does not bound the size of an individual frame.
182 
183 extern stb_vorbis *stb_vorbis_open_pushdata(
184          const unsigned char * datablock, int datablock_length_in_bytes,
185          int *datablock_memory_consumed_in_bytes,
186          int *error,
187          const stb_vorbis_alloc *alloc_buffer);
188 // create a vorbis decoder by passing in the initial data block containing
189 //    the ogg&vorbis headers (you don't need to do parse them, just provide
190 //    the first N bytes of the file--you're told if it's not enough, see below)
191 // on success, returns an stb_vorbis *, does not set error, returns the amount of
192 //    data parsed/consumed on this call in *datablock_memory_consumed_in_bytes;
193 // on failure, returns NULL on error and sets *error, does not change *datablock_memory_consumed
194 // if returns NULL and *error is VORBIS_need_more_data, then the input block was
195 //       incomplete and you need to pass in a larger block from the start of the file
196 
197 extern int stb_vorbis_decode_frame_pushdata(
198          stb_vorbis *f,
199          const unsigned char *datablock, int datablock_length_in_bytes,
200          int *channels,             // place to write number of float * buffers
201          float ***output,           // place to write float ** array of float * buffers
202          int *samples               // place to write number of output samples
203      );
204 // decode a frame of audio sample data if possible from the passed-in data block
205 //
206 // return value: number of bytes we used from datablock
207 //
208 // possible cases:
209 //     0 bytes used, 0 samples output (need more data)
210 //     N bytes used, 0 samples output (resynching the stream, keep going)
211 //     N bytes used, M samples output (one frame of data)
212 // note that after opening a file, you will ALWAYS get one N-bytes,0-sample
213 // frame, because Vorbis always "discards" the first frame.
214 //
215 // Note that on resynch, stb_vorbis will rarely consume all of the buffer,
216 // instead only datablock_length_in_bytes-3 or less. This is because it wants
217 // to avoid missing parts of a page header if they cross a datablock boundary,
218 // without writing state-machiney code to record a partial detection.
219 //
220 // The number of channels returned are stored in *channels (which can be
221 // NULL--it is always the same as the number of channels reported by
222 // get_info). *output will contain an array of float* buffers, one per
223 // channel. In other words, (*output)[0][0] contains the first sample from
224 // the first channel, and (*output)[1][0] contains the first sample from
225 // the second channel.
226 //
227 // *output points into stb_vorbis's internal output buffer storage; these
228 // buffers are owned by stb_vorbis and application code should not free
229 // them or modify their contents. They are transient and will be overwritten
230 // once you ask for more data to get decoded, so be sure to grab any data
231 // you need before then.
232 
233 extern void stb_vorbis_flush_pushdata(stb_vorbis *f);
234 // inform stb_vorbis that your next datablock will not be contiguous with
235 // previous ones (e.g. you've seeked in the data); future attempts to decode
236 // frames will cause stb_vorbis to resynchronize (as noted above), and
237 // once it sees a valid Ogg page (typically 4-8KB, as large as 64KB), it
238 // will begin decoding the _next_ frame.
239 //
240 // if you want to seek using pushdata, you need to seek in your file, then
241 // call stb_vorbis_flush_pushdata(), then start calling decoding, then once
242 // decoding is returning you data, call stb_vorbis_get_sample_offset, and
243 // if you don't like the result, seek your file again and repeat.
244 #endif
245 
246 
247 //////////   PULLING INPUT API
248 
249 #ifndef STB_VORBIS_NO_PULLDATA_API
250 // This API assumes stb_vorbis is allowed to pull data from a source--
251 // either a block of memory containing the _entire_ vorbis stream, or a
252 // FILE * that you or it create, or possibly some other reading mechanism
253 // if you go modify the source to replace the FILE * case with some kind
254 // of callback to your code. (But if you don't support seeking, you may
255 // just want to go ahead and use pushdata.)
256 
257 #if !defined(STB_VORBIS_NO_STDIO) && !defined(STB_VORBIS_NO_INTEGER_CONVERSION)
258 extern int stb_vorbis_decode_filename(const char *filename, int *channels, int *sample_rate, short **output);
259 #endif
260 #if !defined(STB_VORBIS_NO_INTEGER_CONVERSION)
261 extern int stb_vorbis_decode_memory(const unsigned char *mem, int len, int *channels, int *sample_rate, short **output);
262 #endif
263 // decode an entire file and output the data interleaved into a malloc()ed
264 // buffer stored in *output. The return value is the number of samples
265 // decoded, or -1 if the file could not be opened or was not an ogg vorbis file.
266 // When you're done with it, just free() the pointer returned in *output.
267 
268 extern stb_vorbis * stb_vorbis_open_memory(const unsigned char *data, int len,
269                                   int *error, const stb_vorbis_alloc *alloc_buffer);
270 // create an ogg vorbis decoder from an ogg vorbis stream in memory (note
271 // this must be the entire stream!). on failure, returns NULL and sets *error
272 
273 #ifndef STB_VORBIS_NO_STDIO
274 extern stb_vorbis * stb_vorbis_open_filename(const char *filename,
275                                   int *error, const stb_vorbis_alloc *alloc_buffer);
276 // create an ogg vorbis decoder from a filename via fopen(). on failure,
277 // returns NULL and sets *error (possibly to VORBIS_file_open_failure).
278 
279 extern stb_vorbis * stb_vorbis_open_file(FILE *f, int close_handle_on_close,
280                                   int *error, const stb_vorbis_alloc *alloc_buffer);
281 // create an ogg vorbis decoder from an open FILE *, looking for a stream at
282 // the _current_ seek point (ftell). on failure, returns NULL and sets *error.
283 // note that stb_vorbis must "own" this stream; if you seek it in between
284 // calls to stb_vorbis, it will become confused. Moreover, if you attempt to
285 // perform stb_vorbis_seek_*() operations on this file, it will assume it
286 // owns the _entire_ rest of the file after the start point. Use the next
287 // function, stb_vorbis_open_file_section(), to limit it.
288 
289 extern stb_vorbis * stb_vorbis_open_file_section(FILE *f, int close_handle_on_close,
290                 int *error, const stb_vorbis_alloc *alloc_buffer, unsigned int len);
291 // create an ogg vorbis decoder from an open FILE *, looking for a stream at
292 // the _current_ seek point (ftell); the stream will be of length 'len' bytes.
293 // on failure, returns NULL and sets *error. note that stb_vorbis must "own"
294 // this stream; if you seek it in between calls to stb_vorbis, it will become
295 // confused.
296 #endif
297 
298 extern int stb_vorbis_seek_frame(stb_vorbis *f, unsigned int sample_number);
299 extern int stb_vorbis_seek(stb_vorbis *f, unsigned int sample_number);
300 // these functions seek in the Vorbis file to (approximately) 'sample_number'.
301 // after calling seek_frame(), the next call to get_frame_*() will include
302 // the specified sample. after calling stb_vorbis_seek(), the next call to
303 // stb_vorbis_get_samples_* will start with the specified sample. If you
304 // do not need to seek to EXACTLY the target sample when using get_samples_*,
305 // you can also use seek_frame().
306 
307 extern int stb_vorbis_seek_start(stb_vorbis *f);
308 // this function is equivalent to stb_vorbis_seek(f,0)
309 
310 extern unsigned int stb_vorbis_stream_length_in_samples(stb_vorbis *f);
311 extern float        stb_vorbis_stream_length_in_seconds(stb_vorbis *f);
312 // these functions return the total length of the vorbis stream
313 
314 extern int stb_vorbis_get_frame_float(stb_vorbis *f, int *channels, float ***output);
315 // decode the next frame and return the number of samples. the number of
316 // channels returned are stored in *channels (which can be NULL--it is always
317 // the same as the number of channels reported by get_info). *output will
318 // contain an array of float* buffers, one per channel. These outputs will
319 // be overwritten on the next call to stb_vorbis_get_frame_*.
320 //
321 // You generally should not intermix calls to stb_vorbis_get_frame_*()
322 // and stb_vorbis_get_samples_*(), since the latter calls the former.
323 
324 #ifndef STB_VORBIS_NO_INTEGER_CONVERSION
325 extern int stb_vorbis_get_frame_short_interleaved(stb_vorbis *f, int num_c, short *buffer, int num_shorts);
326 extern int stb_vorbis_get_frame_short            (stb_vorbis *f, int num_c, short **buffer, int num_samples);
327 #endif
328 // decode the next frame and return the number of *samples* per channel.
329 // Note that for interleaved data, you pass in the number of shorts (the
330 // size of your array), but the return value is the number of samples per
331 // channel, not the total number of samples.
332 //
333 // The data is coerced to the number of channels you request according to the
334 // channel coercion rules (see below). You must pass in the size of your
335 // buffer(s) so that stb_vorbis will not overwrite the end of the buffer.
336 // The maximum buffer size needed can be gotten from get_info(); however,
337 // the Vorbis I specification implies an absolute maximum of 4096 samples
338 // per channel.
339 
340 // Channel coercion rules:
341 //    Let M be the number of channels requested, and N the number of channels present,
342 //    and Cn be the nth channel; let stereo L be the sum of all L and center channels,
343 //    and stereo R be the sum of all R and center channels (channel assignment from the
344 //    vorbis spec).
345 //        M    N       output
346 //        1    k      sum(Ck) for all k
347 //        2    *      stereo L, stereo R
348 //        k    l      k > l, the first l channels, then 0s
349 //        k    l      k <= l, the first k channels
350 //    Note that this is not _good_ surround etc. mixing at all! It's just so
351 //    you get something useful.
352 
353 extern int stb_vorbis_get_samples_float_interleaved(stb_vorbis *f, int channels, float *buffer, int num_floats);
354 extern int stb_vorbis_get_samples_float(stb_vorbis *f, int channels, float **buffer, int num_samples);
355 // gets num_samples samples, not necessarily on a frame boundary--this requires
356 // buffering so you have to supply the buffers. DOES NOT APPLY THE COERCION RULES.
357 // Returns the number of samples stored per channel; it may be less than requested
358 // at the end of the file. If there are no more samples in the file, returns 0.
359 
360 #ifndef STB_VORBIS_NO_INTEGER_CONVERSION
361 extern int stb_vorbis_get_samples_short_interleaved(stb_vorbis *f, int channels, short *buffer, int num_shorts);
362 extern int stb_vorbis_get_samples_short(stb_vorbis *f, int channels, short **buffer, int num_samples);
363 #endif
364 // gets num_samples samples, not necessarily on a frame boundary--this requires
365 // buffering so you have to supply the buffers. Applies the coercion rules above
366 // to produce 'channels' channels. Returns the number of samples stored per channel;
367 // it may be less than requested at the end of the file. If there are no more
368 // samples in the file, returns 0.
369 
370 #endif
371 
372 ////////   ERROR CODES
373 
374 enum STBVorbisError
375 {
376    VORBIS__no_error,
377 
378    VORBIS_need_more_data=1,             // not a real error
379 
380    VORBIS_invalid_api_mixing,           // can't mix API modes
381    VORBIS_outofmem,                     // not enough memory
382    VORBIS_feature_not_supported,        // uses floor 0
383    VORBIS_too_many_channels,            // STB_VORBIS_MAX_CHANNELS is too small
384    VORBIS_file_open_failure,            // fopen() failed
385    VORBIS_seek_without_length,          // can't seek in unknown-length file
386 
387    VORBIS_unexpected_eof=10,            // file is truncated?
388    VORBIS_seek_invalid,                 // seek past EOF
389 
390    // decoding errors (corrupt/invalid stream) -- you probably
391    // don't care about the exact details of these
392 
393    // vorbis errors:
394    VORBIS_invalid_setup=20,
395    VORBIS_invalid_stream,
396 
397    // ogg errors:
398    VORBIS_missing_capture_pattern=30,
399    VORBIS_invalid_stream_structure_version,
400    VORBIS_continued_packet_flag_invalid,
401    VORBIS_incorrect_stream_serial_number,
402    VORBIS_invalid_first_page,
403    VORBIS_bad_packet_type,
404    VORBIS_cant_find_last_page,
405    VORBIS_seek_failed,
406    VORBIS_ogg_skeleton_not_supported
407 };
408 
409 
410 #ifdef __cplusplus
411 }
412 #endif
413 
414 #endif // STB_VORBIS_INCLUDE_STB_VORBIS_H
415 //
416 //  HEADER ENDS HERE
417 //
418 //////////////////////////////////////////////////////////////////////////////
419 
420 #ifndef STB_VORBIS_HEADER_ONLY
421 
422 // global configuration settings (e.g. set these in the project/makefile),
423 // or just set them in this file at the top (although ideally the first few
424 // should be visible when the header file is compiled too, although it's not
425 // crucial)
426 
427 // STB_VORBIS_NO_PUSHDATA_API
428 //     does not compile the code for the various stb_vorbis_*_pushdata()
429 //     functions
430 // #define STB_VORBIS_NO_PUSHDATA_API
431 
432 // STB_VORBIS_NO_PULLDATA_API
433 //     does not compile the code for the non-pushdata APIs
434 // #define STB_VORBIS_NO_PULLDATA_API
435 
436 // STB_VORBIS_NO_STDIO
437 //     does not compile the code for the APIs that use FILE *s internally
438 //     or externally (implied by STB_VORBIS_NO_PULLDATA_API)
439 // #define STB_VORBIS_NO_STDIO
440 
441 // STB_VORBIS_NO_INTEGER_CONVERSION
442 //     does not compile the code for converting audio sample data from
443 //     float to integer (implied by STB_VORBIS_NO_PULLDATA_API)
444 // #define STB_VORBIS_NO_INTEGER_CONVERSION
445 
446 // STB_VORBIS_NO_FAST_SCALED_FLOAT
447 //      does not use a fast float-to-int trick to accelerate float-to-int on
448 //      most platforms which requires endianness be defined correctly.
449 //#define STB_VORBIS_NO_FAST_SCALED_FLOAT
450 
451 
452 // STB_VORBIS_MAX_CHANNELS [number]
453 //     globally define this to the maximum number of channels you need.
454 //     The spec does not put a restriction on channels except that
455 //     the count is stored in a byte, so 255 is the hard limit.
456 //     Reducing this saves about 16 bytes per value, so using 16 saves
457 //     (255-16)*16 or around 4KB. Plus anything other memory usage
458 //     I forgot to account for. Can probably go as low as 8 (7.1 audio),
459 //     6 (5.1 audio), or 2 (stereo only).
460 #ifndef STB_VORBIS_MAX_CHANNELS
461 #define STB_VORBIS_MAX_CHANNELS    16  // enough for anyone?
462 #endif
463 
464 // STB_VORBIS_PUSHDATA_CRC_COUNT [number]
465 //     after a flush_pushdata(), stb_vorbis begins scanning for the
466 //     next valid page, without backtracking. when it finds something
467 //     that looks like a page, it streams through it and verifies its
468 //     CRC32. Should that validation fail, it keeps scanning. But it's
469 //     possible that _while_ streaming through to check the CRC32 of
470 //     one candidate page, it sees another candidate page. This #define
471 //     determines how many "overlapping" candidate pages it can search
472 //     at once. Note that "real" pages are typically ~4KB to ~8KB, whereas
473 //     garbage pages could be as big as 64KB, but probably average ~16KB.
474 //     So don't hose ourselves by scanning an apparent 64KB page and
475 //     missing a ton of real ones in the interim; so minimum of 2
476 #ifndef STB_VORBIS_PUSHDATA_CRC_COUNT
477 #define STB_VORBIS_PUSHDATA_CRC_COUNT  4
478 #endif
479 
480 // STB_VORBIS_FAST_HUFFMAN_LENGTH [number]
481 //     sets the log size of the huffman-acceleration table.  Maximum
482 //     supported value is 24. with larger numbers, more decodings are O(1),
483 //     but the table size is larger so worse cache missing, so you'll have
484 //     to probe (and try multiple ogg vorbis files) to find the sweet spot.
485 #ifndef STB_VORBIS_FAST_HUFFMAN_LENGTH
486 #define STB_VORBIS_FAST_HUFFMAN_LENGTH   10
487 #endif
488 
489 // STB_VORBIS_FAST_BINARY_LENGTH [number]
490 //     sets the log size of the binary-search acceleration table. this
491 //     is used in similar fashion to the fast-huffman size to set initial
492 //     parameters for the binary search
493 
494 // STB_VORBIS_FAST_HUFFMAN_INT
495 //     The fast huffman tables are much more efficient if they can be
496 //     stored as 16-bit results instead of 32-bit results. This restricts
497 //     the codebooks to having only 65535 possible outcomes, though.
498 //     (At least, accelerated by the huffman table.)
499 #ifndef STB_VORBIS_FAST_HUFFMAN_INT
500 #define STB_VORBIS_FAST_HUFFMAN_SHORT
501 #endif
502 
503 // STB_VORBIS_NO_HUFFMAN_BINARY_SEARCH
504 //     If the 'fast huffman' search doesn't succeed, then stb_vorbis falls
505 //     back on binary searching for the correct one. This requires storing
506 //     extra tables with the huffman codes in sorted order. Defining this
507 //     symbol trades off space for speed by forcing a linear search in the
508 //     non-fast case, except for "sparse" codebooks.
509 // #define STB_VORBIS_NO_HUFFMAN_BINARY_SEARCH
510 
511 // STB_VORBIS_DIVIDES_IN_RESIDUE
512 //     stb_vorbis precomputes the result of the scalar residue decoding
513 //     that would otherwise require a divide per chunk. you can trade off
514 //     space for time by defining this symbol.
515 // #define STB_VORBIS_DIVIDES_IN_RESIDUE
516 
517 // STB_VORBIS_DIVIDES_IN_CODEBOOK
518 //     vorbis VQ codebooks can be encoded two ways: with every case explicitly
519 //     stored, or with all elements being chosen from a small range of values,
520 //     and all values possible in all elements. By default, stb_vorbis expands
521 //     this latter kind out to look like the former kind for ease of decoding,
522 //     because otherwise an integer divide-per-vector-element is required to
523 //     unpack the index. If you define STB_VORBIS_DIVIDES_IN_CODEBOOK, you can
524 //     trade off storage for speed.
525 //#define STB_VORBIS_DIVIDES_IN_CODEBOOK
526 
527 #ifdef STB_VORBIS_CODEBOOK_SHORTS
528 #error "STB_VORBIS_CODEBOOK_SHORTS is no longer supported as it produced incorrect results for some input formats"
529 #endif
530 
531 // STB_VORBIS_DIVIDE_TABLE
532 //     this replaces small integer divides in the floor decode loop with
533 //     table lookups. made less than 1% difference, so disabled by default.
534 
535 // STB_VORBIS_NO_INLINE_DECODE
536 //     disables the inlining of the scalar codebook fast-huffman decode.
537 //     might save a little codespace; useful for debugging
538 // #define STB_VORBIS_NO_INLINE_DECODE
539 
540 // STB_VORBIS_NO_DEFER_FLOOR
541 //     Normally we only decode the floor without synthesizing the actual
542 //     full curve. We can instead synthesize the curve immediately. This
543 //     requires more memory and is very likely slower, so I don't think
544 //     you'd ever want to do it except for debugging.
545 // #define STB_VORBIS_NO_DEFER_FLOOR
546 
547 
548 
549 
550 //////////////////////////////////////////////////////////////////////////////
551 
552 #ifdef STB_VORBIS_NO_PULLDATA_API
553    #define STB_VORBIS_NO_INTEGER_CONVERSION
554    #define STB_VORBIS_NO_STDIO
555 #endif
556 
557 #if defined(STB_VORBIS_NO_CRT) && !defined(STB_VORBIS_NO_STDIO)
558    #define STB_VORBIS_NO_STDIO 1
559 #endif
560 
561 #ifndef STB_VORBIS_NO_INTEGER_CONVERSION
562 #ifndef STB_VORBIS_NO_FAST_SCALED_FLOAT
563 
564    // only need endianness for fast-float-to-int, which we don't
565    // use for pushdata
566 
567    #ifndef STB_VORBIS_BIG_ENDIAN
568      #define STB_VORBIS_ENDIAN  0
569    #else
570      #define STB_VORBIS_ENDIAN  1
571    #endif
572 
573 #endif
574 #endif
575 
576 
577 #ifndef STB_VORBIS_NO_STDIO
578 #include <stdio.h>
579 #endif
580 
581 #ifndef STB_VORBIS_NO_CRT
582    #include <stdlib.h>
583    #include <string.h>
584    #include <assert.h>
585    #include <math.h>
586 
587    // find definition of alloca if it's not in stdlib.h:
588    #if defined(_MSC_VER) || defined(__MINGW32__)
589       #include <malloc.h>
590    #endif
591    #if defined(__linux__) || defined(__linux) || defined(__sun__) || defined(__EMSCRIPTEN__) || defined(__NEWLIB__)
592       #include <alloca.h>
593    #endif
594 #else // STB_VORBIS_NO_CRT
595    #define NULL 0
596    #define malloc(s)   0
597    #define free(s)     ((void) 0)
598    #define realloc(s)  0
599 #endif // STB_VORBIS_NO_CRT
600 
601 #include <limits.h>
602 
603 #ifdef __MINGW32__
604    // eff you mingw:
605    //     "fixed":
606    //         http://sourceforge.net/p/mingw-w64/mailman/message/32882927/
607    //     "no that broke the build, reverted, who cares about C":
608    //         http://sourceforge.net/p/mingw-w64/mailman/message/32890381/
609    #ifdef __forceinline
610    #undef __forceinline
611    #endif
612    #define __forceinline
613    #ifndef alloca
614    #define alloca __builtin_alloca
615    #endif
616 #elif !defined(_MSC_VER)
617    #if __GNUC__
618       #define __forceinline inline
619    #else
620       #define __forceinline
621    #endif
622 #endif
623 
624 #if STB_VORBIS_MAX_CHANNELS > 256
625 #error "Value of STB_VORBIS_MAX_CHANNELS outside of allowed range"
626 #endif
627 
628 #if STB_VORBIS_FAST_HUFFMAN_LENGTH > 24
629 #error "Value of STB_VORBIS_FAST_HUFFMAN_LENGTH outside of allowed range"
630 #endif
631 
632 
633 #if 0
634 #include <crtdbg.h>
635 #define CHECK(f)   _CrtIsValidHeapPointer(f->channel_buffers[1])
636 #else
637 #define CHECK(f)   ((void) 0)
638 #endif
639 
640 #define MAX_BLOCKSIZE_LOG  13   // from specification
641 #define MAX_BLOCKSIZE      (1 << MAX_BLOCKSIZE_LOG)
642 
643 
644 typedef unsigned char  uint8;
645 typedef   signed char   int8;
646 typedef unsigned short uint16;
647 typedef   signed short  int16;
648 typedef unsigned int   uint32;
649 typedef   signed int    int32;
650 
651 #ifndef TRUE
652 #define TRUE 1
653 #define FALSE 0
654 #endif
655 
656 typedef float codetype;
657 
658 #ifdef _MSC_VER
659 #define STBV_NOTUSED(v)  (void)(v)
660 #else
661 #define STBV_NOTUSED(v)  (void)sizeof(v)
662 #endif
663 
664 // @NOTE
665 //
666 // Some arrays below are tagged "//varies", which means it's actually
667 // a variable-sized piece of data, but rather than malloc I assume it's
668 // small enough it's better to just allocate it all together with the
669 // main thing
670 //
671 // Most of the variables are specified with the smallest size I could pack
672 // them into. It might give better performance to make them all full-sized
673 // integers. It should be safe to freely rearrange the structures or change
674 // the sizes larger--nothing relies on silently truncating etc., nor the
675 // order of variables.
676 
677 #define FAST_HUFFMAN_TABLE_SIZE   (1 << STB_VORBIS_FAST_HUFFMAN_LENGTH)
678 #define FAST_HUFFMAN_TABLE_MASK   (FAST_HUFFMAN_TABLE_SIZE - 1)
679 
680 typedef struct
681 {
682    int dimensions, entries;
683    uint8 *codeword_lengths;
684    float  minimum_value;
685    float  delta_value;
686    uint8  value_bits;
687    uint8  lookup_type;
688    uint8  sequence_p;
689    uint8  sparse;
690    uint32 lookup_values;
691    codetype *multiplicands;
692    uint32 *codewords;
693    #ifdef STB_VORBIS_FAST_HUFFMAN_SHORT
694     int16  fast_huffman[FAST_HUFFMAN_TABLE_SIZE];
695    #else
696     int32  fast_huffman[FAST_HUFFMAN_TABLE_SIZE];
697    #endif
698    uint32 *sorted_codewords;
699    int    *sorted_values;
700    int     sorted_entries;
701 } Codebook;
702 
703 typedef struct
704 {
705    uint8 order;
706    uint16 rate;
707    uint16 bark_map_size;
708    uint8 amplitude_bits;
709    uint8 amplitude_offset;
710    uint8 number_of_books;
711    uint8 book_list[16]; // varies
712 } Floor0;
713 
714 typedef struct
715 {
716    uint8 partitions;
717    uint8 partition_class_list[32]; // varies
718    uint8 class_dimensions[16]; // varies
719    uint8 class_subclasses[16]; // varies
720    uint8 class_masterbooks[16]; // varies
721    int16 subclass_books[16][8]; // varies
722    uint16 Xlist[31*8+2]; // varies
723    uint8 sorted_order[31*8+2];
724    uint8 neighbors[31*8+2][2];
725    uint8 floor1_multiplier;
726    uint8 rangebits;
727    int values;
728 } Floor1;
729 
730 typedef union
731 {
732    Floor0 floor0;
733    Floor1 floor1;
734 } Floor;
735 
736 typedef struct
737 {
738    uint32 begin, end;
739    uint32 part_size;
740    uint8 classifications;
741    uint8 classbook;
742    uint8 **classdata;
743    int16 (*residue_books)[8];
744 } Residue;
745 
746 typedef struct
747 {
748    uint8 magnitude;
749    uint8 angle;
750    uint8 mux;
751 } MappingChannel;
752 
753 typedef struct
754 {
755    uint16 coupling_steps;
756    MappingChannel *chan;
757    uint8  submaps;
758    uint8  submap_floor[15]; // varies
759    uint8  submap_residue[15]; // varies
760 } Mapping;
761 
762 typedef struct
763 {
764    uint8 blockflag;
765    uint8 mapping;
766    uint16 windowtype;
767    uint16 transformtype;
768 } Mode;
769 
770 typedef struct
771 {
772    uint32  goal_crc;    // expected crc if match
773    int     bytes_left;  // bytes left in packet
774    uint32  crc_so_far;  // running crc
775    int     bytes_done;  // bytes processed in _current_ chunk
776    uint32  sample_loc;  // granule pos encoded in page
777 } CRCscan;
778 
779 typedef struct
780 {
781    uint32 page_start, page_end;
782    uint32 last_decoded_sample;
783 } ProbedPage;
784 
785 struct stb_vorbis
786 {
787   // user-accessible info
788    unsigned int sample_rate;
789    int channels;
790 
791    unsigned int setup_memory_required;
792    unsigned int temp_memory_required;
793    unsigned int setup_temp_memory_required;
794 
795    char *vendor;
796    int comment_list_length;
797    char **comment_list;
798 
799   // input config
800 #ifndef STB_VORBIS_NO_STDIO
801    FILE *f;
802    uint32 f_start;
803    int close_on_free;
804 #endif
805 
806    uint8 *stream;
807    uint8 *stream_start;
808    uint8 *stream_end;
809 
810    uint32 stream_len;
811 
812    uint8  push_mode;
813 
814    // the page to seek to when seeking to start, may be zero
815    uint32 first_audio_page_offset;
816 
817    // p_first is the page on which the first audio packet ends
818    // (but not necessarily the page on which it starts)
819    ProbedPage p_first, p_last;
820 
821   // memory management
822    stb_vorbis_alloc alloc;
823    int setup_offset;
824    int temp_offset;
825 
826   // run-time results
827    int eof;
828    enum STBVorbisError error;
829 
830   // user-useful data
831 
832   // header info
833    int blocksize[2];
834    int blocksize_0, blocksize_1;
835    int codebook_count;
836    Codebook *codebooks;
837    int floor_count;
838    uint16 floor_types[64]; // varies
839    Floor *floor_config;
840    int residue_count;
841    uint16 residue_types[64]; // varies
842    Residue *residue_config;
843    int mapping_count;
844    Mapping *mapping;
845    int mode_count;
846    Mode mode_config[64];  // varies
847 
848    uint32 total_samples;
849 
850   // decode buffer
851    float *channel_buffers[STB_VORBIS_MAX_CHANNELS];
852    float *outputs        [STB_VORBIS_MAX_CHANNELS];
853 
854    float *previous_window[STB_VORBIS_MAX_CHANNELS];
855    int previous_length;
856 
857    #ifndef STB_VORBIS_NO_DEFER_FLOOR
858    int16 *finalY[STB_VORBIS_MAX_CHANNELS];
859    #else
860    float *floor_buffers[STB_VORBIS_MAX_CHANNELS];
861    #endif
862 
863    uint32 current_loc; // sample location of next frame to decode
864    int    current_loc_valid;
865 
866   // per-blocksize precomputed data
867 
868    // twiddle factors
869    float *A[2],*B[2],*C[2];
870    float *window[2];
871    uint16 *bit_reverse[2];
872 
873   // current page/packet/segment streaming info
874    uint32 serial; // stream serial number for verification
875    int last_page;
876    int segment_count;
877    uint8 segments[255];
878    uint8 page_flag;
879    uint8 bytes_in_seg;
880    uint8 first_decode;
881    int next_seg;
882    int last_seg;  // flag that we're on the last segment
883    int last_seg_which; // what was the segment number of the last seg?
884    uint32 acc;
885    int valid_bits;
886    int packet_bytes;
887    int end_seg_with_known_loc;
888    uint32 known_loc_for_packet;
889    int discard_samples_deferred;
890    uint32 samples_output;
891 
892   // push mode scanning
893    int page_crc_tests; // only in push_mode: number of tests active; -1 if not searching
894 #ifndef STB_VORBIS_NO_PUSHDATA_API
895    CRCscan scan[STB_VORBIS_PUSHDATA_CRC_COUNT];
896 #endif
897 
898   // sample-access
899    int channel_buffer_start;
900    int channel_buffer_end;
901 };
902 
903 #if defined(STB_VORBIS_NO_PUSHDATA_API)
904    #define IS_PUSH_MODE(f)   FALSE
905 #elif defined(STB_VORBIS_NO_PULLDATA_API)
906    #define IS_PUSH_MODE(f)   TRUE
907 #else
908    #define IS_PUSH_MODE(f)   ((f)->push_mode)
909 #endif
910 
911 typedef struct stb_vorbis vorb;
912 
913 static int error(vorb *f, enum STBVorbisError e)
914 {
915    f->error = e;
916    if (!f->eof && e != VORBIS_need_more_data) {
917       f->error=e; // breakpoint for debugging
918    }
919    return 0;
920 }
921 
922 
923 // these functions are used for allocating temporary memory
924 // while decoding. if you can afford the stack space, use
925 // alloca(); otherwise, provide a temp buffer and it will
926 // allocate out of those.
927 
928 #define array_size_required(count,size)  (count*(sizeof(void *)+(size)))
929 
930 #define temp_alloc(f,size)              (f->alloc.alloc_buffer ? setup_temp_malloc(f,size) : alloca(size))
931 #define temp_free(f,p)                  (void)0
932 #define temp_alloc_save(f)              ((f)->temp_offset)
933 #define temp_alloc_restore(f,p)         ((f)->temp_offset = (p))
934 
935 #define temp_block_array(f,count,size)  make_block_array(temp_alloc(f,array_size_required(count,size)), count, size)
936 
937 // given a sufficiently large block of memory, make an array of pointers to subblocks of it
938 static void *make_block_array(void *mem, int count, int size)
939 {
940    int i;
941    void ** p = (void **) mem;
942    char *q = (char *) (p + count);
943    for (i=0; i < count; ++i) {
944       p[i] = q;
945       q += size;
946    }
947    return p;
948 }
949 
950 static void *setup_malloc(vorb *f, int sz)
951 {
952    sz = (sz+7) & ~7; // round up to nearest 8 for alignment of future allocs.
953    f->setup_memory_required += sz;
954    if (f->alloc.alloc_buffer) {
955       void *p = (char *) f->alloc.alloc_buffer + f->setup_offset;
956       if (f->setup_offset + sz > f->temp_offset) return NULL;
957       f->setup_offset += sz;
958       return p;
959    }
960    return sz ? malloc(sz) : NULL;
961 }
962 
963 static void setup_free(vorb *f, void *p)
964 {
965    if (f->alloc.alloc_buffer) return; // do nothing; setup mem is a stack
966    free(p);
967 }
968 
969 static void *setup_temp_malloc(vorb *f, int sz)
970 {
971    sz = (sz+7) & ~7; // round up to nearest 8 for alignment of future allocs.
972    if (f->alloc.alloc_buffer) {
973       if (f->temp_offset - sz < f->setup_offset) return NULL;
974       f->temp_offset -= sz;
975       return (char *) f->alloc.alloc_buffer + f->temp_offset;
976    }
977    return malloc(sz);
978 }
979 
980 static void setup_temp_free(vorb *f, void *p, int sz)
981 {
982    if (f->alloc.alloc_buffer) {
983       f->temp_offset += (sz+7)&~7;
984       return;
985    }
986    free(p);
987 }
988 
989 #define CRC32_POLY    0x04c11db7   // from spec
990 
991 static uint32 crc_table[256];
992 static void crc32_init(void)
993 {
994    int i,j;
995    uint32 s;
996    for(i=0; i < 256; i++) {
997       for (s=(uint32) i << 24, j=0; j < 8; ++j)
998          s = (s << 1) ^ (s >= (1U<<31) ? CRC32_POLY : 0);
999       crc_table[i] = s;
1000    }
1001 }
1002 
1003 static __forceinline uint32 crc32_update(uint32 crc, uint8 byte)
1004 {
1005    return (crc << 8) ^ crc_table[byte ^ (crc >> 24)];
1006 }
1007 
1008 
1009 // used in setup, and for huffman that doesn't go fast path
1010 static unsigned int bit_reverse(unsigned int n)
1011 {
1012   n = ((n & 0xAAAAAAAA) >>  1) | ((n & 0x55555555) << 1);
1013   n = ((n & 0xCCCCCCCC) >>  2) | ((n & 0x33333333) << 2);
1014   n = ((n & 0xF0F0F0F0) >>  4) | ((n & 0x0F0F0F0F) << 4);
1015   n = ((n & 0xFF00FF00) >>  8) | ((n & 0x00FF00FF) << 8);
1016   return (n >> 16) | (n << 16);
1017 }
1018 
1019 static float square(float x)
1020 {
1021    return x*x;
1022 }
1023 
1024 // this is a weird definition of log2() for which log2(1) = 1, log2(2) = 2, log2(4) = 3
1025 // as required by the specification. fast(?) implementation from stb.h
1026 // @OPTIMIZE: called multiple times per-packet with "constants"; move to setup
1027 static int ilog(int32 n)
1028 {
1029    static signed char log2_4[16] = { 0,1,2,2,3,3,3,3,4,4,4,4,4,4,4,4 };
1030 
1031    if (n < 0) return 0; // signed n returns 0
1032 
1033    // 2 compares if n < 16, 3 compares otherwise (4 if signed or n > 1<<29)
1034    if (n < (1 << 14))
1035         if (n < (1 <<  4))            return  0 + log2_4[n      ];
1036         else if (n < (1 <<  9))       return  5 + log2_4[n >>  5];
1037              else                     return 10 + log2_4[n >> 10];
1038    else if (n < (1 << 24))
1039              if (n < (1 << 19))       return 15 + log2_4[n >> 15];
1040              else                     return 20 + log2_4[n >> 20];
1041         else if (n < (1 << 29))       return 25 + log2_4[n >> 25];
1042              else                     return 30 + log2_4[n >> 30];
1043 }
1044 
1045 #ifndef M_PI
1046   #define M_PI  3.14159265358979323846264f  // from CRC
1047 #endif
1048 
1049 // code length assigned to a value with no huffman encoding
1050 #define NO_CODE   255
1051 
1052 /////////////////////// LEAF SETUP FUNCTIONS //////////////////////////
1053 //
1054 // these functions are only called at setup, and only a few times
1055 // per file
1056 
1057 static float float32_unpack(uint32 x)
1058 {
1059    // from the specification
1060    uint32 mantissa = x & 0x1fffff;
1061    uint32 sign = x & 0x80000000;
1062    uint32 exp = (x & 0x7fe00000) >> 21;
1063    double res = sign ? -(double)mantissa : (double)mantissa;
1064    return (float) ldexp((float)res, (int)exp-788);
1065 }
1066 
1067 
1068 // zlib & jpeg huffman tables assume that the output symbols
1069 // can either be arbitrarily arranged, or have monotonically
1070 // increasing frequencies--they rely on the lengths being sorted;
1071 // this makes for a very simple generation algorithm.
1072 // vorbis allows a huffman table with non-sorted lengths. This
1073 // requires a more sophisticated construction, since symbols in
1074 // order do not map to huffman codes "in order".
1075 static void add_entry(Codebook *c, uint32 huff_code, int symbol, int count, int len, uint32 *values)
1076 {
1077    if (!c->sparse) {
1078       c->codewords      [symbol] = huff_code;
1079    } else {
1080       c->codewords       [count] = huff_code;
1081       c->codeword_lengths[count] = len;
1082       values             [count] = symbol;
1083    }
1084 }
1085 
1086 static int compute_codewords(Codebook *c, uint8 *len, int n, uint32 *values)
1087 {
1088    int i,k,m=0;
1089    uint32 available[32];
1090 
1091    memset(available, 0, sizeof(available));
1092    // find the first entry
1093    for (k=0; k < n; ++k) if (len[k] < NO_CODE) break;
1094    if (k == n) { assert(c->sorted_entries == 0); return TRUE; }
1095    assert(len[k] < 32); // no error return required, code reading lens checks this
1096    // add to the list
1097    add_entry(c, 0, k, m++, len[k], values);
1098    // add all available leaves
1099    for (i=1; i <= len[k]; ++i)
1100       available[i] = 1U << (32-i);
1101    // note that the above code treats the first case specially,
1102    // but it's really the same as the following code, so they
1103    // could probably be combined (except the initial code is 0,
1104    // and I use 0 in available[] to mean 'empty')
1105    for (i=k+1; i < n; ++i) {
1106       uint32 res;
1107       int z = len[i], y;
1108       if (z == NO_CODE) continue;
1109       assert(z < 32); // no error return required, code reading lens checks this
1110       // find lowest available leaf (should always be earliest,
1111       // which is what the specification calls for)
1112       // note that this property, and the fact we can never have
1113       // more than one free leaf at a given level, isn't totally
1114       // trivial to prove, but it seems true and the assert never
1115       // fires, so!
1116       while (z > 0 && !available[z]) --z;
1117       if (z == 0) { return FALSE; }
1118       res = available[z];
1119       available[z] = 0;
1120       add_entry(c, bit_reverse(res), i, m++, len[i], values);
1121       // propagate availability up the tree
1122       if (z != len[i]) {
1123          for (y=len[i]; y > z; --y) {
1124             assert(available[y] == 0);
1125             available[y] = res + (1 << (32-y));
1126          }
1127       }
1128    }
1129    return TRUE;
1130 }
1131 
1132 // accelerated huffman table allows fast O(1) match of all symbols
1133 // of length <= STB_VORBIS_FAST_HUFFMAN_LENGTH
1134 static void compute_accelerated_huffman(Codebook *c)
1135 {
1136    int i, len;
1137    for (i=0; i < FAST_HUFFMAN_TABLE_SIZE; ++i)
1138       c->fast_huffman[i] = -1;
1139 
1140    len = c->sparse ? c->sorted_entries : c->entries;
1141    #ifdef STB_VORBIS_FAST_HUFFMAN_SHORT
1142    if (len > 32767) len = 32767; // largest possible value we can encode!
1143    #endif
1144    for (i=0; i < len; ++i) {
1145       if (c->codeword_lengths[i] <= STB_VORBIS_FAST_HUFFMAN_LENGTH) {
1146          uint32 z = c->sparse ? bit_reverse(c->sorted_codewords[i]) : c->codewords[i];
1147          // set table entries for all bit combinations in the higher bits
1148          while (z < FAST_HUFFMAN_TABLE_SIZE) {
1149              c->fast_huffman[z] = i;
1150              z += 1 << c->codeword_lengths[i];
1151          }
1152       }
1153    }
1154 }
1155 
1156 #ifdef _MSC_VER
1157 #define STBV_CDECL __cdecl
1158 #else
1159 #define STBV_CDECL
1160 #endif
1161 
1162 static int STBV_CDECL uint32_compare(const void *p, const void *q)
1163 {
1164    uint32 x = * (uint32 *) p;
1165    uint32 y = * (uint32 *) q;
1166    return x < y ? -1 : x > y;
1167 }
1168 
1169 static int include_in_sort(Codebook *c, uint8 len)
1170 {
1171    if (c->sparse) { assert(len != NO_CODE); return TRUE; }
1172    if (len == NO_CODE) return FALSE;
1173    if (len > STB_VORBIS_FAST_HUFFMAN_LENGTH) return TRUE;
1174    return FALSE;
1175 }
1176 
1177 // if the fast table above doesn't work, we want to binary
1178 // search them... need to reverse the bits
1179 static void compute_sorted_huffman(Codebook *c, uint8 *lengths, uint32 *values)
1180 {
1181    int i, len;
1182    // build a list of all the entries
1183    // OPTIMIZATION: don't include the short ones, since they'll be caught by FAST_HUFFMAN.
1184    // this is kind of a frivolous optimization--I don't see any performance improvement,
1185    // but it's like 4 extra lines of code, so.
1186    if (!c->sparse) {
1187       int k = 0;
1188       for (i=0; i < c->entries; ++i)
1189          if (include_in_sort(c, lengths[i]))
1190             c->sorted_codewords[k++] = bit_reverse(c->codewords[i]);
1191       assert(k == c->sorted_entries);
1192    } else {
1193       for (i=0; i < c->sorted_entries; ++i)
1194          c->sorted_codewords[i] = bit_reverse(c->codewords[i]);
1195    }
1196 
1197    qsort(c->sorted_codewords, c->sorted_entries, sizeof(c->sorted_codewords[0]), uint32_compare);
1198    c->sorted_codewords[c->sorted_entries] = 0xffffffff;
1199 
1200    len = c->sparse ? c->sorted_entries : c->entries;
1201    // now we need to indicate how they correspond; we could either
1202    //   #1: sort a different data structure that says who they correspond to
1203    //   #2: for each sorted entry, search the original list to find who corresponds
1204    //   #3: for each original entry, find the sorted entry
1205    // #1 requires extra storage, #2 is slow, #3 can use binary search!
1206    for (i=0; i < len; ++i) {
1207       int huff_len = c->sparse ? lengths[values[i]] : lengths[i];
1208       if (include_in_sort(c,huff_len)) {
1209          uint32 code = bit_reverse(c->codewords[i]);
1210          int x=0, n=c->sorted_entries;
1211          while (n > 1) {
1212             // invariant: sc[x] <= code < sc[x+n]
1213             int m = x + (n >> 1);
1214             if (c->sorted_codewords[m] <= code) {
1215                x = m;
1216                n -= (n>>1);
1217             } else {
1218                n >>= 1;
1219             }
1220          }
1221          assert(c->sorted_codewords[x] == code);
1222          if (c->sparse) {
1223             c->sorted_values[x] = values[i];
1224             c->codeword_lengths[x] = huff_len;
1225          } else {
1226             c->sorted_values[x] = i;
1227          }
1228       }
1229    }
1230 }
1231 
1232 // only run while parsing the header (3 times)
1233 static int vorbis_validate(uint8 *data)
1234 {
1235    static uint8 vorbis[6] = { 'v', 'o', 'r', 'b', 'i', 's' };
1236    return memcmp(data, vorbis, 6) == 0;
1237 }
1238 
1239 // called from setup only, once per code book
1240 // (formula implied by specification)
1241 static int lookup1_values(int entries, int dim)
1242 {
1243    int r = (int) floor(exp((float) log((float) entries) / dim));
1244    if ((int) floor(pow((float) r+1, dim)) <= entries)   // (int) cast for MinGW warning;
1245       ++r;                                              // floor() to avoid _ftol() when non-CRT
1246    if (pow((float) r+1, dim) <= entries)
1247       return -1;
1248    if ((int) floor(pow((float) r, dim)) > entries)
1249       return -1;
1250    return r;
1251 }
1252 
1253 // called twice per file
1254 static void compute_twiddle_factors(int n, float *A, float *B, float *C)
1255 {
1256    int n4 = n >> 2, n8 = n >> 3;
1257    int k,k2;
1258 
1259    for (k=k2=0; k < n4; ++k,k2+=2) {
1260       A[k2  ] = (float)  cos(4*k*M_PI/n);
1261       A[k2+1] = (float) -sin(4*k*M_PI/n);
1262       B[k2  ] = (float)  cos((k2+1)*M_PI/n/2) * 0.5f;
1263       B[k2+1] = (float)  sin((k2+1)*M_PI/n/2) * 0.5f;
1264    }
1265    for (k=k2=0; k < n8; ++k,k2+=2) {
1266       C[k2  ] = (float)  cos(2*(k2+1)*M_PI/n);
1267       C[k2+1] = (float) -sin(2*(k2+1)*M_PI/n);
1268    }
1269 }
1270 
1271 static void compute_window(int n, float *window)
1272 {
1273    int n2 = n >> 1, i;
1274    for (i=0; i < n2; ++i)
1275       window[i] = (float) sin(0.5 * M_PI * square((float) sin((i - 0 + 0.5) / n2 * 0.5 * M_PI)));
1276 }
1277 
1278 static void compute_bitreverse(int n, uint16 *rev)
1279 {
1280    int ld = ilog(n) - 1; // ilog is off-by-one from normal definitions
1281    int i, n8 = n >> 3;
1282    for (i=0; i < n8; ++i)
1283       rev[i] = (bit_reverse(i) >> (32-ld+3)) << 2;
1284 }
1285 
1286 static int init_blocksize(vorb *f, int b, int n)
1287 {
1288    int n2 = n >> 1, n4 = n >> 2, n8 = n >> 3;
1289    f->A[b] = (float *) setup_malloc(f, sizeof(float) * n2);
1290    f->B[b] = (float *) setup_malloc(f, sizeof(float) * n2);
1291    f->C[b] = (float *) setup_malloc(f, sizeof(float) * n4);
1292    if (!f->A[b] || !f->B[b] || !f->C[b]) return error(f, VORBIS_outofmem);
1293    compute_twiddle_factors(n, f->A[b], f->B[b], f->C[b]);
1294    f->window[b] = (float *) setup_malloc(f, sizeof(float) * n2);
1295    if (!f->window[b]) return error(f, VORBIS_outofmem);
1296    compute_window(n, f->window[b]);
1297    f->bit_reverse[b] = (uint16 *) setup_malloc(f, sizeof(uint16) * n8);
1298    if (!f->bit_reverse[b]) return error(f, VORBIS_outofmem);
1299    compute_bitreverse(n, f->bit_reverse[b]);
1300    return TRUE;
1301 }
1302 
1303 static void neighbors(uint16 *x, int n, int *plow, int *phigh)
1304 {
1305    int low = -1;
1306    int high = 65536;
1307    int i;
1308    for (i=0; i < n; ++i) {
1309       if (x[i] > low  && x[i] < x[n]) { *plow  = i; low = x[i]; }
1310       if (x[i] < high && x[i] > x[n]) { *phigh = i; high = x[i]; }
1311    }
1312 }
1313 
1314 // this has been repurposed so y is now the original index instead of y
1315 typedef struct
1316 {
1317    uint16 x,id;
1318 } stbv__floor_ordering;
1319 
1320 static int STBV_CDECL point_compare(const void *p, const void *q)
1321 {
1322    stbv__floor_ordering *a = (stbv__floor_ordering *) p;
1323    stbv__floor_ordering *b = (stbv__floor_ordering *) q;
1324    return a->x < b->x ? -1 : a->x > b->x;
1325 }
1326 
1327 //
1328 /////////////////////// END LEAF SETUP FUNCTIONS //////////////////////////
1329 
1330 
1331 #if defined(STB_VORBIS_NO_STDIO)
1332    #define USE_MEMORY(z)    TRUE
1333 #else
1334    #define USE_MEMORY(z)    ((z)->stream)
1335 #endif
1336 
1337 static uint8 get8(vorb *z)
1338 {
1339    if (USE_MEMORY(z)) {
1340       if (z->stream >= z->stream_end) { z->eof = TRUE; return 0; }
1341       return *z->stream++;
1342    }
1343 
1344    #ifndef STB_VORBIS_NO_STDIO
1345    {
1346    int c = fgetc(z->f);
1347    if (c == EOF) { z->eof = TRUE; return 0; }
1348    return c;
1349    }
1350    #endif
1351 }
1352 
1353 static uint32 get32(vorb *f)
1354 {
1355    uint32 x;
1356    x = get8(f);
1357    x += get8(f) << 8;
1358    x += get8(f) << 16;
1359    x += (uint32) get8(f) << 24;
1360    return x;
1361 }
1362 
1363 static int getn(vorb *z, uint8 *data, int n)
1364 {
1365    if (USE_MEMORY(z)) {
1366       if (z->stream+n > z->stream_end) { z->eof = 1; return 0; }
1367       memcpy(data, z->stream, n);
1368       z->stream += n;
1369       return 1;
1370    }
1371 
1372    #ifndef STB_VORBIS_NO_STDIO
1373    if (fread(data, n, 1, z->f) == 1)
1374       return 1;
1375    else {
1376       z->eof = 1;
1377       return 0;
1378    }
1379    #endif
1380 }
1381 
1382 static void skip(vorb *z, int n)
1383 {
1384    if (USE_MEMORY(z)) {
1385       z->stream += n;
1386       if (z->stream >= z->stream_end) z->eof = 1;
1387       return;
1388    }
1389    #ifndef STB_VORBIS_NO_STDIO
1390    {
1391       long x = ftell(z->f);
1392       fseek(z->f, x+n, SEEK_SET);
1393    }
1394    #endif
1395 }
1396 
1397 static int set_file_offset(stb_vorbis *f, unsigned int loc)
1398 {
1399    #ifndef STB_VORBIS_NO_PUSHDATA_API
1400    if (f->push_mode) return 0;
1401    #endif
1402    f->eof = 0;
1403    if (USE_MEMORY(f)) {
1404       if (f->stream_start + loc >= f->stream_end || f->stream_start + loc < f->stream_start) {
1405          f->stream = f->stream_end;
1406          f->eof = 1;
1407          return 0;
1408       } else {
1409          f->stream = f->stream_start + loc;
1410          return 1;
1411       }
1412    }
1413    #ifndef STB_VORBIS_NO_STDIO
1414    if (loc + f->f_start < loc || loc >= 0x80000000) {
1415       loc = 0x7fffffff;
1416       f->eof = 1;
1417    } else {
1418       loc += f->f_start;
1419    }
1420    if (!fseek(f->f, loc, SEEK_SET))
1421       return 1;
1422    f->eof = 1;
1423    fseek(f->f, f->f_start, SEEK_END);
1424    return 0;
1425    #endif
1426 }
1427 
1428 
1429 static uint8 ogg_page_header[4] = { 0x4f, 0x67, 0x67, 0x53 };
1430 
1431 static int capture_pattern(vorb *f)
1432 {
1433    if (0x4f != get8(f)) return FALSE;
1434    if (0x67 != get8(f)) return FALSE;
1435    if (0x67 != get8(f)) return FALSE;
1436    if (0x53 != get8(f)) return FALSE;
1437    return TRUE;
1438 }
1439 
1440 #define PAGEFLAG_continued_packet   1
1441 #define PAGEFLAG_first_page         2
1442 #define PAGEFLAG_last_page          4
1443 
1444 static int start_page_no_capturepattern(vorb *f)
1445 {
1446    uint32 loc0,loc1,n;
1447    if (f->first_decode && !IS_PUSH_MODE(f)) {
1448       f->p_first.page_start = stb_vorbis_get_file_offset(f) - 4;
1449    }
1450    // stream structure version
1451    if (0 != get8(f)) return error(f, VORBIS_invalid_stream_structure_version);
1452    // header flag
1453    f->page_flag = get8(f);
1454    // absolute granule position
1455    loc0 = get32(f);
1456    loc1 = get32(f);
1457    // @TODO: validate loc0,loc1 as valid positions?
1458    // stream serial number -- vorbis doesn't interleave, so discard
1459    get32(f);
1460    //if (f->serial != get32(f)) return error(f, VORBIS_incorrect_stream_serial_number);
1461    // page sequence number
1462    n = get32(f);
1463    f->last_page = n;
1464    // CRC32
1465    get32(f);
1466    // page_segments
1467    f->segment_count = get8(f);
1468    if (!getn(f, f->segments, f->segment_count))
1469       return error(f, VORBIS_unexpected_eof);
1470    // assume we _don't_ know any the sample position of any segments
1471    f->end_seg_with_known_loc = -2;
1472    if (loc0 != ~0U || loc1 != ~0U) {
1473       int i;
1474       // determine which packet is the last one that will complete
1475       for (i=f->segment_count-1; i >= 0; --i)
1476          if (f->segments[i] < 255)
1477             break;
1478       // 'i' is now the index of the _last_ segment of a packet that ends
1479       if (i >= 0) {
1480          f->end_seg_with_known_loc = i;
1481          f->known_loc_for_packet   = loc0;
1482       }
1483    }
1484    if (f->first_decode) {
1485       int i,len;
1486       len = 0;
1487       for (i=0; i < f->segment_count; ++i)
1488          len += f->segments[i];
1489       len += 27 + f->segment_count;
1490       f->p_first.page_end = f->p_first.page_start + len;
1491       f->p_first.last_decoded_sample = loc0;
1492    }
1493    f->next_seg = 0;
1494    return TRUE;
1495 }
1496 
1497 static int start_page(vorb *f)
1498 {
1499    if (!capture_pattern(f)) return error(f, VORBIS_missing_capture_pattern);
1500    return start_page_no_capturepattern(f);
1501 }
1502 
1503 static int start_packet(vorb *f)
1504 {
1505    while (f->next_seg == -1) {
1506       if (!start_page(f)) return FALSE;
1507       if (f->page_flag & PAGEFLAG_continued_packet)
1508          return error(f, VORBIS_continued_packet_flag_invalid);
1509    }
1510    f->last_seg = FALSE;
1511    f->valid_bits = 0;
1512    f->packet_bytes = 0;
1513    f->bytes_in_seg = 0;
1514    // f->next_seg is now valid
1515    return TRUE;
1516 }
1517 
1518 static int maybe_start_packet(vorb *f)
1519 {
1520    if (f->next_seg == -1) {
1521       int x = get8(f);
1522       if (f->eof) return FALSE; // EOF at page boundary is not an error!
1523       if (0x4f != x      ) return error(f, VORBIS_missing_capture_pattern);
1524       if (0x67 != get8(f)) return error(f, VORBIS_missing_capture_pattern);
1525       if (0x67 != get8(f)) return error(f, VORBIS_missing_capture_pattern);
1526       if (0x53 != get8(f)) return error(f, VORBIS_missing_capture_pattern);
1527       if (!start_page_no_capturepattern(f)) return FALSE;
1528       if (f->page_flag & PAGEFLAG_continued_packet) {
1529          // set up enough state that we can read this packet if we want,
1530          // e.g. during recovery
1531          f->last_seg = FALSE;
1532          f->bytes_in_seg = 0;
1533          return error(f, VORBIS_continued_packet_flag_invalid);
1534       }
1535    }
1536    return start_packet(f);
1537 }
1538 
1539 static int next_segment(vorb *f)
1540 {
1541    int len;
1542    if (f->last_seg) return 0;
1543    if (f->next_seg == -1) {
1544       f->last_seg_which = f->segment_count-1; // in case start_page fails
1545       if (!start_page(f)) { f->last_seg = 1; return 0; }
1546       if (!(f->page_flag & PAGEFLAG_continued_packet)) return error(f, VORBIS_continued_packet_flag_invalid);
1547    }
1548    len = f->segments[f->next_seg++];
1549    if (len < 255) {
1550       f->last_seg = TRUE;
1551       f->last_seg_which = f->next_seg-1;
1552    }
1553    if (f->next_seg >= f->segment_count)
1554       f->next_seg = -1;
1555    assert(f->bytes_in_seg == 0);
1556    f->bytes_in_seg = len;
1557    return len;
1558 }
1559 
1560 #define EOP    (-1)
1561 #define INVALID_BITS  (-1)
1562 
1563 static int get8_packet_raw(vorb *f)
1564 {
1565    if (!f->bytes_in_seg) {  // CLANG!
1566       if (f->last_seg) return EOP;
1567       else if (!next_segment(f)) return EOP;
1568    }
1569    assert(f->bytes_in_seg > 0);
1570    --f->bytes_in_seg;
1571    ++f->packet_bytes;
1572    return get8(f);
1573 }
1574 
1575 static int get8_packet(vorb *f)
1576 {
1577    int x = get8_packet_raw(f);
1578    f->valid_bits = 0;
1579    return x;
1580 }
1581 
1582 static int get32_packet(vorb *f)
1583 {
1584    uint32 x;
1585    x = get8_packet(f);
1586    x += get8_packet(f) << 8;
1587    x += get8_packet(f) << 16;
1588    x += (uint32) get8_packet(f) << 24;
1589    return x;
1590 }
1591 
1592 static void flush_packet(vorb *f)
1593 {
1594    while (get8_packet_raw(f) != EOP);
1595 }
1596 
1597 // @OPTIMIZE: this is the secondary bit decoder, so it's probably not as important
1598 // as the huffman decoder?
1599 static uint32 get_bits(vorb *f, int n)
1600 {
1601    uint32 z;
1602 
1603    if (f->valid_bits < 0) return 0;
1604    if (f->valid_bits < n) {
1605       if (n > 24) {
1606          // the accumulator technique below would not work correctly in this case
1607          z = get_bits(f, 24);
1608          z += get_bits(f, n-24) << 24;
1609          return z;
1610       }
1611       if (f->valid_bits == 0) f->acc = 0;
1612       while (f->valid_bits < n) {
1613          int z = get8_packet_raw(f);
1614          if (z == EOP) {
1615             f->valid_bits = INVALID_BITS;
1616             return 0;
1617          }
1618          f->acc += z << f->valid_bits;
1619          f->valid_bits += 8;
1620       }
1621    }
1622 
1623    assert(f->valid_bits >= n);
1624    z = f->acc & ((1 << n)-1);
1625    f->acc >>= n;
1626    f->valid_bits -= n;
1627    return z;
1628 }
1629 
1630 // @OPTIMIZE: primary accumulator for huffman
1631 // expand the buffer to as many bits as possible without reading off end of packet
1632 // it might be nice to allow f->valid_bits and f->acc to be stored in registers,
1633 // e.g. cache them locally and decode locally
1634 static __forceinline void prep_huffman(vorb *f)
1635 {
1636    if (f->valid_bits <= 24) {
1637       if (f->valid_bits == 0) f->acc = 0;
1638       do {
1639          int z;
1640          if (f->last_seg && !f->bytes_in_seg) return;
1641          z = get8_packet_raw(f);
1642          if (z == EOP) return;
1643          f->acc += (unsigned) z << f->valid_bits;
1644          f->valid_bits += 8;
1645       } while (f->valid_bits <= 24);
1646    }
1647 }
1648 
1649 enum
1650 {
1651    VORBIS_packet_id = 1,
1652    VORBIS_packet_comment = 3,
1653    VORBIS_packet_setup = 5
1654 };
1655 
1656 static int codebook_decode_scalar_raw(vorb *f, Codebook *c)
1657 {
1658    int i;
1659    prep_huffman(f);
1660 
1661    if (c->codewords == NULL && c->sorted_codewords == NULL)
1662       return -1;
1663 
1664    // cases to use binary search: sorted_codewords && !c->codewords
1665    //                             sorted_codewords && c->entries > 8
1666    if (c->entries > 8 ? c->sorted_codewords!=NULL : !c->codewords) {
1667       // binary search
1668       uint32 code = bit_reverse(f->acc);
1669       int x=0, n=c->sorted_entries, len;
1670 
1671       while (n > 1) {
1672          // invariant: sc[x] <= code < sc[x+n]
1673          int m = x + (n >> 1);
1674          if (c->sorted_codewords[m] <= code) {
1675             x = m;
1676             n -= (n>>1);
1677          } else {
1678             n >>= 1;
1679          }
1680       }
1681       // x is now the sorted index
1682       if (!c->sparse) x = c->sorted_values[x];
1683       // x is now sorted index if sparse, or symbol otherwise
1684       len = c->codeword_lengths[x];
1685       if (f->valid_bits >= len) {
1686          f->acc >>= len;
1687          f->valid_bits -= len;
1688          return x;
1689       }
1690 
1691       f->valid_bits = 0;
1692       return -1;
1693    }
1694 
1695    // if small, linear search
1696    assert(!c->sparse);
1697    for (i=0; i < c->entries; ++i) {
1698       if (c->codeword_lengths[i] == NO_CODE) continue;
1699       if (c->codewords[i] == (f->acc & ((1 << c->codeword_lengths[i])-1))) {
1700          if (f->valid_bits >= c->codeword_lengths[i]) {
1701             f->acc >>= c->codeword_lengths[i];
1702             f->valid_bits -= c->codeword_lengths[i];
1703             return i;
1704          }
1705          f->valid_bits = 0;
1706          return -1;
1707       }
1708    }
1709 
1710    error(f, VORBIS_invalid_stream);
1711    f->valid_bits = 0;
1712    return -1;
1713 }
1714 
1715 #ifndef STB_VORBIS_NO_INLINE_DECODE
1716 
1717 #define DECODE_RAW(var, f,c)                                  \
1718    if (f->valid_bits < STB_VORBIS_FAST_HUFFMAN_LENGTH)        \
1719       prep_huffman(f);                                        \
1720    var = f->acc & FAST_HUFFMAN_TABLE_MASK;                    \
1721    var = c->fast_huffman[var];                                \
1722    if (var >= 0) {                                            \
1723       int n = c->codeword_lengths[var];                       \
1724       f->acc >>= n;                                           \
1725       f->valid_bits -= n;                                     \
1726       if (f->valid_bits < 0) { f->valid_bits = 0; var = -1; } \
1727    } else {                                                   \
1728       var = codebook_decode_scalar_raw(f,c);                  \
1729    }
1730 
1731 #else
1732 
1733 static int codebook_decode_scalar(vorb *f, Codebook *c)
1734 {
1735    int i;
1736    if (f->valid_bits < STB_VORBIS_FAST_HUFFMAN_LENGTH)
1737       prep_huffman(f);
1738    // fast huffman table lookup
1739    i = f->acc & FAST_HUFFMAN_TABLE_MASK;
1740    i = c->fast_huffman[i];
1741    if (i >= 0) {
1742       f->acc >>= c->codeword_lengths[i];
1743       f->valid_bits -= c->codeword_lengths[i];
1744       if (f->valid_bits < 0) { f->valid_bits = 0; return -1; }
1745       return i;
1746    }
1747    return codebook_decode_scalar_raw(f,c);
1748 }
1749 
1750 #define DECODE_RAW(var,f,c)    var = codebook_decode_scalar(f,c);
1751 
1752 #endif
1753 
1754 #define DECODE(var,f,c)                                       \
1755    DECODE_RAW(var,f,c)                                        \
1756    if (c->sparse) var = c->sorted_values[var];
1757 
1758 #ifndef STB_VORBIS_DIVIDES_IN_CODEBOOK
1759   #define DECODE_VQ(var,f,c)   DECODE_RAW(var,f,c)
1760 #else
1761   #define DECODE_VQ(var,f,c)   DECODE(var,f,c)
1762 #endif
1763 
1764 
1765 
1766 
1767 
1768 
1769 // CODEBOOK_ELEMENT_FAST is an optimization for the CODEBOOK_FLOATS case
1770 // where we avoid one addition
1771 #define CODEBOOK_ELEMENT(c,off)          (c->multiplicands[off])
1772 #define CODEBOOK_ELEMENT_FAST(c,off)     (c->multiplicands[off])
1773 #define CODEBOOK_ELEMENT_BASE(c)         (0)
1774 
1775 static int codebook_decode_start(vorb *f, Codebook *c)
1776 {
1777    int z = -1;
1778 
1779    // type 0 is only legal in a scalar context
1780    if (c->lookup_type == 0)
1781       error(f, VORBIS_invalid_stream);
1782    else {
1783       DECODE_VQ(z,f,c);
1784       if (c->sparse) assert(z < c->sorted_entries);
1785       if (z < 0) {  // check for EOP
1786          if (!f->bytes_in_seg)
1787             if (f->last_seg)
1788                return z;
1789          error(f, VORBIS_invalid_stream);
1790       }
1791    }
1792    return z;
1793 }
1794 
1795 static int codebook_decode(vorb *f, Codebook *c, float *output, int len)
1796 {
1797    int i,z = codebook_decode_start(f,c);
1798    if (z < 0) return FALSE;
1799    if (len > c->dimensions) len = c->dimensions;
1800 
1801 #ifdef STB_VORBIS_DIVIDES_IN_CODEBOOK
1802    if (c->lookup_type == 1) {
1803       float last = CODEBOOK_ELEMENT_BASE(c);
1804       int div = 1;
1805       for (i=0; i < len; ++i) {
1806          int off = (z / div) % c->lookup_values;
1807          float val = CODEBOOK_ELEMENT_FAST(c,off) + last;
1808          output[i] += val;
1809          if (c->sequence_p) last = val + c->minimum_value;
1810          div *= c->lookup_values;
1811       }
1812       return TRUE;
1813    }
1814 #endif
1815 
1816    z *= c->dimensions;
1817    if (c->sequence_p) {
1818       float last = CODEBOOK_ELEMENT_BASE(c);
1819       for (i=0; i < len; ++i) {
1820          float val = CODEBOOK_ELEMENT_FAST(c,z+i) + last;
1821          output[i] += val;
1822          last = val + c->minimum_value;
1823       }
1824    } else {
1825       float last = CODEBOOK_ELEMENT_BASE(c);
1826       for (i=0; i < len; ++i) {
1827          output[i] += CODEBOOK_ELEMENT_FAST(c,z+i) + last;
1828       }
1829    }
1830 
1831    return TRUE;
1832 }
1833 
1834 static int codebook_decode_step(vorb *f, Codebook *c, float *output, int len, int step)
1835 {
1836    int i,z = codebook_decode_start(f,c);
1837    float last = CODEBOOK_ELEMENT_BASE(c);
1838    if (z < 0) return FALSE;
1839    if (len > c->dimensions) len = c->dimensions;
1840 
1841 #ifdef STB_VORBIS_DIVIDES_IN_CODEBOOK
1842    if (c->lookup_type == 1) {
1843       int div = 1;
1844       for (i=0; i < len; ++i) {
1845          int off = (z / div) % c->lookup_values;
1846          float val = CODEBOOK_ELEMENT_FAST(c,off) + last;
1847          output[i*step] += val;
1848          if (c->sequence_p) last = val;
1849          div *= c->lookup_values;
1850       }
1851       return TRUE;
1852    }
1853 #endif
1854 
1855    z *= c->dimensions;
1856    for (i=0; i < len; ++i) {
1857       float val = CODEBOOK_ELEMENT_FAST(c,z+i) + last;
1858       output[i*step] += val;
1859       if (c->sequence_p) last = val;
1860    }
1861 
1862    return TRUE;
1863 }
1864 
1865 static int codebook_decode_deinterleave_repeat(vorb *f, Codebook *c, float **outputs, int ch, int *c_inter_p, int *p_inter_p, int len, int total_decode)
1866 {
1867    int c_inter = *c_inter_p;
1868    int p_inter = *p_inter_p;
1869    int i,z, effective = c->dimensions;
1870 
1871    // type 0 is only legal in a scalar context
1872    if (c->lookup_type == 0)   return error(f, VORBIS_invalid_stream);
1873 
1874    while (total_decode > 0) {
1875       float last = CODEBOOK_ELEMENT_BASE(c);
1876       DECODE_VQ(z,f,c);
1877       #ifndef STB_VORBIS_DIVIDES_IN_CODEBOOK
1878       assert(!c->sparse || z < c->sorted_entries);
1879       #endif
1880       if (z < 0) {
1881          if (!f->bytes_in_seg)
1882             if (f->last_seg) return FALSE;
1883          return error(f, VORBIS_invalid_stream);
1884       }
1885 
1886       // if this will take us off the end of the buffers, stop short!
1887       // we check by computing the length of the virtual interleaved
1888       // buffer (len*ch), our current offset within it (p_inter*ch)+(c_inter),
1889       // and the length we'll be using (effective)
1890       if (c_inter + p_inter*ch + effective > len * ch) {
1891          effective = len*ch - (p_inter*ch - c_inter);
1892       }
1893 
1894    #ifdef STB_VORBIS_DIVIDES_IN_CODEBOOK
1895       if (c->lookup_type == 1) {
1896          int div = 1;
1897          for (i=0; i < effective; ++i) {
1898             int off = (z / div) % c->lookup_values;
1899             float val = CODEBOOK_ELEMENT_FAST(c,off) + last;
1900             if (outputs[c_inter])
1901                outputs[c_inter][p_inter] += val;
1902             if (++c_inter == ch) { c_inter = 0; ++p_inter; }
1903             if (c->sequence_p) last = val;
1904             div *= c->lookup_values;
1905          }
1906       } else
1907    #endif
1908       {
1909          z *= c->dimensions;
1910          if (c->sequence_p) {
1911             for (i=0; i < effective; ++i) {
1912                float val = CODEBOOK_ELEMENT_FAST(c,z+i) + last;
1913                if (outputs[c_inter])
1914                   outputs[c_inter][p_inter] += val;
1915                if (++c_inter == ch) { c_inter = 0; ++p_inter; }
1916                last = val;
1917             }
1918          } else {
1919             for (i=0; i < effective; ++i) {
1920                float val = CODEBOOK_ELEMENT_FAST(c,z+i) + last;
1921                if (outputs[c_inter])
1922                   outputs[c_inter][p_inter] += val;
1923                if (++c_inter == ch) { c_inter = 0; ++p_inter; }
1924             }
1925          }
1926       }
1927 
1928       total_decode -= effective;
1929    }
1930    *c_inter_p = c_inter;
1931    *p_inter_p = p_inter;
1932    return TRUE;
1933 }
1934 
1935 static int predict_point(int x, int x0, int x1, int y0, int y1)
1936 {
1937    int dy = y1 - y0;
1938    int adx = x1 - x0;
1939    // @OPTIMIZE: force int division to round in the right direction... is this necessary on x86?
1940    int err = abs(dy) * (x - x0);
1941    int off = err / adx;
1942    return dy < 0 ? y0 - off : y0 + off;
1943 }
1944 
1945 // the following table is block-copied from the specification
1946 static float inverse_db_table[256] =
1947 {
1948   1.0649863e-07f, 1.1341951e-07f, 1.2079015e-07f, 1.2863978e-07f,
1949   1.3699951e-07f, 1.4590251e-07f, 1.5538408e-07f, 1.6548181e-07f,
1950   1.7623575e-07f, 1.8768855e-07f, 1.9988561e-07f, 2.1287530e-07f,
1951   2.2670913e-07f, 2.4144197e-07f, 2.5713223e-07f, 2.7384213e-07f,
1952   2.9163793e-07f, 3.1059021e-07f, 3.3077411e-07f, 3.5226968e-07f,
1953   3.7516214e-07f, 3.9954229e-07f, 4.2550680e-07f, 4.5315863e-07f,
1954   4.8260743e-07f, 5.1396998e-07f, 5.4737065e-07f, 5.8294187e-07f,
1955   6.2082472e-07f, 6.6116941e-07f, 7.0413592e-07f, 7.4989464e-07f,
1956   7.9862701e-07f, 8.5052630e-07f, 9.0579828e-07f, 9.6466216e-07f,
1957   1.0273513e-06f, 1.0941144e-06f, 1.1652161e-06f, 1.2409384e-06f,
1958   1.3215816e-06f, 1.4074654e-06f, 1.4989305e-06f, 1.5963394e-06f,
1959   1.7000785e-06f, 1.8105592e-06f, 1.9282195e-06f, 2.0535261e-06f,
1960   2.1869758e-06f, 2.3290978e-06f, 2.4804557e-06f, 2.6416497e-06f,
1961   2.8133190e-06f, 2.9961443e-06f, 3.1908506e-06f, 3.3982101e-06f,
1962   3.6190449e-06f, 3.8542308e-06f, 4.1047004e-06f, 4.3714470e-06f,
1963   4.6555282e-06f, 4.9580707e-06f, 5.2802740e-06f, 5.6234160e-06f,
1964   5.9888572e-06f, 6.3780469e-06f, 6.7925283e-06f, 7.2339451e-06f,
1965   7.7040476e-06f, 8.2047000e-06f, 8.7378876e-06f, 9.3057248e-06f,
1966   9.9104632e-06f, 1.0554501e-05f, 1.1240392e-05f, 1.1970856e-05f,
1967   1.2748789e-05f, 1.3577278e-05f, 1.4459606e-05f, 1.5399272e-05f,
1968   1.6400004e-05f, 1.7465768e-05f, 1.8600792e-05f, 1.9809576e-05f,
1969   2.1096914e-05f, 2.2467911e-05f, 2.3928002e-05f, 2.5482978e-05f,
1970   2.7139006e-05f, 2.8902651e-05f, 3.0780908e-05f, 3.2781225e-05f,
1971   3.4911534e-05f, 3.7180282e-05f, 3.9596466e-05f, 4.2169667e-05f,
1972   4.4910090e-05f, 4.7828601e-05f, 5.0936773e-05f, 5.4246931e-05f,
1973   5.7772202e-05f, 6.1526565e-05f, 6.5524908e-05f, 6.9783085e-05f,
1974   7.4317983e-05f, 7.9147585e-05f, 8.4291040e-05f, 8.9768747e-05f,
1975   9.5602426e-05f, 0.00010181521f, 0.00010843174f, 0.00011547824f,
1976   0.00012298267f, 0.00013097477f, 0.00013948625f, 0.00014855085f,
1977   0.00015820453f, 0.00016848555f, 0.00017943469f, 0.00019109536f,
1978   0.00020351382f, 0.00021673929f, 0.00023082423f, 0.00024582449f,
1979   0.00026179955f, 0.00027881276f, 0.00029693158f, 0.00031622787f,
1980   0.00033677814f, 0.00035866388f, 0.00038197188f, 0.00040679456f,
1981   0.00043323036f, 0.00046138411f, 0.00049136745f, 0.00052329927f,
1982   0.00055730621f, 0.00059352311f, 0.00063209358f, 0.00067317058f,
1983   0.00071691700f, 0.00076350630f, 0.00081312324f, 0.00086596457f,
1984   0.00092223983f, 0.00098217216f, 0.0010459992f,  0.0011139742f,
1985   0.0011863665f,  0.0012634633f,  0.0013455702f,  0.0014330129f,
1986   0.0015261382f,  0.0016253153f,  0.0017309374f,  0.0018434235f,
1987   0.0019632195f,  0.0020908006f,  0.0022266726f,  0.0023713743f,
1988   0.0025254795f,  0.0026895994f,  0.0028643847f,  0.0030505286f,
1989   0.0032487691f,  0.0034598925f,  0.0036847358f,  0.0039241906f,
1990   0.0041792066f,  0.0044507950f,  0.0047400328f,  0.0050480668f,
1991   0.0053761186f,  0.0057254891f,  0.0060975636f,  0.0064938176f,
1992   0.0069158225f,  0.0073652516f,  0.0078438871f,  0.0083536271f,
1993   0.0088964928f,  0.009474637f,   0.010090352f,   0.010746080f,
1994   0.011444421f,   0.012188144f,   0.012980198f,   0.013823725f,
1995   0.014722068f,   0.015678791f,   0.016697687f,   0.017782797f,
1996   0.018938423f,   0.020169149f,   0.021479854f,   0.022875735f,
1997   0.024362330f,   0.025945531f,   0.027631618f,   0.029427276f,
1998   0.031339626f,   0.033376252f,   0.035545228f,   0.037855157f,
1999   0.040315199f,   0.042935108f,   0.045725273f,   0.048696758f,
2000   0.051861348f,   0.055231591f,   0.058820850f,   0.062643361f,
2001   0.066714279f,   0.071049749f,   0.075666962f,   0.080584227f,
2002   0.085821044f,   0.091398179f,   0.097337747f,   0.10366330f,
2003   0.11039993f,    0.11757434f,    0.12521498f,    0.13335215f,
2004   0.14201813f,    0.15124727f,    0.16107617f,    0.17154380f,
2005   0.18269168f,    0.19456402f,    0.20720788f,    0.22067342f,
2006   0.23501402f,    0.25028656f,    0.26655159f,    0.28387361f,
2007   0.30232132f,    0.32196786f,    0.34289114f,    0.36517414f,
2008   0.38890521f,    0.41417847f,    0.44109412f,    0.46975890f,
2009   0.50028648f,    0.53279791f,    0.56742212f,    0.60429640f,
2010   0.64356699f,    0.68538959f,    0.72993007f,    0.77736504f,
2011   0.82788260f,    0.88168307f,    0.9389798f,     1.0f
2012 };
2013 
2014 
2015 // @OPTIMIZE: if you want to replace this bresenham line-drawing routine,
2016 // note that you must produce bit-identical output to decode correctly;
2017 // this specific sequence of operations is specified in the spec (it's
2018 // drawing integer-quantized frequency-space lines that the encoder
2019 // expects to be exactly the same)
2020 //     ... also, isn't the whole point of Bresenham's algorithm to NOT
2021 // have to divide in the setup? sigh.
2022 #ifndef STB_VORBIS_NO_DEFER_FLOOR
2023 #define LINE_OP(a,b)   a *= b
2024 #else
2025 #define LINE_OP(a,b)   a = b
2026 #endif
2027 
2028 #ifdef STB_VORBIS_DIVIDE_TABLE
2029 #define DIVTAB_NUMER   32
2030 #define DIVTAB_DENOM   64
2031 int8 integer_divide_table[DIVTAB_NUMER][DIVTAB_DENOM]; // 2KB
2032 #endif
2033 
2034 static __forceinline void draw_line(float *output, int x0, int y0, int x1, int y1, int n)
2035 {
2036    int dy = y1 - y0;
2037    int adx = x1 - x0;
2038    int ady = abs(dy);
2039    int base;
2040    int x=x0,y=y0;
2041    int err = 0;
2042    int sy;
2043 
2044 #ifdef STB_VORBIS_DIVIDE_TABLE
2045    if (adx < DIVTAB_DENOM && ady < DIVTAB_NUMER) {
2046       if (dy < 0) {
2047          base = -integer_divide_table[ady][adx];
2048          sy = base-1;
2049       } else {
2050          base =  integer_divide_table[ady][adx];
2051          sy = base+1;
2052       }
2053    } else {
2054       base = dy / adx;
2055       if (dy < 0)
2056          sy = base - 1;
2057       else
2058          sy = base+1;
2059    }
2060 #else
2061    base = dy / adx;
2062    if (dy < 0)
2063       sy = base - 1;
2064    else
2065       sy = base+1;
2066 #endif
2067    ady -= abs(base) * adx;
2068    if (x1 > n) x1 = n;
2069    if (x < x1) {
2070       LINE_OP(output[x], inverse_db_table[y&255]);
2071       for (++x; x < x1; ++x) {
2072          err += ady;
2073          if (err >= adx) {
2074             err -= adx;
2075             y += sy;
2076          } else
2077             y += base;
2078          LINE_OP(output[x], inverse_db_table[y&255]);
2079       }
2080    }
2081 }
2082 
2083 static int residue_decode(vorb *f, Codebook *book, float *target, int offset, int n, int rtype)
2084 {
2085    int k;
2086    if (rtype == 0) {
2087       int step = n / book->dimensions;
2088       for (k=0; k < step; ++k)
2089          if (!codebook_decode_step(f, book, target+offset+k, n-offset-k, step))
2090             return FALSE;
2091    } else {
2092       for (k=0; k < n; ) {
2093          if (!codebook_decode(f, book, target+offset, n-k))
2094             return FALSE;
2095          k += book->dimensions;
2096          offset += book->dimensions;
2097       }
2098    }
2099    return TRUE;
2100 }
2101 
2102 // n is 1/2 of the blocksize --
2103 // specification: "Correct per-vector decode length is [n]/2"
2104 static void decode_residue(vorb *f, float *residue_buffers[], int ch, int n, int rn, uint8 *do_not_decode)
2105 {
2106    int i,j,pass;
2107    Residue *r = f->residue_config + rn;
2108    int rtype = f->residue_types[rn];
2109    int c = r->classbook;
2110    int classwords = f->codebooks[c].dimensions;
2111    unsigned int actual_size = rtype == 2 ? n*2 : n;
2112    unsigned int limit_r_begin = (r->begin < actual_size ? r->begin : actual_size);
2113    unsigned int limit_r_end   = (r->end   < actual_size ? r->end   : actual_size);
2114    int n_read = limit_r_end - limit_r_begin;
2115    int part_read = n_read / r->part_size;
2116    int temp_alloc_point = temp_alloc_save(f);
2117    #ifndef STB_VORBIS_DIVIDES_IN_RESIDUE
2118    uint8 ***part_classdata = (uint8 ***) temp_block_array(f,f->channels, part_read * sizeof(**part_classdata));
2119    #else
2120    int **classifications = (int **) temp_block_array(f,f->channels, part_read * sizeof(**classifications));
2121    #endif
2122 
2123    CHECK(f);
2124 
2125    for (i=0; i < ch; ++i)
2126       if (!do_not_decode[i])
2127          memset(residue_buffers[i], 0, sizeof(float) * n);
2128 
2129    if (rtype == 2 && ch != 1) {
2130       for (j=0; j < ch; ++j)
2131          if (!do_not_decode[j])
2132             break;
2133       if (j == ch)
2134          goto done;
2135 
2136       for (pass=0; pass < 8; ++pass) {
2137          int pcount = 0, class_set = 0;
2138          if (ch == 2) {
2139             while (pcount < part_read) {
2140                int z = r->begin + pcount*r->part_size;
2141                int c_inter = (z & 1), p_inter = z>>1;
2142                if (pass == 0) {
2143                   Codebook *c = f->codebooks+r->classbook;
2144                   int q;
2145                   DECODE(q,f,c);
2146                   if (q == EOP) goto done;
2147                   #ifndef STB_VORBIS_DIVIDES_IN_RESIDUE
2148                   part_classdata[0][class_set] = r->classdata[q];
2149                   #else
2150                   for (i=classwords-1; i >= 0; --i) {
2151                      classifications[0][i+pcount] = q % r->classifications;
2152                      q /= r->classifications;
2153                   }
2154                   #endif
2155                }
2156                for (i=0; i < classwords && pcount < part_read; ++i, ++pcount) {
2157                   int z = r->begin + pcount*r->part_size;
2158                   #ifndef STB_VORBIS_DIVIDES_IN_RESIDUE
2159                   int c = part_classdata[0][class_set][i];
2160                   #else
2161                   int c = classifications[0][pcount];
2162                   #endif
2163                   int b = r->residue_books[c][pass];
2164                   if (b >= 0) {
2165                      Codebook *book = f->codebooks + b;
2166                      #ifdef STB_VORBIS_DIVIDES_IN_CODEBOOK
2167                      if (!codebook_decode_deinterleave_repeat(f, book, residue_buffers, ch, &c_inter, &p_inter, n, r->part_size))
2168                         goto done;
2169                      #else
2170                      // saves 1%
2171                      if (!codebook_decode_deinterleave_repeat(f, book, residue_buffers, ch, &c_inter, &p_inter, n, r->part_size))
2172                         goto done;
2173                      #endif
2174                   } else {
2175                      z += r->part_size;
2176                      c_inter = z & 1;
2177                      p_inter = z >> 1;
2178                   }
2179                }
2180                #ifndef STB_VORBIS_DIVIDES_IN_RESIDUE
2181                ++class_set;
2182                #endif
2183             }
2184          } else if (ch > 2) {
2185             while (pcount < part_read) {
2186                int z = r->begin + pcount*r->part_size;
2187                int c_inter = z % ch, p_inter = z/ch;
2188                if (pass == 0) {
2189                   Codebook *c = f->codebooks+r->classbook;
2190                   int q;
2191                   DECODE(q,f,c);
2192                   if (q == EOP) goto done;
2193                   #ifndef STB_VORBIS_DIVIDES_IN_RESIDUE
2194                   part_classdata[0][class_set] = r->classdata[q];
2195                   #else
2196                   for (i=classwords-1; i >= 0; --i) {
2197                      classifications[0][i+pcount] = q % r->classifications;
2198                      q /= r->classifications;
2199                   }
2200                   #endif
2201                }
2202                for (i=0; i < classwords && pcount < part_read; ++i, ++pcount) {
2203                   int z = r->begin + pcount*r->part_size;
2204                   #ifndef STB_VORBIS_DIVIDES_IN_RESIDUE
2205                   int c = part_classdata[0][class_set][i];
2206                   #else
2207                   int c = classifications[0][pcount];
2208                   #endif
2209                   int b = r->residue_books[c][pass];
2210                   if (b >= 0) {
2211                      Codebook *book = f->codebooks + b;
2212                      if (!codebook_decode_deinterleave_repeat(f, book, residue_buffers, ch, &c_inter, &p_inter, n, r->part_size))
2213                         goto done;
2214                   } else {
2215                      z += r->part_size;
2216                      c_inter = z % ch;
2217                      p_inter = z / ch;
2218                   }
2219                }
2220                #ifndef STB_VORBIS_DIVIDES_IN_RESIDUE
2221                ++class_set;
2222                #endif
2223             }
2224          }
2225       }
2226       goto done;
2227    }
2228    CHECK(f);
2229 
2230    for (pass=0; pass < 8; ++pass) {
2231       int pcount = 0, class_set=0;
2232       while (pcount < part_read) {
2233          if (pass == 0) {
2234             for (j=0; j < ch; ++j) {
2235                if (!do_not_decode[j]) {
2236                   Codebook *c = f->codebooks+r->classbook;
2237                   int temp;
2238                   DECODE(temp,f,c);
2239                   if (temp == EOP) goto done;
2240                   #ifndef STB_VORBIS_DIVIDES_IN_RESIDUE
2241                   part_classdata[j][class_set] = r->classdata[temp];
2242                   #else
2243                   for (i=classwords-1; i >= 0; --i) {
2244                      classifications[j][i+pcount] = temp % r->classifications;
2245                      temp /= r->classifications;
2246                   }
2247                   #endif
2248                }
2249             }
2250          }
2251          for (i=0; i < classwords && pcount < part_read; ++i, ++pcount) {
2252             for (j=0; j < ch; ++j) {
2253                if (!do_not_decode[j]) {
2254                   #ifndef STB_VORBIS_DIVIDES_IN_RESIDUE
2255                   int c = part_classdata[j][class_set][i];
2256                   #else
2257                   int c = classifications[j][pcount];
2258                   #endif
2259                   int b = r->residue_books[c][pass];
2260                   if (b >= 0) {
2261                      float *target = residue_buffers[j];
2262                      int offset = r->begin + pcount * r->part_size;
2263                      int n = r->part_size;
2264                      Codebook *book = f->codebooks + b;
2265                      if (!residue_decode(f, book, target, offset, n, rtype))
2266                         goto done;
2267                   }
2268                }
2269             }
2270          }
2271          #ifndef STB_VORBIS_DIVIDES_IN_RESIDUE
2272          ++class_set;
2273          #endif
2274       }
2275    }
2276   done:
2277    CHECK(f);
2278    #ifndef STB_VORBIS_DIVIDES_IN_RESIDUE
2279    temp_free(f,part_classdata);
2280    #else
2281    temp_free(f,classifications);
2282    #endif
2283    temp_alloc_restore(f,temp_alloc_point);
2284 }
2285 
2286 
2287 #if 0
2288 // slow way for debugging
2289 void inverse_mdct_slow(float *buffer, int n)
2290 {
2291    int i,j;
2292    int n2 = n >> 1;
2293    float *x = (float *) malloc(sizeof(*x) * n2);
2294    memcpy(x, buffer, sizeof(*x) * n2);
2295    for (i=0; i < n; ++i) {
2296       float acc = 0;
2297       for (j=0; j < n2; ++j)
2298          // formula from paper:
2299          //acc += n/4.0f * x[j] * (float) cos(M_PI / 2 / n * (2 * i + 1 + n/2.0)*(2*j+1));
2300          // formula from wikipedia
2301          //acc += 2.0f / n2 * x[j] * (float) cos(M_PI/n2 * (i + 0.5 + n2/2)*(j + 0.5));
2302          // these are equivalent, except the formula from the paper inverts the multiplier!
2303          // however, what actually works is NO MULTIPLIER!?!
2304          //acc += 64 * 2.0f / n2 * x[j] * (float) cos(M_PI/n2 * (i + 0.5 + n2/2)*(j + 0.5));
2305          acc += x[j] * (float) cos(M_PI / 2 / n * (2 * i + 1 + n/2.0)*(2*j+1));
2306       buffer[i] = acc;
2307    }
2308    free(x);
2309 }
2310 #elif 0
2311 // same as above, but just barely able to run in real time on modern machines
2312 void inverse_mdct_slow(float *buffer, int n, vorb *f, int blocktype)
2313 {
2314    float mcos[16384];
2315    int i,j;
2316    int n2 = n >> 1, nmask = (n << 2) -1;
2317    float *x = (float *) malloc(sizeof(*x) * n2);
2318    memcpy(x, buffer, sizeof(*x) * n2);
2319    for (i=0; i < 4*n; ++i)
2320       mcos[i] = (float) cos(M_PI / 2 * i / n);
2321 
2322    for (i=0; i < n; ++i) {
2323       float acc = 0;
2324       for (j=0; j < n2; ++j)
2325          acc += x[j] * mcos[(2 * i + 1 + n2)*(2*j+1) & nmask];
2326       buffer[i] = acc;
2327    }
2328    free(x);
2329 }
2330 #elif 0
2331 // transform to use a slow dct-iv; this is STILL basically trivial,
2332 // but only requires half as many ops
2333 void dct_iv_slow(float *buffer, int n)
2334 {
2335    float mcos[16384];
2336    float x[2048];
2337    int i,j;
2338    int n2 = n >> 1, nmask = (n << 3) - 1;
2339    memcpy(x, buffer, sizeof(*x) * n);
2340    for (i=0; i < 8*n; ++i)
2341       mcos[i] = (float) cos(M_PI / 4 * i / n);
2342    for (i=0; i < n; ++i) {
2343       float acc = 0;
2344       for (j=0; j < n; ++j)
2345          acc += x[j] * mcos[((2 * i + 1)*(2*j+1)) & nmask];
2346       buffer[i] = acc;
2347    }
2348 }
2349 
2350 void inverse_mdct_slow(float *buffer, int n, vorb *f, int blocktype)
2351 {
2352    int i, n4 = n >> 2, n2 = n >> 1, n3_4 = n - n4;
2353    float temp[4096];
2354 
2355    memcpy(temp, buffer, n2 * sizeof(float));
2356    dct_iv_slow(temp, n2);  // returns -c'-d, a-b'
2357 
2358    for (i=0; i < n4  ; ++i) buffer[i] = temp[i+n4];            // a-b'
2359    for (   ; i < n3_4; ++i) buffer[i] = -temp[n3_4 - i - 1];   // b-a', c+d'
2360    for (   ; i < n   ; ++i) buffer[i] = -temp[i - n3_4];       // c'+d
2361 }
2362 #endif
2363 
2364 #ifndef LIBVORBIS_MDCT
2365 #define LIBVORBIS_MDCT 0
2366 #endif
2367 
2368 #if LIBVORBIS_MDCT
2369 // directly call the vorbis MDCT using an interface documented
2370 // by Jeff Roberts... useful for performance comparison
2371 typedef struct
2372 {
2373   int n;
2374   int log2n;
2375 
2376   float *trig;
2377   int   *bitrev;
2378 
2379   float scale;
2380 } mdct_lookup;
2381 
2382 extern void mdct_init(mdct_lookup *lookup, int n);
2383 extern void mdct_clear(mdct_lookup *l);
2384 extern void mdct_backward(mdct_lookup *init, float *in, float *out);
2385 
2386 mdct_lookup M1,M2;
2387 
2388 void inverse_mdct(float *buffer, int n, vorb *f, int blocktype)
2389 {
2390    mdct_lookup *M;
2391    if (M1.n == n) M = &M1;
2392    else if (M2.n == n) M = &M2;
2393    else if (M1.n == 0) { mdct_init(&M1, n); M = &M1; }
2394    else {
2395       if (M2.n) __asm int 3;
2396       mdct_init(&M2, n);
2397       M = &M2;
2398    }
2399 
2400    mdct_backward(M, buffer, buffer);
2401 }
2402 #endif
2403 
2404 
2405 // the following were split out into separate functions while optimizing;
2406 // they could be pushed back up but eh. __forceinline showed no change;
2407 // they're probably already being inlined.
2408 static void imdct_step3_iter0_loop(int n, float *e, int i_off, int k_off, float *A)
2409 {
2410    float *ee0 = e + i_off;
2411    float *ee2 = ee0 + k_off;
2412    int i;
2413 
2414    assert((n & 3) == 0);
2415    for (i=(n>>2); i > 0; --i) {
2416       float k00_20, k01_21;
2417       k00_20  = ee0[ 0] - ee2[ 0];
2418       k01_21  = ee0[-1] - ee2[-1];
2419       ee0[ 0] += ee2[ 0];//ee0[ 0] = ee0[ 0] + ee2[ 0];
2420       ee0[-1] += ee2[-1];//ee0[-1] = ee0[-1] + ee2[-1];
2421       ee2[ 0] = k00_20 * A[0] - k01_21 * A[1];
2422       ee2[-1] = k01_21 * A[0] + k00_20 * A[1];
2423       A += 8;
2424 
2425       k00_20  = ee0[-2] - ee2[-2];
2426       k01_21  = ee0[-3] - ee2[-3];
2427       ee0[-2] += ee2[-2];//ee0[-2] = ee0[-2] + ee2[-2];
2428       ee0[-3] += ee2[-3];//ee0[-3] = ee0[-3] + ee2[-3];
2429       ee2[-2] = k00_20 * A[0] - k01_21 * A[1];
2430       ee2[-3] = k01_21 * A[0] + k00_20 * A[1];
2431       A += 8;
2432 
2433       k00_20  = ee0[-4] - ee2[-4];
2434       k01_21  = ee0[-5] - ee2[-5];
2435       ee0[-4] += ee2[-4];//ee0[-4] = ee0[-4] + ee2[-4];
2436       ee0[-5] += ee2[-5];//ee0[-5] = ee0[-5] + ee2[-5];
2437       ee2[-4] = k00_20 * A[0] - k01_21 * A[1];
2438       ee2[-5] = k01_21 * A[0] + k00_20 * A[1];
2439       A += 8;
2440 
2441       k00_20  = ee0[-6] - ee2[-6];
2442       k01_21  = ee0[-7] - ee2[-7];
2443       ee0[-6] += ee2[-6];//ee0[-6] = ee0[-6] + ee2[-6];
2444       ee0[-7] += ee2[-7];//ee0[-7] = ee0[-7] + ee2[-7];
2445       ee2[-6] = k00_20 * A[0] - k01_21 * A[1];
2446       ee2[-7] = k01_21 * A[0] + k00_20 * A[1];
2447       A += 8;
2448       ee0 -= 8;
2449       ee2 -= 8;
2450    }
2451 }
2452 
2453 static void imdct_step3_inner_r_loop(int lim, float *e, int d0, int k_off, float *A, int k1)
2454 {
2455    int i;
2456    float k00_20, k01_21;
2457 
2458    float *e0 = e + d0;
2459    float *e2 = e0 + k_off;
2460 
2461    for (i=lim >> 2; i > 0; --i) {
2462       k00_20 = e0[-0] - e2[-0];
2463       k01_21 = e0[-1] - e2[-1];
2464       e0[-0] += e2[-0];//e0[-0] = e0[-0] + e2[-0];
2465       e0[-1] += e2[-1];//e0[-1] = e0[-1] + e2[-1];
2466       e2[-0] = (k00_20)*A[0] - (k01_21) * A[1];
2467       e2[-1] = (k01_21)*A[0] + (k00_20) * A[1];
2468 
2469       A += k1;
2470 
2471       k00_20 = e0[-2] - e2[-2];
2472       k01_21 = e0[-3] - e2[-3];
2473       e0[-2] += e2[-2];//e0[-2] = e0[-2] + e2[-2];
2474       e0[-3] += e2[-3];//e0[-3] = e0[-3] + e2[-3];
2475       e2[-2] = (k00_20)*A[0] - (k01_21) * A[1];
2476       e2[-3] = (k01_21)*A[0] + (k00_20) * A[1];
2477 
2478       A += k1;
2479 
2480       k00_20 = e0[-4] - e2[-4];
2481       k01_21 = e0[-5] - e2[-5];
2482       e0[-4] += e2[-4];//e0[-4] = e0[-4] + e2[-4];
2483       e0[-5] += e2[-5];//e0[-5] = e0[-5] + e2[-5];
2484       e2[-4] = (k00_20)*A[0] - (k01_21) * A[1];
2485       e2[-5] = (k01_21)*A[0] + (k00_20) * A[1];
2486 
2487       A += k1;
2488 
2489       k00_20 = e0[-6] - e2[-6];
2490       k01_21 = e0[-7] - e2[-7];
2491       e0[-6] += e2[-6];//e0[-6] = e0[-6] + e2[-6];
2492       e0[-7] += e2[-7];//e0[-7] = e0[-7] + e2[-7];
2493       e2[-6] = (k00_20)*A[0] - (k01_21) * A[1];
2494       e2[-7] = (k01_21)*A[0] + (k00_20) * A[1];
2495 
2496       e0 -= 8;
2497       e2 -= 8;
2498 
2499       A += k1;
2500    }
2501 }
2502 
2503 static void imdct_step3_inner_s_loop(int n, float *e, int i_off, int k_off, float *A, int a_off, int k0)
2504 {
2505    int i;
2506    float A0 = A[0];
2507    float A1 = A[0+1];
2508    float A2 = A[0+a_off];
2509    float A3 = A[0+a_off+1];
2510    float A4 = A[0+a_off*2+0];
2511    float A5 = A[0+a_off*2+1];
2512    float A6 = A[0+a_off*3+0];
2513    float A7 = A[0+a_off*3+1];
2514 
2515    float k00,k11;
2516 
2517    float *ee0 = e  +i_off;
2518    float *ee2 = ee0+k_off;
2519 
2520    for (i=n; i > 0; --i) {
2521       k00     = ee0[ 0] - ee2[ 0];
2522       k11     = ee0[-1] - ee2[-1];
2523       ee0[ 0] =  ee0[ 0] + ee2[ 0];
2524       ee0[-1] =  ee0[-1] + ee2[-1];
2525       ee2[ 0] = (k00) * A0 - (k11) * A1;
2526       ee2[-1] = (k11) * A0 + (k00) * A1;
2527 
2528       k00     = ee0[-2] - ee2[-2];
2529       k11     = ee0[-3] - ee2[-3];
2530       ee0[-2] =  ee0[-2] + ee2[-2];
2531       ee0[-3] =  ee0[-3] + ee2[-3];
2532       ee2[-2] = (k00) * A2 - (k11) * A3;
2533       ee2[-3] = (k11) * A2 + (k00) * A3;
2534 
2535       k00     = ee0[-4] - ee2[-4];
2536       k11     = ee0[-5] - ee2[-5];
2537       ee0[-4] =  ee0[-4] + ee2[-4];
2538       ee0[-5] =  ee0[-5] + ee2[-5];
2539       ee2[-4] = (k00) * A4 - (k11) * A5;
2540       ee2[-5] = (k11) * A4 + (k00) * A5;
2541 
2542       k00     = ee0[-6] - ee2[-6];
2543       k11     = ee0[-7] - ee2[-7];
2544       ee0[-6] =  ee0[-6] + ee2[-6];
2545       ee0[-7] =  ee0[-7] + ee2[-7];
2546       ee2[-6] = (k00) * A6 - (k11) * A7;
2547       ee2[-7] = (k11) * A6 + (k00) * A7;
2548 
2549       ee0 -= k0;
2550       ee2 -= k0;
2551    }
2552 }
2553 
2554 static __forceinline void iter_54(float *z)
2555 {
2556    float k00,k11,k22,k33;
2557    float y0,y1,y2,y3;
2558 
2559    k00  = z[ 0] - z[-4];
2560    y0   = z[ 0] + z[-4];
2561    y2   = z[-2] + z[-6];
2562    k22  = z[-2] - z[-6];
2563 
2564    z[-0] = y0 + y2;      // z0 + z4 + z2 + z6
2565    z[-2] = y0 - y2;      // z0 + z4 - z2 - z6
2566 
2567    // done with y0,y2
2568 
2569    k33  = z[-3] - z[-7];
2570 
2571    z[-4] = k00 + k33;    // z0 - z4 + z3 - z7
2572    z[-6] = k00 - k33;    // z0 - z4 - z3 + z7
2573 
2574    // done with k33
2575 
2576    k11  = z[-1] - z[-5];
2577    y1   = z[-1] + z[-5];
2578    y3   = z[-3] + z[-7];
2579 
2580    z[-1] = y1 + y3;      // z1 + z5 + z3 + z7
2581    z[-3] = y1 - y3;      // z1 + z5 - z3 - z7
2582    z[-5] = k11 - k22;    // z1 - z5 + z2 - z6
2583    z[-7] = k11 + k22;    // z1 - z5 - z2 + z6
2584 }
2585 
2586 static void imdct_step3_inner_s_loop_ld654(int n, float *e, int i_off, float *A, int base_n)
2587 {
2588    int a_off = base_n >> 3;
2589    float A2 = A[0+a_off];
2590    float *z = e + i_off;
2591    float *base = z - 16 * n;
2592 
2593    while (z > base) {
2594       float k00,k11;
2595       float l00,l11;
2596 
2597       k00    = z[-0] - z[ -8];
2598       k11    = z[-1] - z[ -9];
2599       l00    = z[-2] - z[-10];
2600       l11    = z[-3] - z[-11];
2601       z[ -0] = z[-0] + z[ -8];
2602       z[ -1] = z[-1] + z[ -9];
2603       z[ -2] = z[-2] + z[-10];
2604       z[ -3] = z[-3] + z[-11];
2605       z[ -8] = k00;
2606       z[ -9] = k11;
2607       z[-10] = (l00+l11) * A2;
2608       z[-11] = (l11-l00) * A2;
2609 
2610       k00    = z[ -4] - z[-12];
2611       k11    = z[ -5] - z[-13];
2612       l00    = z[ -6] - z[-14];
2613       l11    = z[ -7] - z[-15];
2614       z[ -4] = z[ -4] + z[-12];
2615       z[ -5] = z[ -5] + z[-13];
2616       z[ -6] = z[ -6] + z[-14];
2617       z[ -7] = z[ -7] + z[-15];
2618       z[-12] = k11;
2619       z[-13] = -k00;
2620       z[-14] = (l11-l00) * A2;
2621       z[-15] = (l00+l11) * -A2;
2622 
2623       iter_54(z);
2624       iter_54(z-8);
2625       z -= 16;
2626    }
2627 }
2628 
2629 static void inverse_mdct(float *buffer, int n, vorb *f, int blocktype)
2630 {
2631    int n2 = n >> 1, n4 = n >> 2, n8 = n >> 3, l;
2632    int ld;
2633    // @OPTIMIZE: reduce register pressure by using fewer variables?
2634    int save_point = temp_alloc_save(f);
2635    float *buf2 = (float *) temp_alloc(f, n2 * sizeof(*buf2));
2636    float *u=NULL,*v=NULL;
2637    // twiddle factors
2638    float *A = f->A[blocktype];
2639 
2640    // IMDCT algorithm from "The use of multirate filter banks for coding of high quality digital audio"
2641    // See notes about bugs in that paper in less-optimal implementation 'inverse_mdct_old' after this function.
2642 
2643    // kernel from paper
2644 
2645 
2646    // merged:
2647    //   copy and reflect spectral data
2648    //   step 0
2649 
2650    // note that it turns out that the items added together during
2651    // this step are, in fact, being added to themselves (as reflected
2652    // by step 0). inexplicable inefficiency! this became obvious
2653    // once I combined the passes.
2654 
2655    // so there's a missing 'times 2' here (for adding X to itself).
2656    // this propagates through linearly to the end, where the numbers
2657    // are 1/2 too small, and need to be compensated for.
2658 
2659    {
2660       float *d,*e, *AA, *e_stop;
2661       d = &buf2[n2-2];
2662       AA = A;
2663       e = &buffer[0];
2664       e_stop = &buffer[n2];
2665       while (e != e_stop) {
2666          d[1] = (e[0] * AA[0] - e[2]*AA[1]);
2667          d[0] = (e[0] * AA[1] + e[2]*AA[0]);
2668          d -= 2;
2669          AA += 2;
2670          e += 4;
2671       }
2672 
2673       e = &buffer[n2-3];
2674       while (d >= buf2) {
2675          d[1] = (-e[2] * AA[0] - -e[0]*AA[1]);
2676          d[0] = (-e[2] * AA[1] + -e[0]*AA[0]);
2677          d -= 2;
2678          AA += 2;
2679          e -= 4;
2680       }
2681    }
2682 
2683    // now we use symbolic names for these, so that we can
2684    // possibly swap their meaning as we change which operations
2685    // are in place
2686 
2687    u = buffer;
2688    v = buf2;
2689 
2690    // step 2    (paper output is w, now u)
2691    // this could be in place, but the data ends up in the wrong
2692    // place... _somebody_'s got to swap it, so this is nominated
2693    {
2694       float *AA = &A[n2-8];
2695       float *d0,*d1, *e0, *e1;
2696 
2697       e0 = &v[n4];
2698       e1 = &v[0];
2699 
2700       d0 = &u[n4];
2701       d1 = &u[0];
2702 
2703       while (AA >= A) {
2704          float v40_20, v41_21;
2705 
2706          v41_21 = e0[1] - e1[1];
2707          v40_20 = e0[0] - e1[0];
2708          d0[1]  = e0[1] + e1[1];
2709          d0[0]  = e0[0] + e1[0];
2710          d1[1]  = v41_21*AA[4] - v40_20*AA[5];
2711          d1[0]  = v40_20*AA[4] + v41_21*AA[5];
2712 
2713          v41_21 = e0[3] - e1[3];
2714          v40_20 = e0[2] - e1[2];
2715          d0[3]  = e0[3] + e1[3];
2716          d0[2]  = e0[2] + e1[2];
2717          d1[3]  = v41_21*AA[0] - v40_20*AA[1];
2718          d1[2]  = v40_20*AA[0] + v41_21*AA[1];
2719 
2720          AA -= 8;
2721 
2722          d0 += 4;
2723          d1 += 4;
2724          e0 += 4;
2725          e1 += 4;
2726       }
2727    }
2728 
2729    // step 3
2730    ld = ilog(n) - 1; // ilog is off-by-one from normal definitions
2731 
2732    // optimized step 3:
2733 
2734    // the original step3 loop can be nested r inside s or s inside r;
2735    // it's written originally as s inside r, but this is dumb when r
2736    // iterates many times, and s few. So I have two copies of it and
2737    // switch between them halfway.
2738 
2739    // this is iteration 0 of step 3
2740    imdct_step3_iter0_loop(n >> 4, u, n2-1-n4*0, -(n >> 3), A);
2741    imdct_step3_iter0_loop(n >> 4, u, n2-1-n4*1, -(n >> 3), A);
2742 
2743    // this is iteration 1 of step 3
2744    imdct_step3_inner_r_loop(n >> 5, u, n2-1 - n8*0, -(n >> 4), A, 16);
2745    imdct_step3_inner_r_loop(n >> 5, u, n2-1 - n8*1, -(n >> 4), A, 16);
2746    imdct_step3_inner_r_loop(n >> 5, u, n2-1 - n8*2, -(n >> 4), A, 16);
2747    imdct_step3_inner_r_loop(n >> 5, u, n2-1 - n8*3, -(n >> 4), A, 16);
2748 
2749    l=2;
2750    for (; l < (ld-3)>>1; ++l) {
2751       int k0 = n >> (l+2), k0_2 = k0>>1;
2752       int lim = 1 << (l+1);
2753       int i;
2754       for (i=0; i < lim; ++i)
2755          imdct_step3_inner_r_loop(n >> (l+4), u, n2-1 - k0*i, -k0_2, A, 1 << (l+3));
2756    }
2757 
2758    for (; l < ld-6; ++l) {
2759       int k0 = n >> (l+2), k1 = 1 << (l+3), k0_2 = k0>>1;
2760       int rlim = n >> (l+6), r;
2761       int lim = 1 << (l+1);
2762       int i_off;
2763       float *A0 = A;
2764       i_off = n2-1;
2765       for (r=rlim; r > 0; --r) {
2766          imdct_step3_inner_s_loop(lim, u, i_off, -k0_2, A0, k1, k0);
2767          A0 += k1*4;
2768          i_off -= 8;
2769       }
2770    }
2771 
2772    // iterations with count:
2773    //   ld-6,-5,-4 all interleaved together
2774    //       the big win comes from getting rid of needless flops
2775    //         due to the constants on pass 5 & 4 being all 1 and 0;
2776    //       combining them to be simultaneous to improve cache made little difference
2777    imdct_step3_inner_s_loop_ld654(n >> 5, u, n2-1, A, n);
2778 
2779    // output is u
2780 
2781    // step 4, 5, and 6
2782    // cannot be in-place because of step 5
2783    {
2784       uint16 *bitrev = f->bit_reverse[blocktype];
2785       // weirdly, I'd have thought reading sequentially and writing
2786       // erratically would have been better than vice-versa, but in
2787       // fact that's not what my testing showed. (That is, with
2788       // j = bitreverse(i), do you read i and write j, or read j and write i.)
2789 
2790       float *d0 = &v[n4-4];
2791       float *d1 = &v[n2-4];
2792       while (d0 >= v) {
2793          int k4;
2794 
2795          k4 = bitrev[0];
2796          d1[3] = u[k4+0];
2797          d1[2] = u[k4+1];
2798          d0[3] = u[k4+2];
2799          d0[2] = u[k4+3];
2800 
2801          k4 = bitrev[1];
2802          d1[1] = u[k4+0];
2803          d1[0] = u[k4+1];
2804          d0[1] = u[k4+2];
2805          d0[0] = u[k4+3];
2806 
2807          d0 -= 4;
2808          d1 -= 4;
2809          bitrev += 2;
2810       }
2811    }
2812    // (paper output is u, now v)
2813 
2814 
2815    // data must be in buf2
2816    assert(v == buf2);
2817 
2818    // step 7   (paper output is v, now v)
2819    // this is now in place
2820    {
2821       float *C = f->C[blocktype];
2822       float *d, *e;
2823 
2824       d = v;
2825       e = v + n2 - 4;
2826 
2827       while (d < e) {
2828          float a02,a11,b0,b1,b2,b3;
2829 
2830          a02 = d[0] - e[2];
2831          a11 = d[1] + e[3];
2832 
2833          b0 = C[1]*a02 + C[0]*a11;
2834          b1 = C[1]*a11 - C[0]*a02;
2835 
2836          b2 = d[0] + e[ 2];
2837          b3 = d[1] - e[ 3];
2838 
2839          d[0] = b2 + b0;
2840          d[1] = b3 + b1;
2841          e[2] = b2 - b0;
2842          e[3] = b1 - b3;
2843 
2844          a02 = d[2] - e[0];
2845          a11 = d[3] + e[1];
2846 
2847          b0 = C[3]*a02 + C[2]*a11;
2848          b1 = C[3]*a11 - C[2]*a02;
2849 
2850          b2 = d[2] + e[ 0];
2851          b3 = d[3] - e[ 1];
2852 
2853          d[2] = b2 + b0;
2854          d[3] = b3 + b1;
2855          e[0] = b2 - b0;
2856          e[1] = b1 - b3;
2857 
2858          C += 4;
2859          d += 4;
2860          e -= 4;
2861       }
2862    }
2863 
2864    // data must be in buf2
2865 
2866 
2867    // step 8+decode   (paper output is X, now buffer)
2868    // this generates pairs of data a la 8 and pushes them directly through
2869    // the decode kernel (pushing rather than pulling) to avoid having
2870    // to make another pass later
2871 
2872    // this cannot POSSIBLY be in place, so we refer to the buffers directly
2873 
2874    {
2875       float *d0,*d1,*d2,*d3;
2876 
2877       float *B = f->B[blocktype] + n2 - 8;
2878       float *e = buf2 + n2 - 8;
2879       d0 = &buffer[0];
2880       d1 = &buffer[n2-4];
2881       d2 = &buffer[n2];
2882       d3 = &buffer[n-4];
2883       while (e >= v) {
2884          float p0,p1,p2,p3;
2885 
2886          p3 =  e[6]*B[7] - e[7]*B[6];
2887          p2 = -e[6]*B[6] - e[7]*B[7];
2888 
2889          d0[0] =   p3;
2890          d1[3] = - p3;
2891          d2[0] =   p2;
2892          d3[3] =   p2;
2893 
2894          p1 =  e[4]*B[5] - e[5]*B[4];
2895          p0 = -e[4]*B[4] - e[5]*B[5];
2896 
2897          d0[1] =   p1;
2898          d1[2] = - p1;
2899          d2[1] =   p0;
2900          d3[2] =   p0;
2901 
2902          p3 =  e[2]*B[3] - e[3]*B[2];
2903          p2 = -e[2]*B[2] - e[3]*B[3];
2904 
2905          d0[2] =   p3;
2906          d1[1] = - p3;
2907          d2[2] =   p2;
2908          d3[1] =   p2;
2909 
2910          p1 =  e[0]*B[1] - e[1]*B[0];
2911          p0 = -e[0]*B[0] - e[1]*B[1];
2912 
2913          d0[3] =   p1;
2914          d1[0] = - p1;
2915          d2[3] =   p0;
2916          d3[0] =   p0;
2917 
2918          B -= 8;
2919          e -= 8;
2920          d0 += 4;
2921          d2 += 4;
2922          d1 -= 4;
2923          d3 -= 4;
2924       }
2925    }
2926 
2927    temp_free(f,buf2);
2928    temp_alloc_restore(f,save_point);
2929 }
2930 
2931 #if 0
2932 // this is the original version of the above code, if you want to optimize it from scratch
2933 void inverse_mdct_naive(float *buffer, int n)
2934 {
2935    float s;
2936    float A[1 << 12], B[1 << 12], C[1 << 11];
2937    int i,k,k2,k4, n2 = n >> 1, n4 = n >> 2, n8 = n >> 3, l;
2938    int n3_4 = n - n4, ld;
2939    // how can they claim this only uses N words?!
2940    // oh, because they're only used sparsely, whoops
2941    float u[1 << 13], X[1 << 13], v[1 << 13], w[1 << 13];
2942    // set up twiddle factors
2943 
2944    for (k=k2=0; k < n4; ++k,k2+=2) {
2945       A[k2  ] = (float)  cos(4*k*M_PI/n);
2946       A[k2+1] = (float) -sin(4*k*M_PI/n);
2947       B[k2  ] = (float)  cos((k2+1)*M_PI/n/2);
2948       B[k2+1] = (float)  sin((k2+1)*M_PI/n/2);
2949    }
2950    for (k=k2=0; k < n8; ++k,k2+=2) {
2951       C[k2  ] = (float)  cos(2*(k2+1)*M_PI/n);
2952       C[k2+1] = (float) -sin(2*(k2+1)*M_PI/n);
2953    }
2954 
2955    // IMDCT algorithm from "The use of multirate filter banks for coding of high quality digital audio"
2956    // Note there are bugs in that pseudocode, presumably due to them attempting
2957    // to rename the arrays nicely rather than representing the way their actual
2958    // implementation bounces buffers back and forth. As a result, even in the
2959    // "some formulars corrected" version, a direct implementation fails. These
2960    // are noted below as "paper bug".
2961 
2962    // copy and reflect spectral data
2963    for (k=0; k < n2; ++k) u[k] = buffer[k];
2964    for (   ; k < n ; ++k) u[k] = -buffer[n - k - 1];
2965    // kernel from paper
2966    // step 1
2967    for (k=k2=k4=0; k < n4; k+=1, k2+=2, k4+=4) {
2968       v[n-k4-1] = (u[k4] - u[n-k4-1]) * A[k2]   - (u[k4+2] - u[n-k4-3])*A[k2+1];
2969       v[n-k4-3] = (u[k4] - u[n-k4-1]) * A[k2+1] + (u[k4+2] - u[n-k4-3])*A[k2];
2970    }
2971    // step 2
2972    for (k=k4=0; k < n8; k+=1, k4+=4) {
2973       w[n2+3+k4] = v[n2+3+k4] + v[k4+3];
2974       w[n2+1+k4] = v[n2+1+k4] + v[k4+1];
2975       w[k4+3]    = (v[n2+3+k4] - v[k4+3])*A[n2-4-k4] - (v[n2+1+k4]-v[k4+1])*A[n2-3-k4];
2976       w[k4+1]    = (v[n2+1+k4] - v[k4+1])*A[n2-4-k4] + (v[n2+3+k4]-v[k4+3])*A[n2-3-k4];
2977    }
2978    // step 3
2979    ld = ilog(n) - 1; // ilog is off-by-one from normal definitions
2980    for (l=0; l < ld-3; ++l) {
2981       int k0 = n >> (l+2), k1 = 1 << (l+3);
2982       int rlim = n >> (l+4), r4, r;
2983       int s2lim = 1 << (l+2), s2;
2984       for (r=r4=0; r < rlim; r4+=4,++r) {
2985          for (s2=0; s2 < s2lim; s2+=2) {
2986             u[n-1-k0*s2-r4] = w[n-1-k0*s2-r4] + w[n-1-k0*(s2+1)-r4];
2987             u[n-3-k0*s2-r4] = w[n-3-k0*s2-r4] + w[n-3-k0*(s2+1)-r4];
2988             u[n-1-k0*(s2+1)-r4] = (w[n-1-k0*s2-r4] - w[n-1-k0*(s2+1)-r4]) * A[r*k1]
2989                                 - (w[n-3-k0*s2-r4] - w[n-3-k0*(s2+1)-r4]) * A[r*k1+1];
2990             u[n-3-k0*(s2+1)-r4] = (w[n-3-k0*s2-r4] - w[n-3-k0*(s2+1)-r4]) * A[r*k1]
2991                                 + (w[n-1-k0*s2-r4] - w[n-1-k0*(s2+1)-r4]) * A[r*k1+1];
2992          }
2993       }
2994       if (l+1 < ld-3) {
2995          // paper bug: ping-ponging of u&w here is omitted
2996          memcpy(w, u, sizeof(u));
2997       }
2998    }
2999 
3000    // step 4
3001    for (i=0; i < n8; ++i) {
3002       int j = bit_reverse(i) >> (32-ld+3);
3003       assert(j < n8);
3004       if (i == j) {
3005          // paper bug: original code probably swapped in place; if copying,
3006          //            need to directly copy in this case
3007          int i8 = i << 3;
3008          v[i8+1] = u[i8+1];
3009          v[i8+3] = u[i8+3];
3010          v[i8+5] = u[i8+5];
3011          v[i8+7] = u[i8+7];
3012       } else if (i < j) {
3013          int i8 = i << 3, j8 = j << 3;
3014          v[j8+1] = u[i8+1], v[i8+1] = u[j8 + 1];
3015          v[j8+3] = u[i8+3], v[i8+3] = u[j8 + 3];
3016          v[j8+5] = u[i8+5], v[i8+5] = u[j8 + 5];
3017          v[j8+7] = u[i8+7], v[i8+7] = u[j8 + 7];
3018       }
3019    }
3020    // step 5
3021    for (k=0; k < n2; ++k) {
3022       w[k] = v[k*2+1];
3023    }
3024    // step 6
3025    for (k=k2=k4=0; k < n8; ++k, k2 += 2, k4 += 4) {
3026       u[n-1-k2] = w[k4];
3027       u[n-2-k2] = w[k4+1];
3028       u[n3_4 - 1 - k2] = w[k4+2];
3029       u[n3_4 - 2 - k2] = w[k4+3];
3030    }
3031    // step 7
3032    for (k=k2=0; k < n8; ++k, k2 += 2) {
3033       v[n2 + k2 ] = ( u[n2 + k2] + u[n-2-k2] + C[k2+1]*(u[n2+k2]-u[n-2-k2]) + C[k2]*(u[n2+k2+1]+u[n-2-k2+1]))/2;
3034       v[n-2 - k2] = ( u[n2 + k2] + u[n-2-k2] - C[k2+1]*(u[n2+k2]-u[n-2-k2]) - C[k2]*(u[n2+k2+1]+u[n-2-k2+1]))/2;
3035       v[n2+1+ k2] = ( u[n2+1+k2] - u[n-1-k2] + C[k2+1]*(u[n2+1+k2]+u[n-1-k2]) - C[k2]*(u[n2+k2]-u[n-2-k2]))/2;
3036       v[n-1 - k2] = (-u[n2+1+k2] + u[n-1-k2] + C[k2+1]*(u[n2+1+k2]+u[n-1-k2]) - C[k2]*(u[n2+k2]-u[n-2-k2]))/2;
3037    }
3038    // step 8
3039    for (k=k2=0; k < n4; ++k,k2 += 2) {
3040       X[k]      = v[k2+n2]*B[k2  ] + v[k2+1+n2]*B[k2+1];
3041       X[n2-1-k] = v[k2+n2]*B[k2+1] - v[k2+1+n2]*B[k2  ];
3042    }
3043 
3044    // decode kernel to output
3045    // determined the following value experimentally
3046    // (by first figuring out what made inverse_mdct_slow work); then matching that here
3047    // (probably vorbis encoder premultiplies by n or n/2, to save it on the decoder?)
3048    s = 0.5; // theoretically would be n4
3049 
3050    // [[[ note! the s value of 0.5 is compensated for by the B[] in the current code,
3051    //     so it needs to use the "old" B values to behave correctly, or else
3052    //     set s to 1.0 ]]]
3053    for (i=0; i < n4  ; ++i) buffer[i] = s * X[i+n4];
3054    for (   ; i < n3_4; ++i) buffer[i] = -s * X[n3_4 - i - 1];
3055    for (   ; i < n   ; ++i) buffer[i] = -s * X[i - n3_4];
3056 }
3057 #endif
3058 
3059 static float *get_window(vorb *f, int len)
3060 {
3061    len <<= 1;
3062    if (len == f->blocksize_0) return f->window[0];
3063    if (len == f->blocksize_1) return f->window[1];
3064    return NULL;
3065 }
3066 
3067 #ifndef STB_VORBIS_NO_DEFER_FLOOR
3068 typedef int16 YTYPE;
3069 #else
3070 typedef int YTYPE;
3071 #endif
3072 static int do_floor(vorb *f, Mapping *map, int i, int n, float *target, YTYPE *finalY, uint8 *step2_flag)
3073 {
3074    int n2 = n >> 1;
3075    int s = map->chan[i].mux, floor;
3076    floor = map->submap_floor[s];
3077    if (f->floor_types[floor] == 0) {
3078       return error(f, VORBIS_invalid_stream);
3079    } else {
3080       Floor1 *g = &f->floor_config[floor].floor1;
3081       int j,q;
3082       int lx = 0, ly = finalY[0] * g->floor1_multiplier;
3083       for (q=1; q < g->values; ++q) {
3084          j = g->sorted_order[q];
3085          #ifndef STB_VORBIS_NO_DEFER_FLOOR
3086          STBV_NOTUSED(step2_flag);
3087          if (finalY[j] >= 0)
3088          #else
3089          if (step2_flag[j])
3090          #endif
3091          {
3092             int hy = finalY[j] * g->floor1_multiplier;
3093             int hx = g->Xlist[j];
3094             if (lx != hx)
3095                draw_line(target, lx,ly, hx,hy, n2);
3096             CHECK(f);
3097             lx = hx, ly = hy;
3098          }
3099       }
3100       if (lx < n2) {
3101          // optimization of: draw_line(target, lx,ly, n,ly, n2);
3102          for (j=lx; j < n2; ++j)
3103             LINE_OP(target[j], inverse_db_table[ly]);
3104          CHECK(f);
3105       }
3106    }
3107    return TRUE;
3108 }
3109 
3110 // The meaning of "left" and "right"
3111 //
3112 // For a given frame:
3113 //     we compute samples from 0..n
3114 //     window_center is n/2
3115 //     we'll window and mix the samples from left_start to left_end with data from the previous frame
3116 //     all of the samples from left_end to right_start can be output without mixing; however,
3117 //        this interval is 0-length except when transitioning between short and long frames
3118 //     all of the samples from right_start to right_end need to be mixed with the next frame,
3119 //        which we don't have, so those get saved in a buffer
3120 //     frame N's right_end-right_start, the number of samples to mix with the next frame,
3121 //        has to be the same as frame N+1's left_end-left_start (which they are by
3122 //        construction)
3123 
3124 static int vorbis_decode_initial(vorb *f, int *p_left_start, int *p_left_end, int *p_right_start, int *p_right_end, int *mode)
3125 {
3126    Mode *m;
3127    int i, n, prev, next, window_center;
3128    f->channel_buffer_start = f->channel_buffer_end = 0;
3129 
3130   retry:
3131    if (f->eof) return FALSE;
3132    if (!maybe_start_packet(f))
3133       return FALSE;
3134    // check packet type
3135    if (get_bits(f,1) != 0) {
3136       if (IS_PUSH_MODE(f))
3137          return error(f,VORBIS_bad_packet_type);
3138       while (EOP != get8_packet(f));
3139       goto retry;
3140    }
3141 
3142    if (f->alloc.alloc_buffer)
3143       assert(f->alloc.alloc_buffer_length_in_bytes == f->temp_offset);
3144 
3145    i = get_bits(f, ilog(f->mode_count-1));
3146    if (i == EOP) return FALSE;
3147    if (i >= f->mode_count) return FALSE;
3148    *mode = i;
3149    m = f->mode_config + i;
3150    if (m->blockflag) {
3151       n = f->blocksize_1;
3152       prev = get_bits(f,1);
3153       next = get_bits(f,1);
3154    } else {
3155       prev = next = 0;
3156       n = f->blocksize_0;
3157    }
3158 
3159 // WINDOWING
3160 
3161    window_center = n >> 1;
3162    if (m->blockflag && !prev) {
3163       *p_left_start = (n - f->blocksize_0) >> 2;
3164       *p_left_end   = (n + f->blocksize_0) >> 2;
3165    } else {
3166       *p_left_start = 0;
3167       *p_left_end   = window_center;
3168    }
3169    if (m->blockflag && !next) {
3170       *p_right_start = (n*3 - f->blocksize_0) >> 2;
3171       *p_right_end   = (n*3 + f->blocksize_0) >> 2;
3172    } else {
3173       *p_right_start = window_center;
3174       *p_right_end   = n;
3175    }
3176 
3177    return TRUE;
3178 }
3179 
3180 static int vorbis_decode_packet_rest(vorb *f, int *len, Mode *m, int left_start, int left_end, int right_start, int right_end, int *p_left)
3181 {
3182    Mapping *map;
3183    int i,j,k,n,n2;
3184    int zero_channel[256];
3185    int really_zero_channel[256];
3186 
3187 // WINDOWING
3188 
3189    STBV_NOTUSED(left_end);
3190    n = f->blocksize[m->blockflag];
3191    map = &f->mapping[m->mapping];
3192 
3193 // FLOORS
3194    n2 = n >> 1;
3195 
3196    CHECK(f);
3197 
3198    for (i=0; i < f->channels; ++i) {
3199       int s = map->chan[i].mux, floor;
3200       zero_channel[i] = FALSE;
3201       floor = map->submap_floor[s];
3202       if (f->floor_types[floor] == 0) {
3203          return error(f, VORBIS_invalid_stream);
3204       } else {
3205          Floor1 *g = &f->floor_config[floor].floor1;
3206          if (get_bits(f, 1)) {
3207             short *finalY;
3208             uint8 step2_flag[256];
3209             static int range_list[4] = { 256, 128, 86, 64 };
3210             int range = range_list[g->floor1_multiplier-1];
3211             int offset = 2;
3212             finalY = f->finalY[i];
3213             finalY[0] = get_bits(f, ilog(range)-1);
3214             finalY[1] = get_bits(f, ilog(range)-1);
3215             for (j=0; j < g->partitions; ++j) {
3216                int pclass = g->partition_class_list[j];
3217                int cdim = g->class_dimensions[pclass];
3218                int cbits = g->class_subclasses[pclass];
3219                int csub = (1 << cbits)-1;
3220                int cval = 0;
3221                if (cbits) {
3222                   Codebook *c = f->codebooks + g->class_masterbooks[pclass];
3223                   DECODE(cval,f,c);
3224                }
3225                for (k=0; k < cdim; ++k) {
3226                   int book = g->subclass_books[pclass][cval & csub];
3227                   cval = cval >> cbits;
3228                   if (book >= 0) {
3229                      int temp;
3230                      Codebook *c = f->codebooks + book;
3231                      DECODE(temp,f,c);
3232                      finalY[offset++] = temp;
3233                   } else
3234                      finalY[offset++] = 0;
3235                }
3236             }
3237             if (f->valid_bits == INVALID_BITS) goto error; // behavior according to spec
3238             step2_flag[0] = step2_flag[1] = 1;
3239             for (j=2; j < g->values; ++j) {
3240                int low, high, pred, highroom, lowroom, room, val;
3241                low = g->neighbors[j][0];
3242                high = g->neighbors[j][1];
3243                //neighbors(g->Xlist, j, &low, &high);
3244                pred = predict_point(g->Xlist[j], g->Xlist[low], g->Xlist[high], finalY[low], finalY[high]);
3245                val = finalY[j];
3246                highroom = range - pred;
3247                lowroom = pred;
3248                if (highroom < lowroom)
3249                   room = highroom * 2;
3250                else
3251                   room = lowroom * 2;
3252                if (val) {
3253                   step2_flag[low] = step2_flag[high] = 1;
3254                   step2_flag[j] = 1;
3255                   if (val >= room)
3256                      if (highroom > lowroom)
3257                         finalY[j] = val - lowroom + pred;
3258                      else
3259                         finalY[j] = pred - val + highroom - 1;
3260                   else
3261                      if (val & 1)
3262                         finalY[j] = pred - ((val+1)>>1);
3263                      else
3264                         finalY[j] = pred + (val>>1);
3265                } else {
3266                   step2_flag[j] = 0;
3267                   finalY[j] = pred;
3268                }
3269             }
3270 
3271 #ifdef STB_VORBIS_NO_DEFER_FLOOR
3272             do_floor(f, map, i, n, f->floor_buffers[i], finalY, step2_flag);
3273 #else
3274             // defer final floor computation until _after_ residue
3275             for (j=0; j < g->values; ++j) {
3276                if (!step2_flag[j])
3277                   finalY[j] = -1;
3278             }
3279 #endif
3280          } else {
3281            error:
3282             zero_channel[i] = TRUE;