v / thirdparty / stb_vorbis / stb_vorbis.c
5584 lines · 4959 sloc · 188.27 KB · 8287d5dd97821042e05ac55803af906c9276825c
Raw
1// Ogg Vorbis audio decoder - v1.22 - public domain
2// http://nothings.org/stb_vorbis/
3//
4// Original version written by Sean Barrett in 2007.
5//
6// Originally sponsored by RAD Game Tools. Seeking implementation
7// sponsored by Phillip Bennefall, Marc Andersen, Aaron Baker,
8// Elias Software, Aras Pranckevicius, and Sean Barrett.
9//
10// LICENSE
11//
12// See end of file for license information.
13//
14// Limitations:
15//
16// - floor 0 not supported (used in old ogg vorbis files pre-2004)
17// - lossless sample-truncation at beginning ignored
18// - cannot concatenate multiple vorbis streams
19// - sample positions are 32-bit, limiting seekable 192Khz
20// files to around 6 hours (Ogg supports 64-bit)
21//
22// Feature contributors:
23// Dougall Johnson (sample-exact seeking)
24//
25// Bugfix/warning contributors:
26// Terje Mathisen Niklas Frykholm Andy Hill
27// Casey Muratori John Bolton Gargaj
28// Laurent Gomila Marc LeBlanc Ronny Chevalier
29// Bernhard Wodo Evan Balster github:alxprd
30// Tom Beaumont Ingo Leitgeb Nicolas Guillemot
31// Phillip Bennefall Rohit Thiago Goulart
32// github:manxorist Saga Musix github:infatum
33// Timur Gagiev Maxwell Koo Peter Waller
34// github:audinowho Dougall Johnson David Reid
35// github:Clownacy Pedro J. Estebanez Remi Verschelde
36// AnthoFoxo github:morlat Gabriel Ravier
37//
38// Partial history:
39// 1.22 - 2021-07-11 - various small fixes
40// 1.21 - 2021-07-02 - fix bug for files with no comments
41// 1.20 - 2020-07-11 - several small fixes
42// 1.19 - 2020-02-05 - warnings
43// 1.18 - 2020-02-02 - fix seek bugs; parse header comments; misc warnings etc.
44// 1.17 - 2019-07-08 - fix CVE-2019-13217..CVE-2019-13223 (by ForAllSecure)
45// 1.16 - 2019-03-04 - fix warnings
46// 1.15 - 2019-02-07 - explicit failure if Ogg Skeleton data is found
47// 1.14 - 2018-02-11 - delete bogus dealloca usage
48// 1.13 - 2018-01-29 - fix truncation of last frame (hopefully)
49// 1.12 - 2017-11-21 - limit residue begin/end to blocksize/2 to avoid large temp allocs in bad/corrupt files
50// 1.11 - 2017-07-23 - fix MinGW compilation
51// 1.10 - 2017-03-03 - more robust seeking; fix negative ilog(); clear error in open_memory
52// 1.09 - 2016-04-04 - back out 'truncation of last frame' fix from previous version
53// 1.08 - 2016-04-02 - warnings; setup memory leaks; truncation of last frame
54// 1.07 - 2015-01-16 - fixes for crashes on invalid files; warning fixes; const
55// 1.06 - 2015-08-31 - full, correct support for seeking API (Dougall Johnson)
56// some crash fixes when out of memory or with corrupt files
57// fix some inappropriately signed shifts
58// 1.05 - 2015-04-19 - don't define __forceinline if it's redundant
59// 1.04 - 2014-08-27 - fix missing const-correct case in API
60// 1.03 - 2014-08-07 - warning fixes
61// 1.02 - 2014-07-09 - declare qsort comparison as explicitly _cdecl in Windows
62// 1.01 - 2014-06-18 - fix stb_vorbis_get_samples_float (interleaved was correct)
63// 1.0 - 2014-05-26 - fix memory leaks; fix warnings; fix bugs in >2-channel;
64// (API change) report sample rate for decode-full-file funcs
65//
66// See end of file for full version history.
67
68
69//////////////////////////////////////////////////////////////////////////////
70//
71// HEADER BEGINS HERE
72//
73
74#ifndef STB_VORBIS_INCLUDE_STB_VORBIS_H
75#define STB_VORBIS_INCLUDE_STB_VORBIS_H
76
77#if defined(STB_VORBIS_NO_CRT) && !defined(STB_VORBIS_NO_STDIO)
78#define STB_VORBIS_NO_STDIO 1
79#endif
80
81#ifndef STB_VORBIS_NO_STDIO
82#include <stdio.h>
83#endif
84
85#ifdef __cplusplus
86extern "C" {
87#endif
88
89/////////// THREAD SAFETY
90
91// Individual stb_vorbis* handles are not thread-safe; you cannot decode from
92// them from multiple threads at the same time. However, you can have multiple
93// stb_vorbis* handles and decode from them independently in multiple thrads.
94
95
96/////////// MEMORY ALLOCATION
97
98// normally stb_vorbis uses malloc() to allocate memory at startup,
99// and alloca() to allocate temporary memory during a frame on the
100// stack. (Memory consumption will depend on the amount of setup
101// data in the file and how you set the compile flags for speed
102// vs. size. In my test files the maximal-size usage is ~150KB.)
103//
104// You can modify the wrapper functions in the source (setup_malloc,
105// setup_temp_malloc, temp_malloc) to change this behavior, or you
106// can use a simpler allocation model: you pass in a buffer from
107// which stb_vorbis will allocate _all_ its memory (including the
108// temp memory). "open" may fail with a VORBIS_outofmem if you
109// do not pass in enough data; there is no way to determine how
110// much you do need except to succeed (at which point you can
111// query get_info to find the exact amount required. yes I know
112// this is lame).
113//
114// If you pass in a non-NULL buffer of the type below, allocation
115// will occur from it as described above. Otherwise just pass NULL
116// to use malloc()/alloca()
117
118typedef struct
119{
120 char *alloc_buffer;
121 int alloc_buffer_length_in_bytes;
122} stb_vorbis_alloc;
123
124
125/////////// FUNCTIONS USEABLE WITH ALL INPUT MODES
126
127typedef struct stb_vorbis stb_vorbis;
128
129typedef struct
130{
131 unsigned int sample_rate;
132 int channels;
133
134 unsigned int setup_memory_required;
135 unsigned int setup_temp_memory_required;
136 unsigned int temp_memory_required;
137
138 int max_frame_size;
139} stb_vorbis_info;
140
141typedef struct
142{
143 char *vendor;
144
145 int comment_list_length;
146 char **comment_list;
147} stb_vorbis_comment;
148
149// get general information about the file
150extern stb_vorbis_info stb_vorbis_get_info(stb_vorbis *f);
151
152// get ogg comments
153extern stb_vorbis_comment stb_vorbis_get_comment(stb_vorbis *f);
154
155// get the last error detected (clears it, too)
156extern int stb_vorbis_get_error(stb_vorbis *f);
157
158// close an ogg vorbis file and free all memory in use
159extern void stb_vorbis_close(stb_vorbis *f);
160
161// this function returns the offset (in samples) from the beginning of the
162// file that will be returned by the next decode, if it is known, or -1
163// otherwise. after a flush_pushdata() call, this may take a while before
164// it becomes valid again.
165// NOT WORKING YET after a seek with PULLDATA API
166extern int stb_vorbis_get_sample_offset(stb_vorbis *f);
167
168// returns the current seek point within the file, or offset from the beginning
169// of the memory buffer. In pushdata mode it returns 0.
170extern unsigned int stb_vorbis_get_file_offset(stb_vorbis *f);
171
172/////////// PUSHDATA API
173
174#ifndef STB_VORBIS_NO_PUSHDATA_API
175
176// this API allows you to get blocks of data from any source and hand
177// them to stb_vorbis. you have to buffer them; stb_vorbis will tell
178// you how much it used, and you have to give it the rest next time;
179// and stb_vorbis may not have enough data to work with and you will
180// need to give it the same data again PLUS more. Note that the Vorbis
181// specification does not bound the size of an individual frame.
182
183extern stb_vorbis *stb_vorbis_open_pushdata(
184 const unsigned char * datablock, int datablock_length_in_bytes,
185 int *datablock_memory_consumed_in_bytes,
186 int *error,
187 const stb_vorbis_alloc *alloc_buffer);
188// create a vorbis decoder by passing in the initial data block containing
189// the ogg&vorbis headers (you don't need to do parse them, just provide
190// the first N bytes of the file--you're told if it's not enough, see below)
191// on success, returns an stb_vorbis *, does not set error, returns the amount of
192// data parsed/consumed on this call in *datablock_memory_consumed_in_bytes;
193// on failure, returns NULL on error and sets *error, does not change *datablock_memory_consumed
194// if returns NULL and *error is VORBIS_need_more_data, then the input block was
195// incomplete and you need to pass in a larger block from the start of the file
196
197extern int stb_vorbis_decode_frame_pushdata(
198 stb_vorbis *f,
199 const unsigned char *datablock, int datablock_length_in_bytes,
200 int *channels, // place to write number of float * buffers
201 float ***output, // place to write float ** array of float * buffers
202 int *samples // place to write number of output samples
203 );
204// decode a frame of audio sample data if possible from the passed-in data block
205//
206// return value: number of bytes we used from datablock
207//
208// possible cases:
209// 0 bytes used, 0 samples output (need more data)
210// N bytes used, 0 samples output (resynching the stream, keep going)
211// N bytes used, M samples output (one frame of data)
212// note that after opening a file, you will ALWAYS get one N-bytes,0-sample
213// frame, because Vorbis always "discards" the first frame.
214//
215// Note that on resynch, stb_vorbis will rarely consume all of the buffer,
216// instead only datablock_length_in_bytes-3 or less. This is because it wants
217// to avoid missing parts of a page header if they cross a datablock boundary,
218// without writing state-machiney code to record a partial detection.
219//
220// The number of channels returned are stored in *channels (which can be
221// NULL--it is always the same as the number of channels reported by
222// get_info). *output will contain an array of float* buffers, one per
223// channel. In other words, (*output)[0][0] contains the first sample from
224// the first channel, and (*output)[1][0] contains the first sample from
225// the second channel.
226//
227// *output points into stb_vorbis's internal output buffer storage; these
228// buffers are owned by stb_vorbis and application code should not free
229// them or modify their contents. They are transient and will be overwritten
230// once you ask for more data to get decoded, so be sure to grab any data
231// you need before then.
232
233extern void stb_vorbis_flush_pushdata(stb_vorbis *f);
234// inform stb_vorbis that your next datablock will not be contiguous with
235// previous ones (e.g. you've seeked in the data); future attempts to decode
236// frames will cause stb_vorbis to resynchronize (as noted above), and
237// once it sees a valid Ogg page (typically 4-8KB, as large as 64KB), it
238// will begin decoding the _next_ frame.
239//
240// if you want to seek using pushdata, you need to seek in your file, then
241// call stb_vorbis_flush_pushdata(), then start calling decoding, then once
242// decoding is returning you data, call stb_vorbis_get_sample_offset, and
243// if you don't like the result, seek your file again and repeat.
244#endif
245
246
247////////// PULLING INPUT API
248
249#ifndef STB_VORBIS_NO_PULLDATA_API
250// This API assumes stb_vorbis is allowed to pull data from a source--
251// either a block of memory containing the _entire_ vorbis stream, or a
252// FILE * that you or it create, or possibly some other reading mechanism
253// if you go modify the source to replace the FILE * case with some kind
254// of callback to your code. (But if you don't support seeking, you may
255// just want to go ahead and use pushdata.)
256
257#if !defined(STB_VORBIS_NO_STDIO) && !defined(STB_VORBIS_NO_INTEGER_CONVERSION)
258extern int stb_vorbis_decode_filename(const char *filename, int *channels, int *sample_rate, short **output);
259#endif
260#if !defined(STB_VORBIS_NO_INTEGER_CONVERSION)
261extern int stb_vorbis_decode_memory(const unsigned char *mem, int len, int *channels, int *sample_rate, short **output);
262#endif
263// decode an entire file and output the data interleaved into a malloc()ed
264// buffer stored in *output. The return value is the number of samples
265// decoded, or -1 if the file could not be opened or was not an ogg vorbis file.
266// When you're done with it, just free() the pointer returned in *output.
267
268extern stb_vorbis * stb_vorbis_open_memory(const unsigned char *data, int len,
269 int *error, const stb_vorbis_alloc *alloc_buffer);
270// create an ogg vorbis decoder from an ogg vorbis stream in memory (note
271// this must be the entire stream!). on failure, returns NULL and sets *error
272
273#ifndef STB_VORBIS_NO_STDIO
274extern stb_vorbis * stb_vorbis_open_filename(const char *filename,
275 int *error, const stb_vorbis_alloc *alloc_buffer);
276// create an ogg vorbis decoder from a filename via fopen(). on failure,
277// returns NULL and sets *error (possibly to VORBIS_file_open_failure).
278
279extern stb_vorbis * stb_vorbis_open_file(FILE *f, int close_handle_on_close,
280 int *error, const stb_vorbis_alloc *alloc_buffer);
281// create an ogg vorbis decoder from an open FILE *, looking for a stream at
282// the _current_ seek point (ftell). on failure, returns NULL and sets *error.
283// note that stb_vorbis must "own" this stream; if you seek it in between
284// calls to stb_vorbis, it will become confused. Moreover, if you attempt to
285// perform stb_vorbis_seek_*() operations on this file, it will assume it
286// owns the _entire_ rest of the file after the start point. Use the next
287// function, stb_vorbis_open_file_section(), to limit it.
288
289extern stb_vorbis * stb_vorbis_open_file_section(FILE *f, int close_handle_on_close,
290 int *error, const stb_vorbis_alloc *alloc_buffer, unsigned int len);
291// create an ogg vorbis decoder from an open FILE *, looking for a stream at
292// the _current_ seek point (ftell); the stream will be of length 'len' bytes.
293// on failure, returns NULL and sets *error. note that stb_vorbis must "own"
294// this stream; if you seek it in between calls to stb_vorbis, it will become
295// confused.
296#endif
297
298extern int stb_vorbis_seek_frame(stb_vorbis *f, unsigned int sample_number);
299extern int stb_vorbis_seek(stb_vorbis *f, unsigned int sample_number);
300// these functions seek in the Vorbis file to (approximately) 'sample_number'.
301// after calling seek_frame(), the next call to get_frame_*() will include
302// the specified sample. after calling stb_vorbis_seek(), the next call to
303// stb_vorbis_get_samples_* will start with the specified sample. If you
304// do not need to seek to EXACTLY the target sample when using get_samples_*,
305// you can also use seek_frame().
306
307extern int stb_vorbis_seek_start(stb_vorbis *f);
308// this function is equivalent to stb_vorbis_seek(f,0)
309
310extern unsigned int stb_vorbis_stream_length_in_samples(stb_vorbis *f);
311extern float stb_vorbis_stream_length_in_seconds(stb_vorbis *f);
312// these functions return the total length of the vorbis stream
313
314extern int stb_vorbis_get_frame_float(stb_vorbis *f, int *channels, float ***output);
315// decode the next frame and return the number of samples. the number of
316// channels returned are stored in *channels (which can be NULL--it is always
317// the same as the number of channels reported by get_info). *output will
318// contain an array of float* buffers, one per channel. These outputs will
319// be overwritten on the next call to stb_vorbis_get_frame_*.
320//
321// You generally should not intermix calls to stb_vorbis_get_frame_*()
322// and stb_vorbis_get_samples_*(), since the latter calls the former.
323
324#ifndef STB_VORBIS_NO_INTEGER_CONVERSION
325extern int stb_vorbis_get_frame_short_interleaved(stb_vorbis *f, int num_c, short *buffer, int num_shorts);
326extern int stb_vorbis_get_frame_short (stb_vorbis *f, int num_c, short **buffer, int num_samples);
327#endif
328// decode the next frame and return the number of *samples* per channel.
329// Note that for interleaved data, you pass in the number of shorts (the
330// size of your array), but the return value is the number of samples per
331// channel, not the total number of samples.
332//
333// The data is coerced to the number of channels you request according to the
334// channel coercion rules (see below). You must pass in the size of your
335// buffer(s) so that stb_vorbis will not overwrite the end of the buffer.
336// The maximum buffer size needed can be gotten from get_info(); however,
337// the Vorbis I specification implies an absolute maximum of 4096 samples
338// per channel.
339
340// Channel coercion rules:
341// Let M be the number of channels requested, and N the number of channels present,
342// and Cn be the nth channel; let stereo L be the sum of all L and center channels,
343// and stereo R be the sum of all R and center channels (channel assignment from the
344// vorbis spec).
345// M N output
346// 1 k sum(Ck) for all k
347// 2 * stereo L, stereo R
348// k l k > l, the first l channels, then 0s
349// k l k <= l, the first k channels
350// Note that this is not _good_ surround etc. mixing at all! It's just so
351// you get something useful.
352
353extern int stb_vorbis_get_samples_float_interleaved(stb_vorbis *f, int channels, float *buffer, int num_floats);
354extern int stb_vorbis_get_samples_float(stb_vorbis *f, int channels, float **buffer, int num_samples);
355// gets num_samples samples, not necessarily on a frame boundary--this requires
356// buffering so you have to supply the buffers. DOES NOT APPLY THE COERCION RULES.
357// Returns the number of samples stored per channel; it may be less than requested
358// at the end of the file. If there are no more samples in the file, returns 0.
359
360#ifndef STB_VORBIS_NO_INTEGER_CONVERSION
361extern int stb_vorbis_get_samples_short_interleaved(stb_vorbis *f, int channels, short *buffer, int num_shorts);
362extern int stb_vorbis_get_samples_short(stb_vorbis *f, int channels, short **buffer, int num_samples);
363#endif
364// gets num_samples samples, not necessarily on a frame boundary--this requires
365// buffering so you have to supply the buffers. Applies the coercion rules above
366// to produce 'channels' channels. Returns the number of samples stored per channel;
367// it may be less than requested at the end of the file. If there are no more
368// samples in the file, returns 0.
369
370#endif
371
372//////// ERROR CODES
373
374enum STBVorbisError
375{
376 VORBIS__no_error,
377
378 VORBIS_need_more_data=1, // not a real error
379
380 VORBIS_invalid_api_mixing, // can't mix API modes
381 VORBIS_outofmem, // not enough memory
382 VORBIS_feature_not_supported, // uses floor 0
383 VORBIS_too_many_channels, // STB_VORBIS_MAX_CHANNELS is too small
384 VORBIS_file_open_failure, // fopen() failed
385 VORBIS_seek_without_length, // can't seek in unknown-length file
386
387 VORBIS_unexpected_eof=10, // file is truncated?
388 VORBIS_seek_invalid, // seek past EOF
389
390 // decoding errors (corrupt/invalid stream) -- you probably
391 // don't care about the exact details of these
392
393 // vorbis errors:
394 VORBIS_invalid_setup=20,
395 VORBIS_invalid_stream,
396
397 // ogg errors:
398 VORBIS_missing_capture_pattern=30,
399 VORBIS_invalid_stream_structure_version,
400 VORBIS_continued_packet_flag_invalid,
401 VORBIS_incorrect_stream_serial_number,
402 VORBIS_invalid_first_page,
403 VORBIS_bad_packet_type,
404 VORBIS_cant_find_last_page,
405 VORBIS_seek_failed,
406 VORBIS_ogg_skeleton_not_supported
407};
408
409
410#ifdef __cplusplus
411}
412#endif
413
414#endif // STB_VORBIS_INCLUDE_STB_VORBIS_H
415//
416// HEADER ENDS HERE
417//
418//////////////////////////////////////////////////////////////////////////////
419
420#ifndef STB_VORBIS_HEADER_ONLY
421
422// global configuration settings (e.g. set these in the project/makefile),
423// or just set them in this file at the top (although ideally the first few
424// should be visible when the header file is compiled too, although it's not
425// crucial)
426
427// STB_VORBIS_NO_PUSHDATA_API
428// does not compile the code for the various stb_vorbis_*_pushdata()
429// functions
430// #define STB_VORBIS_NO_PUSHDATA_API
431
432// STB_VORBIS_NO_PULLDATA_API
433// does not compile the code for the non-pushdata APIs
434// #define STB_VORBIS_NO_PULLDATA_API
435
436// STB_VORBIS_NO_STDIO
437// does not compile the code for the APIs that use FILE *s internally
438// or externally (implied by STB_VORBIS_NO_PULLDATA_API)
439// #define STB_VORBIS_NO_STDIO
440
441// STB_VORBIS_NO_INTEGER_CONVERSION
442// does not compile the code for converting audio sample data from
443// float to integer (implied by STB_VORBIS_NO_PULLDATA_API)
444// #define STB_VORBIS_NO_INTEGER_CONVERSION
445
446// STB_VORBIS_NO_FAST_SCALED_FLOAT
447// does not use a fast float-to-int trick to accelerate float-to-int on
448// most platforms which requires endianness be defined correctly.
449//#define STB_VORBIS_NO_FAST_SCALED_FLOAT
450
451
452// STB_VORBIS_MAX_CHANNELS [number]
453// globally define this to the maximum number of channels you need.
454// The spec does not put a restriction on channels except that
455// the count is stored in a byte, so 255 is the hard limit.
456// Reducing this saves about 16 bytes per value, so using 16 saves
457// (255-16)*16 or around 4KB. Plus anything other memory usage
458// I forgot to account for. Can probably go as low as 8 (7.1 audio),
459// 6 (5.1 audio), or 2 (stereo only).
460#ifndef STB_VORBIS_MAX_CHANNELS
461#define STB_VORBIS_MAX_CHANNELS 16 // enough for anyone?
462#endif
463
464// STB_VORBIS_PUSHDATA_CRC_COUNT [number]
465// after a flush_pushdata(), stb_vorbis begins scanning for the
466// next valid page, without backtracking. when it finds something
467// that looks like a page, it streams through it and verifies its
468// CRC32. Should that validation fail, it keeps scanning. But it's
469// possible that _while_ streaming through to check the CRC32 of
470// one candidate page, it sees another candidate page. This #define
471// determines how many "overlapping" candidate pages it can search
472// at once. Note that "real" pages are typically ~4KB to ~8KB, whereas
473// garbage pages could be as big as 64KB, but probably average ~16KB.
474// So don't hose ourselves by scanning an apparent 64KB page and
475// missing a ton of real ones in the interim; so minimum of 2
476#ifndef STB_VORBIS_PUSHDATA_CRC_COUNT
477#define STB_VORBIS_PUSHDATA_CRC_COUNT 4
478#endif
479
480// STB_VORBIS_FAST_HUFFMAN_LENGTH [number]
481// sets the log size of the huffman-acceleration table. Maximum
482// supported value is 24. with larger numbers, more decodings are O(1),
483// but the table size is larger so worse cache missing, so you'll have
484// to probe (and try multiple ogg vorbis files) to find the sweet spot.
485#ifndef STB_VORBIS_FAST_HUFFMAN_LENGTH
486#define STB_VORBIS_FAST_HUFFMAN_LENGTH 10
487#endif
488
489// STB_VORBIS_FAST_BINARY_LENGTH [number]
490// sets the log size of the binary-search acceleration table. this
491// is used in similar fashion to the fast-huffman size to set initial
492// parameters for the binary search
493
494// STB_VORBIS_FAST_HUFFMAN_INT
495// The fast huffman tables are much more efficient if they can be
496// stored as 16-bit results instead of 32-bit results. This restricts
497// the codebooks to having only 65535 possible outcomes, though.
498// (At least, accelerated by the huffman table.)
499#ifndef STB_VORBIS_FAST_HUFFMAN_INT
500#define STB_VORBIS_FAST_HUFFMAN_SHORT
501#endif
502
503// STB_VORBIS_NO_HUFFMAN_BINARY_SEARCH
504// If the 'fast huffman' search doesn't succeed, then stb_vorbis falls
505// back on binary searching for the correct one. This requires storing
506// extra tables with the huffman codes in sorted order. Defining this
507// symbol trades off space for speed by forcing a linear search in the
508// non-fast case, except for "sparse" codebooks.
509// #define STB_VORBIS_NO_HUFFMAN_BINARY_SEARCH
510
511// STB_VORBIS_DIVIDES_IN_RESIDUE
512// stb_vorbis precomputes the result of the scalar residue decoding
513// that would otherwise require a divide per chunk. you can trade off
514// space for time by defining this symbol.
515// #define STB_VORBIS_DIVIDES_IN_RESIDUE
516
517// STB_VORBIS_DIVIDES_IN_CODEBOOK
518// vorbis VQ codebooks can be encoded two ways: with every case explicitly
519// stored, or with all elements being chosen from a small range of values,
520// and all values possible in all elements. By default, stb_vorbis expands
521// this latter kind out to look like the former kind for ease of decoding,
522// because otherwise an integer divide-per-vector-element is required to
523// unpack the index. If you define STB_VORBIS_DIVIDES_IN_CODEBOOK, you can
524// trade off storage for speed.
525//#define STB_VORBIS_DIVIDES_IN_CODEBOOK
526
527#ifdef STB_VORBIS_CODEBOOK_SHORTS
528#error "STB_VORBIS_CODEBOOK_SHORTS is no longer supported as it produced incorrect results for some input formats"
529#endif
530
531// STB_VORBIS_DIVIDE_TABLE
532// this replaces small integer divides in the floor decode loop with
533// table lookups. made less than 1% difference, so disabled by default.
534
535// STB_VORBIS_NO_INLINE_DECODE
536// disables the inlining of the scalar codebook fast-huffman decode.
537// might save a little codespace; useful for debugging
538// #define STB_VORBIS_NO_INLINE_DECODE
539
540// STB_VORBIS_NO_DEFER_FLOOR
541// Normally we only decode the floor without synthesizing the actual
542// full curve. We can instead synthesize the curve immediately. This
543// requires more memory and is very likely slower, so I don't think
544// you'd ever want to do it except for debugging.
545// #define STB_VORBIS_NO_DEFER_FLOOR
546
547
548
549
550//////////////////////////////////////////////////////////////////////////////
551
552#ifdef STB_VORBIS_NO_PULLDATA_API
553 #define STB_VORBIS_NO_INTEGER_CONVERSION
554 #define STB_VORBIS_NO_STDIO
555#endif
556
557#if defined(STB_VORBIS_NO_CRT) && !defined(STB_VORBIS_NO_STDIO)
558 #define STB_VORBIS_NO_STDIO 1
559#endif
560
561#ifndef STB_VORBIS_NO_INTEGER_CONVERSION
562#ifndef STB_VORBIS_NO_FAST_SCALED_FLOAT
563
564 // only need endianness for fast-float-to-int, which we don't
565 // use for pushdata
566
567 #ifndef STB_VORBIS_BIG_ENDIAN
568 #define STB_VORBIS_ENDIAN 0
569 #else
570 #define STB_VORBIS_ENDIAN 1
571 #endif
572
573#endif
574#endif
575
576
577#ifndef STB_VORBIS_NO_STDIO
578#include <stdio.h>
579#endif
580
581#ifndef STB_VORBIS_NO_CRT
582 #include <stdlib.h>
583 #include <string.h>
584 #include <assert.h>
585 #include <math.h>
586
587 // find definition of alloca if it's not in stdlib.h:
588 #if defined(_MSC_VER) || defined(__MINGW32__)
589 #include <malloc.h>
590 #endif
591 #if defined(__linux__) || defined(__linux) || defined(__sun__) || defined(__EMSCRIPTEN__) || defined(__NEWLIB__)
592 #include <alloca.h>
593 #endif
594#else // STB_VORBIS_NO_CRT
595 #define NULL 0
596 #define malloc(s) 0
597 #define free(s) ((void) 0)
598 #define realloc(s) 0
599#endif // STB_VORBIS_NO_CRT
600
601#include <limits.h>
602
603#ifdef __MINGW32__
604 // eff you mingw:
605 // "fixed":
606 // http://sourceforge.net/p/mingw-w64/mailman/message/32882927/
607 // "no that broke the build, reverted, who cares about C":
608 // http://sourceforge.net/p/mingw-w64/mailman/message/32890381/
609 #ifdef __forceinline
610 #undef __forceinline
611 #endif
612 #define __forceinline
613 #ifndef alloca
614 #define alloca __builtin_alloca
615 #endif
616#elif !defined(_MSC_VER)
617 #if __GNUC__
618 #define __forceinline inline
619 #else
620 #define __forceinline
621 #endif
622#endif
623
624#if STB_VORBIS_MAX_CHANNELS > 256
625#error "Value of STB_VORBIS_MAX_CHANNELS outside of allowed range"
626#endif
627
628#if STB_VORBIS_FAST_HUFFMAN_LENGTH > 24
629#error "Value of STB_VORBIS_FAST_HUFFMAN_LENGTH outside of allowed range"
630#endif
631
632
633#if 0
634#include <crtdbg.h>
635#define CHECK(f) _CrtIsValidHeapPointer(f->channel_buffers[1])
636#else
637#define CHECK(f) ((void) 0)
638#endif
639
640#define MAX_BLOCKSIZE_LOG 13 // from specification
641#define MAX_BLOCKSIZE (1 << MAX_BLOCKSIZE_LOG)
642
643
644typedef unsigned char uint8;
645typedef signed char int8;
646typedef unsigned short uint16;
647typedef signed short int16;
648typedef unsigned int uint32;
649typedef signed int int32;
650
651#ifndef TRUE
652#define TRUE 1
653#define FALSE 0
654#endif
655
656typedef float codetype;
657
658#ifdef _MSC_VER
659#define STBV_NOTUSED(v) (void)(v)
660#else
661#define STBV_NOTUSED(v) (void)sizeof(v)
662#endif
663
664// @NOTE
665//
666// Some arrays below are tagged "//varies", which means it's actually
667// a variable-sized piece of data, but rather than malloc I assume it's
668// small enough it's better to just allocate it all together with the
669// main thing
670//
671// Most of the variables are specified with the smallest size I could pack
672// them into. It might give better performance to make them all full-sized
673// integers. It should be safe to freely rearrange the structures or change
674// the sizes larger--nothing relies on silently truncating etc., nor the
675// order of variables.
676
677#define FAST_HUFFMAN_TABLE_SIZE (1 << STB_VORBIS_FAST_HUFFMAN_LENGTH)
678#define FAST_HUFFMAN_TABLE_MASK (FAST_HUFFMAN_TABLE_SIZE - 1)
679
680typedef struct
681{
682 int dimensions, entries;
683 uint8 *codeword_lengths;
684 float minimum_value;
685 float delta_value;
686 uint8 value_bits;
687 uint8 lookup_type;
688 uint8 sequence_p;
689 uint8 sparse;
690 uint32 lookup_values;
691 codetype *multiplicands;
692 uint32 *codewords;
693 #ifdef STB_VORBIS_FAST_HUFFMAN_SHORT
694 int16 fast_huffman[FAST_HUFFMAN_TABLE_SIZE];
695 #else
696 int32 fast_huffman[FAST_HUFFMAN_TABLE_SIZE];
697 #endif
698 uint32 *sorted_codewords;
699 int *sorted_values;
700 int sorted_entries;
701} Codebook;
702
703typedef struct
704{
705 uint8 order;
706 uint16 rate;
707 uint16 bark_map_size;
708 uint8 amplitude_bits;
709 uint8 amplitude_offset;
710 uint8 number_of_books;
711 uint8 book_list[16]; // varies
712} Floor0;
713
714typedef struct
715{
716 uint8 partitions;
717 uint8 partition_class_list[32]; // varies
718 uint8 class_dimensions[16]; // varies
719 uint8 class_subclasses[16]; // varies
720 uint8 class_masterbooks[16]; // varies
721 int16 subclass_books[16][8]; // varies
722 uint16 Xlist[31*8+2]; // varies
723 uint8 sorted_order[31*8+2];
724 uint8 neighbors[31*8+2][2];
725 uint8 floor1_multiplier;
726 uint8 rangebits;
727 int values;
728} Floor1;
729
730typedef union
731{
732 Floor0 floor0;
733 Floor1 floor1;
734} Floor;
735
736typedef struct
737{
738 uint32 begin, end;
739 uint32 part_size;
740 uint8 classifications;
741 uint8 classbook;
742 uint8 **classdata;
743 int16 (*residue_books)[8];
744} Residue;
745
746typedef struct
747{
748 uint8 magnitude;
749 uint8 angle;
750 uint8 mux;
751} MappingChannel;
752
753typedef struct
754{
755 uint16 coupling_steps;
756 MappingChannel *chan;
757 uint8 submaps;
758 uint8 submap_floor[15]; // varies
759 uint8 submap_residue[15]; // varies
760} Mapping;
761
762typedef struct
763{
764 uint8 blockflag;
765 uint8 mapping;
766 uint16 windowtype;
767 uint16 transformtype;
768} Mode;
769
770typedef struct
771{
772 uint32 goal_crc; // expected crc if match
773 int bytes_left; // bytes left in packet
774 uint32 crc_so_far; // running crc
775 int bytes_done; // bytes processed in _current_ chunk
776 uint32 sample_loc; // granule pos encoded in page
777} CRCscan;
778
779typedef struct
780{
781 uint32 page_start, page_end;
782 uint32 last_decoded_sample;
783} ProbedPage;
784
785struct stb_vorbis
786{
787 // user-accessible info
788 unsigned int sample_rate;
789 int channels;
790
791 unsigned int setup_memory_required;
792 unsigned int temp_memory_required;
793 unsigned int setup_temp_memory_required;
794
795 char *vendor;
796 int comment_list_length;
797 char **comment_list;
798
799 // input config
800#ifndef STB_VORBIS_NO_STDIO
801 FILE *f;
802 uint32 f_start;
803 int close_on_free;
804#endif
805
806 uint8 *stream;
807 uint8 *stream_start;
808 uint8 *stream_end;
809
810 uint32 stream_len;
811
812 uint8 push_mode;
813
814 // the page to seek to when seeking to start, may be zero
815 uint32 first_audio_page_offset;
816
817 // p_first is the page on which the first audio packet ends
818 // (but not necessarily the page on which it starts)
819 ProbedPage p_first, p_last;
820
821 // memory management
822 stb_vorbis_alloc alloc;
823 int setup_offset;
824 int temp_offset;
825
826 // run-time results
827 int eof;
828 enum STBVorbisError error;
829
830 // user-useful data
831
832 // header info
833 int blocksize[2];
834 int blocksize_0, blocksize_1;
835 int codebook_count;
836 Codebook *codebooks;
837 int floor_count;
838 uint16 floor_types[64]; // varies
839 Floor *floor_config;
840 int residue_count;
841 uint16 residue_types[64]; // varies
842 Residue *residue_config;
843 int mapping_count;
844 Mapping *mapping;
845 int mode_count;
846 Mode mode_config[64]; // varies
847
848 uint32 total_samples;
849
850 // decode buffer
851 float *channel_buffers[STB_VORBIS_MAX_CHANNELS];
852 float *outputs [STB_VORBIS_MAX_CHANNELS];
853
854 float *previous_window[STB_VORBIS_MAX_CHANNELS];
855 int previous_length;
856
857 #ifndef STB_VORBIS_NO_DEFER_FLOOR
858 int16 *finalY[STB_VORBIS_MAX_CHANNELS];
859 #else
860 float *floor_buffers[STB_VORBIS_MAX_CHANNELS];
861 #endif
862
863 uint32 current_loc; // sample location of next frame to decode
864 int current_loc_valid;
865
866 // per-blocksize precomputed data
867
868 // twiddle factors
869 float *A[2],*B[2],*C[2];
870 float *window[2];
871 uint16 *bit_reverse[2];
872
873 // current page/packet/segment streaming info
874 uint32 serial; // stream serial number for verification
875 int last_page;
876 int segment_count;
877 uint8 segments[255];
878 uint8 page_flag;
879 uint8 bytes_in_seg;
880 uint8 first_decode;
881 int next_seg;
882 int last_seg; // flag that we're on the last segment
883 int last_seg_which; // what was the segment number of the last seg?
884 uint32 acc;
885 int valid_bits;
886 int packet_bytes;
887 int end_seg_with_known_loc;
888 uint32 known_loc_for_packet;
889 int discard_samples_deferred;
890 uint32 samples_output;
891
892 // push mode scanning
893 int page_crc_tests; // only in push_mode: number of tests active; -1 if not searching
894#ifndef STB_VORBIS_NO_PUSHDATA_API
895 CRCscan scan[STB_VORBIS_PUSHDATA_CRC_COUNT];
896#endif
897
898 // sample-access
899 int channel_buffer_start;
900 int channel_buffer_end;
901};
902
903#if defined(STB_VORBIS_NO_PUSHDATA_API)
904 #define IS_PUSH_MODE(f) FALSE
905#elif defined(STB_VORBIS_NO_PULLDATA_API)
906 #define IS_PUSH_MODE(f) TRUE
907#else
908 #define IS_PUSH_MODE(f) ((f)->push_mode)
909#endif
910
911typedef struct stb_vorbis vorb;
912
913static int error(vorb *f, enum STBVorbisError e)
914{
915 f->error = e;
916 if (!f->eof && e != VORBIS_need_more_data) {
917 f->error=e; // breakpoint for debugging
918 }
919 return 0;
920}
921
922
923// these functions are used for allocating temporary memory
924// while decoding. if you can afford the stack space, use
925// alloca(); otherwise, provide a temp buffer and it will
926// allocate out of those.
927
928#define array_size_required(count,size) (count*(sizeof(void *)+(size)))
929
930#define temp_alloc(f,size) (f->alloc.alloc_buffer ? setup_temp_malloc(f,size) : alloca(size))
931#define temp_free(f,p) (void)0
932#define temp_alloc_save(f) ((f)->temp_offset)
933#define temp_alloc_restore(f,p) ((f)->temp_offset = (p))
934
935#define temp_block_array(f,count,size) make_block_array(temp_alloc(f,array_size_required(count,size)), count, size)
936
937// given a sufficiently large block of memory, make an array of pointers to subblocks of it
938static void *make_block_array(void *mem, int count, int size)
939{
940 int i;
941 void ** p = (void **) mem;
942 char *q = (char *) (p + count);
943 for (i=0; i < count; ++i) {
944 p[i] = q;
945 q += size;
946 }
947 return p;
948}
949
950static void *setup_malloc(vorb *f, int sz)
951{
952 sz = (sz+7) & ~7; // round up to nearest 8 for alignment of future allocs.
953 f->setup_memory_required += sz;
954 if (f->alloc.alloc_buffer) {
955 void *p = (char *) f->alloc.alloc_buffer + f->setup_offset;
956 if (f->setup_offset + sz > f->temp_offset) return NULL;
957 f->setup_offset += sz;
958 return p;
959 }
960 return sz ? malloc(sz) : NULL;
961}
962
963static void setup_free(vorb *f, void *p)
964{
965 if (f->alloc.alloc_buffer) return; // do nothing; setup mem is a stack
966 free(p);
967}
968
969static void *setup_temp_malloc(vorb *f, int sz)
970{
971 sz = (sz+7) & ~7; // round up to nearest 8 for alignment of future allocs.
972 if (f->alloc.alloc_buffer) {
973 if (f->temp_offset - sz < f->setup_offset) return NULL;
974 f->temp_offset -= sz;
975 return (char *) f->alloc.alloc_buffer + f->temp_offset;
976 }
977 return malloc(sz);
978}
979
980static void setup_temp_free(vorb *f, void *p, int sz)
981{
982 if (f->alloc.alloc_buffer) {
983 f->temp_offset += (sz+7)&~7;
984 return;
985 }
986 free(p);
987}
988
989#define CRC32_POLY 0x04c11db7 // from spec
990
991static uint32 crc_table[256];
992static void crc32_init(void)
993{
994 int i,j;
995 uint32 s;
996 for(i=0; i < 256; i++) {
997 for (s=(uint32) i << 24, j=0; j < 8; ++j)
998 s = (s << 1) ^ (s >= (1U<<31) ? CRC32_POLY : 0);
999 crc_table[i] = s;
1000 }
1001}
1002
1003static __forceinline uint32 crc32_update(uint32 crc, uint8 byte)
1004{
1005 return (crc << 8) ^ crc_table[byte ^ (crc >> 24)];
1006}
1007
1008
1009// used in setup, and for huffman that doesn't go fast path
1010static unsigned int bit_reverse(unsigned int n)
1011{
1012 n = ((n & 0xAAAAAAAA) >> 1) | ((n & 0x55555555) << 1);
1013 n = ((n & 0xCCCCCCCC) >> 2) | ((n & 0x33333333) << 2);
1014 n = ((n & 0xF0F0F0F0) >> 4) | ((n & 0x0F0F0F0F) << 4);
1015 n = ((n & 0xFF00FF00) >> 8) | ((n & 0x00FF00FF) << 8);
1016 return (n >> 16) | (n << 16);
1017}
1018
1019static float square(float x)
1020{
1021 return x*x;
1022}
1023
1024// this is a weird definition of log2() for which log2(1) = 1, log2(2) = 2, log2(4) = 3
1025// as required by the specification. fast(?) implementation from stb.h
1026// @OPTIMIZE: called multiple times per-packet with "constants"; move to setup
1027static int ilog(int32 n)
1028{
1029 static signed char log2_4[16] = { 0,1,2,2,3,3,3,3,4,4,4,4,4,4,4,4 };
1030
1031 if (n < 0) return 0; // signed n returns 0
1032
1033 // 2 compares if n < 16, 3 compares otherwise (4 if signed or n > 1<<29)
1034 if (n < (1 << 14))
1035 if (n < (1 << 4)) return 0 + log2_4[n ];
1036 else if (n < (1 << 9)) return 5 + log2_4[n >> 5];
1037 else return 10 + log2_4[n >> 10];
1038 else if (n < (1 << 24))
1039 if (n < (1 << 19)) return 15 + log2_4[n >> 15];
1040 else return 20 + log2_4[n >> 20];
1041 else if (n < (1 << 29)) return 25 + log2_4[n >> 25];
1042 else return 30 + log2_4[n >> 30];
1043}
1044
1045#ifndef M_PI
1046 #define M_PI 3.14159265358979323846264f // from CRC
1047#endif
1048
1049// code length assigned to a value with no huffman encoding
1050#define NO_CODE 255
1051
1052/////////////////////// LEAF SETUP FUNCTIONS //////////////////////////
1053//
1054// these functions are only called at setup, and only a few times
1055// per file
1056
1057static float float32_unpack(uint32 x)
1058{
1059 // from the specification
1060 uint32 mantissa = x & 0x1fffff;
1061 uint32 sign = x & 0x80000000;
1062 uint32 exp = (x & 0x7fe00000) >> 21;
1063 double res = sign ? -(double)mantissa : (double)mantissa;
1064 return (float) ldexp((float)res, (int)exp-788);
1065}
1066
1067
1068// zlib & jpeg huffman tables assume that the output symbols
1069// can either be arbitrarily arranged, or have monotonically
1070// increasing frequencies--they rely on the lengths being sorted;
1071// this makes for a very simple generation algorithm.
1072// vorbis allows a huffman table with non-sorted lengths. This
1073// requires a more sophisticated construction, since symbols in
1074// order do not map to huffman codes "in order".
1075static void add_entry(Codebook *c, uint32 huff_code, int symbol, int count, int len, uint32 *values)
1076{
1077 if (!c->sparse) {
1078 c->codewords [symbol] = huff_code;
1079 } else {
1080 c->codewords [count] = huff_code;
1081 c->codeword_lengths[count] = len;
1082 values [count] = symbol;
1083 }
1084}
1085
1086static int compute_codewords(Codebook *c, uint8 *len, int n, uint32 *values)
1087{
1088 int i,k,m=0;
1089 uint32 available[32];
1090
1091 memset(available, 0, sizeof(available));
1092 // find the first entry
1093 for (k=0; k < n; ++k) if (len[k] < NO_CODE) break;
1094 if (k == n) { assert(c->sorted_entries == 0); return TRUE; }
1095 assert(len[k] < 32); // no error return required, code reading lens checks this
1096 // add to the list
1097 add_entry(c, 0, k, m++, len[k], values);
1098 // add all available leaves
1099 for (i=1; i <= len[k]; ++i)
1100 available[i] = 1U << (32-i);
1101 // note that the above code treats the first case specially,
1102 // but it's really the same as the following code, so they
1103 // could probably be combined (except the initial code is 0,
1104 // and I use 0 in available[] to mean 'empty')
1105 for (i=k+1; i < n; ++i) {
1106 uint32 res;
1107 int z = len[i], y;
1108 if (z == NO_CODE) continue;
1109 assert(z < 32); // no error return required, code reading lens checks this
1110 // find lowest available leaf (should always be earliest,
1111 // which is what the specification calls for)
1112 // note that this property, and the fact we can never have
1113 // more than one free leaf at a given level, isn't totally
1114 // trivial to prove, but it seems true and the assert never
1115 // fires, so!
1116 while (z > 0 && !available[z]) --z;
1117 if (z == 0) { return FALSE; }
1118 res = available[z];
1119 available[z] = 0;
1120 add_entry(c, bit_reverse(res), i, m++, len[i], values);
1121 // propagate availability up the tree
1122 if (z != len[i]) {
1123 for (y=len[i]; y > z; --y) {
1124 assert(available[y] == 0);
1125 available[y] = res + (1 << (32-y));
1126 }
1127 }
1128 }
1129 return TRUE;
1130}
1131
1132// accelerated huffman table allows fast O(1) match of all symbols
1133// of length <= STB_VORBIS_FAST_HUFFMAN_LENGTH
1134static void compute_accelerated_huffman(Codebook *c)
1135{
1136 int i, len;
1137 for (i=0; i < FAST_HUFFMAN_TABLE_SIZE; ++i)
1138 c->fast_huffman[i] = -1;
1139
1140 len = c->sparse ? c->sorted_entries : c->entries;
1141 #ifdef STB_VORBIS_FAST_HUFFMAN_SHORT
1142 if (len > 32767) len = 32767; // largest possible value we can encode!
1143 #endif
1144 for (i=0; i < len; ++i) {
1145 if (c->codeword_lengths[i] <= STB_VORBIS_FAST_HUFFMAN_LENGTH) {
1146 uint32 z = c->sparse ? bit_reverse(c->sorted_codewords[i]) : c->codewords[i];
1147 // set table entries for all bit combinations in the higher bits
1148 while (z < FAST_HUFFMAN_TABLE_SIZE) {
1149 c->fast_huffman[z] = i;
1150 z += 1 << c->codeword_lengths[i];
1151 }
1152 }
1153 }
1154}
1155
1156#ifdef _MSC_VER
1157#define STBV_CDECL __cdecl
1158#else
1159#define STBV_CDECL
1160#endif
1161
1162static int STBV_CDECL uint32_compare(const void *p, const void *q)
1163{
1164 uint32 x = * (uint32 *) p;
1165 uint32 y = * (uint32 *) q;
1166 return x < y ? -1 : x > y;
1167}
1168
1169static int include_in_sort(Codebook *c, uint8 len)
1170{
1171 if (c->sparse) { assert(len != NO_CODE); return TRUE; }
1172 if (len == NO_CODE) return FALSE;
1173 if (len > STB_VORBIS_FAST_HUFFMAN_LENGTH) return TRUE;
1174 return FALSE;
1175}
1176
1177// if the fast table above doesn't work, we want to binary
1178// search them... need to reverse the bits
1179static void compute_sorted_huffman(Codebook *c, uint8 *lengths, uint32 *values)
1180{
1181 int i, len;
1182 // build a list of all the entries
1183 // OPTIMIZATION: don't include the short ones, since they'll be caught by FAST_HUFFMAN.
1184 // this is kind of a frivolous optimization--I don't see any performance improvement,
1185 // but it's like 4 extra lines of code, so.
1186 if (!c->sparse) {
1187 int k = 0;
1188 for (i=0; i < c->entries; ++i)
1189 if (include_in_sort(c, lengths[i]))
1190 c->sorted_codewords[k++] = bit_reverse(c->codewords[i]);
1191 assert(k == c->sorted_entries);
1192 } else {
1193 for (i=0; i < c->sorted_entries; ++i)
1194 c->sorted_codewords[i] = bit_reverse(c->codewords[i]);
1195 }
1196
1197 qsort(c->sorted_codewords, c->sorted_entries, sizeof(c->sorted_codewords[0]), uint32_compare);
1198 c->sorted_codewords[c->sorted_entries] = 0xffffffff;
1199
1200 len = c->sparse ? c->sorted_entries : c->entries;
1201 // now we need to indicate how they correspond; we could either
1202 // #1: sort a different data structure that says who they correspond to
1203 // #2: for each sorted entry, search the original list to find who corresponds
1204 // #3: for each original entry, find the sorted entry
1205 // #1 requires extra storage, #2 is slow, #3 can use binary search!
1206 for (i=0; i < len; ++i) {
1207 int huff_len = c->sparse ? lengths[values[i]] : lengths[i];
1208 if (include_in_sort(c,huff_len)) {
1209 uint32 code = bit_reverse(c->codewords[i]);
1210 int x=0, n=c->sorted_entries;
1211 while (n > 1) {
1212 // invariant: sc[x] <= code < sc[x+n]
1213 int m = x + (n >> 1);
1214 if (c->sorted_codewords[m] <= code) {
1215 x = m;
1216 n -= (n>>1);
1217 } else {
1218 n >>= 1;
1219 }
1220 }
1221 assert(c->sorted_codewords[x] == code);
1222 if (c->sparse) {
1223 c->sorted_values[x] = values[i];
1224 c->codeword_lengths[x] = huff_len;
1225 } else {
1226 c->sorted_values[x] = i;
1227 }
1228 }
1229 }
1230}
1231
1232// only run while parsing the header (3 times)
1233static int vorbis_validate(uint8 *data)
1234{
1235 static uint8 vorbis[6] = { 'v', 'o', 'r', 'b', 'i', 's' };
1236 return memcmp(data, vorbis, 6) == 0;
1237}
1238
1239// called from setup only, once per code book
1240// (formula implied by specification)
1241static int lookup1_values(int entries, int dim)
1242{
1243 int r = (int) floor(exp((float) log((float) entries) / dim));
1244 if ((int) floor(pow((float) r+1, dim)) <= entries) // (int) cast for MinGW warning;
1245 ++r; // floor() to avoid _ftol() when non-CRT
1246 if (pow((float) r+1, dim) <= entries)
1247 return -1;
1248 if ((int) floor(pow((float) r, dim)) > entries)
1249 return -1;
1250 return r;
1251}
1252
1253// called twice per file
1254static void compute_twiddle_factors(int n, float *A, float *B, float *C)
1255{
1256 int n4 = n >> 2, n8 = n >> 3;
1257 int k,k2;
1258
1259 for (k=k2=0; k < n4; ++k,k2+=2) {
1260 A[k2 ] = (float) cos(4*k*M_PI/n);
1261 A[k2+1] = (float) -sin(4*k*M_PI/n);
1262 B[k2 ] = (float) cos((k2+1)*M_PI/n/2) * 0.5f;
1263 B[k2+1] = (float) sin((k2+1)*M_PI/n/2) * 0.5f;
1264 }
1265 for (k=k2=0; k < n8; ++k,k2+=2) {
1266 C[k2 ] = (float) cos(2*(k2+1)*M_PI/n);
1267 C[k2+1] = (float) -sin(2*(k2+1)*M_PI/n);
1268 }
1269}
1270
1271static void compute_window(int n, float *window)
1272{
1273 int n2 = n >> 1, i;
1274 for (i=0; i < n2; ++i)
1275 window[i] = (float) sin(0.5 * M_PI * square((float) sin((i - 0 + 0.5) / n2 * 0.5 * M_PI)));
1276}
1277
1278static void compute_bitreverse(int n, uint16 *rev)
1279{
1280 int ld = ilog(n) - 1; // ilog is off-by-one from normal definitions
1281 int i, n8 = n >> 3;
1282 for (i=0; i < n8; ++i)
1283 rev[i] = (bit_reverse(i) >> (32-ld+3)) << 2;
1284}
1285
1286static int init_blocksize(vorb *f, int b, int n)
1287{
1288 int n2 = n >> 1, n4 = n >> 2, n8 = n >> 3;
1289 f->A[b] = (float *) setup_malloc(f, sizeof(float) * n2);
1290 f->B[b] = (float *) setup_malloc(f, sizeof(float) * n2);
1291 f->C[b] = (float *) setup_malloc(f, sizeof(float) * n4);
1292 if (!f->A[b] || !f->B[b] || !f->C[b]) return error(f, VORBIS_outofmem);
1293 compute_twiddle_factors(n, f->A[b], f->B[b], f->C[b]);
1294 f->window[b] = (float *) setup_malloc(f, sizeof(float) * n2);
1295 if (!f->window[b]) return error(f, VORBIS_outofmem);
1296 compute_window(n, f->window[b]);
1297 f->bit_reverse[b] = (uint16 *) setup_malloc(f, sizeof(uint16) * n8);
1298 if (!f->bit_reverse[b]) return error(f, VORBIS_outofmem);
1299 compute_bitreverse(n, f->bit_reverse[b]);
1300 return TRUE;
1301}
1302
1303static void neighbors(uint16 *x, int n, int *plow, int *phigh)
1304{
1305 int low = -1;
1306 int high = 65536;
1307 int i;
1308 for (i=0; i < n; ++i) {
1309 if (x[i] > low && x[i] < x[n]) { *plow = i; low = x[i]; }
1310 if (x[i] < high && x[i] > x[n]) { *phigh = i; high = x[i]; }
1311 }
1312}
1313
1314// this has been repurposed so y is now the original index instead of y
1315typedef struct
1316{
1317 uint16 x,id;
1318} stbv__floor_ordering;
1319
1320static int STBV_CDECL point_compare(const void *p, const void *q)
1321{
1322 stbv__floor_ordering *a = (stbv__floor_ordering *) p;
1323 stbv__floor_ordering *b = (stbv__floor_ordering *) q;
1324 return a->x < b->x ? -1 : a->x > b->x;
1325}
1326
1327//
1328/////////////////////// END LEAF SETUP FUNCTIONS //////////////////////////
1329
1330
1331#if defined(STB_VORBIS_NO_STDIO)
1332 #define USE_MEMORY(z) TRUE
1333#else
1334 #define USE_MEMORY(z) ((z)->stream)
1335#endif
1336
1337static uint8 get8(vorb *z)
1338{
1339 if (USE_MEMORY(z)) {
1340 if (z->stream >= z->stream_end) { z->eof = TRUE; return 0; }
1341 return *z->stream++;
1342 }
1343
1344 #ifndef STB_VORBIS_NO_STDIO
1345 {
1346 int c = fgetc(z->f);
1347 if (c == EOF) { z->eof = TRUE; return 0; }
1348 return c;
1349 }
1350 #endif
1351}
1352
1353static uint32 get32(vorb *f)
1354{
1355 uint32 x;
1356 x = get8(f);
1357 x += get8(f) << 8;
1358 x += get8(f) << 16;
1359 x += (uint32) get8(f) << 24;
1360 return x;
1361}
1362
1363static int getn(vorb *z, uint8 *data, int n)
1364{
1365 if (USE_MEMORY(z)) {
1366 if (z->stream+n > z->stream_end) { z->eof = 1; return 0; }
1367 memcpy(data, z->stream, n);
1368 z->stream += n;
1369 return 1;
1370 }
1371
1372 #ifndef STB_VORBIS_NO_STDIO
1373 if (fread(data, n, 1, z->f) == 1)
1374 return 1;
1375 else {
1376 z->eof = 1;
1377 return 0;
1378 }
1379 #endif
1380}
1381
1382static void skip(vorb *z, int n)
1383{
1384 if (USE_MEMORY(z)) {
1385 z->stream += n;
1386 if (z->stream >= z->stream_end) z->eof = 1;
1387 return;
1388 }
1389 #ifndef STB_VORBIS_NO_STDIO
1390 {
1391 long x = ftell(z->f);
1392 fseek(z->f, x+n, SEEK_SET);
1393 }
1394 #endif
1395}
1396
1397static int set_file_offset(stb_vorbis *f, unsigned int loc)
1398{
1399 #ifndef STB_VORBIS_NO_PUSHDATA_API
1400 if (f->push_mode) return 0;
1401 #endif
1402 f->eof = 0;
1403 if (USE_MEMORY(f)) {
1404 if (f->stream_start + loc >= f->stream_end || f->stream_start + loc < f->stream_start) {
1405 f->stream = f->stream_end;
1406 f->eof = 1;
1407 return 0;
1408 } else {
1409 f->stream = f->stream_start + loc;
1410 return 1;
1411 }
1412 }
1413 #ifndef STB_VORBIS_NO_STDIO
1414 if (loc + f->f_start < loc || loc >= 0x80000000) {
1415 loc = 0x7fffffff;
1416 f->eof = 1;
1417 } else {
1418 loc += f->f_start;
1419 }
1420 if (!fseek(f->f, loc, SEEK_SET))
1421 return 1;
1422 f->eof = 1;
1423 fseek(f->f, f->f_start, SEEK_END);
1424 return 0;
1425 #endif
1426}
1427
1428
1429static uint8 ogg_page_header[4] = { 0x4f, 0x67, 0x67, 0x53 };
1430
1431static int capture_pattern(vorb *f)
1432{
1433 if (0x4f != get8(f)) return FALSE;
1434 if (0x67 != get8(f)) return FALSE;
1435 if (0x67 != get8(f)) return FALSE;
1436 if (0x53 != get8(f)) return FALSE;
1437 return TRUE;
1438}
1439
1440#define PAGEFLAG_continued_packet 1
1441#define PAGEFLAG_first_page 2
1442#define PAGEFLAG_last_page 4
1443
1444static int start_page_no_capturepattern(vorb *f)
1445{
1446 uint32 loc0,loc1,n;
1447 if (f->first_decode && !IS_PUSH_MODE(f)) {
1448 f->p_first.page_start = stb_vorbis_get_file_offset(f) - 4;
1449 }
1450 // stream structure version
1451 if (0 != get8(f)) return error(f, VORBIS_invalid_stream_structure_version);
1452 // header flag
1453 f->page_flag = get8(f);
1454 // absolute granule position
1455 loc0 = get32(f);
1456 loc1 = get32(f);
1457 // @TODO: validate loc0,loc1 as valid positions?
1458 // stream serial number -- vorbis doesn't interleave, so discard
1459 get32(f);
1460 //if (f->serial != get32(f)) return error(f, VORBIS_incorrect_stream_serial_number);
1461 // page sequence number
1462 n = get32(f);
1463 f->last_page = n;
1464 // CRC32
1465 get32(f);
1466 // page_segments
1467 f->segment_count = get8(f);
1468 if (!getn(f, f->segments, f->segment_count))
1469 return error(f, VORBIS_unexpected_eof);
1470 // assume we _don't_ know any the sample position of any segments
1471 f->end_seg_with_known_loc = -2;
1472 if (loc0 != ~0U || loc1 != ~0U) {
1473 int i;
1474 // determine which packet is the last one that will complete
1475 for (i=f->segment_count-1; i >= 0; --i)
1476 if (f->segments[i] < 255)
1477 break;
1478 // 'i' is now the index of the _last_ segment of a packet that ends
1479 if (i >= 0) {
1480 f->end_seg_with_known_loc = i;
1481 f->known_loc_for_packet = loc0;
1482 }
1483 }
1484 if (f->first_decode) {
1485 int i,len;
1486 len = 0;
1487 for (i=0; i < f->segment_count; ++i)
1488 len += f->segments[i];
1489 len += 27 + f->segment_count;
1490 f->p_first.page_end = f->p_first.page_start + len;
1491 f->p_first.last_decoded_sample = loc0;
1492 }
1493 f->next_seg = 0;
1494 return TRUE;
1495}
1496
1497static int start_page(vorb *f)
1498{
1499 if (!capture_pattern(f)) return error(f, VORBIS_missing_capture_pattern);
1500 return start_page_no_capturepattern(f);
1501}
1502
1503static int start_packet(vorb *f)
1504{
1505 while (f->next_seg == -1) {
1506 if (!start_page(f)) return FALSE;
1507 if (f->page_flag & PAGEFLAG_continued_packet)
1508 return error(f, VORBIS_continued_packet_flag_invalid);
1509 }
1510 f->last_seg = FALSE;
1511 f->valid_bits = 0;
1512 f->packet_bytes = 0;
1513 f->bytes_in_seg = 0;
1514 // f->next_seg is now valid
1515 return TRUE;
1516}
1517
1518static int maybe_start_packet(vorb *f)
1519{
1520 if (f->next_seg == -1) {
1521 int x = get8(f);
1522 if (f->eof) return FALSE; // EOF at page boundary is not an error!
1523 if (0x4f != x ) return error(f, VORBIS_missing_capture_pattern);
1524 if (0x67 != get8(f)) return error(f, VORBIS_missing_capture_pattern);
1525 if (0x67 != get8(f)) return error(f, VORBIS_missing_capture_pattern);
1526 if (0x53 != get8(f)) return error(f, VORBIS_missing_capture_pattern);
1527 if (!start_page_no_capturepattern(f)) return FALSE;
1528 if (f->page_flag & PAGEFLAG_continued_packet) {
1529 // set up enough state that we can read this packet if we want,
1530 // e.g. during recovery
1531 f->last_seg = FALSE;
1532 f->bytes_in_seg = 0;
1533 return error(f, VORBIS_continued_packet_flag_invalid);
1534 }
1535 }
1536 return start_packet(f);
1537}
1538
1539static int next_segment(vorb *f)
1540{
1541 int len;
1542 if (f->last_seg) return 0;
1543 if (f->next_seg == -1) {
1544 f->last_seg_which = f->segment_count-1; // in case start_page fails
1545 if (!start_page(f)) { f->last_seg = 1; return 0; }
1546 if (!(f->page_flag & PAGEFLAG_continued_packet)) return error(f, VORBIS_continued_packet_flag_invalid);
1547 }
1548 len = f->segments[f->next_seg++];
1549 if (len < 255) {
1550 f->last_seg = TRUE;
1551 f->last_seg_which = f->next_seg-1;
1552 }
1553 if (f->next_seg >= f->segment_count)
1554 f->next_seg = -1;
1555 assert(f->bytes_in_seg == 0);
1556 f->bytes_in_seg = len;
1557 return len;
1558}
1559
1560#define EOP (-1)
1561#define INVALID_BITS (-1)
1562
1563static int get8_packet_raw(vorb *f)
1564{
1565 if (!f->bytes_in_seg) { // CLANG!
1566 if (f->last_seg) return EOP;
1567 else if (!next_segment(f)) return EOP;
1568 }
1569 assert(f->bytes_in_seg > 0);
1570 --f->bytes_in_seg;
1571 ++f->packet_bytes;
1572 return get8(f);
1573}
1574
1575static int get8_packet(vorb *f)
1576{
1577 int x = get8_packet_raw(f);
1578 f->valid_bits = 0;
1579 return x;
1580}
1581
1582static int get32_packet(vorb *f)
1583{
1584 uint32 x;
1585 x = get8_packet(f);
1586 x += get8_packet(f) << 8;
1587 x += get8_packet(f) << 16;
1588 x += (uint32) get8_packet(f) << 24;
1589 return x;
1590}
1591
1592static void flush_packet(vorb *f)
1593{
1594 while (get8_packet_raw(f) != EOP);
1595}
1596
1597// @OPTIMIZE: this is the secondary bit decoder, so it's probably not as important
1598// as the huffman decoder?
1599static uint32 get_bits(vorb *f, int n)
1600{
1601 uint32 z;
1602
1603 if (f->valid_bits < 0) return 0;
1604 if (f->valid_bits < n) {
1605 if (n > 24) {
1606 // the accumulator technique below would not work correctly in this case
1607 z = get_bits(f, 24);
1608 z += get_bits(f, n-24) << 24;
1609 return z;
1610 }
1611 if (f->valid_bits == 0) f->acc = 0;
1612 while (f->valid_bits < n) {
1613 int z = get8_packet_raw(f);
1614 if (z == EOP) {
1615 f->valid_bits = INVALID_BITS;
1616 return 0;
1617 }
1618 f->acc += z << f->valid_bits;
1619 f->valid_bits += 8;
1620 }
1621 }
1622
1623 assert(f->valid_bits >= n);
1624 z = f->acc & ((1 << n)-1);
1625 f->acc >>= n;
1626 f->valid_bits -= n;
1627 return z;
1628}
1629
1630// @OPTIMIZE: primary accumulator for huffman
1631// expand the buffer to as many bits as possible without reading off end of packet
1632// it might be nice to allow f->valid_bits and f->acc to be stored in registers,
1633// e.g. cache them locally and decode locally
1634static __forceinline void prep_huffman(vorb *f)
1635{
1636 if (f->valid_bits <= 24) {
1637 if (f->valid_bits == 0) f->acc = 0;
1638 do {
1639 int z;
1640 if (f->last_seg && !f->bytes_in_seg) return;
1641 z = get8_packet_raw(f);
1642 if (z == EOP) return;
1643 f->acc += (unsigned) z << f->valid_bits;
1644 f->valid_bits += 8;
1645 } while (f->valid_bits <= 24);
1646 }
1647}
1648
1649enum
1650{
1651 VORBIS_packet_id = 1,
1652 VORBIS_packet_comment = 3,
1653 VORBIS_packet_setup = 5
1654};
1655
1656static int codebook_decode_scalar_raw(vorb *f, Codebook *c)
1657{
1658 int i;
1659 prep_huffman(f);
1660
1661 if (c->codewords == NULL && c->sorted_codewords == NULL)
1662 return -1;
1663
1664 // cases to use binary search: sorted_codewords && !c->codewords
1665 // sorted_codewords && c->entries > 8
1666 if (c->entries > 8 ? c->sorted_codewords!=NULL : !c->codewords) {
1667 // binary search
1668 uint32 code = bit_reverse(f->acc);
1669 int x=0, n=c->sorted_entries, len;
1670
1671 while (n > 1) {
1672 // invariant: sc[x] <= code < sc[x+n]
1673 int m = x + (n >> 1);
1674 if (c->sorted_codewords[m] <= code) {
1675 x = m;
1676 n -= (n>>1);
1677 } else {
1678 n >>= 1;
1679 }
1680 }
1681 // x is now the sorted index
1682 if (!c->sparse) x = c->sorted_values[x];
1683 // x is now sorted index if sparse, or symbol otherwise
1684 len = c->codeword_lengths[x];
1685 if (f->valid_bits >= len) {
1686 f->acc >>= len;
1687 f->valid_bits -= len;
1688 return x;
1689 }
1690
1691 f->valid_bits = 0;
1692 return -1;
1693 }
1694
1695 // if small, linear search
1696 assert(!c->sparse);
1697 for (i=0; i < c->entries; ++i) {
1698 if (c->codeword_lengths[i] == NO_CODE) continue;
1699 if (c->codewords[i] == (f->acc & ((1 << c->codeword_lengths[i])-1))) {
1700 if (f->valid_bits >= c->codeword_lengths[i]) {
1701 f->acc >>= c->codeword_lengths[i];
1702 f->valid_bits -= c->codeword_lengths[i];
1703 return i;
1704 }
1705 f->valid_bits = 0;
1706 return -1;
1707 }
1708 }
1709
1710 error(f, VORBIS_invalid_stream);
1711 f->valid_bits = 0;
1712 return -1;
1713}
1714
1715#ifndef STB_VORBIS_NO_INLINE_DECODE
1716
1717#define DECODE_RAW(var, f,c) \
1718 if (f->valid_bits < STB_VORBIS_FAST_HUFFMAN_LENGTH) \
1719 prep_huffman(f); \
1720 var = f->acc & FAST_HUFFMAN_TABLE_MASK; \
1721 var = c->fast_huffman[var]; \
1722 if (var >= 0) { \
1723 int n = c->codeword_lengths[var]; \
1724 f->acc >>= n; \
1725 f->valid_bits -= n; \
1726 if (f->valid_bits < 0) { f->valid_bits = 0; var = -1; } \
1727 } else { \
1728 var = codebook_decode_scalar_raw(f,c); \
1729 }
1730
1731#else
1732
1733static int codebook_decode_scalar(vorb *f, Codebook *c)
1734{
1735 int i;
1736 if (f->valid_bits < STB_VORBIS_FAST_HUFFMAN_LENGTH)
1737 prep_huffman(f);
1738 // fast huffman table lookup
1739 i = f->acc & FAST_HUFFMAN_TABLE_MASK;
1740 i = c->fast_huffman[i];
1741 if (i >= 0) {
1742 f->acc >>= c->codeword_lengths[i];
1743 f->valid_bits -= c->codeword_lengths[i];
1744 if (f->valid_bits < 0) { f->valid_bits = 0; return -1; }
1745 return i;
1746 }
1747 return codebook_decode_scalar_raw(f,c);
1748}
1749
1750#define DECODE_RAW(var,f,c) var = codebook_decode_scalar(f,c);
1751
1752#endif
1753
1754#define DECODE(var,f,c) \
1755 DECODE_RAW(var,f,c) \
1756 if (c->sparse) var = c->sorted_values[var];
1757
1758#ifndef STB_VORBIS_DIVIDES_IN_CODEBOOK
1759 #define DECODE_VQ(var,f,c) DECODE_RAW(var,f,c)
1760#else
1761 #define DECODE_VQ(var,f,c) DECODE(var,f,c)
1762#endif
1763
1764
1765
1766
1767
1768
1769// CODEBOOK_ELEMENT_FAST is an optimization for the CODEBOOK_FLOATS case
1770// where we avoid one addition
1771#define CODEBOOK_ELEMENT(c,off) (c->multiplicands[off])
1772#define CODEBOOK_ELEMENT_FAST(c,off) (c->multiplicands[off])
1773#define CODEBOOK_ELEMENT_BASE(c) (0)
1774
1775static int codebook_decode_start(vorb *f, Codebook *c)
1776{
1777 int z = -1;
1778
1779 // type 0 is only legal in a scalar context
1780 if (c->lookup_type == 0)
1781 error(f, VORBIS_invalid_stream);
1782 else {
1783 DECODE_VQ(z,f,c);
1784 if (c->sparse) assert(z < c->sorted_entries);
1785 if (z < 0) { // check for EOP
1786 if (!f->bytes_in_seg)
1787 if (f->last_seg)
1788 return z;
1789 error(f, VORBIS_invalid_stream);
1790 }
1791 }
1792 return z;
1793}
1794
1795static int codebook_decode(vorb *f, Codebook *c, float *output, int len)
1796{
1797 int i,z = codebook_decode_start(f,c);
1798 if (z < 0) return FALSE;
1799 if (len > c->dimensions) len = c->dimensions;
1800
1801#ifdef STB_VORBIS_DIVIDES_IN_CODEBOOK
1802 if (c->lookup_type == 1) {
1803 float last = CODEBOOK_ELEMENT_BASE(c);
1804 int div = 1;
1805 for (i=0; i < len; ++i) {
1806 int off = (z / div) % c->lookup_values;
1807 float val = CODEBOOK_ELEMENT_FAST(c,off) + last;
1808 output[i] += val;
1809 if (c->sequence_p) last = val + c->minimum_value;
1810 div *= c->lookup_values;
1811 }
1812 return TRUE;
1813 }
1814#endif
1815
1816 z *= c->dimensions;
1817 if (c->sequence_p) {
1818 float last = CODEBOOK_ELEMENT_BASE(c);
1819 for (i=0; i < len; ++i) {
1820 float val = CODEBOOK_ELEMENT_FAST(c,z+i) + last;
1821 output[i] += val;
1822 last = val + c->minimum_value;
1823 }
1824 } else {
1825 float last = CODEBOOK_ELEMENT_BASE(c);
1826 for (i=0; i < len; ++i) {
1827 output[i] += CODEBOOK_ELEMENT_FAST(c,z+i) + last;
1828 }
1829 }
1830
1831 return TRUE;
1832}
1833
1834static int codebook_decode_step(vorb *f, Codebook *c, float *output, int len, int step)
1835{
1836 int i,z = codebook_decode_start(f,c);
1837 float last = CODEBOOK_ELEMENT_BASE(c);
1838 if (z < 0) return FALSE;
1839 if (len > c->dimensions) len = c->dimensions;
1840
1841#ifdef STB_VORBIS_DIVIDES_IN_CODEBOOK
1842 if (c->lookup_type == 1) {
1843 int div = 1;
1844 for (i=0; i < len; ++i) {
1845 int off = (z / div) % c->lookup_values;
1846 float val = CODEBOOK_ELEMENT_FAST(c,off) + last;
1847 output[i*step] += val;
1848 if (c->sequence_p) last = val;
1849 div *= c->lookup_values;
1850 }
1851 return TRUE;
1852 }
1853#endif
1854
1855 z *= c->dimensions;
1856 for (i=0; i < len; ++i) {
1857 float val = CODEBOOK_ELEMENT_FAST(c,z+i) + last;
1858 output[i*step] += val;
1859 if (c->sequence_p) last = val;
1860 }
1861
1862 return TRUE;
1863}
1864
1865static int codebook_decode_deinterleave_repeat(vorb *f, Codebook *c, float **outputs, int ch, int *c_inter_p, int *p_inter_p, int len, int total_decode)
1866{
1867 int c_inter = *c_inter_p;
1868 int p_inter = *p_inter_p;
1869 int i,z, effective = c->dimensions;
1870
1871 // type 0 is only legal in a scalar context
1872 if (c->lookup_type == 0) return error(f, VORBIS_invalid_stream);
1873
1874 while (total_decode > 0) {
1875 float last = CODEBOOK_ELEMENT_BASE(c);
1876 DECODE_VQ(z,f,c);
1877 #ifndef STB_VORBIS_DIVIDES_IN_CODEBOOK
1878 assert(!c->sparse || z < c->sorted_entries);
1879 #endif
1880 if (z < 0) {
1881 if (!f->bytes_in_seg)
1882 if (f->last_seg) return FALSE;
1883 return error(f, VORBIS_invalid_stream);
1884 }
1885
1886 // if this will take us off the end of the buffers, stop short!
1887 // we check by computing the length of the virtual interleaved
1888 // buffer (len*ch), our current offset within it (p_inter*ch)+(c_inter),
1889 // and the length we'll be using (effective)
1890 if (c_inter + p_inter*ch + effective > len * ch) {
1891 effective = len*ch - (p_inter*ch - c_inter);
1892 }
1893
1894 #ifdef STB_VORBIS_DIVIDES_IN_CODEBOOK
1895 if (c->lookup_type == 1) {
1896 int div = 1;
1897 for (i=0; i < effective; ++i) {
1898 int off = (z / div) % c->lookup_values;
1899 float val = CODEBOOK_ELEMENT_FAST(c,off) + last;
1900 if (outputs[c_inter])
1901 outputs[c_inter][p_inter] += val;
1902 if (++c_inter == ch) { c_inter = 0; ++p_inter; }
1903 if (c->sequence_p) last = val;
1904 div *= c->lookup_values;
1905 }
1906 } else
1907 #endif
1908 {
1909 z *= c->dimensions;
1910 if (c->sequence_p) {
1911 for (i=0; i < effective; ++i) {
1912 float val = CODEBOOK_ELEMENT_FAST(c,z+i) + last;
1913 if (outputs[c_inter])
1914 outputs[c_inter][p_inter] += val;
1915 if (++c_inter == ch) { c_inter = 0; ++p_inter; }
1916 last = val;
1917 }
1918 } else {
1919 for (i=0; i < effective; ++i) {
1920 float val = CODEBOOK_ELEMENT_FAST(c,z+i) + last;
1921 if (outputs[c_inter])
1922 outputs[c_inter][p_inter] += val;
1923 if (++c_inter == ch) { c_inter = 0; ++p_inter; }
1924 }
1925 }
1926 }
1927
1928 total_decode -= effective;
1929 }
1930 *c_inter_p = c_inter;
1931 *p_inter_p = p_inter;
1932 return TRUE;
1933}
1934
1935static int predict_point(int x, int x0, int x1, int y0, int y1)
1936{
1937 int dy = y1 - y0;
1938 int adx = x1 - x0;
1939 // @OPTIMIZE: force int division to round in the right direction... is this necessary on x86?
1940 int err = abs(dy) * (x - x0);
1941 int off = err / adx;
1942 return dy < 0 ? y0 - off : y0 + off;
1943}
1944
1945// the following table is block-copied from the specification
1946static float inverse_db_table[256] =
1947{
1948 1.0649863e-07f, 1.1341951e-07f, 1.2079015e-07f, 1.2863978e-07f,
1949 1.3699951e-07f, 1.4590251e-07f, 1.5538408e-07f, 1.6548181e-07f,
1950 1.7623575e-07f, 1.8768855e-07f, 1.9988561e-07f, 2.1287530e-07f,
1951 2.2670913e-07f, 2.4144197e-07f, 2.5713223e-07f, 2.7384213e-07f,
1952 2.9163793e-07f, 3.1059021e-07f, 3.3077411e-07f, 3.5226968e-07f,
1953 3.7516214e-07f, 3.9954229e-07f, 4.2550680e-07f, 4.5315863e-07f,
1954 4.8260743e-07f, 5.1396998e-07f, 5.4737065e-07f, 5.8294187e-07f,
1955 6.2082472e-07f, 6.6116941e-07f, 7.0413592e-07f, 7.4989464e-07f,
1956 7.9862701e-07f, 8.5052630e-07f, 9.0579828e-07f, 9.6466216e-07f,
1957 1.0273513e-06f, 1.0941144e-06f, 1.1652161e-06f, 1.2409384e-06f,
1958 1.3215816e-06f, 1.4074654e-06f, 1.4989305e-06f, 1.5963394e-06f,
1959 1.7000785e-06f, 1.8105592e-06f, 1.9282195e-06f, 2.0535261e-06f,
1960 2.1869758e-06f, 2.3290978e-06f, 2.4804557e-06f, 2.6416497e-06f,
1961 2.8133190e-06f, 2.9961443e-06f, 3.1908506e-06f, 3.3982101e-06f,
1962 3.6190449e-06f, 3.8542308e-06f, 4.1047004e-06f, 4.3714470e-06f,
1963 4.6555282e-06f, 4.9580707e-06f, 5.2802740e-06f, 5.6234160e-06f,
1964 5.9888572e-06f, 6.3780469e-06f, 6.7925283e-06f, 7.2339451e-06f,
1965 7.7040476e-06f, 8.2047000e-06f, 8.7378876e-06f, 9.3057248e-06f,
1966 9.9104632e-06f, 1.0554501e-05f, 1.1240392e-05f, 1.1970856e-05f,
1967 1.2748789e-05f, 1.3577278e-05f, 1.4459606e-05f, 1.5399272e-05f,
1968 1.6400004e-05f, 1.7465768e-05f, 1.8600792e-05f, 1.9809576e-05f,
1969 2.1096914e-05f, 2.2467911e-05f, 2.3928002e-05f, 2.5482978e-05f,
1970 2.7139006e-05f, 2.8902651e-05f, 3.0780908e-05f, 3.2781225e-05f,
1971 3.4911534e-05f, 3.7180282e-05f, 3.9596466e-05f, 4.2169667e-05f,
1972 4.4910090e-05f, 4.7828601e-05f, 5.0936773e-05f, 5.4246931e-05f,
1973 5.7772202e-05f, 6.1526565e-05f, 6.5524908e-05f, 6.9783085e-05f,
1974 7.4317983e-05f, 7.9147585e-05f, 8.4291040e-05f, 8.9768747e-05f,
1975 9.5602426e-05f, 0.00010181521f, 0.00010843174f, 0.00011547824f,
1976 0.00012298267f, 0.00013097477f, 0.00013948625f, 0.00014855085f,
1977 0.00015820453f, 0.00016848555f, 0.00017943469f, 0.00019109536f,
1978 0.00020351382f, 0.00021673929f, 0.00023082423f, 0.00024582449f,
1979 0.00026179955f, 0.00027881276f, 0.00029693158f, 0.00031622787f,
1980 0.00033677814f, 0.00035866388f, 0.00038197188f, 0.00040679456f,
1981 0.00043323036f, 0.00046138411f, 0.00049136745f, 0.00052329927f,
1982 0.00055730621f, 0.00059352311f, 0.00063209358f, 0.00067317058f,
1983 0.00071691700f, 0.00076350630f, 0.00081312324f, 0.00086596457f,
1984 0.00092223983f, 0.00098217216f, 0.0010459992f, 0.0011139742f,
1985 0.0011863665f, 0.0012634633f, 0.0013455702f, 0.0014330129f,
1986 0.0015261382f, 0.0016253153f, 0.0017309374f, 0.0018434235f,
1987 0.0019632195f, 0.0020908006f, 0.0022266726f, 0.0023713743f,
1988 0.0025254795f, 0.0026895994f, 0.0028643847f, 0.0030505286f,
1989 0.0032487691f, 0.0034598925f, 0.0036847358f, 0.0039241906f,
1990 0.0041792066f, 0.0044507950f, 0.0047400328f, 0.0050480668f,
1991 0.0053761186f, 0.0057254891f, 0.0060975636f, 0.0064938176f,
1992 0.0069158225f, 0.0073652516f, 0.0078438871f, 0.0083536271f,
1993 0.0088964928f, 0.009474637f, 0.010090352f, 0.010746080f,
1994 0.011444421f, 0.012188144f, 0.012980198f, 0.013823725f,
1995 0.014722068f, 0.015678791f, 0.016697687f, 0.017782797f,
1996 0.018938423f, 0.020169149f, 0.021479854f, 0.022875735f,
1997 0.024362330f, 0.025945531f, 0.027631618f, 0.029427276f,
1998 0.031339626f, 0.033376252f, 0.035545228f, 0.037855157f,
1999 0.040315199f, 0.042935108f, 0.045725273f, 0.048696758f,
2000 0.051861348f, 0.055231591f, 0.058820850f, 0.062643361f,
2001 0.066714279f, 0.071049749f, 0.075666962f, 0.080584227f,
2002 0.085821044f, 0.091398179f, 0.097337747f, 0.10366330f,
2003 0.11039993f, 0.11757434f, 0.12521498f, 0.13335215f,
2004 0.14201813f, 0.15124727f, 0.16107617f, 0.17154380f,
2005 0.18269168f, 0.19456402f, 0.20720788f, 0.22067342f,
2006 0.23501402f, 0.25028656f, 0.26655159f, 0.28387361f,
2007 0.30232132f, 0.32196786f, 0.34289114f, 0.36517414f,
2008 0.38890521f, 0.41417847f, 0.44109412f, 0.46975890f,
2009 0.50028648f, 0.53279791f, 0.56742212f, 0.60429640f,
2010 0.64356699f, 0.68538959f, 0.72993007f, 0.77736504f,
2011 0.82788260f, 0.88168307f, 0.9389798f, 1.0f
2012};
2013
2014
2015// @OPTIMIZE: if you want to replace this bresenham line-drawing routine,
2016// note that you must produce bit-identical output to decode correctly;
2017// this specific sequence of operations is specified in the spec (it's
2018// drawing integer-quantized frequency-space lines that the encoder
2019// expects to be exactly the same)
2020// ... also, isn't the whole point of Bresenham's algorithm to NOT
2021// have to divide in the setup? sigh.
2022#ifndef STB_VORBIS_NO_DEFER_FLOOR
2023#define LINE_OP(a,b) a *= b
2024#else
2025#define LINE_OP(a,b) a = b
2026#endif
2027
2028#ifdef STB_VORBIS_DIVIDE_TABLE
2029#define DIVTAB_NUMER 32
2030#define DIVTAB_DENOM 64
2031int8 integer_divide_table[DIVTAB_NUMER][DIVTAB_DENOM]; // 2KB
2032#endif
2033
2034static __forceinline void draw_line(float *output, int x0, int y0, int x1, int y1, int n)
2035{
2036 int dy = y1 - y0;
2037 int adx = x1 - x0;
2038 int ady = abs(dy);
2039 int base;
2040 int x=x0,y=y0;
2041 int err = 0;
2042 int sy;
2043
2044#ifdef STB_VORBIS_DIVIDE_TABLE
2045 if (adx < DIVTAB_DENOM && ady < DIVTAB_NUMER) {
2046 if (dy < 0) {
2047 base = -integer_divide_table[ady][adx];
2048 sy = base-1;
2049 } else {
2050 base = integer_divide_table[ady][adx];
2051 sy = base+1;
2052 }
2053 } else {
2054 base = dy / adx;
2055 if (dy < 0)
2056 sy = base - 1;
2057 else
2058 sy = base+1;
2059 }
2060#else
2061 base = dy / adx;
2062 if (dy < 0)
2063 sy = base - 1;
2064 else
2065 sy = base+1;
2066#endif
2067 ady -= abs(base) * adx;
2068 if (x1 > n) x1 = n;
2069 if (x < x1) {
2070 LINE_OP(output[x], inverse_db_table[y&255]);
2071 for (++x; x < x1; ++x) {
2072 err += ady;
2073 if (err >= adx) {
2074 err -= adx;
2075 y += sy;
2076 } else
2077 y += base;
2078 LINE_OP(output[x], inverse_db_table[y&255]);
2079 }
2080 }
2081}
2082
2083static int residue_decode(vorb *f, Codebook *book, float *target, int offset, int n, int rtype)
2084{
2085 int k;
2086 if (rtype == 0) {
2087 int step = n / book->dimensions;
2088 for (k=0; k < step; ++k)
2089 if (!codebook_decode_step(f, book, target+offset+k, n-offset-k, step))
2090 return FALSE;
2091 } else {
2092 for (k=0; k < n; ) {
2093 if (!codebook_decode(f, book, target+offset, n-k))
2094 return FALSE;
2095 k += book->dimensions;
2096 offset += book->dimensions;
2097 }
2098 }
2099 return TRUE;
2100}
2101
2102// n is 1/2 of the blocksize --
2103// specification: "Correct per-vector decode length is [n]/2"
2104static void decode_residue(vorb *f, float *residue_buffers[], int ch, int n, int rn, uint8 *do_not_decode)
2105{
2106 int i,j,pass;
2107 Residue *r = f->residue_config + rn;
2108 int rtype = f->residue_types[rn];
2109 int c = r->classbook;
2110 int classwords = f->codebooks[c].dimensions;
2111 unsigned int actual_size = rtype == 2 ? n*2 : n;
2112 unsigned int limit_r_begin = (r->begin < actual_size ? r->begin : actual_size);
2113 unsigned int limit_r_end = (r->end < actual_size ? r->end : actual_size);
2114 int n_read = limit_r_end - limit_r_begin;
2115 int part_read = n_read / r->part_size;
2116 int temp_alloc_point = temp_alloc_save(f);
2117 #ifndef STB_VORBIS_DIVIDES_IN_RESIDUE
2118 uint8 ***part_classdata = (uint8 ***) temp_block_array(f,f->channels, part_read * sizeof(**part_classdata));
2119 #else
2120 int **classifications = (int **) temp_block_array(f,f->channels, part_read * sizeof(**classifications));
2121 #endif
2122
2123 CHECK(f);
2124
2125 for (i=0; i < ch; ++i)
2126 if (!do_not_decode[i])
2127 memset(residue_buffers[i], 0, sizeof(float) * n);
2128
2129 if (rtype == 2 && ch != 1) {
2130 for (j=0; j < ch; ++j)
2131 if (!do_not_decode[j])
2132 break;
2133 if (j == ch)
2134 goto done;
2135
2136 for (pass=0; pass < 8; ++pass) {
2137 int pcount = 0, class_set = 0;
2138 if (ch == 2) {
2139 while (pcount < part_read) {
2140 int z = r->begin + pcount*r->part_size;
2141 int c_inter = (z & 1), p_inter = z>>1;
2142 if (pass == 0) {
2143 Codebook *c = f->codebooks+r->classbook;
2144 int q;
2145 DECODE(q,f,c);
2146 if (q == EOP) goto done;
2147 #ifndef STB_VORBIS_DIVIDES_IN_RESIDUE
2148 part_classdata[0][class_set] = r->classdata[q];
2149 #else
2150 for (i=classwords-1; i >= 0; --i) {
2151 classifications[0][i+pcount] = q % r->classifications;
2152 q /= r->classifications;
2153 }
2154 #endif
2155 }
2156 for (i=0; i < classwords && pcount < part_read; ++i, ++pcount) {
2157 int z = r->begin + pcount*r->part_size;
2158 #ifndef STB_VORBIS_DIVIDES_IN_RESIDUE
2159 int c = part_classdata[0][class_set][i];
2160 #else
2161 int c = classifications[0][pcount];
2162 #endif
2163 int b = r->residue_books[c][pass];
2164 if (b >= 0) {
2165 Codebook *book = f->codebooks + b;
2166 #ifdef STB_VORBIS_DIVIDES_IN_CODEBOOK
2167 if (!codebook_decode_deinterleave_repeat(f, book, residue_buffers, ch, &c_inter, &p_inter, n, r->part_size))
2168 goto done;
2169 #else
2170 // saves 1%
2171 if (!codebook_decode_deinterleave_repeat(f, book, residue_buffers, ch, &c_inter, &p_inter, n, r->part_size))
2172 goto done;
2173 #endif
2174 } else {
2175 z += r->part_size;
2176 c_inter = z & 1;
2177 p_inter = z >> 1;
2178 }
2179 }
2180 #ifndef STB_VORBIS_DIVIDES_IN_RESIDUE
2181 ++class_set;
2182 #endif
2183 }
2184 } else if (ch > 2) {
2185 while (pcount < part_read) {
2186 int z = r->begin + pcount*r->part_size;
2187 int c_inter = z % ch, p_inter = z/ch;
2188 if (pass == 0) {
2189 Codebook *c = f->codebooks+r->classbook;
2190 int q;
2191 DECODE(q,f,c);
2192 if (q == EOP) goto done;
2193 #ifndef STB_VORBIS_DIVIDES_IN_RESIDUE
2194 part_classdata[0][class_set] = r->classdata[q];
2195 #else
2196 for (i=classwords-1; i >= 0; --i) {
2197 classifications[0][i+pcount] = q % r->classifications;
2198 q /= r->classifications;
2199 }
2200 #endif
2201 }
2202 for (i=0; i < classwords && pcount < part_read; ++i, ++pcount) {
2203 int z = r->begin + pcount*r->part_size;
2204 #ifndef STB_VORBIS_DIVIDES_IN_RESIDUE
2205 int c = part_classdata[0][class_set][i];
2206 #else
2207 int c = classifications[0][pcount];
2208 #endif
2209 int b = r->residue_books[c][pass];
2210 if (b >= 0) {
2211 Codebook *book = f->codebooks + b;
2212 if (!codebook_decode_deinterleave_repeat(f, book, residue_buffers, ch, &c_inter, &p_inter, n, r->part_size))
2213 goto done;
2214 } else {
2215 z += r->part_size;
2216 c_inter = z % ch;
2217 p_inter = z / ch;
2218 }
2219 }
2220 #ifndef STB_VORBIS_DIVIDES_IN_RESIDUE
2221 ++class_set;
2222 #endif
2223 }
2224 }
2225 }
2226 goto done;
2227 }
2228 CHECK(f);
2229
2230 for (pass=0; pass < 8; ++pass) {
2231 int pcount = 0, class_set=0;
2232 while (pcount < part_read) {
2233 if (pass == 0) {
2234 for (j=0; j < ch; ++j) {
2235 if (!do_not_decode[j]) {
2236 Codebook *c = f->codebooks+r->classbook;
2237 int temp;
2238 DECODE(temp,f,c);
2239 if (temp == EOP) goto done;
2240 #ifndef STB_VORBIS_DIVIDES_IN_RESIDUE
2241 part_classdata[j][class_set] = r->classdata[temp];
2242 #else
2243 for (i=classwords-1; i >= 0; --i) {
2244 classifications[j][i+pcount] = temp % r->classifications;
2245 temp /= r->classifications;
2246 }
2247 #endif
2248 }
2249 }
2250 }
2251 for (i=0; i < classwords && pcount < part_read; ++i, ++pcount) {
2252 for (j=0; j < ch; ++j) {
2253 if (!do_not_decode[j]) {
2254 #ifndef STB_VORBIS_DIVIDES_IN_RESIDUE
2255 int c = part_classdata[j][class_set][i];
2256 #else
2257 int c = classifications[j][pcount];
2258 #endif
2259 int b = r->residue_books[c][pass];
2260 if (b >= 0) {
2261 float *target = residue_buffers[j];
2262 int offset = r->begin + pcount * r->part_size;
2263 int n = r->part_size;
2264 Codebook *book = f->codebooks + b;
2265 if (!residue_decode(f, book, target, offset, n, rtype))
2266 goto done;
2267 }
2268 }
2269 }
2270 }
2271 #ifndef STB_VORBIS_DIVIDES_IN_RESIDUE
2272 ++class_set;
2273 #endif
2274 }
2275 }
2276 done:
2277 CHECK(f);
2278 #ifndef STB_VORBIS_DIVIDES_IN_RESIDUE
2279 temp_free(f,part_classdata);
2280 #else
2281 temp_free(f,classifications);
2282 #endif
2283 temp_alloc_restore(f,temp_alloc_point);
2284}
2285
2286
2287#if 0
2288// slow way for debugging
2289void inverse_mdct_slow(float *buffer, int n)
2290{
2291 int i,j;
2292 int n2 = n >> 1;
2293 float *x = (float *) malloc(sizeof(*x) * n2);
2294 memcpy(x, buffer, sizeof(*x) * n2);
2295 for (i=0; i < n; ++i) {
2296 float acc = 0;
2297 for (j=0; j < n2; ++j)
2298 // formula from paper:
2299 //acc += n/4.0f * x[j] * (float) cos(M_PI / 2 / n * (2 * i + 1 + n/2.0)*(2*j+1));
2300 // formula from wikipedia
2301 //acc += 2.0f / n2 * x[j] * (float) cos(M_PI/n2 * (i + 0.5 + n2/2)*(j + 0.5));
2302 // these are equivalent, except the formula from the paper inverts the multiplier!
2303 // however, what actually works is NO MULTIPLIER!?!
2304 //acc += 64 * 2.0f / n2 * x[j] * (float) cos(M_PI/n2 * (i + 0.5 + n2/2)*(j + 0.5));
2305 acc += x[j] * (float) cos(M_PI / 2 / n * (2 * i + 1 + n/2.0)*(2*j+1));
2306 buffer[i] = acc;
2307 }
2308 free(x);
2309}
2310#elif 0
2311// same as above, but just barely able to run in real time on modern machines
2312void inverse_mdct_slow(float *buffer, int n, vorb *f, int blocktype)
2313{
2314 float mcos[16384];
2315 int i,j;
2316 int n2 = n >> 1, nmask = (n << 2) -1;
2317 float *x = (float *) malloc(sizeof(*x) * n2);
2318 memcpy(x, buffer, sizeof(*x) * n2);
2319 for (i=0; i < 4*n; ++i)
2320 mcos[i] = (float) cos(M_PI / 2 * i / n);
2321
2322 for (i=0; i < n; ++i) {
2323 float acc = 0;
2324 for (j=0; j < n2; ++j)
2325 acc += x[j] * mcos[(2 * i + 1 + n2)*(2*j+1) & nmask];
2326 buffer[i] = acc;
2327 }
2328 free(x);
2329}
2330#elif 0
2331// transform to use a slow dct-iv; this is STILL basically trivial,
2332// but only requires half as many ops
2333void dct_iv_slow(float *buffer, int n)
2334{
2335 float mcos[16384];
2336 float x[2048];
2337 int i,j;
2338 int n2 = n >> 1, nmask = (n << 3) - 1;
2339 memcpy(x, buffer, sizeof(*x) * n);
2340 for (i=0; i < 8*n; ++i)
2341 mcos[i] = (float) cos(M_PI / 4 * i / n);
2342 for (i=0; i < n; ++i) {
2343 float acc = 0;
2344 for (j=0; j < n; ++j)
2345 acc += x[j] * mcos[((2 * i + 1)*(2*j+1)) & nmask];
2346 buffer[i] = acc;
2347 }
2348}
2349
2350void inverse_mdct_slow(float *buffer, int n, vorb *f, int blocktype)
2351{
2352 int i, n4 = n >> 2, n2 = n >> 1, n3_4 = n - n4;
2353 float temp[4096];
2354
2355 memcpy(temp, buffer, n2 * sizeof(float));
2356 dct_iv_slow(temp, n2); // returns -c'-d, a-b'
2357
2358 for (i=0; i < n4 ; ++i) buffer[i] = temp[i+n4]; // a-b'
2359 for ( ; i < n3_4; ++i) buffer[i] = -temp[n3_4 - i - 1]; // b-a', c+d'
2360 for ( ; i < n ; ++i) buffer[i] = -temp[i - n3_4]; // c'+d
2361}
2362#endif
2363
2364#ifndef LIBVORBIS_MDCT
2365#define LIBVORBIS_MDCT 0
2366#endif
2367
2368#if LIBVORBIS_MDCT
2369// directly call the vorbis MDCT using an interface documented
2370// by Jeff Roberts... useful for performance comparison
2371typedef struct
2372{
2373 int n;
2374 int log2n;
2375
2376 float *trig;
2377 int *bitrev;
2378
2379 float scale;
2380} mdct_lookup;
2381
2382extern void mdct_init(mdct_lookup *lookup, int n);
2383extern void mdct_clear(mdct_lookup *l);
2384extern void mdct_backward(mdct_lookup *init, float *in, float *out);
2385
2386mdct_lookup M1,M2;
2387
2388void inverse_mdct(float *buffer, int n, vorb *f, int blocktype)
2389{
2390 mdct_lookup *M;
2391 if (M1.n == n) M = &M1;
2392 else if (M2.n == n) M = &M2;
2393 else if (M1.n == 0) { mdct_init(&M1, n); M = &M1; }
2394 else {
2395 if (M2.n) __asm int 3;
2396 mdct_init(&M2, n);
2397 M = &M2;
2398 }
2399
2400 mdct_backward(M, buffer, buffer);
2401}
2402#endif
2403
2404
2405// the following were split out into separate functions while optimizing;
2406// they could be pushed back up but eh. __forceinline showed no change;
2407// they're probably already being inlined.
2408static void imdct_step3_iter0_loop(int n, float *e, int i_off, int k_off, float *A)
2409{
2410 float *ee0 = e + i_off;
2411 float *ee2 = ee0 + k_off;
2412 int i;
2413
2414 assert((n & 3) == 0);
2415 for (i=(n>>2); i > 0; --i) {
2416 float k00_20, k01_21;
2417 k00_20 = ee0[ 0] - ee2[ 0];
2418 k01_21 = ee0[-1] - ee2[-1];
2419 ee0[ 0] += ee2[ 0];//ee0[ 0] = ee0[ 0] + ee2[ 0];
2420 ee0[-1] += ee2[-1];//ee0[-1] = ee0[-1] + ee2[-1];
2421 ee2[ 0] = k00_20 * A[0] - k01_21 * A[1];
2422 ee2[-1] = k01_21 * A[0] + k00_20 * A[1];
2423 A += 8;
2424
2425 k00_20 = ee0[-2] - ee2[-2];
2426 k01_21 = ee0[-3] - ee2[-3];
2427 ee0[-2] += ee2[-2];//ee0[-2] = ee0[-2] + ee2[-2];
2428 ee0[-3] += ee2[-3];//ee0[-3] = ee0[-3] + ee2[-3];
2429 ee2[-2] = k00_20 * A[0] - k01_21 * A[1];
2430 ee2[-3] = k01_21 * A[0] + k00_20 * A[1];
2431 A += 8;
2432
2433 k00_20 = ee0[-4] - ee2[-4];
2434 k01_21 = ee0[-5] - ee2[-5];
2435 ee0[-4] += ee2[-4];//ee0[-4] = ee0[-4] + ee2[-4];
2436 ee0[-5] += ee2[-5];//ee0[-5] = ee0[-5] + ee2[-5];
2437 ee2[-4] = k00_20 * A[0] - k01_21 * A[1];
2438 ee2[-5] = k01_21 * A[0] + k00_20 * A[1];
2439 A += 8;
2440
2441 k00_20 = ee0[-6] - ee2[-6];
2442 k01_21 = ee0[-7] - ee2[-7];
2443 ee0[-6] += ee2[-6];//ee0[-6] = ee0[-6] + ee2[-6];
2444 ee0[-7] += ee2[-7];//ee0[-7] = ee0[-7] + ee2[-7];
2445 ee2[-6] = k00_20 * A[0] - k01_21 * A[1];
2446 ee2[-7] = k01_21 * A[0] + k00_20 * A[1];
2447 A += 8;
2448 ee0 -= 8;
2449 ee2 -= 8;
2450 }
2451}
2452
2453static void imdct_step3_inner_r_loop(int lim, float *e, int d0, int k_off, float *A, int k1)
2454{
2455 int i;
2456 float k00_20, k01_21;
2457
2458 float *e0 = e + d0;
2459 float *e2 = e0 + k_off;
2460
2461 for (i=lim >> 2; i > 0; --i) {
2462 k00_20 = e0[-0] - e2[-0];
2463 k01_21 = e0[-1] - e2[-1];
2464 e0[-0] += e2[-0];//e0[-0] = e0[-0] + e2[-0];
2465 e0[-1] += e2[-1];//e0[-1] = e0[-1] + e2[-1];
2466 e2[-0] = (k00_20)*A[0] - (k01_21) * A[1];
2467 e2[-1] = (k01_21)*A[0] + (k00_20) * A[1];
2468
2469 A += k1;
2470
2471 k00_20 = e0[-2] - e2[-2];
2472 k01_21 = e0[-3] - e2[-3];
2473 e0[-2] += e2[-2];//e0[-2] = e0[-2] + e2[-2];
2474 e0[-3] += e2[-3];//e0[-3] = e0[-3] + e2[-3];
2475 e2[-2] = (k00_20)*A[0] - (k01_21) * A[1];
2476 e2[-3] = (k01_21)*A[0] + (k00_20) * A[1];
2477
2478 A += k1;
2479
2480 k00_20 = e0[-4] - e2[-4];
2481 k01_21 = e0[-5] - e2[-5];
2482 e0[-4] += e2[-4];//e0[-4] = e0[-4] + e2[-4];
2483 e0[-5] += e2[-5];//e0[-5] = e0[-5] + e2[-5];
2484 e2[-4] = (k00_20)*A[0] - (k01_21) * A[1];
2485 e2[-5] = (k01_21)*A[0] + (k00_20) * A[1];
2486
2487 A += k1;
2488
2489 k00_20 = e0[-6] - e2[-6];
2490 k01_21 = e0[-7] - e2[-7];
2491 e0[-6] += e2[-6];//e0[-6] = e0[-6] + e2[-6];
2492 e0[-7] += e2[-7];//e0[-7] = e0[-7] + e2[-7];
2493 e2[-6] = (k00_20)*A[0] - (k01_21) * A[1];
2494 e2[-7] = (k01_21)*A[0] + (k00_20) * A[1];
2495
2496 e0 -= 8;
2497 e2 -= 8;
2498
2499 A += k1;
2500 }
2501}
2502
2503static void imdct_step3_inner_s_loop(int n, float *e, int i_off, int k_off, float *A, int a_off, int k0)
2504{
2505 int i;
2506 float A0 = A[0];
2507 float A1 = A[0+1];
2508 float A2 = A[0+a_off];
2509 float A3 = A[0+a_off+1];
2510 float A4 = A[0+a_off*2+0];
2511 float A5 = A[0+a_off*2+1];
2512 float A6 = A[0+a_off*3+0];
2513 float A7 = A[0+a_off*3+1];
2514
2515 float k00,k11;
2516
2517 float *ee0 = e +i_off;
2518 float *ee2 = ee0+k_off;
2519
2520 for (i=n; i > 0; --i) {
2521 k00 = ee0[ 0] - ee2[ 0];
2522 k11 = ee0[-1] - ee2[-1];
2523 ee0[ 0] = ee0[ 0] + ee2[ 0];
2524 ee0[-1] = ee0[-1] + ee2[-1];
2525 ee2[ 0] = (k00) * A0 - (k11) * A1;
2526 ee2[-1] = (k11) * A0 + (k00) * A1;
2527
2528 k00 = ee0[-2] - ee2[-2];
2529 k11 = ee0[-3] - ee2[-3];
2530 ee0[-2] = ee0[-2] + ee2[-2];
2531 ee0[-3] = ee0[-3] + ee2[-3];
2532 ee2[-2] = (k00) * A2 - (k11) * A3;
2533 ee2[-3] = (k11) * A2 + (k00) * A3;
2534
2535 k00 = ee0[-4] - ee2[-4];
2536 k11 = ee0[-5] - ee2[-5];
2537 ee0[-4] = ee0[-4] + ee2[-4];
2538 ee0[-5] = ee0[-5] + ee2[-5];
2539 ee2[-4] = (k00) * A4 - (k11) * A5;
2540 ee2[-5] = (k11) * A4 + (k00) * A5;
2541
2542 k00 = ee0[-6] - ee2[-6];
2543 k11 = ee0[-7] - ee2[-7];
2544 ee0[-6] = ee0[-6] + ee2[-6];
2545 ee0[-7] = ee0[-7] + ee2[-7];
2546 ee2[-6] = (k00) * A6 - (k11) * A7;
2547 ee2[-7] = (k11) * A6 + (k00) * A7;
2548
2549 ee0 -= k0;
2550 ee2 -= k0;
2551 }
2552}
2553
2554static __forceinline void iter_54(float *z)
2555{
2556 float k00,k11,k22,k33;
2557 float y0,y1,y2,y3;
2558
2559 k00 = z[ 0] - z[-4];
2560 y0 = z[ 0] + z[-4];
2561 y2 = z[-2] + z[-6];
2562 k22 = z[-2] - z[-6];
2563
2564 z[-0] = y0 + y2; // z0 + z4 + z2 + z6
2565 z[-2] = y0 - y2; // z0 + z4 - z2 - z6
2566
2567 // done with y0,y2
2568
2569 k33 = z[-3] - z[-7];
2570
2571 z[-4] = k00 + k33; // z0 - z4 + z3 - z7
2572 z[-6] = k00 - k33; // z0 - z4 - z3 + z7
2573
2574 // done with k33
2575
2576 k11 = z[-1] - z[-5];
2577 y1 = z[-1] + z[-5];
2578 y3 = z[-3] + z[-7];
2579
2580 z[-1] = y1 + y3; // z1 + z5 + z3 + z7
2581 z[-3] = y1 - y3; // z1 + z5 - z3 - z7
2582 z[-5] = k11 - k22; // z1 - z5 + z2 - z6
2583 z[-7] = k11 + k22; // z1 - z5 - z2 + z6
2584}
2585
2586static void imdct_step3_inner_s_loop_ld654(int n, float *e, int i_off, float *A, int base_n)
2587{
2588 int a_off = base_n >> 3;
2589 float A2 = A[0+a_off];
2590 float *z = e + i_off;
2591 float *base = z - 16 * n;
2592
2593 while (z > base) {
2594 float k00,k11;
2595 float l00,l11;
2596
2597 k00 = z[-0] - z[ -8];
2598 k11 = z[-1] - z[ -9];
2599 l00 = z[-2] - z[-10];
2600 l11 = z[-3] - z[-11];
2601 z[ -0] = z[-0] + z[ -8];
2602 z[ -1] = z[-1] + z[ -9];
2603 z[ -2] = z[-2] + z[-10];
2604 z[ -3] = z[-3] + z[-11];
2605 z[ -8] = k00;
2606 z[ -9] = k11;
2607 z[-10] = (l00+l11) * A2;
2608 z[-11] = (l11-l00) * A2;
2609
2610 k00 = z[ -4] - z[-12];
2611 k11 = z[ -5] - z[-13];
2612 l00 = z[ -6] - z[-14];
2613 l11 = z[ -7] - z[-15];
2614 z[ -4] = z[ -4] + z[-12];
2615 z[ -5] = z[ -5] + z[-13];
2616 z[ -6] = z[ -6] + z[-14];
2617 z[ -7] = z[ -7] + z[-15];
2618 z[-12] = k11;
2619 z[-13] = -k00;
2620 z[-14] = (l11-l00) * A2;
2621 z[-15] = (l00+l11) * -A2;
2622
2623 iter_54(z);
2624 iter_54(z-8);
2625 z -= 16;
2626 }
2627}
2628
2629static void inverse_mdct(float *buffer, int n, vorb *f, int blocktype)
2630{
2631 int n2 = n >> 1, n4 = n >> 2, n8 = n >> 3, l;
2632 int ld;
2633 // @OPTIMIZE: reduce register pressure by using fewer variables?
2634 int save_point = temp_alloc_save(f);
2635 float *buf2 = (float *) temp_alloc(f, n2 * sizeof(*buf2));
2636 float *u=NULL,*v=NULL;
2637 // twiddle factors
2638 float *A = f->A[blocktype];
2639
2640 // IMDCT algorithm from "The use of multirate filter banks for coding of high quality digital audio"
2641 // See notes about bugs in that paper in less-optimal implementation 'inverse_mdct_old' after this function.
2642
2643 // kernel from paper
2644
2645
2646 // merged:
2647 // copy and reflect spectral data
2648 // step 0
2649
2650 // note that it turns out that the items added together during
2651 // this step are, in fact, being added to themselves (as reflected
2652 // by step 0). inexplicable inefficiency! this became obvious
2653 // once I combined the passes.
2654
2655 // so there's a missing 'times 2' here (for adding X to itself).
2656 // this propagates through linearly to the end, where the numbers
2657 // are 1/2 too small, and need to be compensated for.
2658
2659 {
2660 float *d,*e, *AA, *e_stop;
2661 d = &buf2[n2-2];
2662 AA = A;
2663 e = &buffer[0];
2664 e_stop = &buffer[n2];
2665 while (e != e_stop) {
2666 d[1] = (e[0] * AA[0] - e[2]*AA[1]);
2667 d[0] = (e[0] * AA[1] + e[2]*AA[0]);
2668 d -= 2;
2669 AA += 2;
2670 e += 4;
2671 }
2672
2673 e = &buffer[n2-3];
2674 while (d >= buf2) {
2675 d[1] = (-e[2] * AA[0] - -e[0]*AA[1]);
2676 d[0] = (-e[2] * AA[1] + -e[0]*AA[0]);
2677 d -= 2;
2678 AA += 2;
2679 e -= 4;
2680 }
2681 }
2682
2683 // now we use symbolic names for these, so that we can
2684 // possibly swap their meaning as we change which operations
2685 // are in place
2686
2687 u = buffer;
2688 v = buf2;
2689
2690 // step 2 (paper output is w, now u)
2691 // this could be in place, but the data ends up in the wrong
2692 // place... _somebody_'s got to swap it, so this is nominated
2693 {
2694 float *AA = &A[n2-8];
2695 float *d0,*d1, *e0, *e1;
2696
2697 e0 = &v[n4];
2698 e1 = &v[0];
2699
2700 d0 = &u[n4];
2701 d1 = &u[0];
2702
2703 while (AA >= A) {
2704 float v40_20, v41_21;
2705
2706 v41_21 = e0[1] - e1[1];
2707 v40_20 = e0[0] - e1[0];
2708 d0[1] = e0[1] + e1[1];
2709 d0[0] = e0[0] + e1[0];
2710 d1[1] = v41_21*AA[4] - v40_20*AA[5];
2711 d1[0] = v40_20*AA[4] + v41_21*AA[5];
2712
2713 v41_21 = e0[3] - e1[3];
2714 v40_20 = e0[2] - e1[2];
2715 d0[3] = e0[3] + e1[3];
2716 d0[2] = e0[2] + e1[2];
2717 d1[3] = v41_21*AA[0] - v40_20*AA[1];
2718 d1[2] = v40_20*AA[0] + v41_21*AA[1];
2719
2720 AA -= 8;
2721
2722 d0 += 4;
2723 d1 += 4;
2724 e0 += 4;
2725 e1 += 4;
2726 }
2727 }
2728
2729 // step 3
2730 ld = ilog(n) - 1; // ilog is off-by-one from normal definitions
2731
2732 // optimized step 3:
2733
2734 // the original step3 loop can be nested r inside s or s inside r;
2735 // it's written originally as s inside r, but this is dumb when r
2736 // iterates many times, and s few. So I have two copies of it and
2737 // switch between them halfway.
2738
2739 // this is iteration 0 of step 3
2740 imdct_step3_iter0_loop(n >> 4, u, n2-1-n4*0, -(n >> 3), A);
2741 imdct_step3_iter0_loop(n >> 4, u, n2-1-n4*1, -(n >> 3), A);
2742
2743 // this is iteration 1 of step 3
2744 imdct_step3_inner_r_loop(n >> 5, u, n2-1 - n8*0, -(n >> 4), A, 16);
2745 imdct_step3_inner_r_loop(n >> 5, u, n2-1 - n8*1, -(n >> 4), A, 16);
2746 imdct_step3_inner_r_loop(n >> 5, u, n2-1 - n8*2, -(n >> 4), A, 16);
2747 imdct_step3_inner_r_loop(n >> 5, u, n2-1 - n8*3, -(n >> 4), A, 16);
2748
2749 l=2;
2750 for (; l < (ld-3)>>1; ++l) {
2751 int k0 = n >> (l+2), k0_2 = k0>>1;
2752 int lim = 1 << (l+1);
2753 int i;
2754 for (i=0; i < lim; ++i)
2755 imdct_step3_inner_r_loop(n >> (l+4), u, n2-1 - k0*i, -k0_2, A, 1 << (l+3));
2756 }
2757
2758 for (; l < ld-6; ++l) {
2759 int k0 = n >> (l+2), k1 = 1 << (l+3), k0_2 = k0>>1;
2760 int rlim = n >> (l+6), r;
2761 int lim = 1 << (l+1);
2762 int i_off;
2763 float *A0 = A;
2764 i_off = n2-1;
2765 for (r=rlim; r > 0; --r) {
2766 imdct_step3_inner_s_loop(lim, u, i_off, -k0_2, A0, k1, k0);
2767 A0 += k1*4;
2768 i_off -= 8;
2769 }
2770 }
2771
2772 // iterations with count:
2773 // ld-6,-5,-4 all interleaved together
2774 // the big win comes from getting rid of needless flops
2775 // due to the constants on pass 5 & 4 being all 1 and 0;
2776 // combining them to be simultaneous to improve cache made little difference
2777 imdct_step3_inner_s_loop_ld654(n >> 5, u, n2-1, A, n);
2778
2779 // output is u
2780
2781 // step 4, 5, and 6
2782 // cannot be in-place because of step 5
2783 {
2784 uint16 *bitrev = f->bit_reverse[blocktype];
2785 // weirdly, I'd have thought reading sequentially and writing
2786 // erratically would have been better than vice-versa, but in
2787 // fact that's not what my testing showed. (That is, with
2788 // j = bitreverse(i), do you read i and write j, or read j and write i.)
2789
2790 float *d0 = &v[n4-4];
2791 float *d1 = &v[n2-4];
2792 while (d0 >= v) {
2793 int k4;
2794
2795 k4 = bitrev[0];
2796 d1[3] = u[k4+0];
2797 d1[2] = u[k4+1];
2798 d0[3] = u[k4+2];
2799 d0[2] = u[k4+3];
2800
2801 k4 = bitrev[1];
2802 d1[1] = u[k4+0];
2803 d1[0] = u[k4+1];
2804 d0[1] = u[k4+2];
2805 d0[0] = u[k4+3];
2806
2807 d0 -= 4;
2808 d1 -= 4;
2809 bitrev += 2;
2810 }
2811 }
2812 // (paper output is u, now v)
2813
2814
2815 // data must be in buf2
2816 assert(v == buf2);
2817
2818 // step 7 (paper output is v, now v)
2819 // this is now in place
2820 {
2821 float *C = f->C[blocktype];
2822 float *d, *e;
2823
2824 d = v;
2825 e = v + n2 - 4;
2826
2827 while (d < e) {
2828 float a02,a11,b0,b1,b2,b3;
2829
2830 a02 = d[0] - e[2];
2831 a11 = d[1] + e[3];
2832
2833 b0 = C[1]*a02 + C[0]*a11;
2834 b1 = C[1]*a11 - C[0]*a02;
2835
2836 b2 = d[0] + e[ 2];
2837 b3 = d[1] - e[ 3];
2838
2839 d[0] = b2 + b0;
2840 d[1] = b3 + b1;
2841 e[2] = b2 - b0;
2842 e[3] = b1 - b3;
2843
2844 a02 = d[2] - e[0];
2845 a11 = d[3] + e[1];
2846
2847 b0 = C[3]*a02 + C[2]*a11;
2848 b1 = C[3]*a11 - C[2]*a02;
2849
2850 b2 = d[2] + e[ 0];
2851 b3 = d[3] - e[ 1];
2852
2853 d[2] = b2 + b0;
2854 d[3] = b3 + b1;
2855 e[0] = b2 - b0;
2856 e[1] = b1 - b3;
2857
2858 C += 4;
2859 d += 4;
2860 e -= 4;
2861 }
2862 }
2863
2864 // data must be in buf2
2865
2866
2867 // step 8+decode (paper output is X, now buffer)
2868 // this generates pairs of data a la 8 and pushes them directly through
2869 // the decode kernel (pushing rather than pulling) to avoid having
2870 // to make another pass later
2871
2872 // this cannot POSSIBLY be in place, so we refer to the buffers directly
2873
2874 {
2875 float *d0,*d1,*d2,*d3;
2876
2877 float *B = f->B[blocktype] + n2 - 8;
2878 float *e = buf2 + n2 - 8;
2879 d0 = &buffer[0];
2880 d1 = &buffer[n2-4];
2881 d2 = &buffer[n2];
2882 d3 = &buffer[n-4];
2883 while (e >= v) {
2884 float p0,p1,p2,p3;
2885
2886 p3 = e[6]*B[7] - e[7]*B[6];
2887 p2 = -e[6]*B[6] - e[7]*B[7];
2888
2889 d0[0] = p3;
2890 d1[3] = - p3;
2891 d2[0] = p2;
2892 d3[3] = p2;
2893
2894 p1 = e[4]*B[5] - e[5]*B[4];
2895 p0 = -e[4]*B[4] - e[5]*B[5];
2896
2897 d0[1] = p1;
2898 d1[2] = - p1;
2899 d2[1] = p0;
2900 d3[2] = p0;
2901
2902 p3 = e[2]*B[3] - e[3]*B[2];
2903 p2 = -e[2]*B[2] - e[3]*B[3];
2904
2905 d0[2] = p3;
2906 d1[1] = - p3;
2907 d2[2] = p2;
2908 d3[1] = p2;
2909
2910 p1 = e[0]*B[1] - e[1]*B[0];
2911 p0 = -e[0]*B[0] - e[1]*B[1];
2912
2913 d0[3] = p1;
2914 d1[0] = - p1;
2915 d2[3] = p0;
2916 d3[0] = p0;
2917
2918 B -= 8;
2919 e -= 8;
2920 d0 += 4;
2921 d2 += 4;
2922 d1 -= 4;
2923 d3 -= 4;
2924 }
2925 }
2926
2927 temp_free(f,buf2);
2928 temp_alloc_restore(f,save_point);
2929}
2930
2931#if 0
2932// this is the original version of the above code, if you want to optimize it from scratch
2933void inverse_mdct_naive(float *buffer, int n)
2934{
2935 float s;
2936 float A[1 << 12], B[1 << 12], C[1 << 11];
2937 int i,k,k2,k4, n2 = n >> 1, n4 = n >> 2, n8 = n >> 3, l;
2938 int n3_4 = n - n4, ld;
2939 // how can they claim this only uses N words?!
2940 // oh, because they're only used sparsely, whoops
2941 float u[1 << 13], X[1 << 13], v[1 << 13], w[1 << 13];
2942 // set up twiddle factors
2943
2944 for (k=k2=0; k < n4; ++k,k2+=2) {
2945 A[k2 ] = (float) cos(4*k*M_PI/n);
2946 A[k2+1] = (float) -sin(4*k*M_PI/n);
2947 B[k2 ] = (float) cos((k2+1)*M_PI/n/2);
2948 B[k2+1] = (float) sin((k2+1)*M_PI/n/2);
2949 }
2950 for (k=k2=0; k < n8; ++k,k2+=2) {
2951 C[k2 ] = (float) cos(2*(k2+1)*M_PI/n);
2952 C[k2+1] = (float) -sin(2*(k2+1)*M_PI/n);
2953 }
2954
2955 // IMDCT algorithm from "The use of multirate filter banks for coding of high quality digital audio"
2956 // Note there are bugs in that pseudocode, presumably due to them attempting
2957 // to rename the arrays nicely rather than representing the way their actual
2958 // implementation bounces buffers back and forth. As a result, even in the
2959 // "some formulars corrected" version, a direct implementation fails. These
2960 // are noted below as "paper bug".
2961
2962 // copy and reflect spectral data
2963 for (k=0; k < n2; ++k) u[k] = buffer[k];
2964 for ( ; k < n ; ++k) u[k] = -buffer[n - k - 1];
2965 // kernel from paper
2966 // step 1
2967 for (k=k2=k4=0; k < n4; k+=1, k2+=2, k4+=4) {
2968 v[n-k4-1] = (u[k4] - u[n-k4-1]) * A[k2] - (u[k4+2] - u[n-k4-3])*A[k2+1];
2969 v[n-k4-3] = (u[k4] - u[n-k4-1]) * A[k2+1] + (u[k4+2] - u[n-k4-3])*A[k2];
2970 }
2971 // step 2
2972 for (k=k4=0; k < n8; k+=1, k4+=4) {
2973 w[n2+3+k4] = v[n2+3+k4] + v[k4+3];
2974 w[n2+1+k4] = v[n2+1+k4] + v[k4+1];
2975 w[k4+3] = (v[n2+3+k4] - v[k4+3])*A[n2-4-k4] - (v[n2+1+k4]-v[k4+1])*A[n2-3-k4];
2976 w[k4+1] = (v[n2+1+k4] - v[k4+1])*A[n2-4-k4] + (v[n2+3+k4]-v[k4+3])*A[n2-3-k4];
2977 }
2978 // step 3
2979 ld = ilog(n) - 1; // ilog is off-by-one from normal definitions
2980 for (l=0; l < ld-3; ++l) {
2981 int k0 = n >> (l+2), k1 = 1 << (l+3);
2982 int rlim = n >> (l+4), r4, r;
2983 int s2lim = 1 << (l+2), s2;
2984 for (r=r4=0; r < rlim; r4+=4,++r) {
2985 for (s2=0; s2 < s2lim; s2+=2) {
2986 u[n-1-k0*s2-r4] = w[n-1-k0*s2-r4] + w[n-1-k0*(s2+1)-r4];
2987 u[n-3-k0*s2-r4] = w[n-3-k0*s2-r4] + w[n-3-k0*(s2+1)-r4];
2988 u[n-1-k0*(s2+1)-r4] = (w[n-1-k0*s2-r4] - w[n-1-k0*(s2+1)-r4]) * A[r*k1]
2989 - (w[n-3-k0*s2-r4] - w[n-3-k0*(s2+1)-r4]) * A[r*k1+1];
2990 u[n-3-k0*(s2+1)-r4] = (w[n-3-k0*s2-r4] - w[n-3-k0*(s2+1)-r4]) * A[r*k1]
2991 + (w[n-1-k0*s2-r4] - w[n-1-k0*(s2+1)-r4]) * A[r*k1+1];
2992 }
2993 }
2994 if (l+1 < ld-3) {
2995 // paper bug: ping-ponging of u&w here is omitted
2996 memcpy(w, u, sizeof(u));
2997 }
2998 }
2999
3000 // step 4
3001 for (i=0; i < n8; ++i) {
3002 int j = bit_reverse(i) >> (32-ld+3);
3003 assert(j < n8);
3004 if (i == j) {
3005 // paper bug: original code probably swapped in place; if copying,
3006 // need to directly copy in this case
3007 int i8 = i << 3;
3008 v[i8+1] = u[i8+1];
3009 v[i8+3] = u[i8+3];
3010 v[i8+5] = u[i8+5];
3011 v[i8+7] = u[i8+7];
3012 } else if (i < j) {
3013 int i8 = i << 3, j8 = j << 3;
3014 v[j8+1] = u[i8+1], v[i8+1] = u[j8 + 1];
3015 v[j8+3] = u[i8+3], v[i8+3] = u[j8 + 3];
3016 v[j8+5] = u[i8+5], v[i8+5] = u[j8 + 5];
3017 v[j8+7] = u[i8+7], v[i8+7] = u[j8 + 7];
3018 }
3019 }
3020 // step 5
3021 for (k=0; k < n2; ++k) {
3022 w[k] = v[k*2+1];
3023 }
3024 // step 6
3025 for (k=k2=k4=0; k < n8; ++k, k2 += 2, k4 += 4) {
3026 u[n-1-k2] = w[k4];
3027 u[n-2-k2] = w[k4+1];
3028 u[n3_4 - 1 - k2] = w[k4+2];
3029 u[n3_4 - 2 - k2] = w[k4+3];
3030 }
3031 // step 7
3032 for (k=k2=0; k < n8; ++k, k2 += 2) {
3033 v[n2 + k2 ] = ( u[n2 + k2] + u[n-2-k2] + C[k2+1]*(u[n2+k2]-u[n-2-k2]) + C[k2]*(u[n2+k2+1]+u[n-2-k2+1]))/2;
3034 v[n-2 - k2] = ( u[n2 + k2] + u[n-2-k2] - C[k2+1]*(u[n2+k2]-u[n-2-k2]) - C[k2]*(u[n2+k2+1]+u[n-2-k2+1]))/2;
3035 v[n2+1+ k2] = ( u[n2+1+k2] - u[n-1-k2] + C[k2+1]*(u[n2+1+k2]+u[n-1-k2]) - C[k2]*(u[n2+k2]-u[n-2-k2]))/2;
3036 v[n-1 - k2] = (-u[n2+1+k2] + u[n-1-k2] + C[k2+1]*(u[n2+1+k2]+u[n-1-k2]) - C[k2]*(u[n2+k2]-u[n-2-k2]))/2;
3037 }
3038 // step 8
3039 for (k=k2=0; k < n4; ++k,k2 += 2) {
3040 X[k] = v[k2+n2]*B[k2 ] + v[k2+1+n2]*B[k2+1];
3041 X[n2-1-k] = v[k2+n2]*B[k2+1] - v[k2+1+n2]*B[k2 ];
3042 }
3043
3044 // decode kernel to output
3045 // determined the following value experimentally
3046 // (by first figuring out what made inverse_mdct_slow work); then matching that here
3047 // (probably vorbis encoder premultiplies by n or n/2, to save it on the decoder?)
3048 s = 0.5; // theoretically would be n4
3049
3050 // [[[ note! the s value of 0.5 is compensated for by the B[] in the current code,
3051 // so it needs to use the "old" B values to behave correctly, or else
3052 // set s to 1.0 ]]]
3053 for (i=0; i < n4 ; ++i) buffer[i] = s * X[i+n4];
3054 for ( ; i < n3_4; ++i) buffer[i] = -s * X[n3_4 - i - 1];
3055 for ( ; i < n ; ++i) buffer[i] = -s * X[i - n3_4];
3056}
3057#endif
3058
3059static float *get_window(vorb *f, int len)
3060{
3061 len <<= 1;
3062 if (len == f->blocksize_0) return f->window[0];
3063 if (len == f->blocksize_1) return f->window[1];
3064 return NULL;
3065}
3066
3067#ifndef STB_VORBIS_NO_DEFER_FLOOR
3068typedef int16 YTYPE;
3069#else
3070typedef int YTYPE;
3071#endif
3072static int do_floor(vorb *f, Mapping *map, int i, int n, float *target, YTYPE *finalY, uint8 *step2_flag)
3073{
3074 int n2 = n >> 1;
3075 int s = map->chan[i].mux, floor;
3076 floor = map->submap_floor[s];
3077 if (f->floor_types[floor] == 0) {
3078 return error(f, VORBIS_invalid_stream);
3079 } else {
3080 Floor1 *g = &f->floor_config[floor].floor1;
3081 int j,q;
3082 int lx = 0, ly = finalY[0] * g->floor1_multiplier;
3083 for (q=1; q < g->values; ++q) {
3084 j = g->sorted_order[q];
3085 #ifndef STB_VORBIS_NO_DEFER_FLOOR
3086 STBV_NOTUSED(step2_flag);
3087 if (finalY[j] >= 0)
3088 #else
3089 if (step2_flag[j])
3090 #endif
3091 {
3092 int hy = finalY[j] * g->floor1_multiplier;
3093 int hx = g->Xlist[j];
3094 if (lx != hx)
3095 draw_line(target, lx,ly, hx,hy, n2);
3096 CHECK(f);
3097 lx = hx, ly = hy;
3098 }
3099 }
3100 if (lx < n2) {
3101 // optimization of: draw_line(target, lx,ly, n,ly, n2);
3102 for (j=lx; j < n2; ++j)
3103 LINE_OP(target[j], inverse_db_table[ly]);
3104 CHECK(f);
3105 }
3106 }
3107 return TRUE;
3108}
3109
3110// The meaning of "left" and "right"
3111//
3112// For a given frame:
3113// we compute samples from 0..n
3114// window_center is n/2
3115// we'll window and mix the samples from left_start to left_end with data from the previous frame
3116// all of the samples from left_end to right_start can be output without mixing; however,
3117// this interval is 0-length except when transitioning between short and long frames
3118// all of the samples from right_start to right_end need to be mixed with the next frame,
3119// which we don't have, so those get saved in a buffer
3120// frame N's right_end-right_start, the number of samples to mix with the next frame,
3121// has to be the same as frame N+1's left_end-left_start (which they are by
3122// construction)
3123
3124static int vorbis_decode_initial(vorb *f, int *p_left_start, int *p_left_end, int *p_right_start, int *p_right_end, int *mode)
3125{
3126 Mode *m;
3127 int i, n, prev, next, window_center;
3128 f->channel_buffer_start = f->channel_buffer_end = 0;
3129
3130 retry:
3131 if (f->eof) return FALSE;
3132 if (!maybe_start_packet(f))
3133 return FALSE;
3134 // check packet type
3135 if (get_bits(f,1) != 0) {
3136 if (IS_PUSH_MODE(f))
3137 return error(f,VORBIS_bad_packet_type);
3138 while (EOP != get8_packet(f));
3139 goto retry;
3140 }
3141
3142 if (f->alloc.alloc_buffer)
3143 assert(f->alloc.alloc_buffer_length_in_bytes == f->temp_offset);
3144
3145 i = get_bits(f, ilog(f->mode_count-1));
3146 if (i == EOP) return FALSE;
3147 if (i >= f->mode_count) return FALSE;
3148 *mode = i;
3149 m = f->mode_config + i;
3150 if (m->blockflag) {
3151 n = f->blocksize_1;
3152 prev = get_bits(f,1);
3153 next = get_bits(f,1);
3154 } else {
3155 prev = next = 0;
3156 n = f->blocksize_0;
3157 }
3158
3159// WINDOWING
3160
3161 window_center = n >> 1;
3162 if (m->blockflag && !prev) {
3163 *p_left_start = (n - f->blocksize_0) >> 2;
3164 *p_left_end = (n + f->blocksize_0) >> 2;
3165 } else {
3166 *p_left_start = 0;
3167 *p_left_end = window_center;
3168 }
3169 if (m->blockflag && !next) {
3170 *p_right_start = (n*3 - f->blocksize_0) >> 2;
3171 *p_right_end = (n*3 + f->blocksize_0) >> 2;
3172 } else {
3173 *p_right_start = window_center;
3174 *p_right_end = n;
3175 }
3176
3177 return TRUE;
3178}
3179
3180static int vorbis_decode_packet_rest(vorb *f, int *len, Mode *m, int left_start, int left_end, int right_start, int right_end, int *p_left)
3181{
3182 Mapping *map;
3183 int i,j,k,n,n2;
3184 int zero_channel[256];
3185 int really_zero_channel[256];
3186
3187// WINDOWING
3188
3189 STBV_NOTUSED(left_end);
3190 n = f->blocksize[m->blockflag];
3191 map = &f->mapping[m->mapping];
3192
3193// FLOORS
3194 n2 = n >> 1;
3195
3196 CHECK(f);
3197
3198 for (i=0; i < f->channels; ++i) {
3199 int s = map->chan[i].mux, floor;
3200 zero_channel[i] = FALSE;
3201 floor = map->submap_floor[s];
3202 if (f->floor_types[floor] == 0) {
3203 return error(f, VORBIS_invalid_stream);
3204 } else {
3205 Floor1 *g = &f->floor_config[floor].floor1;
3206 if (get_bits(f, 1)) {
3207 short *finalY;
3208 uint8 step2_flag[256];
3209 static int range_list[4] = { 256, 128, 86, 64 };
3210 int range = range_list[g->floor1_multiplier-1];
3211 int offset = 2;
3212 finalY = f->finalY[i];
3213 finalY[0] = get_bits(f, ilog(range)-1);
3214 finalY[1] = get_bits(f, ilog(range)-1);
3215 for (j=0; j < g->partitions; ++j) {
3216 int pclass = g->partition_class_list[j];
3217 int cdim = g->class_dimensions[pclass];
3218 int cbits = g->class_subclasses[pclass];
3219 int csub = (1 << cbits)-1;
3220 int cval = 0;
3221 if (cbits) {
3222 Codebook *c = f->codebooks + g->class_masterbooks[pclass];
3223 DECODE(cval,f,c);
3224 }
3225 for (k=0; k < cdim; ++k) {
3226 int book = g->subclass_books[pclass][cval & csub];
3227 cval = cval >> cbits;
3228 if (book >= 0) {
3229 int temp;
3230 Codebook *c = f->codebooks + book;
3231 DECODE(temp,f,c);
3232 finalY[offset++] = temp;
3233 } else
3234 finalY[offset++] = 0;
3235 }
3236 }
3237 if (f->valid_bits == INVALID_BITS) goto error; // behavior according to spec
3238 step2_flag[0] = step2_flag[1] = 1;
3239 for (j=2; j < g->values; ++j) {
3240 int low, high, pred, highroom, lowroom, room, val;
3241 low = g->neighbors[j][0];
3242 high = g->neighbors[j][1];
3243 //neighbors(g->Xlist, j, &low, &high);
3244 pred = predict_point(g->Xlist[j], g->Xlist[low], g->Xlist[high], finalY[low], finalY[high]);
3245 val = finalY[j];
3246 highroom = range - pred;
3247 lowroom = pred;
3248 if (highroom < lowroom)
3249 room = highroom * 2;
3250 else
3251 room = lowroom * 2;
3252 if (val) {
3253 step2_flag[low] = step2_flag[high] = 1;
3254 step2_flag[j] = 1;
3255 if (val >= room)
3256 if (highroom > lowroom)
3257 finalY[j] = val - lowroom + pred;
3258 else
3259 finalY[j] = pred - val + highroom - 1;
3260 else
3261 if (val & 1)
3262 finalY[j] = pred - ((val+1)>>1);
3263 else
3264 finalY[j] = pred + (val>>1);
3265 } else {
3266 step2_flag[j] = 0;
3267 finalY[j] = pred;
3268 }
3269 }
3270
3271#ifdef STB_VORBIS_NO_DEFER_FLOOR
3272 do_floor(f, map, i, n, f->floor_buffers[i], finalY, step2_flag);
3273#else
3274 // defer final floor computation until _after_ residue
3275 for (j=0; j < g->values; ++j) {
3276 if (!step2_flag[j])
3277 finalY[j] = -1;
3278 }
3279#endif
3280 } else {
3281 error:
3282 zero_channel[i] = TRUE;