--- /dev/null
+Index: src/libFLAC/bitreader.c
+===================================================================
+RCS file: /cvsroot/flac/flac/src/libFLAC/bitreader.c,v
+retrieving revision 1.15
+diff -u -r1.15 bitreader.c
+--- src/libFLAC/bitreader.c 28 Feb 2008 05:34:26 -0000 1.15
++++ src/libFLAC/bitreader.c 14 Mar 2008 11:07:07 -0000
+@@ -69,13 +69,12 @@
+ #endif
+ /* counts the # of zero MSBs in a word */
+ #define COUNT_ZERO_MSBS(word) ( \
+- (word) <= 0xffff ? \
+- ( (word) <= 0xff? byte_to_unary_table[word] + 24 : byte_to_unary_table[(word) >> 8] + 16 ) : \
+- ( (word) <= 0xffffff? byte_to_unary_table[word >> 16] + 8 : byte_to_unary_table[(word) >> 24] ) \
++ word > 0xffffff ? byte_to_unary_table[(word) >> 24] : \
++ !word ? 32 : \
++ word > 0xffff ? byte_to_unary_table[word >> 16] + 8 : \
++ word > 0xff ? byte_to_unary_table[(word) >> 8] + 16 : \
++ byte_to_unary_table[word] + 24 \
+ )
+-/* this alternate might be slightly faster on some systems/compilers: */
+-#define COUNT_ZERO_MSBS2(word) ( (word) <= 0xff ? byte_to_unary_table[word] + 24 : ((word) <= 0xffff ? byte_to_unary_table[(word) >> 8] + 16 : ((word) <= 0xffffff ? byte_to_unary_table[(word) >> 16] + 8 : byte_to_unary_table[(word) >> 24])) )
+-
+
+ /*
+ * This should be at least twice as large as the largest number of words
+Index: src/libFLAC/bitreader.c
+===================================================================
+RCS file: /cvsroot/flac/flac/src/libFLAC/bitreader.c,v
+retrieving revision 1.15
+diff -u -r1.15 bitreader.c
+--- src/libFLAC/bitreader.c 28 Feb 2008 05:34:26 -0000 1.15
++++ src/libFLAC/bitreader.c 14 Mar 2008 13:19:46 -0000
+@@ -149,6 +148,7 @@
+ FLAC__CPUInfo cpu_info;
+ };
+
++#if FLAC__BYTES_PER_WORD == 4 && FLAC__CPU_IA32
+ #ifdef _MSC_VER
+ /* OPT: an MSVC built-in would be better */
+ static _inline FLAC__uint32 local_swap32_(FLAC__uint32 x)
+@@ -173,6 +173,15 @@
+ done1:
+ }
+ }
++#elif __GNUC__
++static void local_swap32_block_(FLAC__uint32 *start, FLAC__uint32 len)
++{
++ FLAC__uint32 *end;
++
++ for(end = start + len; start < end; start++)
++ asm ("bswap %0" : "=r"(*start) : "0"(*start));
++}
++#endif
+ #endif
+
+ static FLaC__INLINE void crc16_update_word_(FLAC__BitReader *br, brword word)
+@@ -263,7 +272,7 @@
+ #if WORDS_BIGENDIAN
+ #else
+ end = (br->words*FLAC__BYTES_PER_WORD + br->bytes + bytes + (FLAC__BYTES_PER_WORD-1)) / FLAC__BYTES_PER_WORD;
+-# if defined(_MSC_VER) && (FLAC__BYTES_PER_WORD == 4)
++# if FLAC__CPU_IA32 && (__GNUC__ || defined(_MSC_VER)) && FLAC__BYTES_PER_WORD == 4
+ if(br->cpu_info.type == FLAC__CPUINFO_TYPE_IA32 && br->cpu_info.data.ia32.bswap) {
+ start = br->words;
+ local_swap32_block_(br->buffer + start, end - start);
+Index: src/libFLAC/bitreader.c
+===================================================================
+RCS file: /cvsroot/flac/flac/src/libFLAC/bitreader.c,v
+retrieving revision 1.15
+diff -u -r1.15 bitreader.c
+--- src/libFLAC/bitreader.c 28 Feb 2008 05:34:26 -0000 1.15
++++ src/libFLAC/bitreader.c 17 Mar 2008 15:42:57 -0000
+@@ -803,379 +812,144 @@
+ }
+
+ /* this is by far the most heavily used reader call. it ain't pretty but it's fast */
+-/* a lot of the logic is copied, then adapted, from FLAC__bitreader_read_unary_unsigned() and FLAC__bitreader_read_raw_uint32() */
+ FLAC__bool FLAC__bitreader_read_rice_signed_block(FLAC__BitReader *br, int vals[], unsigned nvals, unsigned parameter)
+-/* OPT: possibly faster version for use with MSVC */
+-#ifdef _MSC_VER
+ {
+- unsigned i;
+- unsigned uval = 0;
+- unsigned bits; /* the # of binary LSBs left to read to finish a rice codeword */
+-
+ /* try and get br->consumed_words and br->consumed_bits into register;
+ * must remember to flush them back to *br before calling other
+- * bitwriter functions that use them, and before returning */
+- register unsigned cwords;
+- register unsigned cbits;
++ * bitreader functions that use them, and before returning */
++ unsigned cwords, words, lsbs, msbs, x, y;
++ unsigned ucbits; /* keep track of the number of unconsumed bits in word */
++ brword b;
++ int *val, *end;
+
+ FLAC__ASSERT(0 != br);
+ FLAC__ASSERT(0 != br->buffer);
+ /* WATCHOUT: code does not work with <32bit words; we can make things much faster with this assertion */
+ FLAC__ASSERT(FLAC__BITS_PER_WORD >= 32);
+ FLAC__ASSERT(parameter < 32);
+- /* the above two asserts also guarantee that the binary part never straddles more that 2 words, so we don't have to loop to read it */
+-
+- if(nvals == 0)
+- return true;
+-
+- cbits = br->consumed_bits;
+- cwords = br->consumed_words;
++ /* the above two asserts also guarantee that the binary part never straddles more than 2 words, so we don't have to loop to read it */
+
+- while(1) {
++ val = vals;
++ end = vals + nvals;
+
+- /* read unary part */
+- while(1) {
+- while(cwords < br->words) { /* if we've not consumed up to a partial tail word... */
+- brword b = br->buffer[cwords] << cbits;
+- if(b) {
+-#if 0 /* slower, probably due to bad register allocation... */ && defined FLAC__CPU_IA32 && !defined FLAC__NO_ASM && FLAC__BITS_PER_WORD == 32
+- __asm {
+- bsr eax, b
+- not eax
+- and eax, 31
+- mov i, eax
+- }
+-#else
+- i = COUNT_ZERO_MSBS(b);
+-#endif
+- uval += i;
+- bits = parameter;
+- i++;
+- cbits += i;
+- if(cbits == FLAC__BITS_PER_WORD) {
+- crc16_update_word_(br, br->buffer[cwords]);
+- cwords++;
+- cbits = 0;
+- }
+- goto break1;
+- }
+- else {
+- uval += FLAC__BITS_PER_WORD - cbits;
+- crc16_update_word_(br, br->buffer[cwords]);
+- cwords++;
+- cbits = 0;
+- /* didn't find stop bit yet, have to keep going... */
+- }
+- }
+- /* at this point we've eaten up all the whole words; have to try
+- * reading through any tail bytes before calling the read callback.
+- * this is a repeat of the above logic adjusted for the fact we
+- * don't have a whole word. note though if the client is feeding
+- * us data a byte at a time (unlikely), br->consumed_bits may not
+- * be zero.
+- */
+- if(br->bytes) {
+- const unsigned end = br->bytes * 8;
+- brword b = (br->buffer[cwords] & (FLAC__WORD_ALL_ONES << (FLAC__BITS_PER_WORD-end))) << cbits;
+- if(b) {
+- i = COUNT_ZERO_MSBS(b);
+- uval += i;
+- bits = parameter;
+- i++;
+- cbits += i;
+- FLAC__ASSERT(cbits < FLAC__BITS_PER_WORD);
+- goto break1;
+- }
+- else {
+- uval += end - cbits;
+- cbits += end;
+- FLAC__ASSERT(cbits < FLAC__BITS_PER_WORD);
+- /* didn't find stop bit yet, have to keep going... */
+- }
+- }
+- /* flush registers and read; bitreader_read_from_client_() does
+- * not touch br->consumed_bits at all but we still need to set
+- * it in case it fails and we have to return false.
+- */
+- br->consumed_bits = cbits;
+- br->consumed_words = cwords;
+- if(!bitreader_read_from_client_(br))
++ if(parameter == 0) {
++ while(val < end) {
++ /* read the unary MSBs and end bit */
++ if(!FLAC__bitreader_read_unary_unsigned(br, &msbs))
+ return false;
+- cwords = br->consumed_words;
+- }
+-break1:
+- /* read binary part */
+- FLAC__ASSERT(cwords <= br->words);
+-
+- if(bits) {
+- while((br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8 - cbits < bits) {
+- /* flush registers and read; bitreader_read_from_client_() does
+- * not touch br->consumed_bits at all but we still need to set
+- * it in case it fails and we have to return false.
+- */
+- br->consumed_bits = cbits;
+- br->consumed_words = cwords;
+- if(!bitreader_read_from_client_(br))
+- return false;
+- cwords = br->consumed_words;
+- }
+- if(cwords < br->words) { /* if we've not consumed up to a partial tail word... */
+- if(cbits) {
+- /* this also works when consumed_bits==0, it's just a little slower than necessary for that case */
+- const unsigned n = FLAC__BITS_PER_WORD - cbits;
+- const brword word = br->buffer[cwords];
+- if(bits < n) {
+- uval <<= bits;
+- uval |= (word & (FLAC__WORD_ALL_ONES >> cbits)) >> (n-bits);
+- cbits += bits;
+- goto break2;
+- }
+- uval <<= n;
+- uval |= word & (FLAC__WORD_ALL_ONES >> cbits);
+- bits -= n;
+- crc16_update_word_(br, word);
+- cwords++;
+- cbits = 0;
+- if(bits) { /* if there are still bits left to read, there have to be less than 32 so they will all be in the next word */
+- uval <<= bits;
+- uval |= (br->buffer[cwords] >> (FLAC__BITS_PER_WORD-bits));
+- cbits = bits;
+- }
+- goto break2;
+- }
+- else {
+- FLAC__ASSERT(bits < FLAC__BITS_PER_WORD);
+- uval <<= bits;
+- uval |= br->buffer[cwords] >> (FLAC__BITS_PER_WORD-bits);
+- cbits = bits;
+- goto break2;
+- }
+- }
+- else {
+- /* in this case we're starting our read at a partial tail word;
+- * the reader has guaranteed that we have at least 'bits' bits
+- * available to read, which makes this case simpler.
+- */
+- uval <<= bits;
+- if(cbits) {
+- /* this also works when consumed_bits==0, it's just a little slower than necessary for that case */
+- FLAC__ASSERT(cbits + bits <= br->bytes*8);
+- uval |= (br->buffer[cwords] & (FLAC__WORD_ALL_ONES >> cbits)) >> (FLAC__BITS_PER_WORD-cbits-bits);
+- cbits += bits;
+- goto break2;
+- }
+- else {
+- uval |= br->buffer[cwords] >> (FLAC__BITS_PER_WORD-bits);
+- cbits += bits;
+- goto break2;
+- }
+- }
+- }
+-break2:
+- /* compose the value */
+- *vals = (int)(uval >> 1 ^ -(int)(uval & 1));
+
+- /* are we done? */
+- --nvals;
+- if(nvals == 0) {
+- br->consumed_bits = cbits;
+- br->consumed_words = cwords;
+- return true;
++ *val++ = (int)(msbs >> 1) ^ -(int)(msbs & 1);
+ }
+
+- uval = 0;
+- ++vals;
+-
++ return true;
+ }
+-}
+-#else
+-{
+- unsigned i;
+- unsigned uval = 0;
+
+- /* try and get br->consumed_words and br->consumed_bits into register;
+- * must remember to flush them back to *br before calling other
+- * bitwriter functions that use them, and before returning */
+- register unsigned cwords;
+- register unsigned cbits;
+- unsigned ucbits; /* keep track of the number of unconsumed bits in the buffer */
++ FLAC__ASSERT(parameter > 0);
+
+- FLAC__ASSERT(0 != br);
+- FLAC__ASSERT(0 != br->buffer);
+- /* WATCHOUT: code does not work with <32bit words; we can make things much faster with this assertion */
+- FLAC__ASSERT(FLAC__BITS_PER_WORD >= 32);
+- FLAC__ASSERT(parameter < 32);
+- /* the above two asserts also guarantee that the binary part never straddles more than 2 words, so we don't have to loop to read it */
++ cwords = br->consumed_words;
++ words = br->words;
+
+- if(nvals == 0)
+- return true;
++ /* if we've not consumed up to a partial tail word... */
++ if(cwords >= words) {
++ x = 0;
++ goto process_tail;
++ }
++
++ ucbits = FLAC__BITS_PER_WORD - br->consumed_bits;
++ b = br->buffer[cwords] << br->consumed_bits; /* keep unconsumed bits aligned to left */
++
++ while(val < end) {
++ /* read the unary MSBs and end bit */
++ x = y = COUNT_ZERO_MSBS(b);
++ if(x == FLAC__BITS_PER_WORD) {
++ x = ucbits;
++ do {
++ /* didn't find stop bit yet, have to keep going... */
++ crc16_update_word_(br, br->buffer[cwords++]);
++ if (cwords >= words)
++ goto incomplete_msbs;
++ b = br->buffer[cwords];
++ y = COUNT_ZERO_MSBS(b);
++ x += y;
++ } while(y == FLAC__BITS_PER_WORD);
++ }
++ b <<= y;
++ b <<= 1; /* account for stop bit */
++ ucbits = (ucbits - x - 1) % FLAC__BITS_PER_WORD;
++ msbs = x;
++
++ /* read the binary LSBs */
++ x = b >> (FLAC__BITS_PER_WORD - parameter);
++ if(parameter <= ucbits) {
++ ucbits -= parameter;
++ b <<= parameter;
++ } else {
++ /* there are still bits left to read, they will all be in the next word */
++ crc16_update_word_(br, br->buffer[cwords++]);
++ if (cwords >= words)
++ goto incomplete_lsbs;
++ b = br->buffer[cwords];
++ ucbits += FLAC__BITS_PER_WORD - parameter;
++ x |= b >> ucbits;
++ b <<= FLAC__BITS_PER_WORD - ucbits;
++ }
++ lsbs = x;
+
+- cbits = br->consumed_bits;
+- cwords = br->consumed_words;
+- ucbits = (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8 - cbits;
++ /* compose the value */
++ x = (msbs << parameter) | lsbs;
++ *val++ = (int)(x >> 1) ^ -(int)(x & 1);
+
+- while(1) {
++ continue;
+
+- /* read unary part */
+- while(1) {
+- while(cwords < br->words) { /* if we've not consumed up to a partial tail word... */
+- brword b = br->buffer[cwords] << cbits;
+- if(b) {
+-#if 0 /* is not discernably faster... */ && defined FLAC__CPU_IA32 && !defined FLAC__NO_ASM && FLAC__BITS_PER_WORD == 32 && defined __GNUC__
+- asm volatile (
+- "bsrl %1, %0;"
+- "notl %0;"
+- "andl $31, %0;"
+- : "=r"(i)
+- : "r"(b)
+- );
+-#else
+- i = COUNT_ZERO_MSBS(b);
+-#endif
+- uval += i;
+- cbits += i;
+- cbits++; /* skip over stop bit */
+- if(cbits >= FLAC__BITS_PER_WORD) { /* faster way of testing if(cbits == FLAC__BITS_PER_WORD) */
+- crc16_update_word_(br, br->buffer[cwords]);
+- cwords++;
+- cbits = 0;
+- }
+- goto break1;
+- }
+- else {
+- uval += FLAC__BITS_PER_WORD - cbits;
+- crc16_update_word_(br, br->buffer[cwords]);
+- cwords++;
+- cbits = 0;
+- /* didn't find stop bit yet, have to keep going... */
+- }
+- }
+- /* at this point we've eaten up all the whole words; have to try
+- * reading through any tail bytes before calling the read callback.
+- * this is a repeat of the above logic adjusted for the fact we
+- * don't have a whole word. note though if the client is feeding
+- * us data a byte at a time (unlikely), br->consumed_bits may not
+- * be zero.
+- */
+- if(br->bytes) {
+- const unsigned end = br->bytes * 8;
+- brword b = (br->buffer[cwords] & ~(FLAC__WORD_ALL_ONES >> end)) << cbits;
+- if(b) {
+- i = COUNT_ZERO_MSBS(b);
+- uval += i;
+- cbits += i;
+- cbits++; /* skip over stop bit */
+- FLAC__ASSERT(cbits < FLAC__BITS_PER_WORD);
+- goto break1;
+- }
+- else {
+- uval += end - cbits;
+- cbits += end;
+- FLAC__ASSERT(cbits < FLAC__BITS_PER_WORD);
+- /* didn't find stop bit yet, have to keep going... */
+- }
++ /* at this point we've eaten up all the whole words */
++process_tail:
++ do {
++ if(0) {
++incomplete_msbs:
++ br->consumed_bits = 0;
++ br->consumed_words = cwords;
+ }
+- /* flush registers and read; bitreader_read_from_client_() does
+- * not touch br->consumed_bits at all but we still need to set
+- * it in case it fails and we have to return false.
+- */
+- br->consumed_bits = cbits;
+- br->consumed_words = cwords;
+- if(!bitreader_read_from_client_(br))
++
++ /* read the unary MSBs and end bit */
++ if(!FLAC__bitreader_read_unary_unsigned(br, &msbs))
+ return false;
+- cwords = br->consumed_words;
+- ucbits = (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8 - cbits + uval;
+- /* + uval to offset our count by the # of unary bits already
+- * consumed before the read, because we will add these back
+- * in all at once at break1
+- */
+- }
+-break1:
+- ucbits -= uval;
+- ucbits--; /* account for stop bit */
+-
+- /* read binary part */
+- FLAC__ASSERT(cwords <= br->words);
+-
+- if(parameter) {
+- while(ucbits < parameter) {
+- /* flush registers and read; bitreader_read_from_client_() does
+- * not touch br->consumed_bits at all but we still need to set
+- * it in case it fails and we have to return false.
+- */
+- br->consumed_bits = cbits;
++ msbs += x;
++ x = ucbits = 0;
++
++ if(0) {
++incomplete_lsbs:
++ br->consumed_bits = 0;
+ br->consumed_words = cwords;
+- if(!bitreader_read_from_client_(br))
+- return false;
+- cwords = br->consumed_words;
+- ucbits = (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8 - cbits;
+- }
+- if(cwords < br->words) { /* if we've not consumed up to a partial tail word... */
+- if(cbits) {
+- /* this also works when consumed_bits==0, it's just slower than necessary for that case */
+- const unsigned n = FLAC__BITS_PER_WORD - cbits;
+- const brword word = br->buffer[cwords];
+- if(parameter < n) {
+- uval <<= parameter;
+- uval |= (word & (FLAC__WORD_ALL_ONES >> cbits)) >> (n-parameter);
+- cbits += parameter;
+- }
+- else {
+- uval <<= n;
+- uval |= word & (FLAC__WORD_ALL_ONES >> cbits);
+- crc16_update_word_(br, word);
+- cwords++;
+- cbits = parameter - n;
+- if(cbits) { /* parameter > n, i.e. if there are still bits left to read, there have to be less than 32 so they will all be in the next word */
+- uval <<= cbits;
+- uval |= (br->buffer[cwords] >> (FLAC__BITS_PER_WORD-cbits));
+- }
+- }
+- }
+- else {
+- cbits = parameter;
+- uval <<= parameter;
+- uval |= br->buffer[cwords] >> (FLAC__BITS_PER_WORD-cbits);
+- }
+ }
+- else {
+- /* in this case we're starting our read at a partial tail word;
+- * the reader has guaranteed that we have at least 'parameter'
+- * bits available to read, which makes this case simpler.
+- */
+- uval <<= parameter;
+- if(cbits) {
+- /* this also works when consumed_bits==0, it's just a little slower than necessary for that case */
+- FLAC__ASSERT(cbits + parameter <= br->bytes*8);
+- uval |= (br->buffer[cwords] & (FLAC__WORD_ALL_ONES >> cbits)) >> (FLAC__BITS_PER_WORD-cbits-parameter);
+- cbits += parameter;
+- }
+- else {
+- cbits = parameter;
+- uval |= br->buffer[cwords] >> (FLAC__BITS_PER_WORD-cbits);
+- }
+- }
+- }
+
+- ucbits -= parameter;
+-
+- /* compose the value */
+- *vals = (int)(uval >> 1 ^ -(int)(uval & 1));
++ /* read the binary LSBs */
++ if(!FLAC__bitreader_read_raw_uint32(br, &lsbs, parameter - ucbits))
++ return false;
++ lsbs = x | lsbs;
+
+- /* are we done? */
+- --nvals;
+- if(nvals == 0) {
+- br->consumed_bits = cbits;
+- br->consumed_words = cwords;
+- return true;
+- }
++ /* compose the value */
++ x = (msbs << parameter) | lsbs;
++ *val++ = (int)(x >> 1) ^ -(int)(x & 1);
++ x = 0;
+
+- uval = 0;
+- ++vals;
++ cwords = br->consumed_words;
++ words = br->words;
++ ucbits = FLAC__BITS_PER_WORD - br->consumed_bits;
++ b = br->buffer[cwords] << br->consumed_bits;
++ } while(cwords >= words && val < end);
++ }
+
++ if(ucbits == 0 && cwords < words) {
++ /* don't leave the head word with no unconsumed bits */
++ crc16_update_word_(br, br->buffer[cwords++]);
++ ucbits = FLAC__BITS_PER_WORD;
+ }
++
++ br->consumed_bits = FLAC__BITS_PER_WORD - ucbits;
++ br->consumed_words = cwords;
++
++ return true;
+ }
+-#endif
+
+ #if 0 /* UNUSED */
+ FLAC__bool FLAC__bitreader_read_golomb_signed(FLAC__BitReader *br, int *val, unsigned parameter)