# HG changeset patch # User Pascal Bellard # Date 1240926437 -7200 # Node ID a36fcdad71b15878debd934bd01a140b6b740381 # Parent 1ac0ba1a2795f739e55d8f7f9568f738fb756940 linux: speedup unlzma (18% faster) diff -r 1ac0ba1a2795 -r a36fcdad71b1 linux/stuff/linux-lzma-2.6.25.5.u --- a/linux/stuff/linux-lzma-2.6.25.5.u Tue Apr 28 15:38:14 2009 +0200 +++ b/linux/stuff/linux-lzma-2.6.25.5.u Tue Apr 28 15:47:17 2009 +0200 @@ -1444,7 +1444,7 @@ --- linux-2.6.25.5/lib/decompress_unlzma.c +++ linux-2.6.25.5/lib/decompress_unlzma.c -@@ -0,0 +1,616 @@ +@@ -0,0 +1,580 @@ +/* Lzma decompressor for Linux kernel. Shamelessly snarfed + * from busybox 1.1.1 + * @@ -1546,8 +1546,10 @@ + +#ifdef CONFIG_FEATURE_LZMA_FAST +# define speed_inline always_inline ++# define size_inline +#else +# define speed_inline ++# define size_inline always_inline +#endif + + @@ -1559,7 +1561,6 @@ + int buffer_size; + uint32_t code; + uint32_t range; -+ uint32_t bound; +} rc_t; + + @@ -1569,7 +1570,7 @@ + + +/* Called twice: once at startup and once in rc_normalize() */ -+static void rc_read(rc_t * rc) ++static size_inline void rc_read(rc_t * rc) +{ + if (!rc->buffer_size) return; + if (rc->fill) { @@ -1607,8 +1608,8 @@ + } +} + -+/* Called twice, but one callsite is in speed_inline'd rc_is_bit_0_helper() */ -+static void rc_do_normalize(rc_t * rc) ++/* Called twice, but one callsite is in speed_inline'd rc_is_bit_1() */ ++static speed_inline void rc_do_normalize(rc_t * rc) +{ + if (rc->ptr >= rc->buffer_end) + rc_read(rc); @@ -1623,46 +1624,30 @@ +} + +/* Called 9 times */ -+/* Why rc_is_bit_0_helper exists? -+ * Because we want to always expose (rc->code < rc->bound) to optimizer -+ */ -+static speed_inline uint32_t rc_is_bit_0_helper(rc_t * rc, uint16_t * p) ++static speed_inline int rc_is_bit_1(rc_t * rc, uint16_t * p) +{ ++ uint32_t bound; + rc_normalize(rc); -+ rc->bound = *p * (rc->range >> RC_MODEL_TOTAL_BITS); -+ return rc->bound; -+} -+static always_inline int rc_is_bit_0(rc_t * rc, uint16_t * p) -+{ -+ uint32_t t = rc_is_bit_0_helper(rc, p); -+ return rc->code < t; -+} -+ -+/* Called ~10 times, but very small, thus inlined */ -+static speed_inline void rc_update_bit_0(rc_t * rc, uint16_t * p) -+{ -+ rc->range = rc->bound; -+ *p += ((1 << RC_MODEL_TOTAL_BITS) - *p) >> RC_MOVE_BITS; -+} -+static speed_inline void rc_update_bit_1(rc_t * rc, uint16_t * p) -+{ -+ rc->range -= rc->bound; -+ rc->code -= rc->bound; -+ *p -= *p >> RC_MOVE_BITS; ++ bound = *p * (rc->range >> RC_MODEL_TOTAL_BITS); ++ if (rc->code < bound) { ++ rc->range = bound; ++ *p += ((1 << RC_MODEL_TOTAL_BITS) - *p) >> RC_MOVE_BITS; ++ return 0; ++ } ++ else { ++ rc->code -= bound; ++ rc->range -= bound; ++ *p -= *p >> RC_MOVE_BITS; ++ return 1; ++ } +} + +/* Called 4 times in unlzma loop */ -+static int rc_get_bit(rc_t * rc, uint16_t * p, int *symbol) ++static speed_inline int rc_get_bit(rc_t * rc, uint16_t * p, int *symbol) +{ -+ if (rc_is_bit_0(rc, p)) { -+ rc_update_bit_0(rc, p); -+ *symbol *= 2; -+ return 0; -+ } else { -+ rc_update_bit_1(rc, p); -+ *symbol = *symbol * 2 + 1; -+ return 1; -+ } ++ int ret = rc_is_bit_1(rc, p); ++ *symbol = *symbol * 2 + ret; ++ return ret; +} + +/* Called once */ @@ -1838,9 +1823,8 @@ + + prob = + p + LZMA_IS_MATCH + (state << LZMA_NUM_POS_BITS_MAX) + pos_state; -+ if (rc_is_bit_0(&rc, prob)) { ++ if (!rc_is_bit_1(&rc, prob)) { + mi = 1; -+ rc_update_bit_0(&rc, prob); + prob = (p + LZMA_LITERAL + (LZMA_LIT_SIZE + * ((((buffer_pos + global_pos) & literal_pos_mask) << lc) + + (previous_byte >> (8 - lc))))); @@ -1863,49 +1847,43 @@ + match_byte <<= 1; + bit = match_byte & 0x100; + prob_lit = prob + 0x100 + bit + mi; -+ if (rc_get_bit(&rc, prob_lit, &mi)) { -+ if (!bit) -+ break; -+ } else { -+ if (bit) -+ break; -+ } ++ bit ^= (rc_get_bit(&rc, prob_lit, &mi) << 8); ++ if (bit) ++ break; + } while (mi < 0x100); + } + while (mi < 0x100) { + prob_lit = prob + mi; + rc_get_bit(&rc, prob_lit, &mi); + } ++ state -= 3; ++ if (state < 4 - 3) ++ state = 0; ++ if (state >= 10-3) ++ state -= 6-3; + previous_byte = (uint8_t) mi; -+ if (state < 4) -+ state = 0; -+ else if (state < 10) -+ state -= 3; -+ else -+ state -= 6; ++#ifdef xxCONFIG_FEATURE_LZMA_FAST ++#else + goto store_previous_byte; ++#endif + } else { + int offset; + uint16_t *prob_len; + -+ rc_update_bit_1(&rc, prob); + prob = p + LZMA_IS_REP + state; -+ if (rc_is_bit_0(&rc, prob)) { -+ rc_update_bit_0(&rc, prob); ++ if (!rc_is_bit_1(&rc, prob)) { + rep3 = rep2; + rep2 = rep1; + rep1 = rep0; + state = state < LZMA_NUM_LIT_STATES ? 0 : 3; + prob = p + LZMA_LEN_CODER; + } else { -+ rc_update_bit_1(&rc, prob); -+ prob = p + LZMA_IS_REP_G0 + state; -+ if (rc_is_bit_0(&rc, prob)) { -+ rc_update_bit_0(&rc, prob); ++ prob += LZMA_IS_REP_G0 - LZMA_IS_REP; ++ if (!rc_is_bit_1(&rc, prob)) { + prob = (p + LZMA_IS_REP_0_LONG -+ + (state << LZMA_NUM_POS_BITS_MAX) + pos_state); -+ if (rc_is_bit_0(&rc, prob)) { -+ rc_update_bit_0(&rc, prob); ++ + (state << LZMA_NUM_POS_BITS_MAX) ++ + pos_state); ++ if (!rc_is_bit_1(&rc, prob)) { + + state = state < LZMA_NUM_LIT_STATES ? 9 : 11; + pos = buffer_pos - rep0; @@ -1926,25 +1904,16 @@ + writebb((char*)buffer, header.dict_size); + } + continue; -+ } else { -+ rc_update_bit_1(&rc, prob); + } + } else { + uint32_t distance; + -+ rc_update_bit_1(&rc, prob); -+ prob = p + LZMA_IS_REP_G1 + state; -+ if (rc_is_bit_0(&rc, prob)) { -+ rc_update_bit_0(&rc, prob); -+ distance = rep1; -+ } else { -+ rc_update_bit_1(&rc, prob); -+ prob = p + LZMA_IS_REP_G2 + state; -+ if (rc_is_bit_0(&rc, prob)) { -+ rc_update_bit_0(&rc, prob); -+ distance = rep2; -+ } else { -+ rc_update_bit_1(&rc, prob); ++ prob += LZMA_IS_REP_G1 - LZMA_IS_REP_G0; ++ distance = rep1; ++ if (rc_is_bit_1(&rc, prob)) { ++ prob += LZMA_IS_REP_G2 - LZMA_IS_REP_G1; ++ distance = rep2; ++ if (rc_is_bit_1(&rc, prob)) { + distance = rep3; + rep3 = rep2; + } @@ -1958,26 +1927,22 @@ + } + + prob_len = prob + LZMA_LEN_CHOICE; -+ if (rc_is_bit_0(&rc, prob_len)) { -+ rc_update_bit_0(&rc, prob_len); -+ prob_len = (prob + LZMA_LEN_LOW -+ + (pos_state << LZMA_LEN_NUM_LOW_BITS)); ++ if (!rc_is_bit_1(&rc, prob_len)) { ++ prob_len += LZMA_LEN_LOW - LZMA_LEN_CHOICE ++ + (pos_state << LZMA_LEN_NUM_LOW_BITS); + offset = 0; + num_bits = LZMA_LEN_NUM_LOW_BITS; + } else { -+ rc_update_bit_1(&rc, prob_len); -+ prob_len = prob + LZMA_LEN_CHOICE_2; -+ if (rc_is_bit_0(&rc, prob_len)) { -+ rc_update_bit_0(&rc, prob_len); -+ prob_len = (prob + LZMA_LEN_MID -+ + (pos_state << LZMA_LEN_NUM_MID_BITS)); ++ prob_len += LZMA_LEN_CHOICE_2 - LZMA_LEN_CHOICE; ++ if (!rc_is_bit_1(&rc, prob_len)) { ++ prob_len += LZMA_LEN_MID - LZMA_LEN_CHOICE_2 ++ + (pos_state << LZMA_LEN_NUM_MID_BITS); + offset = 1 << LZMA_LEN_NUM_LOW_BITS; + num_bits = LZMA_LEN_NUM_MID_BITS; + } else { -+ rc_update_bit_1(&rc, prob_len); -+ prob_len = prob + LZMA_LEN_HIGH; ++ prob_len += LZMA_LEN_HIGH - LZMA_LEN_CHOICE_2; + offset = ((1 << LZMA_LEN_NUM_LOW_BITS) -+ + (1 << LZMA_LEN_NUM_MID_BITS)); ++ + (1 << LZMA_LEN_NUM_MID_BITS)); + num_bits = LZMA_LEN_NUM_HIGH_BITS; + } + } @@ -1988,25 +1953,25 @@ + int pos_slot; + + state += LZMA_NUM_LIT_STATES; -+ prob = -+ p + LZMA_POS_SLOT + ++ prob = p + LZMA_POS_SLOT + + ((len < + LZMA_NUM_LEN_TO_POS_STATES ? len : + LZMA_NUM_LEN_TO_POS_STATES - 1) + << LZMA_NUM_POS_SLOT_BITS); + rc_bit_tree_decode(&rc, prob, LZMA_NUM_POS_SLOT_BITS, + &pos_slot); ++ rep0 = pos_slot; + if (pos_slot >= LZMA_START_POS_MODEL_INDEX) { + num_bits = (pos_slot >> 1) - 1; + rep0 = 2 | (pos_slot & 1); ++ prob = p + LZMA_ALIGN; + if (pos_slot < LZMA_END_POS_MODEL_INDEX) { + rep0 <<= num_bits; -+ prob = p + LZMA_SPEC_POS + rep0 - pos_slot - 1; ++ prob += LZMA_SPEC_POS - LZMA_ALIGN - 1 + rep0 - pos_slot; + } else { + num_bits -= LZMA_NUM_ALIGN_BITS; + while (num_bits--) + rep0 = (rep0 << 1) | rc_direct_bit(&rc); -+ prob = p + LZMA_ALIGN; + rep0 <<= LZMA_NUM_ALIGN_BITS; + num_bits = LZMA_NUM_ALIGN_BITS; + } @@ -2017,8 +1982,7 @@ + rep0 |= i; + i <<= 1; + } -+ } else -+ rep0 = pos_slot; ++ } + if (++rep0 == 0) + break; + }