wok rev 2796
linux: speedup unlzma (18% faster)
author | Pascal Bellard <pascal.bellard@slitaz.org> |
---|---|
date | Tue Apr 28 15:47:17 2009 +0200 (2009-04-28) |
parents | 1ac0ba1a2795 |
children | 08e60b697edb |
files | linux/stuff/linux-lzma-2.6.25.5.u |
line diff
1.1 --- a/linux/stuff/linux-lzma-2.6.25.5.u Tue Apr 28 15:38:14 2009 +0200 1.2 +++ b/linux/stuff/linux-lzma-2.6.25.5.u Tue Apr 28 15:47:17 2009 +0200 1.3 @@ -1444,7 +1444,7 @@ 1.4 1.5 --- linux-2.6.25.5/lib/decompress_unlzma.c 1.6 +++ linux-2.6.25.5/lib/decompress_unlzma.c 1.7 -@@ -0,0 +1,616 @@ 1.8 +@@ -0,0 +1,580 @@ 1.9 +/* Lzma decompressor for Linux kernel. Shamelessly snarfed 1.10 + * from busybox 1.1.1 1.11 + * 1.12 @@ -1546,8 +1546,10 @@ 1.13 + 1.14 +#ifdef CONFIG_FEATURE_LZMA_FAST 1.15 +# define speed_inline always_inline 1.16 ++# define size_inline 1.17 +#else 1.18 +# define speed_inline 1.19 ++# define size_inline always_inline 1.20 +#endif 1.21 + 1.22 + 1.23 @@ -1559,7 +1561,6 @@ 1.24 + int buffer_size; 1.25 + uint32_t code; 1.26 + uint32_t range; 1.27 -+ uint32_t bound; 1.28 +} rc_t; 1.29 + 1.30 + 1.31 @@ -1569,7 +1570,7 @@ 1.32 + 1.33 + 1.34 +/* Called twice: once at startup and once in rc_normalize() */ 1.35 -+static void rc_read(rc_t * rc) 1.36 ++static size_inline void rc_read(rc_t * rc) 1.37 +{ 1.38 + if (!rc->buffer_size) return; 1.39 + if (rc->fill) { 1.40 @@ -1607,8 +1608,8 @@ 1.41 + } 1.42 +} 1.43 + 1.44 -+/* Called twice, but one callsite is in speed_inline'd rc_is_bit_0_helper() */ 1.45 -+static void rc_do_normalize(rc_t * rc) 1.46 ++/* Called twice, but one callsite is in speed_inline'd rc_is_bit_1() */ 1.47 ++static speed_inline void rc_do_normalize(rc_t * rc) 1.48 +{ 1.49 + if (rc->ptr >= rc->buffer_end) 1.50 + rc_read(rc); 1.51 @@ -1623,46 +1624,30 @@ 1.52 +} 1.53 + 1.54 +/* Called 9 times */ 1.55 -+/* Why rc_is_bit_0_helper exists? 1.56 -+ * Because we want to always expose (rc->code < rc->bound) to optimizer 1.57 -+ */ 1.58 -+static speed_inline uint32_t rc_is_bit_0_helper(rc_t * rc, uint16_t * p) 1.59 ++static speed_inline int rc_is_bit_1(rc_t * rc, uint16_t * p) 1.60 +{ 1.61 ++ uint32_t bound; 1.62 + rc_normalize(rc); 1.63 -+ rc->bound = *p * (rc->range >> RC_MODEL_TOTAL_BITS); 1.64 -+ return rc->bound; 1.65 -+} 1.66 -+static always_inline int rc_is_bit_0(rc_t * rc, uint16_t * p) 1.67 -+{ 1.68 -+ uint32_t t = rc_is_bit_0_helper(rc, p); 1.69 -+ return rc->code < t; 1.70 -+} 1.71 -+ 1.72 -+/* Called ~10 times, but very small, thus inlined */ 1.73 -+static speed_inline void rc_update_bit_0(rc_t * rc, uint16_t * p) 1.74 -+{ 1.75 -+ rc->range = rc->bound; 1.76 -+ *p += ((1 << RC_MODEL_TOTAL_BITS) - *p) >> RC_MOVE_BITS; 1.77 -+} 1.78 -+static speed_inline void rc_update_bit_1(rc_t * rc, uint16_t * p) 1.79 -+{ 1.80 -+ rc->range -= rc->bound; 1.81 -+ rc->code -= rc->bound; 1.82 -+ *p -= *p >> RC_MOVE_BITS; 1.83 ++ bound = *p * (rc->range >> RC_MODEL_TOTAL_BITS); 1.84 ++ if (rc->code < bound) { 1.85 ++ rc->range = bound; 1.86 ++ *p += ((1 << RC_MODEL_TOTAL_BITS) - *p) >> RC_MOVE_BITS; 1.87 ++ return 0; 1.88 ++ } 1.89 ++ else { 1.90 ++ rc->code -= bound; 1.91 ++ rc->range -= bound; 1.92 ++ *p -= *p >> RC_MOVE_BITS; 1.93 ++ return 1; 1.94 ++ } 1.95 +} 1.96 + 1.97 +/* Called 4 times in unlzma loop */ 1.98 -+static int rc_get_bit(rc_t * rc, uint16_t * p, int *symbol) 1.99 ++static speed_inline int rc_get_bit(rc_t * rc, uint16_t * p, int *symbol) 1.100 +{ 1.101 -+ if (rc_is_bit_0(rc, p)) { 1.102 -+ rc_update_bit_0(rc, p); 1.103 -+ *symbol *= 2; 1.104 -+ return 0; 1.105 -+ } else { 1.106 -+ rc_update_bit_1(rc, p); 1.107 -+ *symbol = *symbol * 2 + 1; 1.108 -+ return 1; 1.109 -+ } 1.110 ++ int ret = rc_is_bit_1(rc, p); 1.111 ++ *symbol = *symbol * 2 + ret; 1.112 ++ return ret; 1.113 +} 1.114 + 1.115 +/* Called once */ 1.116 @@ -1838,9 +1823,8 @@ 1.117 + 1.118 + prob = 1.119 + p + LZMA_IS_MATCH + (state << LZMA_NUM_POS_BITS_MAX) + pos_state; 1.120 -+ if (rc_is_bit_0(&rc, prob)) { 1.121 ++ if (!rc_is_bit_1(&rc, prob)) { 1.122 + mi = 1; 1.123 -+ rc_update_bit_0(&rc, prob); 1.124 + prob = (p + LZMA_LITERAL + (LZMA_LIT_SIZE 1.125 + * ((((buffer_pos + global_pos) & literal_pos_mask) << lc) 1.126 + + (previous_byte >> (8 - lc))))); 1.127 @@ -1863,49 +1847,43 @@ 1.128 + match_byte <<= 1; 1.129 + bit = match_byte & 0x100; 1.130 + prob_lit = prob + 0x100 + bit + mi; 1.131 -+ if (rc_get_bit(&rc, prob_lit, &mi)) { 1.132 -+ if (!bit) 1.133 -+ break; 1.134 -+ } else { 1.135 -+ if (bit) 1.136 -+ break; 1.137 -+ } 1.138 ++ bit ^= (rc_get_bit(&rc, prob_lit, &mi) << 8); 1.139 ++ if (bit) 1.140 ++ break; 1.141 + } while (mi < 0x100); 1.142 + } 1.143 + while (mi < 0x100) { 1.144 + prob_lit = prob + mi; 1.145 + rc_get_bit(&rc, prob_lit, &mi); 1.146 + } 1.147 ++ state -= 3; 1.148 ++ if (state < 4 - 3) 1.149 ++ state = 0; 1.150 ++ if (state >= 10-3) 1.151 ++ state -= 6-3; 1.152 + previous_byte = (uint8_t) mi; 1.153 -+ if (state < 4) 1.154 -+ state = 0; 1.155 -+ else if (state < 10) 1.156 -+ state -= 3; 1.157 -+ else 1.158 -+ state -= 6; 1.159 ++#ifdef xxCONFIG_FEATURE_LZMA_FAST 1.160 ++#else 1.161 + goto store_previous_byte; 1.162 ++#endif 1.163 + } else { 1.164 + int offset; 1.165 + uint16_t *prob_len; 1.166 + 1.167 -+ rc_update_bit_1(&rc, prob); 1.168 + prob = p + LZMA_IS_REP + state; 1.169 -+ if (rc_is_bit_0(&rc, prob)) { 1.170 -+ rc_update_bit_0(&rc, prob); 1.171 ++ if (!rc_is_bit_1(&rc, prob)) { 1.172 + rep3 = rep2; 1.173 + rep2 = rep1; 1.174 + rep1 = rep0; 1.175 + state = state < LZMA_NUM_LIT_STATES ? 0 : 3; 1.176 + prob = p + LZMA_LEN_CODER; 1.177 + } else { 1.178 -+ rc_update_bit_1(&rc, prob); 1.179 -+ prob = p + LZMA_IS_REP_G0 + state; 1.180 -+ if (rc_is_bit_0(&rc, prob)) { 1.181 -+ rc_update_bit_0(&rc, prob); 1.182 ++ prob += LZMA_IS_REP_G0 - LZMA_IS_REP; 1.183 ++ if (!rc_is_bit_1(&rc, prob)) { 1.184 + prob = (p + LZMA_IS_REP_0_LONG 1.185 -+ + (state << LZMA_NUM_POS_BITS_MAX) + pos_state); 1.186 -+ if (rc_is_bit_0(&rc, prob)) { 1.187 -+ rc_update_bit_0(&rc, prob); 1.188 ++ + (state << LZMA_NUM_POS_BITS_MAX) 1.189 ++ + pos_state); 1.190 ++ if (!rc_is_bit_1(&rc, prob)) { 1.191 + 1.192 + state = state < LZMA_NUM_LIT_STATES ? 9 : 11; 1.193 + pos = buffer_pos - rep0; 1.194 @@ -1926,25 +1904,16 @@ 1.195 + writebb((char*)buffer, header.dict_size); 1.196 + } 1.197 + continue; 1.198 -+ } else { 1.199 -+ rc_update_bit_1(&rc, prob); 1.200 + } 1.201 + } else { 1.202 + uint32_t distance; 1.203 + 1.204 -+ rc_update_bit_1(&rc, prob); 1.205 -+ prob = p + LZMA_IS_REP_G1 + state; 1.206 -+ if (rc_is_bit_0(&rc, prob)) { 1.207 -+ rc_update_bit_0(&rc, prob); 1.208 -+ distance = rep1; 1.209 -+ } else { 1.210 -+ rc_update_bit_1(&rc, prob); 1.211 -+ prob = p + LZMA_IS_REP_G2 + state; 1.212 -+ if (rc_is_bit_0(&rc, prob)) { 1.213 -+ rc_update_bit_0(&rc, prob); 1.214 -+ distance = rep2; 1.215 -+ } else { 1.216 -+ rc_update_bit_1(&rc, prob); 1.217 ++ prob += LZMA_IS_REP_G1 - LZMA_IS_REP_G0; 1.218 ++ distance = rep1; 1.219 ++ if (rc_is_bit_1(&rc, prob)) { 1.220 ++ prob += LZMA_IS_REP_G2 - LZMA_IS_REP_G1; 1.221 ++ distance = rep2; 1.222 ++ if (rc_is_bit_1(&rc, prob)) { 1.223 + distance = rep3; 1.224 + rep3 = rep2; 1.225 + } 1.226 @@ -1958,26 +1927,22 @@ 1.227 + } 1.228 + 1.229 + prob_len = prob + LZMA_LEN_CHOICE; 1.230 -+ if (rc_is_bit_0(&rc, prob_len)) { 1.231 -+ rc_update_bit_0(&rc, prob_len); 1.232 -+ prob_len = (prob + LZMA_LEN_LOW 1.233 -+ + (pos_state << LZMA_LEN_NUM_LOW_BITS)); 1.234 ++ if (!rc_is_bit_1(&rc, prob_len)) { 1.235 ++ prob_len += LZMA_LEN_LOW - LZMA_LEN_CHOICE 1.236 ++ + (pos_state << LZMA_LEN_NUM_LOW_BITS); 1.237 + offset = 0; 1.238 + num_bits = LZMA_LEN_NUM_LOW_BITS; 1.239 + } else { 1.240 -+ rc_update_bit_1(&rc, prob_len); 1.241 -+ prob_len = prob + LZMA_LEN_CHOICE_2; 1.242 -+ if (rc_is_bit_0(&rc, prob_len)) { 1.243 -+ rc_update_bit_0(&rc, prob_len); 1.244 -+ prob_len = (prob + LZMA_LEN_MID 1.245 -+ + (pos_state << LZMA_LEN_NUM_MID_BITS)); 1.246 ++ prob_len += LZMA_LEN_CHOICE_2 - LZMA_LEN_CHOICE; 1.247 ++ if (!rc_is_bit_1(&rc, prob_len)) { 1.248 ++ prob_len += LZMA_LEN_MID - LZMA_LEN_CHOICE_2 1.249 ++ + (pos_state << LZMA_LEN_NUM_MID_BITS); 1.250 + offset = 1 << LZMA_LEN_NUM_LOW_BITS; 1.251 + num_bits = LZMA_LEN_NUM_MID_BITS; 1.252 + } else { 1.253 -+ rc_update_bit_1(&rc, prob_len); 1.254 -+ prob_len = prob + LZMA_LEN_HIGH; 1.255 ++ prob_len += LZMA_LEN_HIGH - LZMA_LEN_CHOICE_2; 1.256 + offset = ((1 << LZMA_LEN_NUM_LOW_BITS) 1.257 -+ + (1 << LZMA_LEN_NUM_MID_BITS)); 1.258 ++ + (1 << LZMA_LEN_NUM_MID_BITS)); 1.259 + num_bits = LZMA_LEN_NUM_HIGH_BITS; 1.260 + } 1.261 + } 1.262 @@ -1988,25 +1953,25 @@ 1.263 + int pos_slot; 1.264 + 1.265 + state += LZMA_NUM_LIT_STATES; 1.266 -+ prob = 1.267 -+ p + LZMA_POS_SLOT + 1.268 ++ prob = p + LZMA_POS_SLOT + 1.269 + ((len < 1.270 + LZMA_NUM_LEN_TO_POS_STATES ? len : 1.271 + LZMA_NUM_LEN_TO_POS_STATES - 1) 1.272 + << LZMA_NUM_POS_SLOT_BITS); 1.273 + rc_bit_tree_decode(&rc, prob, LZMA_NUM_POS_SLOT_BITS, 1.274 + &pos_slot); 1.275 ++ rep0 = pos_slot; 1.276 + if (pos_slot >= LZMA_START_POS_MODEL_INDEX) { 1.277 + num_bits = (pos_slot >> 1) - 1; 1.278 + rep0 = 2 | (pos_slot & 1); 1.279 ++ prob = p + LZMA_ALIGN; 1.280 + if (pos_slot < LZMA_END_POS_MODEL_INDEX) { 1.281 + rep0 <<= num_bits; 1.282 -+ prob = p + LZMA_SPEC_POS + rep0 - pos_slot - 1; 1.283 ++ prob += LZMA_SPEC_POS - LZMA_ALIGN - 1 + rep0 - pos_slot; 1.284 + } else { 1.285 + num_bits -= LZMA_NUM_ALIGN_BITS; 1.286 + while (num_bits--) 1.287 + rep0 = (rep0 << 1) | rc_direct_bit(&rc); 1.288 -+ prob = p + LZMA_ALIGN; 1.289 + rep0 <<= LZMA_NUM_ALIGN_BITS; 1.290 + num_bits = LZMA_NUM_ALIGN_BITS; 1.291 + } 1.292 @@ -2017,8 +1982,7 @@ 1.293 + rep0 |= i; 1.294 + i <<= 1; 1.295 + } 1.296 -+ } else 1.297 -+ rep0 = pos_slot; 1.298 ++ } 1.299 + if (++rep0 == 0) 1.300 + break; 1.301 + }