wok rev 2796

linux: speedup unlzma (18% faster)
author Pascal Bellard <pascal.bellard@slitaz.org>
date Tue Apr 28 15:47:17 2009 +0200 (2009-04-28)
parents 1ac0ba1a2795
children 08e60b697edb
files linux/stuff/linux-lzma-2.6.25.5.u
line diff
     1.1 --- a/linux/stuff/linux-lzma-2.6.25.5.u	Tue Apr 28 15:38:14 2009 +0200
     1.2 +++ b/linux/stuff/linux-lzma-2.6.25.5.u	Tue Apr 28 15:47:17 2009 +0200
     1.3 @@ -1444,7 +1444,7 @@
     1.4  
     1.5  --- linux-2.6.25.5/lib/decompress_unlzma.c
     1.6  +++ linux-2.6.25.5/lib/decompress_unlzma.c
     1.7 -@@ -0,0 +1,616 @@
     1.8 +@@ -0,0 +1,580 @@
     1.9  +/* Lzma decompressor for Linux kernel. Shamelessly snarfed
    1.10  + * from busybox 1.1.1
    1.11  + *
    1.12 @@ -1546,8 +1546,10 @@
    1.13  +
    1.14  +#ifdef CONFIG_FEATURE_LZMA_FAST
    1.15  +#  define speed_inline always_inline
    1.16 ++#  define size_inline
    1.17  +#else
    1.18  +#  define speed_inline
    1.19 ++#  define size_inline always_inline
    1.20  +#endif
    1.21  +
    1.22  +
    1.23 @@ -1559,7 +1561,6 @@
    1.24  +	int buffer_size;
    1.25  +	uint32_t code;
    1.26  +	uint32_t range;
    1.27 -+	uint32_t bound;
    1.28  +} rc_t;
    1.29  +
    1.30  +
    1.31 @@ -1569,7 +1570,7 @@
    1.32  +
    1.33  +
    1.34  +/* Called twice: once at startup and once in rc_normalize() */
    1.35 -+static void rc_read(rc_t * rc)
    1.36 ++static size_inline void rc_read(rc_t * rc)
    1.37  +{
    1.38  +	if (!rc->buffer_size) return;
    1.39  +	if (rc->fill) {
    1.40 @@ -1607,8 +1608,8 @@
    1.41  +	}
    1.42  +}
    1.43  +
    1.44 -+/* Called twice, but one callsite is in speed_inline'd rc_is_bit_0_helper() */
    1.45 -+static void rc_do_normalize(rc_t * rc)
    1.46 ++/* Called twice, but one callsite is in speed_inline'd rc_is_bit_1() */
    1.47 ++static speed_inline void rc_do_normalize(rc_t * rc)
    1.48  +{
    1.49  +	if (rc->ptr >= rc->buffer_end)
    1.50  +		rc_read(rc);
    1.51 @@ -1623,46 +1624,30 @@
    1.52  +}
    1.53  +
    1.54  +/* Called 9 times */
    1.55 -+/* Why rc_is_bit_0_helper exists?
    1.56 -+ * Because we want to always expose (rc->code < rc->bound) to optimizer
    1.57 -+ */
    1.58 -+static speed_inline uint32_t rc_is_bit_0_helper(rc_t * rc, uint16_t * p)
    1.59 ++static speed_inline int rc_is_bit_1(rc_t * rc, uint16_t * p)
    1.60  +{
    1.61 ++	uint32_t bound;
    1.62  +	rc_normalize(rc);
    1.63 -+	rc->bound = *p * (rc->range >> RC_MODEL_TOTAL_BITS);
    1.64 -+	return rc->bound;
    1.65 -+}
    1.66 -+static always_inline int rc_is_bit_0(rc_t * rc, uint16_t * p)
    1.67 -+{
    1.68 -+	uint32_t t = rc_is_bit_0_helper(rc, p);
    1.69 -+	return rc->code < t;
    1.70 -+}
    1.71 -+
    1.72 -+/* Called ~10 times, but very small, thus inlined */
    1.73 -+static speed_inline void rc_update_bit_0(rc_t * rc, uint16_t * p)
    1.74 -+{
    1.75 -+	rc->range = rc->bound;
    1.76 -+	*p += ((1 << RC_MODEL_TOTAL_BITS) - *p) >> RC_MOVE_BITS;
    1.77 -+}
    1.78 -+static speed_inline void rc_update_bit_1(rc_t * rc, uint16_t * p)
    1.79 -+{
    1.80 -+	rc->range -= rc->bound;
    1.81 -+	rc->code -= rc->bound;
    1.82 -+	*p -= *p >> RC_MOVE_BITS;
    1.83 ++	bound = *p * (rc->range >> RC_MODEL_TOTAL_BITS);
    1.84 ++	if (rc->code < bound) {
    1.85 ++		rc->range = bound;
    1.86 ++		*p += ((1 << RC_MODEL_TOTAL_BITS) - *p) >> RC_MOVE_BITS;
    1.87 ++		return 0;
    1.88 ++	}
    1.89 ++	else {
    1.90 ++		rc->code -= bound;
    1.91 ++		rc->range -= bound;
    1.92 ++		*p -= *p >> RC_MOVE_BITS;
    1.93 ++		return 1;
    1.94 ++	}
    1.95  +}
    1.96  +
    1.97  +/* Called 4 times in unlzma loop */
    1.98 -+static int rc_get_bit(rc_t * rc, uint16_t * p, int *symbol)
    1.99 ++static speed_inline int rc_get_bit(rc_t * rc, uint16_t * p, int *symbol)
   1.100  +{
   1.101 -+	if (rc_is_bit_0(rc, p)) {
   1.102 -+		rc_update_bit_0(rc, p);
   1.103 -+		*symbol *= 2;
   1.104 -+		return 0;
   1.105 -+	} else {
   1.106 -+		rc_update_bit_1(rc, p);
   1.107 -+		*symbol = *symbol * 2 + 1;
   1.108 -+		return 1;
   1.109 -+	}
   1.110 ++	int ret  = rc_is_bit_1(rc, p);
   1.111 ++	*symbol = *symbol * 2 + ret;
   1.112 ++	return ret;
   1.113  +}
   1.114  +
   1.115  +/* Called once */
   1.116 @@ -1838,9 +1823,8 @@
   1.117  +
   1.118  +		prob =
   1.119  +			p + LZMA_IS_MATCH + (state << LZMA_NUM_POS_BITS_MAX) + pos_state;
   1.120 -+		if (rc_is_bit_0(&rc, prob)) {
   1.121 ++		if (!rc_is_bit_1(&rc, prob)) {
   1.122  +			mi = 1;
   1.123 -+			rc_update_bit_0(&rc, prob);
   1.124  +			prob = (p + LZMA_LITERAL + (LZMA_LIT_SIZE
   1.125  +					* ((((buffer_pos + global_pos) & literal_pos_mask) << lc)
   1.126  +					+ (previous_byte >> (8 - lc)))));
   1.127 @@ -1863,49 +1847,43 @@
   1.128  +					match_byte <<= 1;
   1.129  +					bit = match_byte & 0x100;
   1.130  +					prob_lit = prob + 0x100 + bit + mi;
   1.131 -+					if (rc_get_bit(&rc, prob_lit, &mi)) {
   1.132 -+						if (!bit)
   1.133 -+							break;
   1.134 -+					} else {
   1.135 -+						if (bit)
   1.136 -+							break;
   1.137 -+					}
   1.138 ++					bit ^= (rc_get_bit(&rc, prob_lit, &mi) << 8);
   1.139 ++					if (bit)
   1.140 ++						break;
   1.141  +				} while (mi < 0x100);
   1.142  +			}
   1.143  +			while (mi < 0x100) {
   1.144  +				prob_lit = prob + mi;
   1.145  +				rc_get_bit(&rc, prob_lit, &mi);
   1.146  +			}
   1.147 ++			state -= 3;
   1.148 ++			if (state < 4 - 3)
   1.149 ++				state = 0;
   1.150 ++			if (state >= 10-3)
   1.151 ++				state -= 6-3;
   1.152  +			previous_byte = (uint8_t) mi;
   1.153 -+			if (state < 4)
   1.154 -+				state = 0;
   1.155 -+			else if (state < 10)
   1.156 -+				state -= 3;
   1.157 -+			else
   1.158 -+				state -= 6;
   1.159 ++#ifdef xxCONFIG_FEATURE_LZMA_FAST
   1.160 ++#else
   1.161  +			goto store_previous_byte;
   1.162 ++#endif
   1.163  +		} else {
   1.164  +			int offset;
   1.165  +			uint16_t *prob_len;
   1.166  +
   1.167 -+			rc_update_bit_1(&rc, prob);
   1.168  +			prob = p + LZMA_IS_REP + state;
   1.169 -+			if (rc_is_bit_0(&rc, prob)) {
   1.170 -+				rc_update_bit_0(&rc, prob);
   1.171 ++			if (!rc_is_bit_1(&rc, prob)) {
   1.172  +				rep3 = rep2;
   1.173  +				rep2 = rep1;
   1.174  +				rep1 = rep0;
   1.175  +				state = state < LZMA_NUM_LIT_STATES ? 0 : 3;
   1.176  +				prob = p + LZMA_LEN_CODER;
   1.177  +			} else {
   1.178 -+				rc_update_bit_1(&rc, prob);
   1.179 -+				prob = p + LZMA_IS_REP_G0 + state;
   1.180 -+				if (rc_is_bit_0(&rc, prob)) {
   1.181 -+					rc_update_bit_0(&rc, prob);
   1.182 ++				prob += LZMA_IS_REP_G0 - LZMA_IS_REP;
   1.183 ++				if (!rc_is_bit_1(&rc, prob)) {
   1.184  +					prob = (p + LZMA_IS_REP_0_LONG
   1.185 -+							+ (state << LZMA_NUM_POS_BITS_MAX) + pos_state);
   1.186 -+					if (rc_is_bit_0(&rc, prob)) {
   1.187 -+						rc_update_bit_0(&rc, prob);
   1.188 ++						+ (state << LZMA_NUM_POS_BITS_MAX)
   1.189 ++						+ pos_state);
   1.190 ++					if (!rc_is_bit_1(&rc, prob)) {
   1.191  +
   1.192  +						state = state < LZMA_NUM_LIT_STATES ? 9 : 11;
   1.193  +						pos = buffer_pos - rep0;
   1.194 @@ -1926,25 +1904,16 @@
   1.195  +							writebb((char*)buffer, header.dict_size);
   1.196  +						}
   1.197  +						continue;
   1.198 -+					} else {
   1.199 -+						rc_update_bit_1(&rc, prob);
   1.200  +					}
   1.201  +				} else {
   1.202  +					uint32_t distance;
   1.203  +
   1.204 -+					rc_update_bit_1(&rc, prob);
   1.205 -+					prob = p + LZMA_IS_REP_G1 + state;
   1.206 -+					if (rc_is_bit_0(&rc, prob)) {
   1.207 -+						rc_update_bit_0(&rc, prob);
   1.208 -+						distance = rep1;
   1.209 -+					} else {
   1.210 -+						rc_update_bit_1(&rc, prob);
   1.211 -+						prob = p + LZMA_IS_REP_G2 + state;
   1.212 -+						if (rc_is_bit_0(&rc, prob)) {
   1.213 -+							rc_update_bit_0(&rc, prob);
   1.214 -+							distance = rep2;
   1.215 -+						} else {
   1.216 -+							rc_update_bit_1(&rc, prob);
   1.217 ++					prob += LZMA_IS_REP_G1 - LZMA_IS_REP_G0;
   1.218 ++					distance = rep1;
   1.219 ++					if (rc_is_bit_1(&rc, prob)) {
   1.220 ++						prob += LZMA_IS_REP_G2 - LZMA_IS_REP_G1;
   1.221 ++						distance = rep2;
   1.222 ++						if (rc_is_bit_1(&rc, prob)) {
   1.223  +							distance = rep3;
   1.224  +							rep3 = rep2;
   1.225  +						}
   1.226 @@ -1958,26 +1927,22 @@
   1.227  +			}
   1.228  +
   1.229  +			prob_len = prob + LZMA_LEN_CHOICE;
   1.230 -+			if (rc_is_bit_0(&rc, prob_len)) {
   1.231 -+				rc_update_bit_0(&rc, prob_len);
   1.232 -+				prob_len = (prob + LZMA_LEN_LOW
   1.233 -+							+ (pos_state << LZMA_LEN_NUM_LOW_BITS));
   1.234 ++			if (!rc_is_bit_1(&rc, prob_len)) {
   1.235 ++				prob_len += LZMA_LEN_LOW - LZMA_LEN_CHOICE
   1.236 ++					    + (pos_state << LZMA_LEN_NUM_LOW_BITS);
   1.237  +				offset = 0;
   1.238  +				num_bits = LZMA_LEN_NUM_LOW_BITS;
   1.239  +			} else {
   1.240 -+				rc_update_bit_1(&rc, prob_len);
   1.241 -+				prob_len = prob + LZMA_LEN_CHOICE_2;
   1.242 -+				if (rc_is_bit_0(&rc, prob_len)) {
   1.243 -+					rc_update_bit_0(&rc, prob_len);
   1.244 -+					prob_len = (prob + LZMA_LEN_MID
   1.245 -+								+ (pos_state << LZMA_LEN_NUM_MID_BITS));
   1.246 ++				prob_len += LZMA_LEN_CHOICE_2 - LZMA_LEN_CHOICE;
   1.247 ++				if (!rc_is_bit_1(&rc, prob_len)) {
   1.248 ++					prob_len += LZMA_LEN_MID - LZMA_LEN_CHOICE_2
   1.249 ++						    + (pos_state << LZMA_LEN_NUM_MID_BITS);
   1.250  +					offset = 1 << LZMA_LEN_NUM_LOW_BITS;
   1.251  +					num_bits = LZMA_LEN_NUM_MID_BITS;
   1.252  +				} else {
   1.253 -+					rc_update_bit_1(&rc, prob_len);
   1.254 -+					prob_len = prob + LZMA_LEN_HIGH;
   1.255 ++					prob_len += LZMA_LEN_HIGH - LZMA_LEN_CHOICE_2;
   1.256  +					offset = ((1 << LZMA_LEN_NUM_LOW_BITS)
   1.257 -+							  + (1 << LZMA_LEN_NUM_MID_BITS));
   1.258 ++						 + (1 << LZMA_LEN_NUM_MID_BITS));
   1.259  +					num_bits = LZMA_LEN_NUM_HIGH_BITS;
   1.260  +				}
   1.261  +			}
   1.262 @@ -1988,25 +1953,25 @@
   1.263  +				int pos_slot;
   1.264  +
   1.265  +				state += LZMA_NUM_LIT_STATES;
   1.266 -+				prob =
   1.267 -+					p + LZMA_POS_SLOT +
   1.268 ++				prob = p + LZMA_POS_SLOT +
   1.269  +					((len <
   1.270  +					  LZMA_NUM_LEN_TO_POS_STATES ? len :
   1.271  +					  LZMA_NUM_LEN_TO_POS_STATES - 1)
   1.272  +					 << LZMA_NUM_POS_SLOT_BITS);
   1.273  +				rc_bit_tree_decode(&rc, prob, LZMA_NUM_POS_SLOT_BITS,
   1.274  +								   &pos_slot);
   1.275 ++				rep0 = pos_slot;
   1.276  +				if (pos_slot >= LZMA_START_POS_MODEL_INDEX) {
   1.277  +					num_bits = (pos_slot >> 1) - 1;
   1.278  +					rep0 = 2 | (pos_slot & 1);
   1.279 ++					prob = p + LZMA_ALIGN;
   1.280  +					if (pos_slot < LZMA_END_POS_MODEL_INDEX) {
   1.281  +						rep0 <<= num_bits;
   1.282 -+						prob = p + LZMA_SPEC_POS + rep0 - pos_slot - 1;
   1.283 ++						prob += LZMA_SPEC_POS - LZMA_ALIGN - 1 + rep0 - pos_slot;
   1.284  +					} else {
   1.285  +						num_bits -= LZMA_NUM_ALIGN_BITS;
   1.286  +						while (num_bits--)
   1.287  +							rep0 = (rep0 << 1) | rc_direct_bit(&rc);
   1.288 -+						prob = p + LZMA_ALIGN;
   1.289  +						rep0 <<= LZMA_NUM_ALIGN_BITS;
   1.290  +						num_bits = LZMA_NUM_ALIGN_BITS;
   1.291  +					}
   1.292 @@ -2017,8 +1982,7 @@
   1.293  +							rep0 |= i;
   1.294  +						i <<= 1;
   1.295  +					}
   1.296 -+				} else
   1.297 -+					rep0 = pos_slot;
   1.298 ++				}
   1.299  +				if (++rep0 == 0)
   1.300  +					break;
   1.301  +			}