wok-tiny diff linux/stuff/unlzsa1.S @ rev 179

linux: rewrite stuff/pack
author Pascal Bellard <pascal.bellard@slitaz.org>
date Tue Aug 15 09:20:55 2023 +0000 (9 months ago)
parents 2c80994c5e30
children
line diff
     1.1 --- a/linux/stuff/unlzsa1.S	Wed Jul 14 14:20:00 2021 +0000
     1.2 +++ b/linux/stuff/unlzsa1.S	Tue Aug 15 09:20:55 2023 +0000
     1.3 @@ -1,143 +1,211 @@
     1.4 -//  based on
     1.5 -//  decompress_small.S - space-efficient decompressor implementation for 8088
     1.6 -//
     1.7 -//  Copyright (C) 2019 Emmanuel Marty
     1.8 -//
     1.9 -//  This software is provided 'as-is', without any express or implied
    1.10 -//  warranty.  In no event will the authors be held liable for any damages
    1.11 -//  arising from the use of this software.
    1.12 -//
    1.13 -//  Permission is granted to anyone to use this software for any purpose,
    1.14 -//  including commercial applications, and to alter it and redistribute it
    1.15 -//  freely, subject to the following restrictions:
    1.16 -//
    1.17 -//  1. The origin of this software must not be misrepresented; you must not
    1.18 -//     claim that you wrote the original software. If you use this software
    1.19 -//     in a product, an acknowledgment in the product documentation would be
    1.20 -//     appreciated but is not required.
    1.21 -//  2. Altered source versions must be plainly marked as such, and must not be
    1.22 -//     misrepresented as being the original software.
    1.23 -//  3. This notice may not be removed or altered from any source distribution.
    1.24 -
    1.25 -//  ---------------------------------------------------------------------------
    1.26 -//  Decompress raw LZSA1 block
    1.27 -//  inputs:
    1.28 -//  * %ds:%si: raw LZSA1 block
    1.29 -//  * %es:%di: output buffer
    1.30 -//  ---------------------------------------------------------------------------
    1.31 -
    1.32 +// Lzsa1Decode:
    1.33 +#ifndef FLAT32
    1.34 +//   input   ds:si=inStream, es:di=outStream
    1.35 +//   output  outStream[], ds:si, es:di
    1.36  	.code16
    1.37 -lzsa1_decompress:
    1.38 -	//pushw	%di		// remember decompression offset
    1.39 -	//cld			// make string operations (lods, movs, stos..) move forward
    1.40 -
    1.41 -lzsa1_decode_token:
    1.42 -	xorb	%ah, %ah	// clear %ah
    1.43 -	lodsb			// read token byte: O|LLL|MMMM
    1.44 -	movw	%ax,%bx		// keep token in %bl
    1.45 -	
    1.46 -	andb	$0x70, %al	// isolate literals length in token (LLL)
    1.47 -	je	lzsa1_check_offset_size	// if LLL=0, we have no literals; goto match
    1.48 -#ifdef ONLY8086
    1.49 -	movb	$4, %cl
    1.50 -	shrb	%cl, %al	// shift literals length into place
    1.51 +#define AX	%ax
    1.52 +#define BX	%bx
    1.53 +#define SI	%si
    1.54 +#define DI	%di
    1.55  #else
    1.56 -	shrb	$4, %al		// shift literals length into place
    1.57 +//   input   esi=inStream, edi=outStream
    1.58 +//   output  outStream[], ds:esi, es:edi
    1.59 +	.code32
    1.60 +#define AX	%eax
    1.61 +#define BX	%ebx
    1.62 +#define SI	%esi
    1.63 +#define DI	%edi
    1.64  #endif
    1.65  
    1.66 -	cmpb	$7, %al		// LITERALS_RUN_LEN?
    1.67 -	jne	lzsa1_got_literals	// no, we have the full literals count from the token, go copy
    1.68 +MATCH_RUN_LEN		=	15
    1.69 +LITERALS_RUN_LEN	=	7
    1.70 +MIN_MATCH_SIZE		=	3
    1.71 +MIN_LITERALS_SIZE	=	0
    1.72  
    1.73 -	lodsb                   // grab extra length byte
    1.74 -	addb	$7, %al		// add LITERALS_RUN_LEN
    1.75 -	jnc	lzsa1_got_literals	// if no overflow, we have the full literals count, go copy
    1.76 -	je	lzsa1_big_literals
    1.77 +#define PACKED_ONLY			// assume no copy block, optional
    1.78 +//#define PARANOIA			// cover rare cases, optional
    1.79 + 
    1.80 +.macro	shrclw cnt,obj
    1.81 +#ifdef ONLY8086
    1.82 +	movb	\cnt, %cl
    1.83 +	shrw	%cl, \obj
    1.84 +#else
    1.85 +	shrw	\cnt, \obj
    1.86 +#endif
    1.87 +.endm
    1.88  
    1.89 -	movb	$1, %ah		// add 256 (I'd prefer 'xchgb %al, %ah'	max 1791 instead of 511)
    1.90 -	lodsb			// grab single extra length byte
    1.91 -	.byte	0x3C		// mask lodsw with cmpb $0xAD, %al
    1.92 -				// (*like jmp short lzsa1_got_literals but faster)
    1.93 -
    1.94 -lzsa1_big_literals:
    1.95 -	lodsw			// grab 16-bit extra length
    1.96 -
    1.97 -lzsa1_got_literals:
    1.98 +#ifdef FLAT16OUT
    1.99 +#define RAW_FORMAT
   1.100 +#endif
   1.101 +lzsa1main:
   1.102 +#ifdef PARANOIA
   1.103 +	cld
   1.104 +#endif
   1.105 +#ifndef RAW_FORMAT
   1.106 +# ifndef NO_LZSA1_HEADER
   1.107 +	lodsw
   1.108 +	cmpw	$0x9E7B, %ax	// magic
   1.109 +	jne	lzsa1main
   1.110 +	lodsb
   1.111 +//	cmpb	$0, %al		// lzsa1
   1.112 +//	jne	lzsa1main
   1.113 +# endif
   1.114 +	xorw	%ax, %ax
   1.115 +	xchgw	%ax, %di
   1.116 +	shrclw	$4, %ax
   1.117 +	jmp	lzsa1blockz	// %di *MUST* be paragraph aligned
   1.118 +# ifndef PACKED_ONLY
   1.119 +lzsa1copy:
   1.120 +	movsb			// handle 64K case
   1.121 +	decw	%cx
   1.122 +	rep	movsb		// copy block
   1.123 +# endif
   1.124 +lzsa1block:			// uncompress chunk
   1.125 +	movw	$0x1000, %ax
   1.126 +lzsa1blockz:	
   1.127 +	movw	%es, %bx
   1.128 +	addw	%ax, %bx
   1.129 +	movw	%bx, %es
   1.130 +# ifndef FLAT16
   1.131 +	movw	%si, %ax
   1.132 +	andw	$0xf, %si
   1.133 +	shrclw	$4, %ax
   1.134 +	movw	%ds, %bx
   1.135 +	addw	%ax, %bx
   1.136 +	movw	%bx, %ds
   1.137 +# endif
   1.138 +	lodsw			// block size
   1.139  	xchgw	%ax, %cx
   1.140 -#ifdef USE_MOVSW
   1.141 -	shrw	$1, %cx
   1.142 -	rep movsw
   1.143 -	adcw	%cx, %cx
   1.144 +	movw	%cx, %dx
   1.145 +	lodsb
   1.146 +# ifndef PACKED_ONLY
   1.147 +	orb	%al, %al
   1.148 +	js	lzsa1copy
   1.149 +	jne	lzsa1full	// 64Kb block
   1.150 +# endif
   1.151 +	jcxz	lzsa1quit	// bail if we hit EOD
   1.152 +lzsa1full:
   1.153 +	addw	%si, %dx
   1.154  #endif
   1.155 -	rep movsb		// copy %cx literals from %ds:%si to %es:%di
   1.156 -
   1.157 -lzsa1_check_offset_size:
   1.158 +lzsa1chunk:			// uncompress chunk
   1.159 +	lodsb			// get token O|LLL|MMMM
   1.160 +	movb	%al, %bl	// keep token in bl
   1.161 +	shrclw	$4, %ax		// shift literals length into place
   1.162 +	movw	$LITERALS_RUN_LEN+256*MIN_LITERALS_SIZE, %cx
   1.163 +	call	lzsa1len	// %ch = LITERALS_RUN_LEN
   1.164 +	rep	movsb		// copy %cx literals from %ds:%si to %es:%di
   1.165 +#ifndef RAW_FORMAT
   1.166 +	cmpw	%dx, %si
   1.167 +	je	lzsa1block	// bail if we hit EOD
   1.168 +#endif
   1.169 +#ifdef FLAT32
   1.170 +	orl	$-1, %eax
   1.171 +#endif
   1.172  	testb	%bl, %bl	// check match offset size in token (O bit)
   1.173 -	js	lzsa1_get_long_offset
   1.174 -
   1.175 -	decw	%cx
   1.176 -	xchgw	%ax, %cx	// %ah to 0xff - %cx was zero from the rep movsb above
   1.177 +	js	lzsa1LongOfs
   1.178 +#ifndef FLAT32
   1.179 +	movb	$-1, %ah	// set offset bits 15-8 to 1
   1.180 +#endif
   1.181  	lodsb
   1.182  	.byte	0x3C		// mask lodsw with cmpb $0xAD, %al
   1.183 -				// (*like jmp short lzsa1_get_match_length but faster)
   1.184 -
   1.185 -lzsa1_get_long_offset:
   1.186 -	lodsw			// Get 2-byte match offset
   1.187 -
   1.188 -lzsa1_get_match_length:
   1.189 -	xchgw	%ax, %bx	// %bx: match offset  %ax: original token
   1.190 -	andb	$0xF, %al	// isolate match length in token (MMMM)
   1.191 -	addb	$3, %al		// add MIN_MATCH_SIZE
   1.192 -
   1.193 -	cmpb	$0x12, %al	// MATCH_RUN_LEN?
   1.194 -	jne	lzsa1_got_matchlen	// no, we have the full match length from the token, go copy
   1.195 -
   1.196 -	lodsb			// grab extra length byte
   1.197 -	addb	$0x12, %al	// add MIN_MATCH_SIZE + MATCH_RUN_LEN
   1.198 -	jnc	lzsa1_got_matchlen	// if no overflow, we have the entire length
   1.199 -	je	lzsa1_big_matchlen       
   1.200 -
   1.201 -	movb	$1, %ah		// add 256 (I'd prefer 'xchgb %al, %ah'	max 3071 instead of 511)
   1.202 -	lodsb			// grab single extra length byte
   1.203 -	.byte	0x3C		// mask lodsw with cmpb $0xAD, %al
   1.204 -				// (*like jmp short lzsa1_got_matchlen but faster)
   1.205 -lzsa1_big_matchlen:
   1.206 -	lodsw			// grab 16-bit length
   1.207 -
   1.208 -lzsa1_got_matchlen:
   1.209 -	xchgw	%ax, %cx	// copy match length into %cx
   1.210 -	jcxz	lzsa1_done_decompressing	// bail if we hit EOD
   1.211 -	xchgw	%ax, %si	// save %si (current pointer to compressed data)
   1.212 -	leaw	(%bx,%di), %si	// %es:%si now points at back reference in output data
   1.213 -#ifdef USE_MOVSW
   1.214 -	cmpw	$-2, %bx
   1.215 -	jae	lzsa1_store
   1.216 -	shrw	$1, %cx
   1.217 -	rep movsw %es:(%si), %es:(%di)
   1.218 -	adcw	%cx, %cx
   1.219 -#endif
   1.220 -	rep movsb %es:(%si), %es:(%di)		// copy match
   1.221 -	xchgw	%ax, %si	// restore %ds:%si
   1.222 -	jmp	lzsa1_decode_token	// go decode another token
   1.223 -#ifdef USE_MOVSW
   1.224 -lzsa1_store:
   1.225 -	je	lzsa1_store_word
   1.226 -	lodsb	%es:(%si)
   1.227 -	movb	%al, %ah	
   1.228 -	.byte	0x3D		// mask lodsw with cmpb $0x26AD, %ax
   1.229 -				// (*like jmp short lzsa1_store_byte but faster)
   1.230 -lzsa1_store_word:
   1.231 -	lodsw	%es:(%si)
   1.232 -lzsa1_store_byte:
   1.233 -	shrw	$1, %cx
   1.234 -	rep	stosw
   1.235 -	adcw	%cx, %cx
   1.236 -	rep	stosb
   1.237 -	xchgw	%ax, %si	// restore %ds:%si
   1.238 -	jmp	lzsa1_decode_token	// go decode another token
   1.239 +lzsa1LongOfs:
   1.240 +	lodsw
   1.241 +	xchg	AX, BX		// %bx: match offset  %ax: original token
   1.242 +	movw	$MATCH_RUN_LEN+256*MIN_MATCH_SIZE, %cx
   1.243 +	call	lzsa1len
   1.244 +#ifdef RAW_FORMAT
   1.245 +	jcxz	lzsa1quit	// bail if we hit EOD
   1.246  #endif
   1.247  
   1.248 -lzsa1_done_decompressing:
   1.249 -//	popw	%ax		// retrieve the original decompression offset
   1.250 -//	xchgw	%ax, %di	// compute decompressed size
   1.251 -//	subw	%di, %ax
   1.252 -	ret			// done
   1.253 +#if !defined(FLAT16OUT) && !defined(FLAT32)
   1.254 +	xchg	AX, SI		// save %si	
   1.255 +	lea	(BX,DI), SI
   1.256 +	pushw	%ds
   1.257 +	movw	%es, %bp
   1.258 +	cmpw	%si, %di
   1.259 +	jnc	lzsa1sameSeg
   1.260 +	pushw	%si
   1.261 +# ifdef ONLY8086
   1.262 +	pushw	%cx
   1.263 +# endif
   1.264 +	shrclw	$4, %si
   1.265 +# ifdef ONLY8086
   1.266 +	popw	%cx
   1.267 +# endif
   1.268 +	lea	-4096(%bp,%si), %bp
   1.269 +	popw	%si
   1.270 +	andw	$0xF, %si
   1.271 +lzsa1sameSeg:
   1.272 +	movw	%bp, %ds
   1.273 +# ifdef FASTFILL
   1.274 +	cmp	$-FASTFILL,BX
   1.275 +	jae	lzsa1fast
   1.276 +# endif
   1.277 +	rep movsb
   1.278 +lzsa1chunkz:
   1.279 +	popw	%ds
   1.280 +#else
   1.281 +# ifdef FASTFILL
   1.282 +	cmp	$-FASTFILL,BX
   1.283 +	jae	lzsa1fast
   1.284 +# endif
   1.285 +	xchg	AX, SI		// save %si	
   1.286 +	lea	(BX,DI), SI
   1.287 +# ifdef ONLY8086
   1.288 +lzsa2movsb:
   1.289 +	movsb	%es:(SI), %es:(DI)	//  NMOS 8088/8086 workaround.
   1.290 +	loop	lzsa2movsb
   1.291 +# else
   1.292 +	rep movsb	%es:(SI), %es:(DI)
   1.293 +# endif
   1.294 +#define lzsa1chunkz lzsa1chunk
   1.295 +#endif
   1.296 +	xchg	AX, SI		// restore %si	
   1.297 +	jmp	lzsa1chunk
   1.298 +#ifdef FASTFILL
   1.299 +lzsa1fast:
   1.300 +# if FASTFILL == 1
   1.301 +#  if !defined(FLAT16OUT) && !defined(FLAT32)
   1.302 +	lodsb
   1.303 +#  else
   1.304 +	movb	%es:(BX,DI), %al
   1.305 +#  endif
   1.306 +	rep stosb
   1.307 +# endif
   1.308 +# if FASTFILL == 2
   1.309 +#  if !defined(FLAT16OUT) && !defined(FLAT32)
   1.310 +	lodsw
   1.311 +#  else
   1.312 +	movw	%es:(BX,DI), %ax
   1.313 +#  endif
   1.314 +	je	lzsa1fastword
   1.315 +	movb	%ah, %al
   1.316 +lzsa1fastword:
   1.317 +	shr	$1, CX
   1.318 +	rep stosw
   1.319 +	jnc	lzsa1chunkz
   1.320 +	stosb
   1.321 +# endif
   1.322 +	jmp	lzsa1chunkz
   1.323 +#endif
   1.324 +
   1.325 +lzsa1len:			// get length in %ecx
   1.326 +	andb	%cl, %al
   1.327 +	cbw			// clear %ah
   1.328 +	cmpb	%al, %cl
   1.329 +	jne	lzsa1minNumber	// S=0-6, L=0-14		%cx = %ch + %al   if (%al & %cl != %cl)
   1.330 +	addb	%al, %ch
   1.331 +	lodsb
   1.332 +lzsa1minNumber:
   1.333 +	addb	%ch, %al
   1.334 +	jnc	lzsa1gotNumber  // 0-255			%cx = %ch + %cl + byte   if (%al & %cl == %cl && %ch + %cl + byte < 0x100)
   1.335 +	movb	%al, %ah	// S=256-1791, L=256-4607 or S=256-511, L=256-511
   1.336 +	jne	lzsa1midNumber
   1.337 +	lodsw			// 0-65535			%cx = word   if (%al & %cl == %cl && %ch + %cl + byte == 0x100)
   1.338 +	.byte	0x3C		// mask lodsb with cmpb $0xAC, %al
   1.339 +lzsa1midNumber:
   1.340 +	lodsb			//				%cx = (%ch + %cl + byte)*256 + byte2   if (%al & %cl == %cl && %ch + %cl + byte > 0x100)
   1.341 +lzsa1gotNumber:
   1.342 +	xchgw	%ax, %cx
   1.343 +lzsa1quit:
   1.344 +	ret