From 554fd8c5195424bdbcabf5de30fdc183aba391bd Mon Sep 17 00:00:00 2001 From: upstream source tree Date: Sun, 15 Mar 2015 20:14:05 -0400 Subject: obtained gcc-4.6.4.tar.bz2 from upstream website; verified gcc-4.6.4.tar.bz2.sig; imported gcc-4.6.4 source tree from verified upstream tarball. downloading a git-generated archive based on the 'upstream' tag should provide you with a source tree that is binary identical to the one extracted from the above tarball. if you have obtained the source via the command 'git clone', however, do note that line-endings of files in your working directory might differ from line-endings of the respective files in the upstream repository. --- zlib/contrib/masmx86/bld_ml32.bat | 2 + zlib/contrib/masmx86/gvmat32.asm | 972 +++++++++++++++++++++++++++++++++ zlib/contrib/masmx86/gvmat32.obj | Bin 0 -> 10241 bytes zlib/contrib/masmx86/gvmat32c.c | 62 +++ zlib/contrib/masmx86/inffas32.asm | 1083 +++++++++++++++++++++++++++++++++++++ zlib/contrib/masmx86/inffas32.obj | Bin 0 -> 14893 bytes zlib/contrib/masmx86/mkasm.bat | 3 + zlib/contrib/masmx86/readme.txt | 21 + 8 files changed, 2143 insertions(+) create mode 100644 zlib/contrib/masmx86/bld_ml32.bat create mode 100644 zlib/contrib/masmx86/gvmat32.asm create mode 100644 zlib/contrib/masmx86/gvmat32.obj create mode 100644 zlib/contrib/masmx86/gvmat32c.c create mode 100644 zlib/contrib/masmx86/inffas32.asm create mode 100644 zlib/contrib/masmx86/inffas32.obj create mode 100755 zlib/contrib/masmx86/mkasm.bat create mode 100644 zlib/contrib/masmx86/readme.txt (limited to 'zlib/contrib/masmx86') diff --git a/zlib/contrib/masmx86/bld_ml32.bat b/zlib/contrib/masmx86/bld_ml32.bat new file mode 100644 index 000000000..99144d07a --- /dev/null +++ b/zlib/contrib/masmx86/bld_ml32.bat @@ -0,0 +1,2 @@ +ml /coff /Zi /c /Flgvmat32.lst gvmat32.asm +ml /coff /Zi /c /Flinffas32.lst inffas32.asm diff --git a/zlib/contrib/masmx86/gvmat32.asm b/zlib/contrib/masmx86/gvmat32.asm new file mode 100644 index 000000000..874bb2d48 --- /dev/null +++ b/zlib/contrib/masmx86/gvmat32.asm @@ -0,0 +1,972 @@ +; gvmat32.asm -- Asm portion of the optimized longest_match for 32 bits x86 +; Copyright (C) 1995-1996 Jean-loup Gailly and Gilles Vollant. +; File written by Gilles Vollant, by modifiying the longest_match +; from Jean-loup Gailly in deflate.c +; +; http://www.zlib.net +; http://www.winimage.com/zLibDll +; http://www.muppetlabs.com/~breadbox/software/assembly.html +; +; For Visual C++ 4.x and higher and ML 6.x and higher +; ml.exe is in directory \MASM611C of Win95 DDK +; ml.exe is also distributed in http://www.masm32.com/masmdl.htm +; and in VC++2003 toolkit at http://msdn.microsoft.com/visualc/vctoolkit2003/ +; +; this file contain two implementation of longest_match +; +; longest_match_7fff : written 1996 by Gilles Vollant optimized for +; first Pentium. Assume s->w_mask == 0x7fff +; longest_match_686 : written by Brian raiter (1998), optimized for Pentium Pro +; +; for using an seembly version of longest_match, you need define ASMV in project +; There is two way in using gvmat32.asm +; +; A) Suggested method +; if you want include both longest_match_7fff and longest_match_686 +; compile the asm file running +; ml /coff /Zi /Flgvmat32.lst /c gvmat32.asm +; and include gvmat32c.c in your project +; if you have an old cpu (386,486 or first Pentium) and s->w_mask==0x7fff, +; longest_match_7fff will be used +; if you have a more modern CPU (Pentium Pro, II and higher) +; longest_match_686 will be used +; on old cpu with s->w_mask!=0x7fff, longest_match_686 will be used, +; but this is not a sitation you'll find often +; +; B) Alternative +; if you are not interresed in old cpu performance and want the smaller +; binaries possible +; +; compile the asm file running +; ml /coff /Zi /c /Flgvmat32.lst /DNOOLDPENTIUMCODE gvmat32.asm +; and do not include gvmat32c.c in your project (ou define also +; NOOLDPENTIUMCODE) +; +; note : as I known, longest_match_686 is very faster than longest_match_7fff +; on pentium Pro/II/III, faster (but less) in P4, but it seem +; longest_match_7fff can be faster (very very litte) on AMD Athlon64/K8 +; +; see below : zlib1222add must be adjuster if you use a zlib version < 1.2.2.2 + +;uInt longest_match_7fff(s, cur_match) +; deflate_state *s; +; IPos cur_match; /* current match */ + + NbStack equ 76 + cur_match equ dword ptr[esp+NbStack-0] + str_s equ dword ptr[esp+NbStack-4] +; 5 dword on top (ret,ebp,esi,edi,ebx) + adrret equ dword ptr[esp+NbStack-8] + pushebp equ dword ptr[esp+NbStack-12] + pushedi equ dword ptr[esp+NbStack-16] + pushesi equ dword ptr[esp+NbStack-20] + pushebx equ dword ptr[esp+NbStack-24] + + chain_length equ dword ptr [esp+NbStack-28] + limit equ dword ptr [esp+NbStack-32] + best_len equ dword ptr [esp+NbStack-36] + window equ dword ptr [esp+NbStack-40] + prev equ dword ptr [esp+NbStack-44] + scan_start equ word ptr [esp+NbStack-48] + wmask equ dword ptr [esp+NbStack-52] + match_start_ptr equ dword ptr [esp+NbStack-56] + nice_match equ dword ptr [esp+NbStack-60] + scan equ dword ptr [esp+NbStack-64] + + windowlen equ dword ptr [esp+NbStack-68] + match_start equ dword ptr [esp+NbStack-72] + strend equ dword ptr [esp+NbStack-76] + NbStackAdd equ (NbStack-24) + + .386p + + name gvmatch + .MODEL FLAT + + + +; all the +zlib1222add offsets are due to the addition of fields +; in zlib in the deflate_state structure since the asm code was first written +; (if you compile with zlib 1.0.4 or older, use "zlib1222add equ (-4)"). +; (if you compile with zlib between 1.0.5 and 1.2.2.1, use "zlib1222add equ 0"). +; if you compile with zlib 1.2.2.2 or later , use "zlib1222add equ 8"). + + zlib1222add equ 8 + +; Note : these value are good with a 8 bytes boundary pack structure + dep_chain_length equ 74h+zlib1222add + dep_window equ 30h+zlib1222add + dep_strstart equ 64h+zlib1222add + dep_prev_length equ 70h+zlib1222add + dep_nice_match equ 88h+zlib1222add + dep_w_size equ 24h+zlib1222add + dep_prev equ 38h+zlib1222add + dep_w_mask equ 2ch+zlib1222add + dep_good_match equ 84h+zlib1222add + dep_match_start equ 68h+zlib1222add + dep_lookahead equ 6ch+zlib1222add + + +_TEXT segment + +IFDEF NOUNDERLINE + IFDEF NOOLDPENTIUMCODE + public longest_match + public match_init + ELSE + public longest_match_7fff + public cpudetect32 + public longest_match_686 + ENDIF +ELSE + IFDEF NOOLDPENTIUMCODE + public _longest_match + public _match_init + ELSE + public _longest_match_7fff + public _cpudetect32 + public _longest_match_686 + ENDIF +ENDIF + + MAX_MATCH equ 258 + MIN_MATCH equ 3 + MIN_LOOKAHEAD equ (MAX_MATCH+MIN_MATCH+1) + + + +IFNDEF NOOLDPENTIUMCODE +IFDEF NOUNDERLINE +longest_match_7fff proc near +ELSE +_longest_match_7fff proc near +ENDIF + + mov edx,[esp+4] + + + + push ebp + push edi + push esi + push ebx + + sub esp,NbStackAdd + +; initialize or check the variables used in match.asm. + mov ebp,edx + +; chain_length = s->max_chain_length +; if (prev_length>=good_match) chain_length >>= 2 + mov edx,[ebp+dep_chain_length] + mov ebx,[ebp+dep_prev_length] + cmp [ebp+dep_good_match],ebx + ja noshr + shr edx,2 +noshr: +; we increment chain_length because in the asm, the --chain_lenght is in the beginning of the loop + inc edx + mov edi,[ebp+dep_nice_match] + mov chain_length,edx + mov eax,[ebp+dep_lookahead] + cmp eax,edi +; if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead; + jae nolookaheadnicematch + mov edi,eax +nolookaheadnicematch: +; best_len = s->prev_length + mov best_len,ebx + +; window = s->window + mov esi,[ebp+dep_window] + mov ecx,[ebp+dep_strstart] + mov window,esi + + mov nice_match,edi +; scan = window + strstart + add esi,ecx + mov scan,esi +; dx = *window + mov dx,word ptr [esi] +; bx = *(window+best_len-1) + mov bx,word ptr [esi+ebx-1] + add esi,MAX_MATCH-1 +; scan_start = *scan + mov scan_start,dx +; strend = scan + MAX_MATCH-1 + mov strend,esi +; bx = scan_end = *(window+best_len-1) + +; IPos limit = s->strstart > (IPos)MAX_DIST(s) ? +; s->strstart - (IPos)MAX_DIST(s) : NIL; + + mov esi,[ebp+dep_w_size] + sub esi,MIN_LOOKAHEAD +; here esi = MAX_DIST(s) + sub ecx,esi + ja nodist + xor ecx,ecx +nodist: + mov limit,ecx + +; prev = s->prev + mov edx,[ebp+dep_prev] + mov prev,edx + +; + mov edx,dword ptr [ebp+dep_match_start] + mov bp,scan_start + mov eax,cur_match + mov match_start,edx + + mov edx,window + mov edi,edx + add edi,best_len + mov esi,prev + dec edi +; windowlen = window + best_len -1 + mov windowlen,edi + + jmp beginloop2 + align 4 + +; here, in the loop +; eax = ax = cur_match +; ecx = limit +; bx = scan_end +; bp = scan_start +; edi = windowlen (window + best_len -1) +; esi = prev + + +;// here; chain_length <=16 +normalbeg0add16: + add chain_length,16 + jz exitloop +normalbeg0: + cmp word ptr[edi+eax],bx + je normalbeg2noroll +rcontlabnoroll: +; cur_match = prev[cur_match & wmask] + and eax,7fffh + mov ax,word ptr[esi+eax*2] +; if cur_match > limit, go to exitloop + cmp ecx,eax + jnb exitloop +; if --chain_length != 0, go to exitloop + dec chain_length + jnz normalbeg0 + jmp exitloop + +normalbeg2noroll: +; if (scan_start==*(cur_match+window)) goto normalbeg2 + cmp bp,word ptr[edx+eax] + jne rcontlabnoroll + jmp normalbeg2 + +contloop3: + mov edi,windowlen + +; cur_match = prev[cur_match & wmask] + and eax,7fffh + mov ax,word ptr[esi+eax*2] +; if cur_match > limit, go to exitloop + cmp ecx,eax +jnbexitloopshort1: + jnb exitloop +; if --chain_length != 0, go to exitloop + + +; begin the main loop +beginloop2: + sub chain_length,16+1 +; if chain_length <=16, don't use the unrolled loop + jna normalbeg0add16 + +do16: + cmp word ptr[edi+eax],bx + je normalbeg2dc0 + +maccn MACRO lab + and eax,7fffh + mov ax,word ptr[esi+eax*2] + cmp ecx,eax + jnb exitloop + cmp word ptr[edi+eax],bx + je lab + ENDM + +rcontloop0: + maccn normalbeg2dc1 + +rcontloop1: + maccn normalbeg2dc2 + +rcontloop2: + maccn normalbeg2dc3 + +rcontloop3: + maccn normalbeg2dc4 + +rcontloop4: + maccn normalbeg2dc5 + +rcontloop5: + maccn normalbeg2dc6 + +rcontloop6: + maccn normalbeg2dc7 + +rcontloop7: + maccn normalbeg2dc8 + +rcontloop8: + maccn normalbeg2dc9 + +rcontloop9: + maccn normalbeg2dc10 + +rcontloop10: + maccn short normalbeg2dc11 + +rcontloop11: + maccn short normalbeg2dc12 + +rcontloop12: + maccn short normalbeg2dc13 + +rcontloop13: + maccn short normalbeg2dc14 + +rcontloop14: + maccn short normalbeg2dc15 + +rcontloop15: + and eax,7fffh + mov ax,word ptr[esi+eax*2] + cmp ecx,eax + jnb exitloop + + sub chain_length,16 + ja do16 + jmp normalbeg0add16 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +normbeg MACRO rcontlab,valsub +; if we are here, we know that *(match+best_len-1) == scan_end + cmp bp,word ptr[edx+eax] +; if (match != scan_start) goto rcontlab + jne rcontlab +; calculate the good chain_length, and we'll compare scan and match string + add chain_length,16-valsub + jmp iseq + ENDM + + +normalbeg2dc11: + normbeg rcontloop11,11 + +normalbeg2dc12: + normbeg short rcontloop12,12 + +normalbeg2dc13: + normbeg short rcontloop13,13 + +normalbeg2dc14: + normbeg short rcontloop14,14 + +normalbeg2dc15: + normbeg short rcontloop15,15 + +normalbeg2dc10: + normbeg rcontloop10,10 + +normalbeg2dc9: + normbeg rcontloop9,9 + +normalbeg2dc8: + normbeg rcontloop8,8 + +normalbeg2dc7: + normbeg rcontloop7,7 + +normalbeg2dc6: + normbeg rcontloop6,6 + +normalbeg2dc5: + normbeg rcontloop5,5 + +normalbeg2dc4: + normbeg rcontloop4,4 + +normalbeg2dc3: + normbeg rcontloop3,3 + +normalbeg2dc2: + normbeg rcontloop2,2 + +normalbeg2dc1: + normbeg rcontloop1,1 + +normalbeg2dc0: + normbeg rcontloop0,0 + + +; we go in normalbeg2 because *(ushf*)(match+best_len-1) == scan_end + +normalbeg2: + mov edi,window + + cmp bp,word ptr[edi+eax] + jne contloop3 ; if *(ushf*)match != scan_start, continue + +iseq: +; if we are here, we know that *(match+best_len-1) == scan_end +; and (match == scan_start) + + mov edi,edx + mov esi,scan ; esi = scan + add edi,eax ; edi = window + cur_match = match + + mov edx,[esi+3] ; compare manually dword at match+3 + xor edx,[edi+3] ; and scan +3 + + jz begincompare ; if equal, go to long compare + +; we will determine the unmatch byte and calculate len (in esi) + or dl,dl + je eq1rr + mov esi,3 + jmp trfinval +eq1rr: + or dx,dx + je eq1 + + mov esi,4 + jmp trfinval +eq1: + and edx,0ffffffh + jz eq11 + mov esi,5 + jmp trfinval +eq11: + mov esi,6 + jmp trfinval + +begincompare: + ; here we now scan and match begin same + add edi,6 + add esi,6 + mov ecx,(MAX_MATCH-(2+4))/4 ; scan for at most MAX_MATCH bytes + repe cmpsd ; loop until mismatch + + je trfin ; go to trfin if not unmatch +; we determine the unmatch byte + sub esi,4 + mov edx,[edi-4] + xor edx,[esi] + + or dl,dl + jnz trfin + inc esi + + or dx,dx + jnz trfin + inc esi + + and edx,0ffffffh + jnz trfin + inc esi + +trfin: + sub esi,scan ; esi = len +trfinval: +; here we have finised compare, and esi contain len of equal string + cmp esi,best_len ; if len > best_len, go newbestlen + ja short newbestlen +; now we restore edx, ecx and esi, for the big loop + mov esi,prev + mov ecx,limit + mov edx,window + jmp contloop3 + +newbestlen: + mov best_len,esi ; len become best_len + + mov match_start,eax ; save new position as match_start + cmp esi,nice_match ; if best_len >= nice_match, exit + jae exitloop + mov ecx,scan + mov edx,window ; restore edx=window + add ecx,esi + add esi,edx + + dec esi + mov windowlen,esi ; windowlen = window + best_len-1 + mov bx,[ecx-1] ; bx = *(scan+best_len-1) = scan_end + +; now we restore ecx and esi, for the big loop : + mov esi,prev + mov ecx,limit + jmp contloop3 + +exitloop: +; exit : s->match_start=match_start + mov ebx,match_start + mov ebp,str_s + mov ecx,best_len + mov dword ptr [ebp+dep_match_start],ebx + mov eax,dword ptr [ebp+dep_lookahead] + cmp ecx,eax + ja minexlo + mov eax,ecx +minexlo: +; return min(best_len,s->lookahead) + +; restore stack and register ebx,esi,edi,ebp + add esp,NbStackAdd + + pop ebx + pop esi + pop edi + pop ebp + ret +InfoAuthor: +; please don't remove this string ! +; Your are free use gvmat32 in any fre or commercial apps if you don't remove the string in the binary! + db 0dh,0ah,"GVMat32 optimised assembly code written 1996-98 by Gilles Vollant",0dh,0ah + + + +IFDEF NOUNDERLINE +longest_match_7fff endp +ELSE +_longest_match_7fff endp +ENDIF + + +IFDEF NOUNDERLINE +cpudetect32 proc near +ELSE +_cpudetect32 proc near +ENDIF + + push ebx + + pushfd ; push original EFLAGS + pop eax ; get original EFLAGS + mov ecx, eax ; save original EFLAGS + xor eax, 40000h ; flip AC bit in EFLAGS + push eax ; save new EFLAGS value on stack + popfd ; replace current EFLAGS value + pushfd ; get new EFLAGS + pop eax ; store new EFLAGS in EAX + xor eax, ecx ; can’t toggle AC bit, processor=80386 + jz end_cpu_is_386 ; jump if 80386 processor + push ecx + popfd ; restore AC bit in EFLAGS first + + pushfd + pushfd + pop ecx + + mov eax, ecx ; get original EFLAGS + xor eax, 200000h ; flip ID bit in EFLAGS + push eax ; save new EFLAGS value on stack + popfd ; replace current EFLAGS value + pushfd ; get new EFLAGS + pop eax ; store new EFLAGS in EAX + popfd ; restore original EFLAGS + xor eax, ecx ; can’t toggle ID bit, + je is_old_486 ; processor=old + + mov eax,1 + db 0fh,0a2h ;CPUID + +exitcpudetect: + pop ebx + ret + +end_cpu_is_386: + mov eax,0300h + jmp exitcpudetect + +is_old_486: + mov eax,0400h + jmp exitcpudetect + +IFDEF NOUNDERLINE +cpudetect32 endp +ELSE +_cpudetect32 endp +ENDIF +ENDIF + +MAX_MATCH equ 258 +MIN_MATCH equ 3 +MIN_LOOKAHEAD equ (MAX_MATCH + MIN_MATCH + 1) +MAX_MATCH_8_ equ ((MAX_MATCH + 7) AND 0FFF0h) + + +;;; stack frame offsets + +chainlenwmask equ esp + 0 ; high word: current chain len + ; low word: s->wmask +window equ esp + 4 ; local copy of s->window +windowbestlen equ esp + 8 ; s->window + bestlen +scanstart equ esp + 16 ; first two bytes of string +scanend equ esp + 12 ; last two bytes of string +scanalign equ esp + 20 ; dword-misalignment of string +nicematch equ esp + 24 ; a good enough match size +bestlen equ esp + 28 ; size of best match so far +scan equ esp + 32 ; ptr to string wanting match + +LocalVarsSize equ 36 +; saved ebx byte esp + 36 +; saved edi byte esp + 40 +; saved esi byte esp + 44 +; saved ebp byte esp + 48 +; return address byte esp + 52 +deflatestate equ esp + 56 ; the function arguments +curmatch equ esp + 60 + +;;; Offsets for fields in the deflate_state structure. These numbers +;;; are calculated from the definition of deflate_state, with the +;;; assumption that the compiler will dword-align the fields. (Thus, +;;; changing the definition of deflate_state could easily cause this +;;; program to crash horribly, without so much as a warning at +;;; compile time. Sigh.) + +dsWSize equ 36+zlib1222add +dsWMask equ 44+zlib1222add +dsWindow equ 48+zlib1222add +dsPrev equ 56+zlib1222add +dsMatchLen equ 88+zlib1222add +dsPrevMatch equ 92+zlib1222add +dsStrStart equ 100+zlib1222add +dsMatchStart equ 104+zlib1222add +dsLookahead equ 108+zlib1222add +dsPrevLen equ 112+zlib1222add +dsMaxChainLen equ 116+zlib1222add +dsGoodMatch equ 132+zlib1222add +dsNiceMatch equ 136+zlib1222add + + +;;; match.asm -- Pentium-Pro-optimized version of longest_match() +;;; Written for zlib 1.1.2 +;;; Copyright (C) 1998 Brian Raiter +;;; You can look at http://www.muppetlabs.com/~breadbox/software/assembly.html +;;; +;;; This is free software; you can redistribute it and/or modify it +;;; under the terms of the GNU General Public License. + +;GLOBAL _longest_match, _match_init + + +;SECTION .text + +;;; uInt longest_match(deflate_state *deflatestate, IPos curmatch) + +;_longest_match: +IFDEF NOOLDPENTIUMCODE + IFDEF NOUNDERLINE + longest_match proc near + ELSE + _longest_match proc near + ENDIF +ELSE + IFDEF NOUNDERLINE + longest_match_686 proc near + ELSE + _longest_match_686 proc near + ENDIF +ENDIF + +;;; Save registers that the compiler may be using, and adjust esp to +;;; make room for our stack frame. + + push ebp + push edi + push esi + push ebx + sub esp, LocalVarsSize + +;;; Retrieve the function arguments. ecx will hold cur_match +;;; throughout the entire function. edx will hold the pointer to the +;;; deflate_state structure during the function's setup (before +;;; entering the main loop. + + mov edx, [deflatestate] + mov ecx, [curmatch] + +;;; uInt wmask = s->w_mask; +;;; unsigned chain_length = s->max_chain_length; +;;; if (s->prev_length >= s->good_match) { +;;; chain_length >>= 2; +;;; } + + mov eax, [edx + dsPrevLen] + mov ebx, [edx + dsGoodMatch] + cmp eax, ebx + mov eax, [edx + dsWMask] + mov ebx, [edx + dsMaxChainLen] + jl LastMatchGood + shr ebx, 2 +LastMatchGood: + +;;; chainlen is decremented once beforehand so that the function can +;;; use the sign flag instead of the zero flag for the exit test. +;;; It is then shifted into the high word, to make room for the wmask +;;; value, which it will always accompany. + + dec ebx + shl ebx, 16 + or ebx, eax + mov [chainlenwmask], ebx + +;;; if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead; + + mov eax, [edx + dsNiceMatch] + mov ebx, [edx + dsLookahead] + cmp ebx, eax + jl LookaheadLess + mov ebx, eax +LookaheadLess: mov [nicematch], ebx + +;;; register Bytef *scan = s->window + s->strstart; + + mov esi, [edx + dsWindow] + mov [window], esi + mov ebp, [edx + dsStrStart] + lea edi, [esi + ebp] + mov [scan], edi + +;;; Determine how many bytes the scan ptr is off from being +;;; dword-aligned. + + mov eax, edi + neg eax + and eax, 3 + mov [scanalign], eax + +;;; IPos limit = s->strstart > (IPos)MAX_DIST(s) ? +;;; s->strstart - (IPos)MAX_DIST(s) : NIL; + + mov eax, [edx + dsWSize] + sub eax, MIN_LOOKAHEAD + sub ebp, eax + jg LimitPositive + xor ebp, ebp +LimitPositive: + +;;; int best_len = s->prev_length; + + mov eax, [edx + dsPrevLen] + mov [bestlen], eax + +;;; Store the sum of s->window + best_len in esi locally, and in esi. + + add esi, eax + mov [windowbestlen], esi + +;;; register ush scan_start = *(ushf*)scan; +;;; register ush scan_end = *(ushf*)(scan+best_len-1); +;;; Posf *prev = s->prev; + + movzx ebx, word ptr [edi] + mov [scanstart], ebx + movzx ebx, word ptr [edi + eax - 1] + mov [scanend], ebx + mov edi, [edx + dsPrev] + +;;; Jump into the main loop. + + mov edx, [chainlenwmask] + jmp short LoopEntry + +align 4 + +;;; do { +;;; match = s->window + cur_match; +;;; if (*(ushf*)(match+best_len-1) != scan_end || +;;; *(ushf*)match != scan_start) continue; +;;; [...] +;;; } while ((cur_match = prev[cur_match & wmask]) > limit +;;; && --chain_length != 0); +;;; +;;; Here is the inner loop of the function. The function will spend the +;;; majority of its time in this loop, and majority of that time will +;;; be spent in the first ten instructions. +;;; +;;; Within this loop: +;;; ebx = scanend +;;; ecx = curmatch +;;; edx = chainlenwmask - i.e., ((chainlen << 16) | wmask) +;;; esi = windowbestlen - i.e., (window + bestlen) +;;; edi = prev +;;; ebp = limit + +LookupLoop: + and ecx, edx + movzx ecx, word ptr [edi + ecx*2] + cmp ecx, ebp + jbe LeaveNow + sub edx, 00010000h + js LeaveNow +LoopEntry: movzx eax, word ptr [esi + ecx - 1] + cmp eax, ebx + jnz LookupLoop + mov eax, [window] + movzx eax, word ptr [eax + ecx] + cmp eax, [scanstart] + jnz LookupLoop + +;;; Store the current value of chainlen. + + mov [chainlenwmask], edx + +;;; Point edi to the string under scrutiny, and esi to the string we +;;; are hoping to match it up with. In actuality, esi and edi are +;;; both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and edx is +;;; initialized to -(MAX_MATCH_8 - scanalign). + + mov esi, [window] + mov edi, [scan] + add esi, ecx + mov eax, [scanalign] + mov edx, 0fffffef8h; -(MAX_MATCH_8) + lea edi, [edi + eax + 0108h] ;MAX_MATCH_8] + lea esi, [esi + eax + 0108h] ;MAX_MATCH_8] + +;;; Test the strings for equality, 8 bytes at a time. At the end, +;;; adjust edx so that it is offset to the exact byte that mismatched. +;;; +;;; We already know at this point that the first three bytes of the +;;; strings match each other, and they can be safely passed over before +;;; starting the compare loop. So what this code does is skip over 0-3 +;;; bytes, as much as necessary in order to dword-align the edi +;;; pointer. (esi will still be misaligned three times out of four.) +;;; +;;; It should be confessed that this loop usually does not represent +;;; much of the total running time. Replacing it with a more +;;; straightforward "rep cmpsb" would not drastically degrade +;;; performance. + +LoopCmps: + mov eax, [esi + edx] + xor eax, [edi + edx] + jnz LeaveLoopCmps + mov eax, [esi + edx + 4] + xor eax, [edi + edx + 4] + jnz LeaveLoopCmps4 + add edx, 8 + jnz LoopCmps + jmp short LenMaximum +LeaveLoopCmps4: add edx, 4 +LeaveLoopCmps: test eax, 0000FFFFh + jnz LenLower + add edx, 2 + shr eax, 16 +LenLower: sub al, 1 + adc edx, 0 + +;;; Calculate the length of the match. If it is longer than MAX_MATCH, +;;; then automatically accept it as the best possible match and leave. + + lea eax, [edi + edx] + mov edi, [scan] + sub eax, edi + cmp eax, MAX_MATCH + jge LenMaximum + +;;; If the length of the match is not longer than the best match we +;;; have so far, then forget it and return to the lookup loop. + + mov edx, [deflatestate] + mov ebx, [bestlen] + cmp eax, ebx + jg LongerMatch + mov esi, [windowbestlen] + mov edi, [edx + dsPrev] + mov ebx, [scanend] + mov edx, [chainlenwmask] + jmp LookupLoop + +;;; s->match_start = cur_match; +;;; best_len = len; +;;; if (len >= nice_match) break; +;;; scan_end = *(ushf*)(scan+best_len-1); + +LongerMatch: mov ebx, [nicematch] + mov [bestlen], eax + mov [edx + dsMatchStart], ecx + cmp eax, ebx + jge LeaveNow + mov esi, [window] + add esi, eax + mov [windowbestlen], esi + movzx ebx, word ptr [edi + eax - 1] + mov edi, [edx + dsPrev] + mov [scanend], ebx + mov edx, [chainlenwmask] + jmp LookupLoop + +;;; Accept the current string, with the maximum possible length. + +LenMaximum: mov edx, [deflatestate] + mov dword ptr [bestlen], MAX_MATCH + mov [edx + dsMatchStart], ecx + +;;; if ((uInt)best_len <= s->lookahead) return (uInt)best_len; +;;; return s->lookahead; + +LeaveNow: + mov edx, [deflatestate] + mov ebx, [bestlen] + mov eax, [edx + dsLookahead] + cmp ebx, eax + jg LookaheadRet + mov eax, ebx +LookaheadRet: + +;;; Restore the stack and return from whence we came. + + add esp, LocalVarsSize + pop ebx + pop esi + pop edi + pop ebp + + ret +; please don't remove this string ! +; Your can freely use gvmat32 in any free or commercial app if you don't remove the string in the binary! + db 0dh,0ah,"asm686 with masm, optimised assembly code from Brian Raiter, written 1998",0dh,0ah + + +IFDEF NOOLDPENTIUMCODE + IFDEF NOUNDERLINE + longest_match endp + ELSE + _longest_match endp + ENDIF + + IFDEF NOUNDERLINE + match_init proc near + ret + match_init endp + ELSE + _match_init proc near + ret + _match_init endp + ENDIF +ELSE + IFDEF NOUNDERLINE + longest_match_686 endp + ELSE + _longest_match_686 endp + ENDIF +ENDIF + +_TEXT ends +end diff --git a/zlib/contrib/masmx86/gvmat32.obj b/zlib/contrib/masmx86/gvmat32.obj new file mode 100644 index 000000000..ebb326238 Binary files /dev/null and b/zlib/contrib/masmx86/gvmat32.obj differ diff --git a/zlib/contrib/masmx86/gvmat32c.c b/zlib/contrib/masmx86/gvmat32c.c new file mode 100644 index 000000000..7ad2b2794 --- /dev/null +++ b/zlib/contrib/masmx86/gvmat32c.c @@ -0,0 +1,62 @@ +/* gvmat32.c -- C portion of the optimized longest_match for 32 bits x86 + * Copyright (C) 1995-1996 Jean-loup Gailly and Gilles Vollant. + * File written by Gilles Vollant, by modifiying the longest_match + * from Jean-loup Gailly in deflate.c + * it prepare all parameters and call the assembly longest_match_gvasm + * longest_match execute standard C code is wmask != 0x7fff + * (assembly code is faster with a fixed wmask) + * + * Read comment at beginning of gvmat32.asm for more information + */ + +#if defined(ASMV) && (!defined(NOOLDPENTIUMCODE)) +#include "deflate.h" + +/* if your C compiler don't add underline before function name, + define ADD_UNDERLINE_ASMFUNC */ +#ifdef ADD_UNDERLINE_ASMFUNC +#define longest_match_7fff _longest_match_7fff +#define longest_match_686 _longest_match_686 +#define cpudetect32 _cpudetect32 +#endif + + +unsigned long cpudetect32(); + +uInt longest_match_c( + deflate_state *s, + IPos cur_match); /* current match */ + + +uInt longest_match_7fff( + deflate_state *s, + IPos cur_match); /* current match */ + +uInt longest_match_686( + deflate_state *s, + IPos cur_match); /* current match */ + + +static uInt iIsPPro=2; + +void match_init () +{ + iIsPPro = (((cpudetect32()/0x100)&0xf)>=6) ? 1 : 0; +} + +uInt longest_match( + deflate_state *s, + IPos cur_match) /* current match */ +{ + if (iIsPPro!=0) + return longest_match_686(s,cur_match); + + if (s->w_mask != 0x7fff) + return longest_match_686(s,cur_match); + + /* now ((s->w_mask == 0x7fff) && (iIsPPro==0)) */ + return longest_match_7fff(s,cur_match); +} + + +#endif /* defined(ASMV) && (!defined(NOOLDPENTIUMCODE)) */ diff --git a/zlib/contrib/masmx86/inffas32.asm b/zlib/contrib/masmx86/inffas32.asm new file mode 100644 index 000000000..4a205125e --- /dev/null +++ b/zlib/contrib/masmx86/inffas32.asm @@ -0,0 +1,1083 @@ +;/* inffas32.asm is a hand tuned assembler version of inffast.c -- fast decoding +; * +; * inffas32.asm is derivated from inffas86.c, with translation of assembly code +; * +; * Copyright (C) 1995-2003 Mark Adler +; * For conditions of distribution and use, see copyright notice in zlib.h +; * +; * Copyright (C) 2003 Chris Anderson +; * Please use the copyright conditions above. +; * +; * Mar-13-2003 -- Most of this is derived from inffast.S which is derived from +; * the gcc -S output of zlib-1.2.0/inffast.c. Zlib-1.2.0 is in beta release at +; * the moment. I have successfully compiled and tested this code with gcc2.96, +; * gcc3.2, icc5.0, msvc6.0. It is very close to the speed of inffast.S +; * compiled with gcc -DNO_MMX, but inffast.S is still faster on the P3 with MMX +; * enabled. I will attempt to merge the MMX code into this version. Newer +; * versions of this and inffast.S can be found at +; * http://www.eetbeetee.com/zlib/ and http://www.charm.net/~christop/zlib/ +; * +; * 2005 : modification by Gilles Vollant +; */ +; For Visual C++ 4.x and higher and ML 6.x and higher +; ml.exe is in directory \MASM611C of Win95 DDK +; ml.exe is also distributed in http://www.masm32.com/masmdl.htm +; and in VC++2003 toolkit at http://msdn.microsoft.com/visualc/vctoolkit2003/ +; +; +; compile with command line option +; ml /coff /Zi /c /Flinffas32.lst inffas32.asm + +; if you define NO_GZIP (see inflate.h), compile with +; ml /coff /Zi /c /Flinffas32.lst /DNO_GUNZIP inffas32.asm + + +; zlib122sup is 0 fort zlib 1.2.2.1 and lower +; zlib122sup is 8 fort zlib 1.2.2.2 and more (with addition of dmax and head +; in inflate_state in inflate.h) +zlib1222sup equ 8 + + +IFDEF GUNZIP + INFLATE_MODE_TYPE equ 11 + INFLATE_MODE_BAD equ 26 +ELSE + IFNDEF NO_GUNZIP + INFLATE_MODE_TYPE equ 11 + INFLATE_MODE_BAD equ 26 + ELSE + INFLATE_MODE_TYPE equ 3 + INFLATE_MODE_BAD equ 17 + ENDIF +ENDIF + + +; 75 "inffast.S" +;FILE "inffast.S" + +;;;GLOBAL _inflate_fast + +;;;SECTION .text + + + + .586p + .mmx + + name inflate_fast_x86 + .MODEL FLAT + +_DATA segment +inflate_fast_use_mmx: + dd 1 + + +_TEXT segment +PUBLIC _inflate_fast + +ALIGN 4 +_inflate_fast: + jmp inflate_fast_entry + + + +ALIGN 4 + db 'Fast decoding Code from Chris Anderson' + db 0 + +ALIGN 4 +invalid_literal_length_code_msg: + db 'invalid literal/length code' + db 0 + +ALIGN 4 +invalid_distance_code_msg: + db 'invalid distance code' + db 0 + +ALIGN 4 +invalid_distance_too_far_msg: + db 'invalid distance too far back' + db 0 + + +ALIGN 4 +inflate_fast_mask: +dd 0 +dd 1 +dd 3 +dd 7 +dd 15 +dd 31 +dd 63 +dd 127 +dd 255 +dd 511 +dd 1023 +dd 2047 +dd 4095 +dd 8191 +dd 16383 +dd 32767 +dd 65535 +dd 131071 +dd 262143 +dd 524287 +dd 1048575 +dd 2097151 +dd 4194303 +dd 8388607 +dd 16777215 +dd 33554431 +dd 67108863 +dd 134217727 +dd 268435455 +dd 536870911 +dd 1073741823 +dd 2147483647 +dd 4294967295 + + +mode_state equ 0 ;/* state->mode */ +wsize_state equ (32+zlib1222sup) ;/* state->wsize */ +write_state equ (36+4+zlib1222sup) ;/* state->write */ +window_state equ (40+4+zlib1222sup) ;/* state->window */ +hold_state equ (44+4+zlib1222sup) ;/* state->hold */ +bits_state equ (48+4+zlib1222sup) ;/* state->bits */ +lencode_state equ (64+4+zlib1222sup) ;/* state->lencode */ +distcode_state equ (68+4+zlib1222sup) ;/* state->distcode */ +lenbits_state equ (72+4+zlib1222sup) ;/* state->lenbits */ +distbits_state equ (76+4+zlib1222sup) ;/* state->distbits */ + + +;;SECTION .text +; 205 "inffast.S" +;GLOBAL inflate_fast_use_mmx + +;SECTION .data + + +; GLOBAL inflate_fast_use_mmx:object +;.size inflate_fast_use_mmx, 4 +; 226 "inffast.S" +;SECTION .text + +ALIGN 4 +inflate_fast_entry: + push edi + push esi + push ebp + push ebx + pushfd + sub esp,64 + cld + + + + + mov esi, [esp+88] + mov edi, [esi+28] + + + + + + + + mov edx, [esi+4] + mov eax, [esi+0] + + add edx,eax + sub edx,11 + + mov [esp+44],eax + mov [esp+20],edx + + mov ebp, [esp+92] + mov ecx, [esi+16] + mov ebx, [esi+12] + + sub ebp,ecx + neg ebp + add ebp,ebx + + sub ecx,257 + add ecx,ebx + + mov [esp+60],ebx + mov [esp+40],ebp + mov [esp+16],ecx +; 285 "inffast.S" + mov eax, [edi+lencode_state] + mov ecx, [edi+distcode_state] + + mov [esp+8],eax + mov [esp+12],ecx + + mov eax,1 + mov ecx, [edi+lenbits_state] + shl eax,cl + dec eax + mov [esp+0],eax + + mov eax,1 + mov ecx, [edi+distbits_state] + shl eax,cl + dec eax + mov [esp+4],eax + + mov eax, [edi+wsize_state] + mov ecx, [edi+write_state] + mov edx, [edi+window_state] + + mov [esp+52],eax + mov [esp+48],ecx + mov [esp+56],edx + + mov ebp, [edi+hold_state] + mov ebx, [edi+bits_state] +; 321 "inffast.S" + mov esi, [esp+44] + mov ecx, [esp+20] + cmp ecx,esi + ja L_align_long + + add ecx,11 + sub ecx,esi + mov eax,12 + sub eax,ecx + lea edi, [esp+28] + rep movsb + mov ecx,eax + xor eax,eax + rep stosb + lea esi, [esp+28] + mov [esp+20],esi + jmp L_is_aligned + + +L_align_long: + test esi,3 + jz L_is_aligned + xor eax,eax + mov al, [esi] + inc esi + mov ecx,ebx + add ebx,8 + shl eax,cl + or ebp,eax + jmp L_align_long + +L_is_aligned: + mov edi, [esp+60] +; 366 "inffast.S" +L_check_mmx: + cmp dword ptr [inflate_fast_use_mmx],2 + je L_init_mmx + ja L_do_loop + + push eax + push ebx + push ecx + push edx + pushfd + mov eax, [esp] + xor dword ptr [esp],0200000h + + + + + popfd + pushfd + pop edx + xor edx,eax + jz L_dont_use_mmx + xor eax,eax + cpuid + cmp ebx,0756e6547h + jne L_dont_use_mmx + cmp ecx,06c65746eh + jne L_dont_use_mmx + cmp edx,049656e69h + jne L_dont_use_mmx + mov eax,1 + cpuid + shr eax,8 + and eax,15 + cmp eax,6 + jne L_dont_use_mmx + test edx,0800000h + jnz L_use_mmx + jmp L_dont_use_mmx +L_use_mmx: + mov dword ptr [inflate_fast_use_mmx],2 + jmp L_check_mmx_pop +L_dont_use_mmx: + mov dword ptr [inflate_fast_use_mmx],3 +L_check_mmx_pop: + pop edx + pop ecx + pop ebx + pop eax + jmp L_check_mmx +; 426 "inffast.S" +ALIGN 4 +L_do_loop: +; 437 "inffast.S" + cmp bl,15 + ja L_get_length_code + + xor eax,eax + lodsw + mov cl,bl + add bl,16 + shl eax,cl + or ebp,eax + +L_get_length_code: + mov edx, [esp+0] + mov ecx, [esp+8] + and edx,ebp + mov eax, [ecx+edx*4] + +L_dolen: + + + + + + + mov cl,ah + sub bl,ah + shr ebp,cl + + + + + + + test al,al + jnz L_test_for_length_base + + shr eax,16 + stosb + +L_while_test: + + + cmp [esp+16],edi + jbe L_break_loop + + cmp [esp+20],esi + ja L_do_loop + jmp L_break_loop + +L_test_for_length_base: +; 502 "inffast.S" + mov edx,eax + shr edx,16 + mov cl,al + + test al,16 + jz L_test_for_second_level_length + and cl,15 + jz L_save_len + cmp bl,cl + jae L_add_bits_to_len + + mov ch,cl + xor eax,eax + lodsw + mov cl,bl + add bl,16 + shl eax,cl + or ebp,eax + mov cl,ch + +L_add_bits_to_len: + mov eax,1 + shl eax,cl + dec eax + sub bl,cl + and eax,ebp + shr ebp,cl + add edx,eax + +L_save_len: + mov [esp+24],edx + + +L_decode_distance: +; 549 "inffast.S" + cmp bl,15 + ja L_get_distance_code + + xor eax,eax + lodsw + mov cl,bl + add bl,16 + shl eax,cl + or ebp,eax + +L_get_distance_code: + mov edx, [esp+4] + mov ecx, [esp+12] + and edx,ebp + mov eax, [ecx+edx*4] + + +L_dodist: + mov edx,eax + shr edx,16 + mov cl,ah + sub bl,ah + shr ebp,cl +; 584 "inffast.S" + mov cl,al + + test al,16 + jz L_test_for_second_level_dist + and cl,15 + jz L_check_dist_one + cmp bl,cl + jae L_add_bits_to_dist + + mov ch,cl + xor eax,eax + lodsw + mov cl,bl + add bl,16 + shl eax,cl + or ebp,eax + mov cl,ch + +L_add_bits_to_dist: + mov eax,1 + shl eax,cl + dec eax + sub bl,cl + and eax,ebp + shr ebp,cl + add edx,eax + jmp L_check_window + +L_check_window: +; 625 "inffast.S" + mov [esp+44],esi + mov eax,edi + sub eax, [esp+40] + + cmp eax,edx + jb L_clip_window + + mov ecx, [esp+24] + mov esi,edi + sub esi,edx + + sub ecx,3 + mov al, [esi] + mov [edi],al + mov al, [esi+1] + mov dl, [esi+2] + add esi,3 + mov [edi+1],al + mov [edi+2],dl + add edi,3 + rep movsb + + mov esi, [esp+44] + jmp L_while_test + +ALIGN 4 +L_check_dist_one: + cmp edx,1 + jne L_check_window + cmp [esp+40],edi + je L_check_window + + dec edi + mov ecx, [esp+24] + mov al, [edi] + sub ecx,3 + + mov [edi+1],al + mov [edi+2],al + mov [edi+3],al + add edi,4 + rep stosb + + jmp L_while_test + +ALIGN 4 +L_test_for_second_level_length: + + + + + test al,64 + jnz L_test_for_end_of_block + + mov eax,1 + shl eax,cl + dec eax + and eax,ebp + add eax,edx + mov edx, [esp+8] + mov eax, [edx+eax*4] + jmp L_dolen + +ALIGN 4 +L_test_for_second_level_dist: + + + + + test al,64 + jnz L_invalid_distance_code + + mov eax,1 + shl eax,cl + dec eax + and eax,ebp + add eax,edx + mov edx, [esp+12] + mov eax, [edx+eax*4] + jmp L_dodist + +ALIGN 4 +L_clip_window: +; 721 "inffast.S" + mov ecx,eax + mov eax, [esp+52] + neg ecx + mov esi, [esp+56] + + cmp eax,edx + jb L_invalid_distance_too_far + + add ecx,edx + cmp dword ptr [esp+48],0 + jne L_wrap_around_window + + sub eax,ecx + add esi,eax +; 749 "inffast.S" + mov eax, [esp+24] + cmp eax,ecx + jbe L_do_copy1 + + sub eax,ecx + rep movsb + mov esi,edi + sub esi,edx + jmp L_do_copy1 + + cmp eax,ecx + jbe L_do_copy1 + + sub eax,ecx + rep movsb + mov esi,edi + sub esi,edx + jmp L_do_copy1 + +L_wrap_around_window: +; 793 "inffast.S" + mov eax, [esp+48] + cmp ecx,eax + jbe L_contiguous_in_window + + add esi, [esp+52] + add esi,eax + sub esi,ecx + sub ecx,eax + + + mov eax, [esp+24] + cmp eax,ecx + jbe L_do_copy1 + + sub eax,ecx + rep movsb + mov esi, [esp+56] + mov ecx, [esp+48] + cmp eax,ecx + jbe L_do_copy1 + + sub eax,ecx + rep movsb + mov esi,edi + sub esi,edx + jmp L_do_copy1 + +L_contiguous_in_window: +; 836 "inffast.S" + add esi,eax + sub esi,ecx + + + mov eax, [esp+24] + cmp eax,ecx + jbe L_do_copy1 + + sub eax,ecx + rep movsb + mov esi,edi + sub esi,edx + +L_do_copy1: +; 862 "inffast.S" + mov ecx,eax + rep movsb + + mov esi, [esp+44] + jmp L_while_test +; 878 "inffast.S" +ALIGN 4 +L_init_mmx: + emms + + + + + + movd mm0,ebp + mov ebp,ebx +; 896 "inffast.S" + movd mm4,[esp+0] + movq mm3,mm4 + movd mm5,[esp+4] + movq mm2,mm5 + pxor mm1,mm1 + mov ebx, [esp+8] + jmp L_do_loop_mmx + +ALIGN 4 +L_do_loop_mmx: + psrlq mm0,mm1 + + cmp ebp,32 + ja L_get_length_code_mmx + + movd mm6,ebp + movd mm7,[esi] + add esi,4 + psllq mm7,mm6 + add ebp,32 + por mm0,mm7 + +L_get_length_code_mmx: + pand mm4,mm0 + movd eax,mm4 + movq mm4,mm3 + mov eax, [ebx+eax*4] + +L_dolen_mmx: + movzx ecx,ah + movd mm1,ecx + sub ebp,ecx + + test al,al + jnz L_test_for_length_base_mmx + + shr eax,16 + stosb + +L_while_test_mmx: + + + cmp [esp+16],edi + jbe L_break_loop + + cmp [esp+20],esi + ja L_do_loop_mmx + jmp L_break_loop + +L_test_for_length_base_mmx: + + mov edx,eax + shr edx,16 + + test al,16 + jz L_test_for_second_level_length_mmx + and eax,15 + jz L_decode_distance_mmx + + psrlq mm0,mm1 + movd mm1,eax + movd ecx,mm0 + sub ebp,eax + and ecx, [inflate_fast_mask+eax*4] + add edx,ecx + +L_decode_distance_mmx: + psrlq mm0,mm1 + + cmp ebp,32 + ja L_get_dist_code_mmx + + movd mm6,ebp + movd mm7,[esi] + add esi,4 + psllq mm7,mm6 + add ebp,32 + por mm0,mm7 + +L_get_dist_code_mmx: + mov ebx, [esp+12] + pand mm5,mm0 + movd eax,mm5 + movq mm5,mm2 + mov eax, [ebx+eax*4] + +L_dodist_mmx: + + movzx ecx,ah + mov ebx,eax + shr ebx,16 + sub ebp,ecx + movd mm1,ecx + + test al,16 + jz L_test_for_second_level_dist_mmx + and eax,15 + jz L_check_dist_one_mmx + +L_add_bits_to_dist_mmx: + psrlq mm0,mm1 + movd mm1,eax + movd ecx,mm0 + sub ebp,eax + and ecx, [inflate_fast_mask+eax*4] + add ebx,ecx + +L_check_window_mmx: + mov [esp+44],esi + mov eax,edi + sub eax, [esp+40] + + cmp eax,ebx + jb L_clip_window_mmx + + mov ecx,edx + mov esi,edi + sub esi,ebx + + sub ecx,3 + mov al, [esi] + mov [edi],al + mov al, [esi+1] + mov dl, [esi+2] + add esi,3 + mov [edi+1],al + mov [edi+2],dl + add edi,3 + rep movsb + + mov esi, [esp+44] + mov ebx, [esp+8] + jmp L_while_test_mmx + +ALIGN 4 +L_check_dist_one_mmx: + cmp ebx,1 + jne L_check_window_mmx + cmp [esp+40],edi + je L_check_window_mmx + + dec edi + mov ecx,edx + mov al, [edi] + sub ecx,3 + + mov [edi+1],al + mov [edi+2],al + mov [edi+3],al + add edi,4 + rep stosb + + mov ebx, [esp+8] + jmp L_while_test_mmx + +ALIGN 4 +L_test_for_second_level_length_mmx: + test al,64 + jnz L_test_for_end_of_block + + and eax,15 + psrlq mm0,mm1 + movd ecx,mm0 + and ecx, [inflate_fast_mask+eax*4] + add ecx,edx + mov eax, [ebx+ecx*4] + jmp L_dolen_mmx + +ALIGN 4 +L_test_for_second_level_dist_mmx: + test al,64 + jnz L_invalid_distance_code + + and eax,15 + psrlq mm0,mm1 + movd ecx,mm0 + and ecx, [inflate_fast_mask+eax*4] + mov eax, [esp+12] + add ecx,ebx + mov eax, [eax+ecx*4] + jmp L_dodist_mmx + +ALIGN 4 +L_clip_window_mmx: + + mov ecx,eax + mov eax, [esp+52] + neg ecx + mov esi, [esp+56] + + cmp eax,ebx + jb L_invalid_distance_too_far + + add ecx,ebx + cmp dword ptr [esp+48],0 + jne L_wrap_around_window_mmx + + sub eax,ecx + add esi,eax + + cmp edx,ecx + jbe L_do_copy1_mmx + + sub edx,ecx + rep movsb + mov esi,edi + sub esi,ebx + jmp L_do_copy1_mmx + + cmp edx,ecx + jbe L_do_copy1_mmx + + sub edx,ecx + rep movsb + mov esi,edi + sub esi,ebx + jmp L_do_copy1_mmx + +L_wrap_around_window_mmx: + + mov eax, [esp+48] + cmp ecx,eax + jbe L_contiguous_in_window_mmx + + add esi, [esp+52] + add esi,eax + sub esi,ecx + sub ecx,eax + + + cmp edx,ecx + jbe L_do_copy1_mmx + + sub edx,ecx + rep movsb + mov esi, [esp+56] + mov ecx, [esp+48] + cmp edx,ecx + jbe L_do_copy1_mmx + + sub edx,ecx + rep movsb + mov esi,edi + sub esi,ebx + jmp L_do_copy1_mmx + +L_contiguous_in_window_mmx: + + add esi,eax + sub esi,ecx + + + cmp edx,ecx + jbe L_do_copy1_mmx + + sub edx,ecx + rep movsb + mov esi,edi + sub esi,ebx + +L_do_copy1_mmx: + + + mov ecx,edx + rep movsb + + mov esi, [esp+44] + mov ebx, [esp+8] + jmp L_while_test_mmx +; 1174 "inffast.S" +L_invalid_distance_code: + + + + + + mov ecx, invalid_distance_code_msg + mov edx,INFLATE_MODE_BAD + jmp L_update_stream_state + +L_test_for_end_of_block: + + + + + + test al,32 + jz L_invalid_literal_length_code + + mov ecx,0 + mov edx,INFLATE_MODE_TYPE + jmp L_update_stream_state + +L_invalid_literal_length_code: + + + + + + mov ecx, invalid_literal_length_code_msg + mov edx,INFLATE_MODE_BAD + jmp L_update_stream_state + +L_invalid_distance_too_far: + + + + mov esi, [esp+44] + mov ecx, invalid_distance_too_far_msg + mov edx,INFLATE_MODE_BAD + jmp L_update_stream_state + +L_update_stream_state: + + mov eax, [esp+88] + test ecx,ecx + jz L_skip_msg + mov [eax+24],ecx +L_skip_msg: + mov eax, [eax+28] + mov [eax+mode_state],edx + jmp L_break_loop + +ALIGN 4 +L_break_loop: +; 1243 "inffast.S" + cmp dword ptr [inflate_fast_use_mmx],2 + jne L_update_next_in + + + + mov ebx,ebp + +L_update_next_in: +; 1266 "inffast.S" + mov eax, [esp+88] + mov ecx,ebx + mov edx, [eax+28] + shr ecx,3 + sub esi,ecx + shl ecx,3 + sub ebx,ecx + mov [eax+12],edi + mov [edx+bits_state],ebx + mov ecx,ebx + + lea ebx, [esp+28] + cmp [esp+20],ebx + jne L_buf_not_used + + sub esi,ebx + mov ebx, [eax+0] + mov [esp+20],ebx + add esi,ebx + mov ebx, [eax+4] + sub ebx,11 + add [esp+20],ebx + +L_buf_not_used: + mov [eax+0],esi + + mov ebx,1 + shl ebx,cl + dec ebx + + + + + + cmp dword ptr [inflate_fast_use_mmx],2 + jne L_update_hold + + + + psrlq mm0,mm1 + movd ebp,mm0 + + emms + +L_update_hold: + + + + and ebp,ebx + mov [edx+hold_state],ebp + + + + + mov ebx, [esp+20] + cmp ebx,esi + jbe L_last_is_smaller + + sub ebx,esi + add ebx,11 + mov [eax+4],ebx + jmp L_fixup_out +L_last_is_smaller: + sub esi,ebx + neg esi + add esi,11 + mov [eax+4],esi + + + + +L_fixup_out: + + mov ebx, [esp+16] + cmp ebx,edi + jbe L_end_is_smaller + + sub ebx,edi + add ebx,257 + mov [eax+16],ebx + jmp L_done +L_end_is_smaller: + sub edi,ebx + neg edi + add edi,257 + mov [eax+16],edi + + + + + +L_done: + add esp,64 + popfd + pop ebx + pop ebp + pop esi + pop edi + ret + +_TEXT ends +end diff --git a/zlib/contrib/masmx86/inffas32.obj b/zlib/contrib/masmx86/inffas32.obj new file mode 100644 index 000000000..bd6664d11 Binary files /dev/null and b/zlib/contrib/masmx86/inffas32.obj differ diff --git a/zlib/contrib/masmx86/mkasm.bat b/zlib/contrib/masmx86/mkasm.bat new file mode 100755 index 000000000..70a51f837 --- /dev/null +++ b/zlib/contrib/masmx86/mkasm.bat @@ -0,0 +1,3 @@ +cl /DASMV /I..\.. /O2 /c gvmat32c.c +ml /coff /Zi /c /Flgvmat32.lst gvmat32.asm +ml /coff /Zi /c /Flinffas32.lst inffas32.asm diff --git a/zlib/contrib/masmx86/readme.txt b/zlib/contrib/masmx86/readme.txt new file mode 100644 index 000000000..7b57167b7 --- /dev/null +++ b/zlib/contrib/masmx86/readme.txt @@ -0,0 +1,21 @@ + +Summary +------- +This directory contains ASM implementations of the functions +longest_match() and inflate_fast(). + + +Use instructions +---------------- +Copy these files into the zlib source directory, then run the +appropriate makefile, as suggested below. + + +Build instructions +------------------ +* With Microsoft C and MASM: +nmake -f win32/Makefile.msc LOC="-DASMV -DASMINF" OBJA="gvmat32c.obj gvmat32.obj inffas32.obj" + +* With Borland C and TASM: +make -f win32/Makefile.bor LOCAL_ZLIB="-DASMV -DASMINF" OBJA="gvmat32c.obj gvmat32.obj inffas32.obj" OBJPA="+gvmat32c.obj+gvmat32.obj+inffas32.obj" + -- cgit v1.2.3