]> git.ipfire.org Git - thirdparty/gcc.git/blame - zlib/contrib/masm686/match.asm
This commit was generated by cvs2svn to compensate for changes in r104181,
[thirdparty/gcc.git] / zlib / contrib / masm686 / match.asm
CommitLineData
fbac2044 1\r
2; match.asm -- Pentium-Pro optimized version of longest_match()\r
3;\r
4; Updated for zlib 1.1.3 and converted to MASM 6.1x\r
5; Copyright (C) 2000 Dan Higdon <hdan@kinesoft.com>\r
6; and Chuck Walbourn <chuckw@kinesoft.com>\r
7; Corrections by Cosmin Truta <cosmint@cs.ubbcluj.ro>\r
8;\r
9; This is free software; you can redistribute it and/or modify it\r
10; under the terms of the GNU General Public License.\r
11\r
12; Based on match.S\r
13; Written for zlib 1.1.2\r
14; Copyright (C) 1998 Brian Raiter <breadbox@muppetlabs.com>\r
15;\r
16; Modified by Gilles Vollant (2005) for add gzhead and gzindex\r
17\r
18 .686P\r
19 .MODEL FLAT\r
20\r
21;===========================================================================\r
22; EQUATES\r
23;===========================================================================\r
24\r
25MAX_MATCH EQU 258\r
26MIN_MATCH EQU 3\r
27MIN_LOOKAHEAD EQU (MAX_MATCH + MIN_MATCH + 1)\r
28MAX_MATCH_8 EQU ((MAX_MATCH + 7) AND (NOT 7))\r
29\r
30;===========================================================================\r
31; STRUCTURES\r
32;===========================================================================\r
33\r
34; This STRUCT assumes a 4-byte alignment\r
35\r
36DEFLATE_STATE STRUCT\r
37ds_strm dd ?\r
38ds_status dd ?\r
39ds_pending_buf dd ?\r
40ds_pending_buf_size dd ?\r
41ds_pending_out dd ?\r
42ds_pending dd ?\r
43ds_wrap dd ?\r
44; gzhead and gzindex are added in zlib 1.2.2.2 (see deflate.h)\r
45ds_gzhead dd ?\r
46ds_gzindex dd ?\r
47ds_data_type db ?\r
48ds_method db ?\r
49 db ? ; padding\r
50 db ? ; padding\r
51ds_last_flush dd ?\r
52ds_w_size dd ? ; used\r
53ds_w_bits dd ?\r
54ds_w_mask dd ? ; used\r
55ds_window dd ? ; used\r
56ds_window_size dd ?\r
57ds_prev dd ? ; used\r
58ds_head dd ?\r
59ds_ins_h dd ?\r
60ds_hash_size dd ?\r
61ds_hash_bits dd ?\r
62ds_hash_mask dd ?\r
63ds_hash_shift dd ?\r
64ds_block_start dd ?\r
65ds_match_length dd ? ; used\r
66ds_prev_match dd ? ; used\r
67ds_match_available dd ?\r
68ds_strstart dd ? ; used\r
69ds_match_start dd ? ; used\r
70ds_lookahead dd ? ; used\r
71ds_prev_length dd ? ; used\r
72ds_max_chain_length dd ? ; used\r
73ds_max_laxy_match dd ?\r
74ds_level dd ?\r
75ds_strategy dd ?\r
76ds_good_match dd ? ; used\r
77ds_nice_match dd ? ; used\r
78\r
79; Don't need anymore of the struct for match\r
80DEFLATE_STATE ENDS\r
81\r
82;===========================================================================\r
83; CODE\r
84;===========================================================================\r
85_TEXT SEGMENT\r
86\r
87;---------------------------------------------------------------------------\r
88; match_init\r
89;---------------------------------------------------------------------------\r
90 ALIGN 4\r
91PUBLIC _match_init\r
92_match_init PROC\r
93 ; no initialization needed\r
94 ret\r
95_match_init ENDP\r
96\r
97;---------------------------------------------------------------------------\r
98; uInt longest_match(deflate_state *deflatestate, IPos curmatch)\r
99;---------------------------------------------------------------------------\r
100 ALIGN 4\r
101\r
102PUBLIC _longest_match\r
103_longest_match PROC\r
104\r
105; Since this code uses EBP for a scratch register, the stack frame must\r
106; be manually constructed and referenced relative to the ESP register.\r
107\r
108; Stack image\r
109; Variables\r
110chainlenwmask = 0 ; high word: current chain len\r
111 ; low word: s->wmask\r
112window = 4 ; local copy of s->window\r
113windowbestlen = 8 ; s->window + bestlen\r
114scanend = 12 ; last two bytes of string\r
115scanstart = 16 ; first two bytes of string\r
116scanalign = 20 ; dword-misalignment of string\r
117nicematch = 24 ; a good enough match size\r
118bestlen = 28 ; size of best match so far\r
119scan = 32 ; ptr to string wanting match\r
120varsize = 36 ; number of bytes (also offset to last saved register)\r
121\r
122; Saved Registers (actually pushed into place)\r
123ebx_save = 36\r
124edi_save = 40\r
125esi_save = 44\r
126ebp_save = 48\r
127\r
128; Parameters\r
129retaddr = 52\r
130deflatestate = 56\r
131curmatch = 60\r
132\r
133; Save registers that the compiler may be using\r
134 push ebp\r
135 push edi\r
136 push esi\r
137 push ebx\r
138\r
139; Allocate local variable space\r
140 sub esp,varsize\r
141\r
142; Retrieve the function arguments. ecx will hold cur_match\r
143; throughout the entire function. edx will hold the pointer to the\r
144; deflate_state structure during the function's setup (before\r
145; entering the main loop).\r
146\r
147 mov edx, [esp+deflatestate]\r
148ASSUME edx:PTR DEFLATE_STATE\r
149\r
150 mov ecx, [esp+curmatch]\r
151\r
152; uInt wmask = s->w_mask;\r
153; unsigned chain_length = s->max_chain_length;\r
154; if (s->prev_length >= s->good_match) {\r
155; chain_length >>= 2;\r
156; }\r
157\r
158 mov eax, [edx].ds_prev_length\r
159 mov ebx, [edx].ds_good_match\r
160 cmp eax, ebx\r
161 mov eax, [edx].ds_w_mask\r
162 mov ebx, [edx].ds_max_chain_length\r
163 jl SHORT LastMatchGood\r
164 shr ebx, 2\r
165LastMatchGood:\r
166\r
167; chainlen is decremented once beforehand so that the function can\r
168; use the sign flag instead of the zero flag for the exit test.\r
169; It is then shifted into the high word, to make room for the wmask\r
170; value, which it will always accompany.\r
171\r
172 dec ebx\r
173 shl ebx, 16\r
174 or ebx, eax\r
175 mov [esp+chainlenwmask], ebx\r
176\r
177; if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead;\r
178\r
179 mov eax, [edx].ds_nice_match\r
180 mov ebx, [edx].ds_lookahead\r
181 cmp ebx, eax\r
182 jl SHORT LookaheadLess\r
183 mov ebx, eax\r
184LookaheadLess:\r
185 mov [esp+nicematch], ebx\r
186\r
187;/* register Bytef *scan = s->window + s->strstart; */\r
188\r
189 mov esi, [edx].ds_window\r
190 mov [esp+window], esi\r
191 mov ebp, [edx].ds_strstart\r
192 lea edi, [esi+ebp]\r
193 mov [esp+scan],edi\r
194\r
195;/* Determine how many bytes the scan ptr is off from being */\r
196;/* dword-aligned. */\r
197\r
198 mov eax, edi\r
199 neg eax\r
200 and eax, 3\r
201 mov [esp+scanalign], eax\r
202\r
203;/* IPos limit = s->strstart > (IPos)MAX_DIST(s) ? */\r
204;/* s->strstart - (IPos)MAX_DIST(s) : NIL; */\r
205\r
206 mov eax, [edx].ds_w_size\r
207 sub eax, MIN_LOOKAHEAD\r
208 sub ebp, eax\r
209 jg SHORT LimitPositive\r
210 xor ebp, ebp\r
211LimitPositive:\r
212\r
213;/* int best_len = s->prev_length; */\r
214\r
215 mov eax, [edx].ds_prev_length\r
216 mov [esp+bestlen], eax\r
217\r
218;/* Store the sum of s->window + best_len in %esi locally, and in %esi. */\r
219\r
220 add esi, eax\r
221 mov [esp+windowbestlen], esi\r
222\r
223;/* register ush scan_start = *(ushf*)scan; */\r
224;/* register ush scan_end = *(ushf*)(scan+best_len-1); */\r
225;/* Posf *prev = s->prev; */\r
226\r
227 movzx ebx, WORD PTR[edi]\r
228 mov [esp+scanstart], ebx\r
229 movzx ebx, WORD PTR[eax+edi-1]\r
230 mov [esp+scanend], ebx\r
231 mov edi, [edx].ds_prev\r
232\r
233;/* Jump into the main loop. */\r
234\r
235 mov edx, [esp+chainlenwmask]\r
236 jmp SHORT LoopEntry\r
237\r
238;/* do {\r
239; * match = s->window + cur_match;\r
240; * if (*(ushf*)(match+best_len-1) != scan_end ||\r
241; * *(ushf*)match != scan_start) continue;\r
242; * [...]\r
243; * } while ((cur_match = prev[cur_match & wmask]) > limit\r
244; * && --chain_length != 0);\r
245; *\r
246; * Here is the inner loop of the function. The function will spend the\r
247; * majority of its time in this loop, and majority of that time will\r
248; * be spent in the first ten instructions.\r
249; *\r
250; * Within this loop:\r
251; * %ebx = scanend\r
252; * %ecx = curmatch\r
253; * %edx = chainlenwmask - i.e., ((chainlen << 16) | wmask)\r
254; * %esi = windowbestlen - i.e., (window + bestlen)\r
255; * %edi = prev\r
256; * %ebp = limit\r
257; */\r
258\r
259 ALIGN 4\r
260LookupLoop:\r
261 and ecx, edx\r
262 movzx ecx, WORD PTR[edi+ecx*2]\r
263 cmp ecx, ebp\r
264 jbe LeaveNow\r
265 sub edx, 000010000H\r
266 js LeaveNow\r
267\r
268LoopEntry:\r
269 movzx eax, WORD PTR[esi+ecx-1]\r
270 cmp eax, ebx\r
271 jnz SHORT LookupLoop\r
272\r
273 mov eax, [esp+window]\r
274 movzx eax, WORD PTR[eax+ecx]\r
275 cmp eax, [esp+scanstart]\r
276 jnz SHORT LookupLoop\r
277\r
278;/* Store the current value of chainlen. */\r
279\r
280 mov [esp+chainlenwmask], edx\r
281\r
282;/* Point %edi to the string under scrutiny, and %esi to the string we */\r
283;/* are hoping to match it up with. In actuality, %esi and %edi are */\r
284;/* both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and %edx is */\r
285;/* initialized to -(MAX_MATCH_8 - scanalign). */\r
286\r
287 mov esi, [esp+window]\r
288 mov edi, [esp+scan]\r
289 add esi, ecx\r
290 mov eax, [esp+scanalign]\r
291 mov edx, -MAX_MATCH_8\r
292 lea edi, [edi+eax+MAX_MATCH_8]\r
293 lea esi, [esi+eax+MAX_MATCH_8]\r
294\r
295;/* Test the strings for equality, 8 bytes at a time. At the end,\r
296; * adjust %edx so that it is offset to the exact byte that mismatched.\r
297; *\r
298; * We already know at this point that the first three bytes of the\r
299; * strings match each other, and they can be safely passed over before\r
300; * starting the compare loop. So what this code does is skip over 0-3\r
301; * bytes, as much as necessary in order to dword-align the %edi\r
302; * pointer. (%esi will still be misaligned three times out of four.)\r
303; *\r
304; * It should be confessed that this loop usually does not represent\r
305; * much of the total running time. Replacing it with a more\r
306; * straightforward "rep cmpsb" would not drastically degrade\r
307; * performance.\r
308; */\r
309\r
310LoopCmps:\r
311 mov eax, DWORD PTR[esi+edx]\r
312 xor eax, DWORD PTR[edi+edx]\r
313 jnz SHORT LeaveLoopCmps\r
314\r
315 mov eax, DWORD PTR[esi+edx+4]\r
316 xor eax, DWORD PTR[edi+edx+4]\r
317 jnz SHORT LeaveLoopCmps4\r
318\r
319 add edx, 8\r
320 jnz SHORT LoopCmps\r
321 jmp LenMaximum\r
322 ALIGN 4\r
323\r
324LeaveLoopCmps4:\r
325 add edx, 4\r
326\r
327LeaveLoopCmps:\r
328 test eax, 00000FFFFH\r
329 jnz SHORT LenLower\r
330\r
331 add edx, 2\r
332 shr eax, 16\r
333\r
334LenLower:\r
335 sub al, 1\r
336 adc edx, 0\r
337\r
338;/* Calculate the length of the match. If it is longer than MAX_MATCH, */\r
339;/* then automatically accept it as the best possible match and leave. */\r
340\r
341 lea eax, [edi+edx]\r
342 mov edi, [esp+scan]\r
343 sub eax, edi\r
344 cmp eax, MAX_MATCH\r
345 jge SHORT LenMaximum\r
346\r
347;/* If the length of the match is not longer than the best match we */\r
348;/* have so far, then forget it and return to the lookup loop. */\r
349\r
350 mov edx, [esp+deflatestate]\r
351 mov ebx, [esp+bestlen]\r
352 cmp eax, ebx\r
353 jg SHORT LongerMatch\r
354 mov esi, [esp+windowbestlen]\r
355 mov edi, [edx].ds_prev\r
356 mov ebx, [esp+scanend]\r
357 mov edx, [esp+chainlenwmask]\r
358 jmp LookupLoop\r
359 ALIGN 4\r
360\r
361;/* s->match_start = cur_match; */\r
362;/* best_len = len; */\r
363;/* if (len >= nice_match) break; */\r
364;/* scan_end = *(ushf*)(scan+best_len-1); */\r
365\r
366LongerMatch:\r
367 mov ebx, [esp+nicematch]\r
368 mov [esp+bestlen], eax\r
369 mov [edx].ds_match_start, ecx\r
370 cmp eax, ebx\r
371 jge SHORT LeaveNow\r
372 mov esi, [esp+window]\r
373 add esi, eax\r
374 mov [esp+windowbestlen], esi\r
375 movzx ebx, WORD PTR[edi+eax-1]\r
376 mov edi, [edx].ds_prev\r
377 mov [esp+scanend], ebx\r
378 mov edx, [esp+chainlenwmask]\r
379 jmp LookupLoop\r
380 ALIGN 4\r
381\r
382;/* Accept the current string, with the maximum possible length. */\r
383\r
384LenMaximum:\r
385 mov edx, [esp+deflatestate]\r
386 mov DWORD PTR[esp+bestlen], MAX_MATCH\r
387 mov [edx].ds_match_start, ecx\r
388\r
389;/* if ((uInt)best_len <= s->lookahead) return (uInt)best_len; */\r
390;/* return s->lookahead; */\r
391\r
392LeaveNow:\r
393 mov edx, [esp+deflatestate]\r
394 mov ebx, [esp+bestlen]\r
395 mov eax, [edx].ds_lookahead\r
396 cmp ebx, eax\r
397 jg SHORT LookaheadRet\r
398 mov eax, ebx\r
399LookaheadRet:\r
400\r
401; Restore the stack and return from whence we came.\r
402\r
403 add esp, varsize\r
404 pop ebx\r
405 pop esi\r
406 pop edi\r
407 pop ebp\r
408 ret\r
409\r
410_longest_match ENDP\r
411\r
412_TEXT ENDS\r
413END\r