]>
Commit | Line | Data |
---|---|---|
fbac2044 | 1 | \r |
2 | ; match.asm -- Pentium-Pro optimized version of longest_match()\r | |
3 | ;\r | |
4 | ; Updated for zlib 1.1.3 and converted to MASM 6.1x\r | |
5 | ; Copyright (C) 2000 Dan Higdon <hdan@kinesoft.com>\r | |
6 | ; and Chuck Walbourn <chuckw@kinesoft.com>\r | |
7 | ; Corrections by Cosmin Truta <cosmint@cs.ubbcluj.ro>\r | |
8 | ;\r | |
9 | ; This is free software; you can redistribute it and/or modify it\r | |
10 | ; under the terms of the GNU General Public License.\r | |
11 | \r | |
12 | ; Based on match.S\r | |
13 | ; Written for zlib 1.1.2\r | |
14 | ; Copyright (C) 1998 Brian Raiter <breadbox@muppetlabs.com>\r | |
15 | ;\r | |
16 | ; Modified by Gilles Vollant (2005) for add gzhead and gzindex\r | |
17 | \r | |
18 | .686P\r | |
19 | .MODEL FLAT\r | |
20 | \r | |
21 | ;===========================================================================\r | |
22 | ; EQUATES\r | |
23 | ;===========================================================================\r | |
24 | \r | |
25 | MAX_MATCH EQU 258\r | |
26 | MIN_MATCH EQU 3\r | |
27 | MIN_LOOKAHEAD EQU (MAX_MATCH + MIN_MATCH + 1)\r | |
28 | MAX_MATCH_8 EQU ((MAX_MATCH + 7) AND (NOT 7))\r | |
29 | \r | |
30 | ;===========================================================================\r | |
31 | ; STRUCTURES\r | |
32 | ;===========================================================================\r | |
33 | \r | |
34 | ; This STRUCT assumes a 4-byte alignment\r | |
35 | \r | |
36 | DEFLATE_STATE STRUCT\r | |
37 | ds_strm dd ?\r | |
38 | ds_status dd ?\r | |
39 | ds_pending_buf dd ?\r | |
40 | ds_pending_buf_size dd ?\r | |
41 | ds_pending_out dd ?\r | |
42 | ds_pending dd ?\r | |
43 | ds_wrap dd ?\r | |
44 | ; gzhead and gzindex are added in zlib 1.2.2.2 (see deflate.h)\r | |
45 | ds_gzhead dd ?\r | |
46 | ds_gzindex dd ?\r | |
47 | ds_data_type db ?\r | |
48 | ds_method db ?\r | |
49 | db ? ; padding\r | |
50 | db ? ; padding\r | |
51 | ds_last_flush dd ?\r | |
52 | ds_w_size dd ? ; used\r | |
53 | ds_w_bits dd ?\r | |
54 | ds_w_mask dd ? ; used\r | |
55 | ds_window dd ? ; used\r | |
56 | ds_window_size dd ?\r | |
57 | ds_prev dd ? ; used\r | |
58 | ds_head dd ?\r | |
59 | ds_ins_h dd ?\r | |
60 | ds_hash_size dd ?\r | |
61 | ds_hash_bits dd ?\r | |
62 | ds_hash_mask dd ?\r | |
63 | ds_hash_shift dd ?\r | |
64 | ds_block_start dd ?\r | |
65 | ds_match_length dd ? ; used\r | |
66 | ds_prev_match dd ? ; used\r | |
67 | ds_match_available dd ?\r | |
68 | ds_strstart dd ? ; used\r | |
69 | ds_match_start dd ? ; used\r | |
70 | ds_lookahead dd ? ; used\r | |
71 | ds_prev_length dd ? ; used\r | |
72 | ds_max_chain_length dd ? ; used\r | |
73 | ds_max_laxy_match dd ?\r | |
74 | ds_level dd ?\r | |
75 | ds_strategy dd ?\r | |
76 | ds_good_match dd ? ; used\r | |
77 | ds_nice_match dd ? ; used\r | |
78 | \r | |
79 | ; Don't need anymore of the struct for match\r | |
80 | DEFLATE_STATE ENDS\r | |
81 | \r | |
82 | ;===========================================================================\r | |
83 | ; CODE\r | |
84 | ;===========================================================================\r | |
85 | _TEXT SEGMENT\r | |
86 | \r | |
87 | ;---------------------------------------------------------------------------\r | |
88 | ; match_init\r | |
89 | ;---------------------------------------------------------------------------\r | |
90 | ALIGN 4\r | |
91 | PUBLIC _match_init\r | |
92 | _match_init PROC\r | |
93 | ; no initialization needed\r | |
94 | ret\r | |
95 | _match_init ENDP\r | |
96 | \r | |
97 | ;---------------------------------------------------------------------------\r | |
98 | ; uInt longest_match(deflate_state *deflatestate, IPos curmatch)\r | |
99 | ;---------------------------------------------------------------------------\r | |
100 | ALIGN 4\r | |
101 | \r | |
102 | PUBLIC _longest_match\r | |
103 | _longest_match PROC\r | |
104 | \r | |
105 | ; Since this code uses EBP for a scratch register, the stack frame must\r | |
106 | ; be manually constructed and referenced relative to the ESP register.\r | |
107 | \r | |
108 | ; Stack image\r | |
109 | ; Variables\r | |
110 | chainlenwmask = 0 ; high word: current chain len\r | |
111 | ; low word: s->wmask\r | |
112 | window = 4 ; local copy of s->window\r | |
113 | windowbestlen = 8 ; s->window + bestlen\r | |
114 | scanend = 12 ; last two bytes of string\r | |
115 | scanstart = 16 ; first two bytes of string\r | |
116 | scanalign = 20 ; dword-misalignment of string\r | |
117 | nicematch = 24 ; a good enough match size\r | |
118 | bestlen = 28 ; size of best match so far\r | |
119 | scan = 32 ; ptr to string wanting match\r | |
120 | varsize = 36 ; number of bytes (also offset to last saved register)\r | |
121 | \r | |
122 | ; Saved Registers (actually pushed into place)\r | |
123 | ebx_save = 36\r | |
124 | edi_save = 40\r | |
125 | esi_save = 44\r | |
126 | ebp_save = 48\r | |
127 | \r | |
128 | ; Parameters\r | |
129 | retaddr = 52\r | |
130 | deflatestate = 56\r | |
131 | curmatch = 60\r | |
132 | \r | |
133 | ; Save registers that the compiler may be using\r | |
134 | push ebp\r | |
135 | push edi\r | |
136 | push esi\r | |
137 | push ebx\r | |
138 | \r | |
139 | ; Allocate local variable space\r | |
140 | sub esp,varsize\r | |
141 | \r | |
142 | ; Retrieve the function arguments. ecx will hold cur_match\r | |
143 | ; throughout the entire function. edx will hold the pointer to the\r | |
144 | ; deflate_state structure during the function's setup (before\r | |
145 | ; entering the main loop).\r | |
146 | \r | |
147 | mov edx, [esp+deflatestate]\r | |
148 | ASSUME edx:PTR DEFLATE_STATE\r | |
149 | \r | |
150 | mov ecx, [esp+curmatch]\r | |
151 | \r | |
152 | ; uInt wmask = s->w_mask;\r | |
153 | ; unsigned chain_length = s->max_chain_length;\r | |
154 | ; if (s->prev_length >= s->good_match) {\r | |
155 | ; chain_length >>= 2;\r | |
156 | ; }\r | |
157 | \r | |
158 | mov eax, [edx].ds_prev_length\r | |
159 | mov ebx, [edx].ds_good_match\r | |
160 | cmp eax, ebx\r | |
161 | mov eax, [edx].ds_w_mask\r | |
162 | mov ebx, [edx].ds_max_chain_length\r | |
163 | jl SHORT LastMatchGood\r | |
164 | shr ebx, 2\r | |
165 | LastMatchGood:\r | |
166 | \r | |
167 | ; chainlen is decremented once beforehand so that the function can\r | |
168 | ; use the sign flag instead of the zero flag for the exit test.\r | |
169 | ; It is then shifted into the high word, to make room for the wmask\r | |
170 | ; value, which it will always accompany.\r | |
171 | \r | |
172 | dec ebx\r | |
173 | shl ebx, 16\r | |
174 | or ebx, eax\r | |
175 | mov [esp+chainlenwmask], ebx\r | |
176 | \r | |
177 | ; if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead;\r | |
178 | \r | |
179 | mov eax, [edx].ds_nice_match\r | |
180 | mov ebx, [edx].ds_lookahead\r | |
181 | cmp ebx, eax\r | |
182 | jl SHORT LookaheadLess\r | |
183 | mov ebx, eax\r | |
184 | LookaheadLess:\r | |
185 | mov [esp+nicematch], ebx\r | |
186 | \r | |
187 | ;/* register Bytef *scan = s->window + s->strstart; */\r | |
188 | \r | |
189 | mov esi, [edx].ds_window\r | |
190 | mov [esp+window], esi\r | |
191 | mov ebp, [edx].ds_strstart\r | |
192 | lea edi, [esi+ebp]\r | |
193 | mov [esp+scan],edi\r | |
194 | \r | |
195 | ;/* Determine how many bytes the scan ptr is off from being */\r | |
196 | ;/* dword-aligned. */\r | |
197 | \r | |
198 | mov eax, edi\r | |
199 | neg eax\r | |
200 | and eax, 3\r | |
201 | mov [esp+scanalign], eax\r | |
202 | \r | |
203 | ;/* IPos limit = s->strstart > (IPos)MAX_DIST(s) ? */\r | |
204 | ;/* s->strstart - (IPos)MAX_DIST(s) : NIL; */\r | |
205 | \r | |
206 | mov eax, [edx].ds_w_size\r | |
207 | sub eax, MIN_LOOKAHEAD\r | |
208 | sub ebp, eax\r | |
209 | jg SHORT LimitPositive\r | |
210 | xor ebp, ebp\r | |
211 | LimitPositive:\r | |
212 | \r | |
213 | ;/* int best_len = s->prev_length; */\r | |
214 | \r | |
215 | mov eax, [edx].ds_prev_length\r | |
216 | mov [esp+bestlen], eax\r | |
217 | \r | |
218 | ;/* Store the sum of s->window + best_len in %esi locally, and in %esi. */\r | |
219 | \r | |
220 | add esi, eax\r | |
221 | mov [esp+windowbestlen], esi\r | |
222 | \r | |
223 | ;/* register ush scan_start = *(ushf*)scan; */\r | |
224 | ;/* register ush scan_end = *(ushf*)(scan+best_len-1); */\r | |
225 | ;/* Posf *prev = s->prev; */\r | |
226 | \r | |
227 | movzx ebx, WORD PTR[edi]\r | |
228 | mov [esp+scanstart], ebx\r | |
229 | movzx ebx, WORD PTR[eax+edi-1]\r | |
230 | mov [esp+scanend], ebx\r | |
231 | mov edi, [edx].ds_prev\r | |
232 | \r | |
233 | ;/* Jump into the main loop. */\r | |
234 | \r | |
235 | mov edx, [esp+chainlenwmask]\r | |
236 | jmp SHORT LoopEntry\r | |
237 | \r | |
238 | ;/* do {\r | |
239 | ; * match = s->window + cur_match;\r | |
240 | ; * if (*(ushf*)(match+best_len-1) != scan_end ||\r | |
241 | ; * *(ushf*)match != scan_start) continue;\r | |
242 | ; * [...]\r | |
243 | ; * } while ((cur_match = prev[cur_match & wmask]) > limit\r | |
244 | ; * && --chain_length != 0);\r | |
245 | ; *\r | |
246 | ; * Here is the inner loop of the function. The function will spend the\r | |
247 | ; * majority of its time in this loop, and majority of that time will\r | |
248 | ; * be spent in the first ten instructions.\r | |
249 | ; *\r | |
250 | ; * Within this loop:\r | |
251 | ; * %ebx = scanend\r | |
252 | ; * %ecx = curmatch\r | |
253 | ; * %edx = chainlenwmask - i.e., ((chainlen << 16) | wmask)\r | |
254 | ; * %esi = windowbestlen - i.e., (window + bestlen)\r | |
255 | ; * %edi = prev\r | |
256 | ; * %ebp = limit\r | |
257 | ; */\r | |
258 | \r | |
259 | ALIGN 4\r | |
260 | LookupLoop:\r | |
261 | and ecx, edx\r | |
262 | movzx ecx, WORD PTR[edi+ecx*2]\r | |
263 | cmp ecx, ebp\r | |
264 | jbe LeaveNow\r | |
265 | sub edx, 000010000H\r | |
266 | js LeaveNow\r | |
267 | \r | |
268 | LoopEntry:\r | |
269 | movzx eax, WORD PTR[esi+ecx-1]\r | |
270 | cmp eax, ebx\r | |
271 | jnz SHORT LookupLoop\r | |
272 | \r | |
273 | mov eax, [esp+window]\r | |
274 | movzx eax, WORD PTR[eax+ecx]\r | |
275 | cmp eax, [esp+scanstart]\r | |
276 | jnz SHORT LookupLoop\r | |
277 | \r | |
278 | ;/* Store the current value of chainlen. */\r | |
279 | \r | |
280 | mov [esp+chainlenwmask], edx\r | |
281 | \r | |
282 | ;/* Point %edi to the string under scrutiny, and %esi to the string we */\r | |
283 | ;/* are hoping to match it up with. In actuality, %esi and %edi are */\r | |
284 | ;/* both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and %edx is */\r | |
285 | ;/* initialized to -(MAX_MATCH_8 - scanalign). */\r | |
286 | \r | |
287 | mov esi, [esp+window]\r | |
288 | mov edi, [esp+scan]\r | |
289 | add esi, ecx\r | |
290 | mov eax, [esp+scanalign]\r | |
291 | mov edx, -MAX_MATCH_8\r | |
292 | lea edi, [edi+eax+MAX_MATCH_8]\r | |
293 | lea esi, [esi+eax+MAX_MATCH_8]\r | |
294 | \r | |
295 | ;/* Test the strings for equality, 8 bytes at a time. At the end,\r | |
296 | ; * adjust %edx so that it is offset to the exact byte that mismatched.\r | |
297 | ; *\r | |
298 | ; * We already know at this point that the first three bytes of the\r | |
299 | ; * strings match each other, and they can be safely passed over before\r | |
300 | ; * starting the compare loop. So what this code does is skip over 0-3\r | |
301 | ; * bytes, as much as necessary in order to dword-align the %edi\r | |
302 | ; * pointer. (%esi will still be misaligned three times out of four.)\r | |
303 | ; *\r | |
304 | ; * It should be confessed that this loop usually does not represent\r | |
305 | ; * much of the total running time. Replacing it with a more\r | |
306 | ; * straightforward "rep cmpsb" would not drastically degrade\r | |
307 | ; * performance.\r | |
308 | ; */\r | |
309 | \r | |
310 | LoopCmps:\r | |
311 | mov eax, DWORD PTR[esi+edx]\r | |
312 | xor eax, DWORD PTR[edi+edx]\r | |
313 | jnz SHORT LeaveLoopCmps\r | |
314 | \r | |
315 | mov eax, DWORD PTR[esi+edx+4]\r | |
316 | xor eax, DWORD PTR[edi+edx+4]\r | |
317 | jnz SHORT LeaveLoopCmps4\r | |
318 | \r | |
319 | add edx, 8\r | |
320 | jnz SHORT LoopCmps\r | |
321 | jmp LenMaximum\r | |
322 | ALIGN 4\r | |
323 | \r | |
324 | LeaveLoopCmps4:\r | |
325 | add edx, 4\r | |
326 | \r | |
327 | LeaveLoopCmps:\r | |
328 | test eax, 00000FFFFH\r | |
329 | jnz SHORT LenLower\r | |
330 | \r | |
331 | add edx, 2\r | |
332 | shr eax, 16\r | |
333 | \r | |
334 | LenLower:\r | |
335 | sub al, 1\r | |
336 | adc edx, 0\r | |
337 | \r | |
338 | ;/* Calculate the length of the match. If it is longer than MAX_MATCH, */\r | |
339 | ;/* then automatically accept it as the best possible match and leave. */\r | |
340 | \r | |
341 | lea eax, [edi+edx]\r | |
342 | mov edi, [esp+scan]\r | |
343 | sub eax, edi\r | |
344 | cmp eax, MAX_MATCH\r | |
345 | jge SHORT LenMaximum\r | |
346 | \r | |
347 | ;/* If the length of the match is not longer than the best match we */\r | |
348 | ;/* have so far, then forget it and return to the lookup loop. */\r | |
349 | \r | |
350 | mov edx, [esp+deflatestate]\r | |
351 | mov ebx, [esp+bestlen]\r | |
352 | cmp eax, ebx\r | |
353 | jg SHORT LongerMatch\r | |
354 | mov esi, [esp+windowbestlen]\r | |
355 | mov edi, [edx].ds_prev\r | |
356 | mov ebx, [esp+scanend]\r | |
357 | mov edx, [esp+chainlenwmask]\r | |
358 | jmp LookupLoop\r | |
359 | ALIGN 4\r | |
360 | \r | |
361 | ;/* s->match_start = cur_match; */\r | |
362 | ;/* best_len = len; */\r | |
363 | ;/* if (len >= nice_match) break; */\r | |
364 | ;/* scan_end = *(ushf*)(scan+best_len-1); */\r | |
365 | \r | |
366 | LongerMatch:\r | |
367 | mov ebx, [esp+nicematch]\r | |
368 | mov [esp+bestlen], eax\r | |
369 | mov [edx].ds_match_start, ecx\r | |
370 | cmp eax, ebx\r | |
371 | jge SHORT LeaveNow\r | |
372 | mov esi, [esp+window]\r | |
373 | add esi, eax\r | |
374 | mov [esp+windowbestlen], esi\r | |
375 | movzx ebx, WORD PTR[edi+eax-1]\r | |
376 | mov edi, [edx].ds_prev\r | |
377 | mov [esp+scanend], ebx\r | |
378 | mov edx, [esp+chainlenwmask]\r | |
379 | jmp LookupLoop\r | |
380 | ALIGN 4\r | |
381 | \r | |
382 | ;/* Accept the current string, with the maximum possible length. */\r | |
383 | \r | |
384 | LenMaximum:\r | |
385 | mov edx, [esp+deflatestate]\r | |
386 | mov DWORD PTR[esp+bestlen], MAX_MATCH\r | |
387 | mov [edx].ds_match_start, ecx\r | |
388 | \r | |
389 | ;/* if ((uInt)best_len <= s->lookahead) return (uInt)best_len; */\r | |
390 | ;/* return s->lookahead; */\r | |
391 | \r | |
392 | LeaveNow:\r | |
393 | mov edx, [esp+deflatestate]\r | |
394 | mov ebx, [esp+bestlen]\r | |
395 | mov eax, [edx].ds_lookahead\r | |
396 | cmp ebx, eax\r | |
397 | jg SHORT LookaheadRet\r | |
398 | mov eax, ebx\r | |
399 | LookaheadRet:\r | |
400 | \r | |
401 | ; Restore the stack and return from whence we came.\r | |
402 | \r | |
403 | add esp, varsize\r | |
404 | pop ebx\r | |
405 | pop esi\r | |
406 | pop edi\r | |
407 | pop ebp\r | |
408 | ret\r | |
409 | \r | |
410 | _longest_match ENDP\r | |
411 | \r | |
412 | _TEXT ENDS\r | |
413 | END\r |