]>
Commit | Line | Data |
---|---|---|
e3a510f8 AP |
1 | #!/usr/bin/env perl |
2 | # | |
3 | # ==================================================================== | |
4 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | |
5 | # project. The module is, however, dual licensed under OpenSSL and | |
6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | |
7 | # details see http://www.openssl.org/~appro/cryptogams/. | |
8 | # ==================================================================== | |
9 | # | |
8525950e | 10 | # March, May, June 2010 |
480cd6ab | 11 | # |
c3473126 AP |
12 | # The module implements "4-bit" GCM GHASH function and underlying |
13 | # single multiplication operation in GF(2^128). "4-bit" means that it | |
14 | # uses 256 bytes per-key table [+64/128 bytes fixed table]. It has two | |
5c88dcca AP |
15 | # code paths: vanilla x86 and vanilla SSE. Former will be executed on |
16 | # 486 and Pentium, latter on all others. SSE GHASH features so called | |
8525950e AP |
17 | # "528B" variant of "4-bit" method utilizing additional 256+16 bytes |
18 | # of per-key storage [+512 bytes shared table]. Performance results | |
19 | # are for streamed GHASH subroutine and are expressed in cycles per | |
20 | # processed byte, less is better: | |
e3a510f8 | 21 | # |
5c88dcca | 22 | # gcc 2.95.3(*) SSE assembler x86 assembler |
e3a510f8 | 23 | # |
d52d5ad1 AP |
24 | # Pentium 105/111(**) - 50 |
25 | # PIII 68 /75 12.2 24 | |
26 | # P4 125/125 17.8 84(***) | |
27 | # Opteron 66 /70 10.1 30 | |
28 | # Core2 54 /67 8.4 18 | |
e3a510f8 AP |
29 | # |
30 | # (*) gcc 3.4.x was observed to generate few percent slower code, | |
480cd6ab | 31 | # which is one of reasons why 2.95.3 results were chosen, |
e3a510f8 | 32 | # another reason is lack of 3.4.x results for older CPUs; |
5c88dcca | 33 | # comparison with SSE results is not completely fair, because C |
d52d5ad1 AP |
34 | # results are for vanilla "256B" implementation, while |
35 | # assembler results are for "528B";-) | |
e3a510f8 AP |
36 | # (**) second number is result for code compiled with -fPIC flag, |
37 | # which is actually more relevant, because assembler code is | |
38 | # position-independent; | |
39 | # (***) see comment in non-MMX routine for further details; | |
40 | # | |
8525950e | 41 | # To summarize, it's >2-5 times faster than gcc-generated code. To |
480cd6ab | 42 | # anchor it to something else SHA1 assembler processes one byte in |
5c88dcca AP |
43 | # ~7 cycles on contemporary x86 cores. As for choice of MMX/SSE |
44 | # in particular, see comment at the end of the file... | |
e3a510f8 | 45 | |
c1f092d1 AP |
46 | # May 2010 |
47 | # | |
d52d5ad1 | 48 | # Add PCLMULQDQ version performing at 2.10 cycles per processed byte. |
c1f092d1 AP |
49 | # The question is how close is it to theoretical limit? The pclmulqdq |
50 | # instruction latency appears to be 14 cycles and there can't be more | |
51 | # than 2 of them executing at any given time. This means that single | |
52 | # Karatsuba multiplication would take 28 cycles *plus* few cycles for | |
53 | # pre- and post-processing. Then multiplication has to be followed by | |
54 | # modulo-reduction. Given that aggregated reduction method [see | |
55 | # "Carry-less Multiplication and Its Usage for Computing the GCM Mode" | |
56 | # white paper by Intel] allows you to perform reduction only once in | |
57 | # a while we can assume that asymptotic performance can be estimated | |
58 | # as (28+Tmod/Naggr)/16, where Tmod is time to perform reduction | |
59 | # and Naggr is the aggregation factor. | |
60 | # | |
61 | # Before we proceed to this implementation let's have closer look at | |
62 | # the best-performing code suggested by Intel in their white paper. | |
63 | # By tracing inter-register dependencies Tmod is estimated as ~19 | |
d52d5ad1 AP |
64 | # cycles and Naggr chosen by Intel is 4, resulting in 2.05 cycles per |
65 | # processed byte. As implied, this is quite optimistic estimate, | |
66 | # because it does not account for Karatsuba pre- and post-processing, | |
67 | # which for a single multiplication is ~5 cycles. Unfortunately Intel | |
68 | # does not provide performance data for GHASH alone. But benchmarking | |
69 | # AES_GCM_encrypt ripped out of Fig. 15 of the white paper with aadt | |
70 | # alone resulted in 2.46 cycles per byte of out 16KB buffer. Note that | |
71 | # the result accounts even for pre-computing of degrees of the hash | |
72 | # key H, but its portion is negligible at 16KB buffer size. | |
c1f092d1 AP |
73 | # |
74 | # Moving on to the implementation in question. Tmod is estimated as | |
75 | # ~13 cycles and Naggr is 2, giving asymptotic performance of ... | |
76 | # 2.16. How is it possible that measured performance is better than | |
77 | # optimistic theoretical estimate? There is one thing Intel failed | |
d52d5ad1 AP |
78 | # to recognize. By serializing GHASH with CTR in same subroutine |
79 | # former's performance is really limited to above (Tmul + Tmod/Naggr) | |
80 | # equation. But if GHASH procedure is detached, the modulo-reduction | |
81 | # can be interleaved with Naggr-1 multiplications at instruction level | |
82 | # and under ideal conditions even disappear from the equation. So that | |
83 | # optimistic theoretical estimate for this implementation is ... | |
84 | # 28/16=1.75, and not 2.16. Well, it's probably way too optimistic, | |
85 | # at least for such small Naggr. I'd argue that (28+Tproc/Naggr), | |
86 | # where Tproc is time required for Karatsuba pre- and post-processing, | |
87 | # is more realistic estimate. In this case it gives ... 1.91 cycles. | |
88 | # Or in other words, depending on how well we can interleave reduction | |
89 | # and one of the two multiplications the performance should be betwen | |
90 | # 1.91 and 2.16. As already mentioned, this implementation processes | |
91 | # one byte out of 8KB buffer in 2.10 cycles, while x86_64 counterpart | |
92 | # - in 2.02. x86_64 performance is better, because larger register | |
93 | # bank allows to interleave reduction and multiplication better. | |
c1f092d1 AP |
94 | # |
95 | # Does it make sense to increase Naggr? To start with it's virtually | |
96 | # impossible in 32-bit mode, because of limited register bank | |
97 | # capacity. Otherwise improvement has to be weighed agiainst slower | |
98 | # setup, as well as code size and complexity increase. As even | |
99 | # optimistic estimate doesn't promise 30% performance improvement, | |
100 | # there are currently no plans to increase Naggr. | |
1aa8a629 AP |
101 | # |
102 | # Special thanks to David Woodhouse <dwmw2@infradead.org> for | |
103 | # providing access to a Westmere-based system on behalf of Intel | |
104 | # Open Source Technology Centre. | |
c1f092d1 | 105 | |
bc5b136c AP |
106 | # January 2010 |
107 | # | |
108 | # Tweaked to optimize transitions between integer and FP operations | |
109 | # on same XMM register, PCLMULQDQ subroutine was measured to process | |
110 | # one byte in 2.07 cycles on Sandy Bridge, and in 2.12 - on Westmere. | |
111 | # The minor regression on Westmere is outweighed by ~15% improvement | |
112 | # on Sandy Bridge. Strangely enough attempt to modify 64-bit code in | |
113 | # similar manner resulted in almost 20% degradation on Sandy Bridge, | |
114 | # where original 64-bit code processes one byte in 1.95 cycles. | |
115 | ||
e3a510f8 AP |
116 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; |
117 | push(@INC,"${dir}","${dir}../../perlasm"); | |
118 | require "x86asm.pl"; | |
119 | ||
c1f092d1 | 120 | &asm_init($ARGV[0],"ghash-x86.pl",$x86only = $ARGV[$#ARGV] eq "386"); |
e3a510f8 | 121 | |
c1f092d1 AP |
122 | $sse2=0; |
123 | for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } | |
e3a510f8 | 124 | |
c1f092d1 | 125 | ($Zhh,$Zhl,$Zlh,$Zll) = ("ebp","edx","ecx","ebx"); |
e3a510f8 AP |
126 | $inp = "edi"; |
127 | $Htbl = "esi"; | |
c1f092d1 | 128 | \f |
e3a510f8 AP |
129 | $unroll = 0; # Affects x86 loop. Folded loop performs ~7% worse |
130 | # than unrolled, which has to be weighted against | |
c1f092d1 | 131 | # 2.5x x86-specific code size reduction. |
e3a510f8 AP |
132 | |
133 | sub x86_loop { | |
134 | my $off = shift; | |
135 | my $rem = "eax"; | |
136 | ||
137 | &mov ($Zhh,&DWP(4,$Htbl,$Zll)); | |
138 | &mov ($Zhl,&DWP(0,$Htbl,$Zll)); | |
139 | &mov ($Zlh,&DWP(12,$Htbl,$Zll)); | |
140 | &mov ($Zll,&DWP(8,$Htbl,$Zll)); | |
141 | &xor ($rem,$rem); # avoid partial register stalls on PIII | |
142 | ||
143 | # shrd practically kills P4, 2.5x deterioration, but P4 has | |
144 | # MMX code-path to execute. shrd runs tad faster [than twice | |
145 | # the shifts, move's and or's] on pre-MMX Pentium (as well as | |
146 | # PIII and Core2), *but* minimizes code size, spares register | |
147 | # and thus allows to fold the loop... | |
148 | if (!$unroll) { | |
149 | my $cnt = $inp; | |
150 | &mov ($cnt,15); | |
151 | &jmp (&label("x86_loop")); | |
152 | &set_label("x86_loop",16); | |
153 | for($i=1;$i<=2;$i++) { | |
154 | &mov (&LB($rem),&LB($Zll)); | |
155 | &shrd ($Zll,$Zlh,4); | |
156 | &and (&LB($rem),0xf); | |
157 | &shrd ($Zlh,$Zhl,4); | |
158 | &shrd ($Zhl,$Zhh,4); | |
159 | &shr ($Zhh,4); | |
160 | &xor ($Zhh,&DWP($off+16,"esp",$rem,4)); | |
161 | ||
162 | &mov (&LB($rem),&BP($off,"esp",$cnt)); | |
163 | if ($i&1) { | |
164 | &and (&LB($rem),0xf0); | |
165 | } else { | |
166 | &shl (&LB($rem),4); | |
167 | } | |
168 | ||
169 | &xor ($Zll,&DWP(8,$Htbl,$rem)); | |
170 | &xor ($Zlh,&DWP(12,$Htbl,$rem)); | |
171 | &xor ($Zhl,&DWP(0,$Htbl,$rem)); | |
172 | &xor ($Zhh,&DWP(4,$Htbl,$rem)); | |
173 | ||
174 | if ($i&1) { | |
175 | &dec ($cnt); | |
176 | &js (&label("x86_break")); | |
177 | } else { | |
178 | &jmp (&label("x86_loop")); | |
179 | } | |
180 | } | |
181 | &set_label("x86_break",16); | |
182 | } else { | |
183 | for($i=1;$i<32;$i++) { | |
184 | &comment($i); | |
185 | &mov (&LB($rem),&LB($Zll)); | |
186 | &shrd ($Zll,$Zlh,4); | |
187 | &and (&LB($rem),0xf); | |
188 | &shrd ($Zlh,$Zhl,4); | |
189 | &shrd ($Zhl,$Zhh,4); | |
190 | &shr ($Zhh,4); | |
191 | &xor ($Zhh,&DWP($off+16,"esp",$rem,4)); | |
192 | ||
193 | if ($i&1) { | |
194 | &mov (&LB($rem),&BP($off+15-($i>>1),"esp")); | |
195 | &and (&LB($rem),0xf0); | |
196 | } else { | |
197 | &mov (&LB($rem),&BP($off+15-($i>>1),"esp")); | |
198 | &shl (&LB($rem),4); | |
199 | } | |
200 | ||
201 | &xor ($Zll,&DWP(8,$Htbl,$rem)); | |
202 | &xor ($Zlh,&DWP(12,$Htbl,$rem)); | |
203 | &xor ($Zhl,&DWP(0,$Htbl,$rem)); | |
204 | &xor ($Zhh,&DWP(4,$Htbl,$rem)); | |
205 | } | |
206 | } | |
207 | &bswap ($Zll); | |
208 | &bswap ($Zlh); | |
209 | &bswap ($Zhl); | |
210 | if (!$x86only) { | |
211 | &bswap ($Zhh); | |
212 | } else { | |
213 | &mov ("eax",$Zhh); | |
214 | &bswap ("eax"); | |
215 | &mov ($Zhh,"eax"); | |
216 | } | |
217 | } | |
218 | ||
219 | if ($unroll) { | |
220 | &function_begin_B("_x86_gmult_4bit_inner"); | |
221 | &x86_loop(4); | |
222 | &ret (); | |
223 | &function_end_B("_x86_gmult_4bit_inner"); | |
224 | } | |
225 | ||
c1f092d1 AP |
226 | sub deposit_rem_4bit { |
227 | my $bias = shift; | |
e3a510f8 | 228 | |
c1f092d1 AP |
229 | &mov (&DWP($bias+0, "esp"),0x0000<<16); |
230 | &mov (&DWP($bias+4, "esp"),0x1C20<<16); | |
231 | &mov (&DWP($bias+8, "esp"),0x3840<<16); | |
232 | &mov (&DWP($bias+12,"esp"),0x2460<<16); | |
233 | &mov (&DWP($bias+16,"esp"),0x7080<<16); | |
234 | &mov (&DWP($bias+20,"esp"),0x6CA0<<16); | |
235 | &mov (&DWP($bias+24,"esp"),0x48C0<<16); | |
236 | &mov (&DWP($bias+28,"esp"),0x54E0<<16); | |
237 | &mov (&DWP($bias+32,"esp"),0xE100<<16); | |
238 | &mov (&DWP($bias+36,"esp"),0xFD20<<16); | |
239 | &mov (&DWP($bias+40,"esp"),0xD940<<16); | |
240 | &mov (&DWP($bias+44,"esp"),0xC560<<16); | |
241 | &mov (&DWP($bias+48,"esp"),0x9180<<16); | |
242 | &mov (&DWP($bias+52,"esp"),0x8DA0<<16); | |
243 | &mov (&DWP($bias+56,"esp"),0xA9C0<<16); | |
244 | &mov (&DWP($bias+60,"esp"),0xB5E0<<16); | |
245 | } | |
246 | \f | |
247 | $suffix = $x86only ? "" : "_x86"; | |
e3a510f8 | 248 | |
c1f092d1 | 249 | &function_begin("gcm_gmult_4bit".$suffix); |
e3a510f8 AP |
250 | &stack_push(16+4+1); # +1 for stack alignment |
251 | &mov ($inp,&wparam(0)); # load Xi | |
252 | &mov ($Htbl,&wparam(1)); # load Htable | |
253 | ||
254 | &mov ($Zhh,&DWP(0,$inp)); # load Xi[16] | |
255 | &mov ($Zhl,&DWP(4,$inp)); | |
256 | &mov ($Zlh,&DWP(8,$inp)); | |
257 | &mov ($Zll,&DWP(12,$inp)); | |
258 | ||
259 | &deposit_rem_4bit(16); | |
260 | ||
261 | &mov (&DWP(0,"esp"),$Zhh); # copy Xi[16] on stack | |
262 | &mov (&DWP(4,"esp"),$Zhl); | |
263 | &mov (&DWP(8,"esp"),$Zlh); | |
264 | &mov (&DWP(12,"esp"),$Zll); | |
265 | &shr ($Zll,20); | |
266 | &and ($Zll,0xf0); | |
267 | ||
268 | if ($unroll) { | |
269 | &call ("_x86_gmult_4bit_inner"); | |
270 | } else { | |
271 | &x86_loop(0); | |
272 | &mov ($inp,&wparam(0)); | |
273 | } | |
274 | ||
275 | &mov (&DWP(12,$inp),$Zll); | |
276 | &mov (&DWP(8,$inp),$Zlh); | |
277 | &mov (&DWP(4,$inp),$Zhl); | |
278 | &mov (&DWP(0,$inp),$Zhh); | |
279 | &stack_pop(16+4+1); | |
c1f092d1 AP |
280 | &function_end("gcm_gmult_4bit".$suffix); |
281 | ||
282 | &function_begin("gcm_ghash_4bit".$suffix); | |
283 | &stack_push(16+4+1); # +1 for 64-bit alignment | |
284 | &mov ($Zll,&wparam(0)); # load Xi | |
285 | &mov ($Htbl,&wparam(1)); # load Htable | |
286 | &mov ($inp,&wparam(2)); # load in | |
287 | &mov ("ecx",&wparam(3)); # load len | |
288 | &add ("ecx",$inp); | |
289 | &mov (&wparam(3),"ecx"); | |
290 | ||
291 | &mov ($Zhh,&DWP(0,$Zll)); # load Xi[16] | |
292 | &mov ($Zhl,&DWP(4,$Zll)); | |
293 | &mov ($Zlh,&DWP(8,$Zll)); | |
294 | &mov ($Zll,&DWP(12,$Zll)); | |
295 | ||
296 | &deposit_rem_4bit(16); | |
297 | ||
298 | &set_label("x86_outer_loop",16); | |
299 | &xor ($Zll,&DWP(12,$inp)); # xor with input | |
300 | &xor ($Zlh,&DWP(8,$inp)); | |
301 | &xor ($Zhl,&DWP(4,$inp)); | |
302 | &xor ($Zhh,&DWP(0,$inp)); | |
303 | &mov (&DWP(12,"esp"),$Zll); # dump it on stack | |
304 | &mov (&DWP(8,"esp"),$Zlh); | |
305 | &mov (&DWP(4,"esp"),$Zhl); | |
306 | &mov (&DWP(0,"esp"),$Zhh); | |
307 | ||
308 | &shr ($Zll,20); | |
309 | &and ($Zll,0xf0); | |
310 | ||
311 | if ($unroll) { | |
312 | &call ("_x86_gmult_4bit_inner"); | |
313 | } else { | |
314 | &x86_loop(0); | |
315 | &mov ($inp,&wparam(2)); | |
316 | } | |
317 | &lea ($inp,&DWP(16,$inp)); | |
318 | &cmp ($inp,&wparam(3)); | |
319 | &mov (&wparam(2),$inp) if (!$unroll); | |
320 | &jb (&label("x86_outer_loop")); | |
321 | ||
322 | &mov ($inp,&wparam(0)); # load Xi | |
323 | &mov (&DWP(12,$inp),$Zll); | |
324 | &mov (&DWP(8,$inp),$Zlh); | |
325 | &mov (&DWP(4,$inp),$Zhl); | |
326 | &mov (&DWP(0,$inp),$Zhh); | |
327 | &stack_pop(16+4+1); | |
328 | &function_end("gcm_ghash_4bit".$suffix); | |
329 | \f | |
330 | if (!$x86only) {{{ | |
331 | ||
332 | &static_label("rem_4bit"); | |
333 | ||
98909c1d | 334 | if (!$sse2) {{ # pure-MMX "May" version... |
8525950e AP |
335 | |
336 | $S=12; # shift factor for rem_4bit | |
337 | ||
07e29c12 AP |
338 | &function_begin_B("_mmx_gmult_4bit_inner"); |
339 | # MMX version performs 3.5 times better on P4 (see comment in non-MMX | |
340 | # routine for further details), 100% better on Opteron, ~70% better | |
341 | # on Core2 and PIII... In other words effort is considered to be well | |
342 | # spent... Since initial release the loop was unrolled in order to | |
343 | # "liberate" register previously used as loop counter. Instead it's | |
344 | # used to optimize critical path in 'Z.hi ^= rem_4bit[Z.lo&0xf]'. | |
345 | # The path involves move of Z.lo from MMX to integer register, | |
346 | # effective address calculation and finally merge of value to Z.hi. | |
347 | # Reference to rem_4bit is scheduled so late that I had to >>4 | |
348 | # rem_4bit elements. This resulted in 20-45% procent improvement | |
349 |