]>
Commit | Line | Data |
---|---|---|
e3a510f8 AP |
1 | #!/usr/bin/env perl |
2 | # | |
3 | # ==================================================================== | |
4 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | |
5 | # project. The module is, however, dual licensed under OpenSSL and | |
6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | |
7 | # details see http://www.openssl.org/~appro/cryptogams/. | |
8 | # ==================================================================== | |
9 | # | |
07e29c12 | 10 | # March, May 2010 |
480cd6ab | 11 | # |
c3473126 AP |
12 | # The module implements "4-bit" GCM GHASH function and underlying |
13 | # single multiplication operation in GF(2^128). "4-bit" means that it | |
14 | # uses 256 bytes per-key table [+64/128 bytes fixed table]. It has two | |
15 | # code paths: vanilla x86 and vanilla MMX. Former will be executed on | |
16 | # 486 and Pentium, latter on all others. Performance results are for | |
17 | # streamed GHASH subroutine and are expressed in cycles per processed | |
18 | # byte, less is better: | |
e3a510f8 AP |
19 | # |
20 | # gcc 2.95.3(*) MMX assembler x86 assembler | |
21 | # | |
22 | # Pentium 100/112(**) - 50 | |
07e29c12 AP |
23 | # PIII 63 /77 14.5 24 |
24 | # P4 96 /122 24.5 84(***) | |
25 | # Opteron 50 /71 14.5 30 | |
26 | # Core2 54 /68 10.5 18 | |
e3a510f8 AP |
27 | # |
28 | # (*) gcc 3.4.x was observed to generate few percent slower code, | |
480cd6ab | 29 | # which is one of reasons why 2.95.3 results were chosen, |
e3a510f8 AP |
30 | # another reason is lack of 3.4.x results for older CPUs; |
31 | # (**) second number is result for code compiled with -fPIC flag, | |
32 | # which is actually more relevant, because assembler code is | |
33 | # position-independent; | |
34 | # (***) see comment in non-MMX routine for further details; | |
35 | # | |
07e29c12 | 36 | # To summarize, it's >2-4 times faster than gcc-generated code. To |
480cd6ab | 37 | # anchor it to something else SHA1 assembler processes one byte in |
07e29c12 AP |
38 | # 11-13 cycles on contemporary x86 cores. As for choice of MMX in |
39 | # particular, see comment at the end of the file... | |
e3a510f8 | 40 | |
c1f092d1 AP |
41 | # May 2010 |
42 | # | |
43 | # Add PCLMULQDQ version performing at 2.13 cycles per processed byte. | |
44 | # The question is how close is it to theoretical limit? The pclmulqdq | |
45 | # instruction latency appears to be 14 cycles and there can't be more | |
46 | # than 2 of them executing at any given time. This means that single | |
47 | # Karatsuba multiplication would take 28 cycles *plus* few cycles for | |
48 | # pre- and post-processing. Then multiplication has to be followed by | |
49 | # modulo-reduction. Given that aggregated reduction method [see | |
50 | # "Carry-less Multiplication and Its Usage for Computing the GCM Mode" | |
51 | # white paper by Intel] allows you to perform reduction only once in | |
52 | # a while we can assume that asymptotic performance can be estimated | |
53 | # as (28+Tmod/Naggr)/16, where Tmod is time to perform reduction | |
54 | # and Naggr is the aggregation factor. | |
55 | # | |
56 | # Before we proceed to this implementation let's have closer look at | |
57 | # the best-performing code suggested by Intel in their white paper. | |
58 | # By tracing inter-register dependencies Tmod is estimated as ~19 | |
59 | # cycles and Naggr is 4, resulting in 2.05 cycles per processed byte. | |
60 | # As implied, this is quite optimistic estimate, because it does not | |
61 | # account for Karatsuba pre- and post-processing, which for a single | |
62 | # multiplication is ~5 cycles. Unfortunately Intel does not provide | |
63 | # performance data for GHASH alone, only for fused GCM mode. But | |
64 | # we can estimate it by subtracting CTR performance result provided | |
65 | # in "AES Instruction Set" white paper: 3.54-1.38=2.16 cycles per | |
66 | # processed byte or 5% off the estimate. It should be noted though | |
67 | # that 3.54 is GCM result for 16KB block size, while 1.38 is CTR for | |
68 | # 1KB block size, meaning that real number is likely to be a bit | |
69 | # further from estimate. | |
70 | # | |
71 | # Moving on to the implementation in question. Tmod is estimated as | |
72 | # ~13 cycles and Naggr is 2, giving asymptotic performance of ... | |
73 | # 2.16. How is it possible that measured performance is better than | |
74 | # optimistic theoretical estimate? There is one thing Intel failed | |
75 | # to recognize. By fusing GHASH with CTR former's performance is | |
76 | # really limited to above (Tmul + Tmod/Naggr) equation. But if GHASH | |
77 | # procedure is detached, the modulo-reduction can be interleaved with | |
78 | # Naggr-1 multiplications and under ideal conditions even disappear | |
79 | # from the equation. So that optimistic theoretical estimate for this | |
80 | # implementation is ... 28/16=1.75, and not 2.16. Well, it's probably | |
81 | # way too optimistic, at least for such small Naggr. I'd argue that | |
82 | # (28+Tproc/Naggr), where Tproc is time required for Karatsuba pre- | |
83 | # and post-processing, is more realistic estimate. In this case it | |
84 | # gives ... 1.91 cycles per processed byte. Or in other words, | |
85 | # depending on how well we can interleave reduction and one of the | |
86 | # two multiplications the performance should be betwen 1.91 and 2.16. | |
87 | # As already mentioned, this implementation processes one byte [out | |
88 | # of 1KB buffer] in 2.13 cycles, while x86_64 counterpart - in 2.07. | |
89 | # x86_64 performance is better, because larger register bank allows | |
90 | # to interleave reduction and multiplication better. | |
91 | # | |
92 | # Does it make sense to increase Naggr? To start with it's virtually | |
93 | # impossible in 32-bit mode, because of limited register bank | |
94 | # capacity. Otherwise improvement has to be weighed agiainst slower | |
95 | # setup, as well as code size and complexity increase. As even | |
96 | # optimistic estimate doesn't promise 30% performance improvement, | |
97 | # there are currently no plans to increase Naggr. | |
1aa8a629 AP |
98 | # |
99 | # Special thanks to David Woodhouse <dwmw2@infradead.org> for | |
100 | # providing access to a Westmere-based system on behalf of Intel | |
101 | # Open Source Technology Centre. | |
c1f092d1 | 102 | |
e3a510f8 AP |
103 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; |
104 | push(@INC,"${dir}","${dir}../../perlasm"); | |
105 | require "x86asm.pl"; | |
106 | ||
c1f092d1 | 107 | &asm_init($ARGV[0],"ghash-x86.pl",$x86only = $ARGV[$#ARGV] eq "386"); |
e3a510f8 | 108 | |
c1f092d1 AP |
109 | $sse2=0; |
110 | for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } | |
e3a510f8 | 111 | |
c1f092d1 | 112 | ($Zhh,$Zhl,$Zlh,$Zll) = ("ebp","edx","ecx","ebx"); |
e3a510f8 AP |
113 | $inp = "edi"; |
114 | $Htbl = "esi"; | |
c1f092d1 | 115 | \f |
e3a510f8 AP |
116 | $unroll = 0; # Affects x86 loop. Folded loop performs ~7% worse |
117 | # than unrolled, which has to be weighted against | |
c1f092d1 | 118 | # 2.5x x86-specific code size reduction. |
e3a510f8 AP |
119 | |
120 | sub x86_loop { | |
121 | my $off = shift; | |
122 | my $rem = "eax"; | |
123 | ||
124 | &mov ($Zhh,&DWP(4,$Htbl,$Zll)); | |
125 | &mov ($Zhl,&DWP(0,$Htbl,$Zll)); | |
126 | &mov ($Zlh,&DWP(12,$Htbl,$Zll)); | |
127 | &mov ($Zll,&DWP(8,$Htbl,$Zll)); | |
128 | &xor ($rem,$rem); # avoid partial register stalls on PIII | |
129 | ||
130 | # shrd practically kills P4, 2.5x deterioration, but P4 has | |
131 | # MMX code-path to execute. shrd runs tad faster [than twice | |
132 | # the shifts, move's and or's] on pre-MMX Pentium (as well as | |
133 | # PIII and Core2), *but* minimizes code size, spares register | |
134 | # and thus allows to fold the loop... | |
135 | if (!$unroll) { | |
136 | my $cnt = $inp; | |
137 | &mov ($cnt,15); | |
138 | &jmp (&label("x86_loop")); | |
139 | &set_label("x86_loop",16); | |
140 | for($i=1;$i<=2;$i++) { | |
141 | &mov (&LB($rem),&LB($Zll)); | |
142 | &shrd ($Zll,$Zlh,4); | |
143 | &and (&LB($rem),0xf); | |
144 | &shrd ($Zlh,$Zhl,4); | |
145 | &shrd ($Zhl,$Zhh,4); | |
146 | &shr ($Zhh,4); | |
147 | &xor ($Zhh,&DWP($off+16,"esp",$rem,4)); | |
148 | ||
149 | &mov (&LB($rem),&BP($off,"esp",$cnt)); | |
150 | if ($i&1) { | |
151 | &and (&LB($rem),0xf0); | |
152 | } else { | |
153 | &shl (&LB($rem),4); | |
154 | } | |
155 | ||
156 | &xor ($Zll,&DWP(8,$Htbl,$rem)); | |
157 | &xor ($Zlh,&DWP(12,$Htbl,$rem)); | |
158 | &xor ($Zhl,&DWP(0,$Htbl,$rem)); | |
159 | &xor ($Zhh,&DWP(4,$Htbl,$rem)); | |
160 | ||
161 | if ($i&1) { | |
162 | &dec ($cnt); | |
163 | &js (&label("x86_break")); | |
164 | } else { | |
165 | &jmp (&label("x86_loop")); | |
166 | } | |
167 | } | |
168 | &set_label("x86_break",16); | |
169 | } else { | |
170 | for($i=1;$i<32;$i++) { | |
171 | &comment($i); | |
172 | &mov (&LB($rem),&LB($Zll)); | |
173 | &shrd ($Zll,$Zlh,4); | |
174 | &and (&LB($rem),0xf); | |
175 | &shrd ($Zlh,$Zhl,4); | |
176 | &shrd ($Zhl,$Zhh,4); | |
177 | &shr ($Zhh,4); | |
178 | &xor ($Zhh,&DWP($off+16,"esp",$rem,4)); | |
179 | ||
180 | if ($i&1) { | |
181 | &mov (&LB($rem),&BP($off+15-($i>>1),"esp")); | |
182 | &and (&LB($rem),0xf0); | |
183 | } else { | |
184 | &mov (&LB($rem),&BP($off+15-($i>>1),"esp")); | |
185 | &shl (&LB($rem),4); | |
186 | } | |
187 | ||
188 | &xor ($Zll,&DWP(8,$Htbl,$rem)); | |
189 | &xor ($Zlh,&DWP(12,$Htbl,$rem)); | |
190 | &xor ($Zhl,&DWP(0,$Htbl,$rem)); | |
191 | &xor ($Zhh,&DWP(4,$Htbl,$rem)); | |
192 | } | |
193 | } | |
194 | &bswap ($Zll); | |
195 | &bswap ($Zlh); | |
196 | &bswap ($Zhl); | |
197 | if (!$x86only) { | |
198 | &bswap ($Zhh); | |
199 | } else { | |
200 | &mov ("eax",$Zhh); | |
201 | &bswap ("eax"); | |
202 | &mov ($Zhh,"eax"); | |
203 | } | |
204 | } | |
205 | ||
206 | if ($unroll) { | |
207 | &function_begin_B("_x86_gmult_4bit_inner"); | |
208 | &x86_loop(4); | |
209 | &ret (); | |
210 | &function_end_B("_x86_gmult_4bit_inner"); | |
211 | } | |
212 | ||
c1f092d1 AP |
213 | sub deposit_rem_4bit { |
214 | my $bias = shift; | |
e3a510f8 | 215 | |
c1f092d1 AP |
216 | &mov (&DWP($bias+0, "esp"),0x0000<<16); |
217 | &mov (&DWP($bias+4, "esp"),0x1C20<<16); | |
218 | &mov (&DWP($bias+8, "esp"),0x3840<<16); | |
219 | &mov (&DWP($bias+12,"esp"),0x2460<<16); | |
220 | &mov (&DWP($bias+16,"esp"),0x7080<<16); | |
221 | &mov (&DWP($bias+20,"esp"),0x6CA0<<16); | |
222 | &mov (&DWP($bias+24,"esp"),0x48C0<<16); | |
223 | &mov (&DWP($bias+28,"esp"),0x54E0<<16); | |
224 | &mov (&DWP($bias+32,"esp"),0xE100<<16); | |
225 | &mov (&DWP($bias+36,"esp"),0xFD20<<16); | |
226 | &mov (&DWP($bias+40,"esp"),0xD940<<16); | |
227 | &mov (&DWP($bias+44,"esp"),0xC560<<16); | |
228 | &mov (&DWP($bias+48,"esp"),0x9180<<16); | |
229 | &mov (&DWP($bias+52,"esp"),0x8DA0<<16); | |
230 | &mov (&DWP($bias+56,"esp"),0xA9C0<<16); | |
231 | &mov (&DWP($bias+60,"esp"),0xB5E0<<16); | |
232 | } | |
233 | \f | |
234 | $suffix = $x86only ? "" : "_x86"; | |
e3a510f8 | 235 | |
c1f092d1 | 236 | &function_begin("gcm_gmult_4bit".$suffix); |
e3a510f8 AP |
237 | &stack_push(16+4+1); # +1 for stack alignment |
238 | &mov ($inp,&wparam(0)); # load Xi | |
239 | &mov ($Htbl,&wparam(1)); # load Htable | |
240 | ||
241 | &mov ($Zhh,&DWP(0,$inp)); # load Xi[16] | |
242 | &mov ($Zhl,&DWP(4,$inp)); | |
243 | &mov ($Zlh,&DWP(8,$inp)); | |
244 | &mov ($Zll,&DWP(12,$inp)); | |
245 | ||
246 | &deposit_rem_4bit(16); | |
247 | ||
248 | &mov (&DWP(0,"esp"),$Zhh); # copy Xi[16] on stack | |
249 | &mov (&DWP(4,"esp"),$Zhl); | |
250 | &mov (&DWP(8,"esp"),$Zlh); | |
251 | &mov (&DWP(12,"esp"),$Zll); | |
252 | &shr ($Zll,20); | |
253 | &and ($Zll,0xf0); | |
254 | ||
255 | if ($unroll) { | |
256 | &call ("_x86_gmult_4bit_inner"); | |
257 | } else { | |
258 | &x86_loop(0); | |
259 | &mov ($inp,&wparam(0)); | |
260 | } | |
261 | ||
262 | &mov (&DWP(12,$inp),$Zll); | |
263 | &mov (&DWP(8,$inp),$Zlh); | |
264 | &mov (&DWP(4,$inp),$Zhl); | |
265 | &mov (&DWP(0,$inp),$Zhh); | |
266 | &stack_pop(16+4+1); | |
c1f092d1 AP |
267 | &function_end("gcm_gmult_4bit".$suffix); |
268 | ||
269 | &function_begin("gcm_ghash_4bit".$suffix); | |
270 | &stack_push(16+4+1); # +1 for 64-bit alignment | |
271 | &mov ($Zll,&wparam(0)); # load Xi | |
272 | &mov ($Htbl,&wparam(1)); # load Htable | |
273 | &mov ($inp,&wparam(2)); # load in | |
274 | &mov ("ecx",&wparam(3)); # load len | |
275 | &add ("ecx",$inp); | |
276 | &mov (&wparam(3),"ecx"); | |
277 | ||
278 | &mov ($Zhh,&DWP(0,$Zll)); # load Xi[16] | |
279 | &mov ($Zhl,&DWP(4,$Zll)); | |
280 | &mov ($Zlh,&DWP(8,$Zll)); | |
281 | &mov ($Zll,&DWP(12,$Zll)); | |
282 | ||
283 | &deposit_rem_4bit(16); | |
284 | ||
285 | &set_label("x86_outer_loop",16); | |
286 | &xor ($Zll,&DWP(12,$inp)); # xor with input | |
287 | &xor ($Zlh,&DWP(8,$inp)); | |
288 | &xor ($Zhl,&DWP(4,$inp)); | |
289 | &xor ($Zhh,&DWP(0,$inp)); | |
290 | &mov (&DWP(12,"esp"),$Zll); # dump it on stack | |
291 | &mov (&DWP(8,"esp"),$Zlh); | |
292 | &mov (&DWP(4,"esp"),$Zhl); | |
293 | &mov (&DWP(0,"esp"),$Zhh); | |
294 | ||
295 | &shr ($Zll,20); | |
296 | &and ($Zll,0xf0); | |
297 | ||
298 | if ($unroll) { | |
299 | &call ("_x86_gmult_4bit_inner"); | |
300 | } else { | |
301 | &x86_loop(0); | |
302 | &mov ($inp,&wparam(2)); | |
303 | } | |
304 | &lea ($inp,&DWP(16,$inp)); | |
305 | &cmp ($inp,&wparam(3)); | |
306 | &mov (&wparam(2),$inp) if (!$unroll); | |
307 | &jb (&label("x86_outer_loop")); | |
308 | ||
309 | &mov ($inp,&wparam(0)); # load Xi | |
310 | &mov (&DWP(12,$inp),$Zll); | |
311 | &mov (&DWP(8,$inp),$Zlh); | |
312 | &mov (&DWP(4,$inp),$Zhl); | |
313 | &mov (&DWP(0,$inp),$Zhh); | |
314 | &stack_pop(16+4+1); | |
315 | &function_end("gcm_ghash_4bit".$suffix); | |
316 | \f | |
317 | if (!$x86only) {{{ | |
318 | ||
319 | &static_label("rem_4bit"); | |
320 | ||
07e29c12 AP |
321 | &function_begin_B("_mmx_gmult_4bit_inner"); |
322 | # MMX version performs 3.5 times better on P4 (see comment in non-MMX | |
323 | # routine for further details), 100% better on Opteron, ~70% better | |
324 | # on Core2 and PIII... In other words effort is considered to be well | |
325 | # spent... Since initial release the loop was unrolled in order to | |
326 | # "liberate" register previously used as loop counter. Instead it's | |
327 | # used to optimize critical path in 'Z.hi ^= rem_4bit[Z.lo&0xf]'. | |
328 | # The path involves move of Z.lo from MMX to integer register, | |
329 | # effective address calculation and finally merge of value to Z.hi. | |
330 | # Reference to rem_4bit is scheduled so late that I had to >>4 | |
331 | # rem_4bit elements. This resulted in 20-45% procent improvement | |
332 |