]>
Commit | Line | Data |
---|---|---|
6aa36e8e | 1 | #! /usr/bin/env perl |
fecb3aae | 2 | # Copyright 2014-2022 The OpenSSL Project Authors. All Rights Reserved. |
6aa36e8e | 3 | # |
81cae8ce | 4 | # Licensed under the Apache License 2.0 (the "License"). You may not use |
6aa36e8e RS |
5 | # this file except in compliance with the License. You can obtain a copy |
6 | # in the file LICENSE in the source distribution or at | |
7 | # https://www.openssl.org/source/license.html | |
8 | ||
2d5a799d AP |
9 | # |
10 | # ==================================================================== | |
11 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | |
12 | # project. The module is, however, dual licensed under OpenSSL and | |
13 | # CRYPTOGAMS licenses depending on where you obtain it. For further | |
14 | # details see http://www.openssl.org/~appro/cryptogams/. | |
15 | # ==================================================================== | |
16 | # | |
17 | # GHASH for ARMv8 Crypto Extension, 64-bit polynomial multiplication. | |
18 | # | |
19 | # June 2014 | |
7ff2fa4b AP |
20 | # |
21 | # Initial version was developed in tight cooperation with Ard | |
22 | # Biesheuvel of Linaro from bits-n-pieces from other assembly modules. | |
23 | # Just like aesv8-armx.pl this module supports both AArch32 and | |
24 | # AArch64 execution modes. | |
2d5a799d | 25 | # |
7eeeb49e | 26 | # July 2014 |
7ff2fa4b | 27 | # |
7eeeb49e AP |
28 | # Implement 2x aggregated reduction [see ghash-x86.pl for background |
29 | # information]. | |
30 | # | |
7ff2fa4b AP |
31 | # November 2017 |
32 | # | |
aa7bf316 AP |
33 | # AArch64 register bank to "accommodate" 4x aggregated reduction and |
34 | # improve performance by 20-70% depending on processor. | |
7ff2fa4b | 35 | # |
2d5a799d AP |
36 | # Current performance in cycles per processed byte: |
37 | # | |
7ff2fa4b | 38 | # 64-bit PMULL 32-bit PMULL 32-bit NEON(*) |
aa7bf316 AP |
39 | # Apple A7 0.58 0.92 5.62 |
40 | # Cortex-A53 0.85 1.01 8.39 | |
41 | # Cortex-A57 0.73 1.17 7.61 | |
42 | # Denver 0.51 0.65 6.02 | |
43 | # Mongoose 0.65 1.10 8.06 | |
44 | # Kryo 0.76 1.16 8.00 | |
6465321e | 45 | # ThunderX2 1.05 |
2d5a799d AP |
46 | # |
47 | # (*) presented for reference/comparison purposes; | |
48 | ||
1aa89a7a RL |
49 | # $output is the last argument if it looks like a file (it has an extension) |
50 | # $flavour is the first argument if it doesn't look like a file | |
51 | $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; | |
52 | $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; | |
9b05cbc3 AP |
53 | |
54 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | |
55 | ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or | |
56 | ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or | |
57 | die "can't locate arm-xlate.pl"; | |
58 | ||
1aa89a7a RL |
59 | open OUT,"| \"$^X\" $xlate $flavour \"$output\"" |
60 | or die "can't call $xlate: $!"; | |
9b05cbc3 | 61 | *STDOUT=*OUT; |
2d5a799d AP |
62 | |
63 | $Xi="x0"; # argument block | |
64 | $Htbl="x1"; | |
65 | $inp="x2"; | |
66 | $len="x3"; | |
67 | ||
68 | $inc="x12"; | |
69 | ||
70 | { | |
71 | my ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3)); | |
7eeeb49e | 72 | my ($t0,$t1,$t2,$xC2,$H,$Hhl,$H2)=map("q$_",(8..14)); |
3405db97 | 73 | my $_byte = ($flavour =~ /win/ ? "DCB" : ".byte"); |
2d5a799d AP |
74 | |
75 | $code=<<___; | |
76 | #include "arm_arch.h" | |
77 | ||
198a2ed7 | 78 | #if __ARM_MAX_ARCH__>=7 |
2d5a799d | 79 | ___ |
3405db97 AP |
80 | $code.=".arch armv8-a+crypto\n.text\n" if ($flavour =~ /64/); |
81 | $code.=<<___ if ($flavour !~ /64/); | |
c93f06c1 | 82 | .fpu neon |
3405db97 AP |
83 | #ifdef __thumb2__ |
84 | .syntax unified | |
85 | .thumb | |
86 | # define INST(a,b,c,d) $_byte c,0xef,a,b | |
87 | #else | |
88 | .code 32 | |
89 | # define INST(a,b,c,d) $_byte a,b,c,0xf2 | |
90 | #endif | |
91 | ||
92 | .text | |
c93f06c1 | 93 | ___ |
2d5a799d | 94 | |
7eeeb49e AP |
95 | ################################################################################ |
96 | # void gcm_init_v8(u128 Htable[16],const u64 H[2]); | |
97 | # | |
98 | # input: 128-bit H - secret parameter E(K,0^128) | |
99 | # output: precomputed table filled with degrees of twisted H; | |
100 | # H is twisted to handle reverse bitness of GHASH; | |
101 | # only few of 16 slots of Htable[16] are used; | |
102 | # data is opaque to outside world (which allows to | |
103 | # optimize the code independently); | |
104 | # | |
2d5a799d AP |
105 | $code.=<<___; |
106 | .global gcm_init_v8 | |
107 | .type gcm_init_v8,%function | |
108 | .align 4 | |
109 | gcm_init_v8: | |
19e277dd RB |
110 | ___ |
111 | $code.=<<___ if ($flavour =~ /64/); | |
112 | AARCH64_VALID_CALL_TARGET | |
113 | ___ | |
114 | $code.=<<___; | |
7eeeb49e AP |
115 | vld1.64 {$t1},[x1] @ load input H |
116 | vmov.i8 $xC2,#0xe1 | |
117 | vshl.i64 $xC2,$xC2,#57 @ 0xc2.0 | |
2d5a799d | 118 | vext.8 $IN,$t1,$t1,#8 |
7eeeb49e | 119 | vshr.u64 $t2,$xC2,#63 |
2d5a799d | 120 | vdup.32 $t1,${t1}[1] |
7eeeb49e AP |
121 | vext.8 $t0,$t2,$xC2,#8 @ t0=0xc2....01 |
122 | vshr.u64 $t2,$IN,#63 | |
2d5a799d | 123 | vshr.s32 $t1,$t1,#31 @ broadcast carry bit |
7eeeb49e | 124 | vand $t2,$t2,$t0 |
2d5a799d | 125 | vshl.i64 $IN,$IN,#1 |
7eeeb49e | 126 | vext.8 $t2,$t2,$t2,#8 |
2d5a799d | 127 | vand $t0,$t0,$t1 |
7eeeb49e AP |
128 | vorr $IN,$IN,$t2 @ H<<<=1 |
129 | veor $H,$IN,$t0 @ twisted H | |
130 | vst1.64 {$H},[x0],#16 @ store Htable[0] | |
131 | ||
132 | @ calculate H^2 | |
133 | vext.8 $t0,$H,$H,#8 @ Karatsuba pre-processing | |
134 | vpmull.p64 $Xl,$H,$H | |
135 | veor $t0,$t0,$H | |
136 | vpmull2.p64 $Xh,$H,$H | |
137 | vpmull.p64 $Xm,$t0,$t0 | |
138 | ||
139 | vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing | |
140 | veor $t2,$Xl,$Xh | |
141 | veor $Xm,$Xm,$t1 | |
142 | veor $Xm,$Xm,$t2 | |
143 | vpmull.p64 $t2,$Xl,$xC2 @ 1st phase | |
144 | ||
145 | vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result | |
146 | vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl | |
147 | veor $Xl,$Xm,$t2 | |
148 | ||
149 | vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase | |
150 | vpmull.p64 $Xl,$Xl,$xC2 | |
151 | veor $t2,$t2,$Xh | |
152 | veor $H2,$Xl,$t2 | |
153 | ||
154 | vext.8 $t1,$H2,$H2,#8 @ Karatsuba pre-processing | |
155 | veor $t1,$t1,$H2 | |
156 | vext.8 $Hhl,$t0,$t1,#8 @ pack Karatsuba pre-processed | |
7ff2fa4b AP |
157 | vst1.64 {$Hhl-$H2},[x0],#32 @ store Htable[1..2] |
158 | ___ | |
159 | if ($flavour =~ /64/) { | |
160 | my ($t3,$Yl,$Ym,$Yh) = map("q$_",(4..7)); | |
954f45ba | 161 | my ($H3,$H34k,$H4,$H5,$H56k,$H6,$H7,$H78k,$H8) = map("q$_",(15..23)); |
7ff2fa4b AP |
162 | |
163 | $code.=<<___; | |
164 | @ calculate H^3 and H^4 | |
165 | vpmull.p64 $Xl,$H, $H2 | |
166 | vpmull.p64 $Yl,$H2,$H2 | |
167 | vpmull2.p64 $Xh,$H, $H2 | |
168 | vpmull2.p64 $Yh,$H2,$H2 | |
169 | vpmull.p64 $Xm,$t0,$t1 | |
170 | vpmull.p64 $Ym,$t1,$t1 | |
171 | ||
172 | vext.8 $t0,$Xl,$Xh,#8 @ Karatsuba post-processing | |
173 | vext.8 $t1,$Yl,$Yh,#8 | |
174 | veor $t2,$Xl,$Xh | |
175 | veor $Xm,$Xm,$t0 | |
176 | veor $t3,$Yl,$Yh | |
177 | veor $Ym,$Ym,$t1 | |
178 | veor $Xm,$Xm,$t2 | |
179 | vpmull.p64 $t2,$Xl,$xC2 @ 1st phase | |
180 | veor $Ym,$Ym,$t3 | |
181 | vpmull.p64 $t3,$Yl,$xC2 | |
2d5a799d | 182 | |
7ff2fa4b AP |
183 | vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result |
184 | vmov $Yh#lo,$Ym#hi | |
185 | vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl | |
186 | vmov $Ym#hi,$Yl#lo | |
187 | veor $Xl,$Xm,$t2 | |
188 | veor $Yl,$Ym,$t3 | |
189 | ||
190 | vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase | |
191 | vext.8 $t3,$Yl,$Yl,#8 | |
192 | vpmull.p64 $Xl,$Xl,$xC2 | |
193 | vpmull.p64 $Yl,$Yl,$xC2 | |
194 | veor $t2,$t2,$Xh | |
195 | veor $t3,$t3,$Yh | |
954f45ba X |
196 | veor $H3, $Xl,$t2 @ H^3 |
197 | veor $H4,$Yl,$t3 @ H^4 | |
7ff2fa4b | 198 | |
954f45ba X |
199 | vext.8 $t0,$H3, $H3,#8 @ Karatsuba pre-processing |
200 | vext.8 $t1,$H4,$H4,#8 | |
201 | vext.8 $t2,$H2,$H2,#8 | |
202 | veor $t0,$t0,$H3 | |
203 | veor $t1,$t1,$H4 | |
204 | veor $t2,$t2,$H2 | |
205 | vext.8 $H34k,$t0,$t1,#8 @ pack Karatsuba pre-processed | |
206 | vst1.64 {$H3-$H4},[x0],#48 @ store Htable[3..5] | |
207 | ||
208 | @ calculate H^5 and H^6 | |
209 | vpmull.p64 $Xl,$H2, $H3 | |
210 | vpmull.p64 $Yl,$H3,$H3 | |
211 | vpmull2.p64 $Xh,$H2, $H3 | |
212 | vpmull2.p64 $Yh,$H3,$H3 | |
213 | vpmull.p64 $Xm,$t0,$t2 | |
214 | vpmull.p64 $Ym,$t0,$t0 | |
215 | ||
216 | vext.8 $t0,$Xl,$Xh,#8 @ Karatsuba post-processing | |
217 | vext.8 $t1,$Yl,$Yh,#8 | |
218 | veor $t2,$Xl,$Xh | |
219 | veor $Xm,$Xm,$t0 | |
220 | veor $t3,$Yl,$Yh | |
221 | veor $Ym,$Ym,$t1 | |
222 | veor $Xm,$Xm,$t2 | |
223 | vpmull.p64 $t2,$Xl,$xC2 @ 1st phase | |
224 | veor $Ym,$Ym,$t3 | |
225 | vpmull.p64 $t3,$Yl,$xC2 | |
226 | ||
227 | vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result | |
228 | vmov $Yh#lo,$Ym#hi | |
229 | vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl | |
230 | vmov $Ym#hi,$Yl#lo | |
231 | veor $Xl,$Xm,$t2 | |
232 | veor $Yl,$Ym,$t3 | |
233 | ||
234 | vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase | |
235 | vext.8 $t3,$Yl,$Yl,#8 | |
236 | vpmull.p64 $Xl,$Xl,$xC2 | |
237 | vpmull.p64 $Yl,$Yl,$xC2 | |
238 | veor $t2,$t2,$Xh | |
239 | veor $t3,$t3,$Yh | |
240 | veor $H5,$Xl,$t2 @ H^5 | |
241 | veor $H6,$Yl,$t3 @ H^6 | |
242 | ||
243 | vext.8 $t0,$H5, $H5,#8 @ Karatsuba pre-processing | |
244 | vext.8 $t1,$H6,$H6,#8 | |
245 | vext.8 $t2,$H2,$H2,#8 | |
246 | veor $t0,$t0,$H5 | |
247 | veor $t1,$t1,$H6 | |
248 | veor $t2,$t2,$H2 | |
249 | vext.8 $H56k,$t0,$t1,#8 @ pack Karatsuba pre-processed | |
250 | vst1.64 {$H5-$H6},[x0],#48 @ store Htable[6..8] | |
251 | ||
252 | @ calculate H^7 and H^8 | |
253 | vpmull.p64 $Xl,$H2,$H5 | |
254 | vpmull.p64 $Yl,$H2,$H6 | |
255 | vpmull2.p64 $Xh,$H2,$H5 | |
256 | vpmull2.p64 $Yh,$H2,$H6 | |
257 | vpmull.p64 $Xm,$t0,$t2 | |
258 | vpmull.p64 $Ym,$t1,$t2 | |
259 | ||
260 | vext.8 $t0,$Xl,$Xh,#8 @ Karatsuba post-processing | |
261 | vext.8 $t1,$Yl,$Yh,#8 | |
262 | veor $t2,$Xl,$Xh | |
263 | veor $Xm,$Xm,$t0 | |
264 | veor $t3,$Yl,$Yh | |
265 | veor $Ym,$Ym,$t1 | |
266 | veor $Xm,$Xm,$t2 | |
267 | vpmull.p64 $t2,$Xl,$xC2 @ 1st phase | |
268 | veor $Ym,$Ym,$t3 | |
269 | vpmull.p64 $t3,$Yl,$xC2 | |
270 | ||
271 | vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result | |
272 | vmov $Yh#lo,$Ym#hi | |
273 | vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl | |
274 | vmov $Ym#hi,$Yl#lo | |
275 | veor $Xl,$Xm,$t2 | |
276 | veor $Yl,$Ym,$t3 | |
277 | ||
278 | vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase | |
279 | vext.8 $t3,$Yl,$Yl,#8 | |
280 | vpmull.p64 $Xl,$Xl,$xC2 | |
281 | vpmull.p64 $Yl,$Yl,$xC2 | |
282 | veor $t2,$t2,$Xh | |
283 | veor $t3,$t3,$Yh | |
284 | veor $H7,$Xl,$t2 @ H^7 | |
285 | veor $H8,$Yl,$t3 @ H^8 | |
286 | ||
287 | vext.8 $t0,$H7,$H7,#8 @ Karatsuba pre-processing | |
288 | vext.8 $t1,$H8,$H8,#8 | |
289 | veor $t0,$t0,$H7 | |
290 | veor $t1,$t1,$H8 | |
291 | vext.8 $H78k,$t0,$t1,#8 @ pack Karatsuba pre-processed | |
292 | vst1.64 {$H7-$H8},[x0] @ store Htable[9..11] | |
7ff2fa4b AP |
293 | ___ |
294 | } | |
295 | $code.=<<___; | |
2d5a799d AP |
296 | ret |
297 | .size gcm_init_v8,.-gcm_init_v8 | |
7eeeb49e AP |
298 | ___ |
299 | ################################################################################ | |
300 | # void gcm_gmult_v8(u64 Xi[2],const u128 Htable[16]); | |
301 | # | |
302 | # input: Xi - current hash value; | |
303 | # Htable - table precomputed in gcm_init_v8; | |
304 | # output: Xi - next hash value Xi; | |
305 | # | |
306 | $code.=<<___; | |
2d5a799d AP |
307 | .global gcm_gmult_v8 |
308 | .type gcm_gmult_v8,%function | |
309 | .align 4 | |
310 | gcm_gmult_v8: | |
19e277dd RB |
311 | ___ |
312 | $code.=<<___ if ($flavour =~ /64/); | |
313 | AARCH64_VALID_CALL_TARGET | |
314 | ___ | |
315 | $code.=<<___; | |
2d5a799d | 316 | vld1.64 {$t1},[$Xi] @ load Xi |
7eeeb49e AP |
317 | vmov.i8 $xC2,#0xe1 |
318 | vld1.64 {$H-$Hhl},[$Htbl] @ load twisted H, ... | |
319 | vshl.u64 $xC2,$xC2,#57 | |
2d5a799d AP |
320 | #ifndef __ARMEB__ |
321 | vrev64.8 $t1,$t1 | |
322 | #endif | |
2d5a799d | 323 | vext.8 $IN,$t1,$t1,#8 |
2d5a799d | 324 | |
053fa39a | 325 | vpmull.p64 $Xl,$H,$IN @ H.lo·Xi.lo |
7eeeb49e | 326 | veor $t1,$t1,$IN @ Karatsuba pre-processing |
053fa39a RL |
327 | vpmull2.p64 $Xh,$H,$IN @ H.hi·Xi.hi |
328 | vpmull.p64 $Xm,$Hhl,$t1 @ (H.lo+H.hi)·(Xi.lo+Xi.hi) | |
7eeeb49e AP |
329 | |
330 | vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing | |
331 | veor $t2,$Xl,$Xh | |
332 | veor $Xm,$Xm,$t1 | |
333 | veor $Xm,$Xm,$t2 | |
334 | vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction | |
335 | ||
336 | vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result | |
337 | vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl | |
338 | veor $Xl,$Xm,$t2 | |
339 | ||
340 | vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction | |
341 | vpmull.p64 $Xl,$Xl,$xC2 | |
342 | veor $t2,$t2,$Xh | |
343 | veor $Xl,$Xl,$t2 | |
344 | ||
345 | #ifndef __ARMEB__ | |
346 | vrev64.8 $Xl,$Xl | |
347 | #endif | |
348 | vext.8 $Xl,$Xl,$Xl,#8 | |
349 | vst1.64 {$Xl},[$Xi] @ write out Xi | |
350 | ||
351 | ret | |
352 | .size gcm_gmult_v8,.-gcm_gmult_v8 | |
353 | ___ | |
354 | ################################################################################ | |
355 | # void gcm_ghash_v8(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len); | |
356 | # | |
357 | # input: table precomputed in gcm_init_v8; | |
358 | # current hash value Xi; | |
359 | # pointer to input data; | |
360 | # length of input data in bytes, but divisible by block size; | |
361 | # output: next hash value Xi; | |
362 | # | |
363 | $code.=<<___; | |
2d5a799d AP |
364 | .global gcm_ghash_v8 |
365 | .type gcm_ghash_v8,%function | |
366 | .align 4 | |
367 | gcm_ghash_v8: | |
7eeeb49e | 368 | ___ |
7ff2fa4b | 369 | $code.=<<___ if ($flavour =~ /64/); |
19e277dd | 370 | AARCH64_VALID_CALL_TARGET |
603ebe03 AP |
371 | cmp $len,#64 |
372 | b.hs .Lgcm_ghash_v8_4x | |
7ff2fa4b | 373 | ___ |
7eeeb49e AP |
374 | $code.=<<___ if ($flavour !~ /64/); |
375 | vstmdb sp!,{d8-d15} @ 32-bit ABI says so | |
376 | ___ | |
377 | $code.=<<___; | |
2d5a799d | 378 | vld1.64 {$Xl},[$Xi] @ load [rotated] Xi |
7eeeb49e AP |
379 | @ "[rotated]" means that |
380 | @ loaded value would have | |
381 | @ to be rotated in order to | |
382 | @ make it appear as in | |
46f4e1be | 383 | @ algorithm specification |
7eeeb49e AP |
384 | subs $len,$len,#32 @ see if $len is 32 or larger |
385 | mov $inc,#16 @ $inc is used as post- | |
386 | @ increment for input pointer; | |
387 | @ as loop is modulo-scheduled | |
388 | @ $inc is zeroed just in time | |
46f4e1be | 389 | @ to preclude overstepping |
7eeeb49e AP |
390 | @ inp[len], which means that |
391 | @ last block[s] are actually | |
392 | @ loaded twice, but last | |
393 | @ copy is not processed | |
394 | vld1.64 {$H-$Hhl},[$Htbl],#32 @ load twisted H, ..., H^2 | |
395 | vmov.i8 $xC2,#0xe1 | |
396 | vld1.64 {$H2},[$Htbl] | |
397 | cclr $inc,eq @ is it time to zero $inc? | |
398 | vext.8 $Xl,$Xl,$Xl,#8 @ rotate Xi | |
399 | vld1.64 {$t0},[$inp],#16 @ load [rotated] I[0] | |
400 | vshl.u64 $xC2,$xC2,#57 @ compose 0xc2.0 constant | |
2d5a799d | 401 | #ifndef __ARMEB__ |
7eeeb49e | 402 | vrev64.8 $t0,$t0 |
2d5a799d | 403 | vrev64.8 $Xl,$Xl |
7eeeb49e AP |
404 | #endif |
405 | vext.8 $IN,$t0,$t0,#8 @ rotate I[0] | |
406 | b.lo .Lodd_tail_v8 @ $len was less than 32 | |
407 | ___ | |
408 | { my ($Xln,$Xmn,$Xhn,$In) = map("q$_",(4..7)); | |
409 | ####### | |
410 | # Xi+2 =[H*(Ii+1 + Xi+1)] mod P = | |
411 | # [(H*Ii+1) + (H*Xi+1)] mod P = | |
412 | # [(H*Ii+1) + H^2*(Ii+Xi)] mod P | |
413 | # | |
414 | $code.=<<___; | |
415 | vld1.64 {$t1},[$inp],$inc @ load [rotated] I[1] | |
416 | #ifndef __ARMEB__ | |
2d5a799d AP |
417 | vrev64.8 $t1,$t1 |
418 | #endif | |
7eeeb49e AP |
419 | vext.8 $In,$t1,$t1,#8 |
420 | veor $IN,$IN,$Xl @ I[i]^=Xi | |
053fa39a | 421 | vpmull.p64 $Xln,$H,$In @ H·Ii+1 |
7eeeb49e AP |
422 | veor $t1,$t1,$In @ Karatsuba pre-processing |
423 | vpmull2.p64 $Xhn,$H,$In | |
424 | b .Loop_mod2x_v8 | |
2d5a799d AP |
425 | |
426 | .align 4 | |
7eeeb49e AP |
427 | .Loop_mod2x_v8: |
428 | vext.8 $t2,$IN,$IN,#8 | |
429 | subs $len,$len,#32 @ is there more data? | |
053fa39a | 430 | vpmull.p64 $Xl,$H2,$IN @ H^2.lo·Xi.lo |
7eeeb49e AP |
431 | cclr $inc,lo @ is it time to zero $inc? |
432 | ||
433 | vpmull.p64 $Xmn,$Hhl,$t1 | |
434 | veor $t2,$t2,$IN @ Karatsuba pre-processing | |
053fa39a | 435 | vpmull2.p64 $Xh,$H2,$IN @ H^2.hi·Xi.hi |
7eeeb49e | 436 | veor $Xl,$Xl,$Xln @ accumulate |
053fa39a | 437 | vpmull2.p64 $Xm,$Hhl,$t2 @ (H^2.lo+H^2.hi)·(Xi.lo+Xi.hi) |
7eeeb49e AP |
438 | vld1.64 {$t0},[$inp],$inc @ load [rotated] I[i+2] |
439 | ||
440 | veor $Xh,$Xh,$Xhn | |
441 | cclr $inc,eq @ is it time to zero $inc? | |
442 | veor $Xm,$Xm,$Xmn | |
443 | ||
444 | vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing | |
445 | veor $t2,$Xl,$Xh | |
446 | veor $Xm,$Xm,$t1 | |
447 | vld1.64 {$t1},[$inp],$inc @ load [rotated] I[i+3] | |
448 | #ifndef __ARMEB__ | |
449 | vrev64.8 $t0,$t0 | |
450 | #endif | |
451 | veor $Xm,$Xm,$t2 | |
452 | vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction | |
453 | ||
454 | #ifndef __ARMEB__ | |
455 | vrev64.8 $t1,$t1 | |
456 | #endif | |
457 | vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result | |
458 | vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl | |
459 | vext.8 $In,$t1,$t1,#8 | |
460 | vext.8 $IN,$t0,$t0,#8 | |
461 | veor $Xl,$Xm,$t2 | |
053fa39a | 462 | vpmull.p64 $Xln,$H,$In @ H·Ii+1 |
7eeeb49e AP |
463 | veor $IN,$IN,$Xh @ accumulate $IN early |
464 | ||
465 | vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction | |
466 | vpmull.p64 $Xl,$Xl,$xC2 | |
467 | veor $IN,$IN,$t2 | |
468 | veor $t1,$t1,$In @ Karatsuba pre-processing | |
469 | veor $IN,$IN,$Xl | |
470 | vpmull2.p64 $Xhn,$H,$In | |
471 | b.hs .Loop_mod2x_v8 @ there was at least 32 more bytes | |
472 | ||
473 | veor $Xh,$Xh,$t2 | |
474 | vext.8 $IN,$t0,$t0,#8 @ re-construct $IN | |
475 | adds $len,$len,#32 @ re-construct $len | |
476 | veor $Xl,$Xl,$Xh @ re-construct $Xl | |
477 | b.eq .Ldone_v8 @ is $len zero? | |
478 | ___ | |
479 | } | |
480 | $code.=<<___; | |
481 | .Lodd_tail_v8: | |
2d5a799d AP |
482 | vext.8 $t2,$Xl,$Xl,#8 |
483 | veor $IN,$IN,$Xl @ inp^=Xi | |
7eeeb49e | 484 | veor $t1,$t0,$t2 @ $t1 is rotated inp^Xi |
2d5a799d | 485 | |
053fa39a | 486 | vpmull.p64 $Xl,$H,$IN @ H.lo·Xi.lo |
2d5a799d | 487 | veor $t1,$t1,$IN @ Karatsuba pre-processing |
053fa39a RL |
488 | vpmull2.p64 $Xh,$H,$IN @ H.hi·Xi.hi |
489 | vpmull.p64 $Xm,$Hhl,$t1 @ (H.lo+H.hi)·(Xi.lo+Xi.hi) | |
2d5a799d AP |
490 | |
491 | vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing | |
492 | veor $t2,$Xl,$Xh | |
493 | veor $Xm,$Xm,$t1 | |
2d5a799d | 494 | veor $Xm,$Xm,$t2 |
7eeeb49e | 495 | vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction |
2d5a799d AP |
496 | |
497 | vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result | |
498 | vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl | |
2d5a799d | 499 | veor $Xl,$Xm,$t2 |
2d5a799d | 500 | |
7eeeb49e AP |
501 | vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction |
502 | vpmull.p64 $Xl,$Xl,$xC2 | |
2d5a799d AP |
503 | veor $t2,$t2,$Xh |
504 | veor $Xl,$Xl,$t2 | |
2d5a799d | 505 | |
7eeeb49e | 506 | .Ldone_v8: |
2d5a799d AP |
507 | #ifndef __ARMEB__ |
508 | vrev64.8 $Xl,$Xl | |
509 | #endif | |
510 | vext.8 $Xl,$Xl,$Xl,#8 | |
511 | vst1.64 {$Xl},[$Xi] @ write out Xi | |
512 | ||
7eeeb49e AP |
513 | ___ |
514 | $code.=<<___ if ($flavour !~ /64/); | |
515 | vldmia sp!,{d8-d15} @ 32-bit ABI says so | |
516 | ___ | |
517 | $code.=<<___; | |
2d5a799d AP |
518 | ret |
519 | .size gcm_ghash_v8,.-gcm_ghash_v8 | |
520 | ___ | |
7ff2fa4b AP |
521 | |
522 | if ($flavour =~ /64/) { # 4x subroutine | |
523 | my ($I0,$j1,$j2,$j3, | |
524 | $I1,$I2,$I3,$H3,$H34,$H4,$Yl,$Ym,$Yh) = map("q$_",(4..7,15..23)); | |
525 | ||
526 | $code.=<<___; | |
527 | .type gcm_ghash_v8_4x,%function | |
528 | .align 4 | |
529 | gcm_ghash_v8_4x: | |
530 | .Lgcm_ghash_v8_4x: | |
531 | vld1.64 {$Xl},[$Xi] @ load [rotated] Xi | |
532 | vld1.64 {$H-$H2},[$Htbl],#48 @ load twisted H, ..., H^2 | |
533 | vmov.i8 $xC2,#0xe1 | |
534 | vld1.64 {$H3-$H4},[$Htbl] @ load twisted H^3, ..., H^4 | |
535 | vshl.u64 $xC2,$xC2,#57 @ compose 0xc2.0 constant | |
7ff2fa4b | 536 | |
7ff2fa4b AP |
537 | vld1.64 {$I0-$j3},[$inp],#64 |
538 | #ifndef __ARMEB__ | |
603ebe03 | 539 | vrev64.8 $Xl,$Xl |
7ff2fa4b AP |
540 | vrev64.8 $j1,$j1 |
541 | vrev64.8 $j2,$j2 | |
542 | vrev64.8 $j3,$j3 | |
543 | vrev64.8 $I0,$I0 | |
544 | #endif | |
545 | vext.8 $I3,$j3,$j3,#8 | |
546 | vext.8 $I2,$j2,$j2,#8 | |
547 | vext.8 $I1,$j1,$j1,#8 | |
548 | ||
549 | vpmull.p64 $Yl,$H,$I3 @ H·Ii+3 | |
550 | veor $j3,$j3,$I3 | |
551 | vpmull2.p64 $Yh,$H,$I3 | |
552 | vpmull.p64 $Ym,$Hhl,$j3 | |
553 | ||
554 | vpmull.p64 $t0,$H2,$I2 @ H^2·Ii+2 | |
555 | veor $j2,$j2,$I2 | |
556 | vpmull2.p64 $I2,$H2,$I2 | |
557 | vpmull2.p64 $j2,$Hhl,$j2 | |
558 | ||
559 | veor $Yl,$Yl,$t0 | |
560 | veor $Yh,$Yh,$I2 | |
561 | veor $Ym,$Ym,$j2 | |
562 | ||
563 | vpmull.p64 $j3,$H3,$I1 @ H^3·Ii+1 | |
564 | veor $j1,$j1,$I1 | |
565 | vpmull2.p64 $I1,$H3,$I1 | |
566 | vpmull.p64 $j1,$H34,$j1 | |
567 | ||
568 | veor $Yl,$Yl,$j3 | |
569 | veor $Yh,$Yh,$I1 | |
570 | veor $Ym,$Ym,$j1 | |
571 | ||
603ebe03 AP |
572 | subs $len,$len,#128 |
573 | b.lo .Ltail4x | |
9ee020f8 AP |
574 | |
575 | b .Loop4x | |
576 | ||
577 | .align 4 | |
578 | .Loop4x: | |
7ff2fa4b | 579 | veor $t0,$I0,$Xl |
aa7bf316 | 580 | vld1.64 {$I0-$j3},[$inp],#64 |
7ff2fa4b | 581 | vext.8 $IN,$t0,$t0,#8 |
aa7bf316 AP |
582 | #ifndef __ARMEB__ |
583 | vrev64.8 $j1,$j1 | |
584 | vrev64.8 $j2,$j2 | |
585 | vrev64.8 $j3,$j3 | |
586 | vrev64.8 $I0,$I0 | |
587 | #endif | |
7ff2fa4b AP |
588 | |
589 | vpmull.p64 $Xl,$H4,$IN @ H^4·(Xi+Ii) | |
590 | veor $t0,$t0,$IN | |
591 | vpmull2.p64 $Xh,$H4,$IN | |
aa7bf316 | 592 | vext.8 $I3,$j3,$j3,#8 |
7ff2fa4b AP |
593 | vpmull2.p64 $Xm,$H34,$t0 |
594 | ||
595 | veor $Xl,$Xl,$Yl | |
596 | veor $Xh,$Xh,$Yh | |
aa7bf316 | 597 | vext.8 $I2,$j2,$j2,#8 |
7ff2fa4b | 598 | veor $Xm,$Xm,$Ym |
aa7bf316 | 599 | vext.8 $I1,$j1,$j1,#8 |
7ff2fa4b AP |
600 | |
601 | vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing | |
602 | veor $t2,$Xl,$Xh | |
aa7bf316 AP |
603 | vpmull.p64 $Yl,$H,$I3 @ H·Ii+3 |
604 | veor $j3,$j3,$I3 | |
7ff2fa4b | 605 | veor $Xm,$Xm,$t1 |
aa7bf316 | 606 | vpmull2.p64 $Yh,$H,$I3 |
7ff2fa4b | 607 | veor $Xm,$Xm,$t2 |
aa7bf316 | 608 | vpmull.p64 $Ym,$Hhl,$j3 |
7ff2fa4b AP |
609 | |
610 | vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction | |
611 | vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result | |
612 | vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl | |
9ee020f8 AP |
613 | vpmull.p64 $t0,$H2,$I2 @ H^2·Ii+2 |
614 | veor $j2,$j2,$I2 | |
615 | vpmull2.p64 $I2,$H2,$I2 | |
aa7bf316 | 616 | veor $Xl,$Xm,$t2 |
9ee020f8 AP |
617 | vpmull2.p64 $j2,$Hhl,$j2 |
618 | ||
619 | veor $Yl,$Yl,$t0 | |
620 | veor $Yh,$Yh,$I2 | |
621 | veor $Ym,$Ym,$j2 | |
622 | ||
aa7bf316 AP |
623 | vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction |
624 | vpmull.p64 $Xl,$Xl,$xC2 | |
9ee020f8 AP |
625 | vpmull.p64 $j3,$H3,$I1 @ H^3·Ii+1 |
626 | veor $j1,$j1,$I1 | |
aa7bf316 | 627 | veor $t2,$t2,$Xh |
9ee020f8 AP |
628 | vpmull2.p64 $I1,$H3,$I1 |
629 | vpmull.p64 $j1,$H34,$j1 | |
630 | ||
aa7bf316 | 631 | veor $Xl,$Xl,$t2 |
9ee020f8 AP |
632 | veor $Yl,$Yl,$j3 |
633 | veor $Yh,$Yh,$I1 | |
aa7bf316 | 634 | vext.8 $Xl,$Xl,$Xl,#8 |
9ee020f8 AP |
635 | veor $Ym,$Ym,$j1 |
636 | ||
7ff2fa4b | 637 | subs $len,$len,#64 |
603ebe03 | 638 | b.hs .Loop4x |
7ff2fa4b | 639 | |
9ee020f8 AP |
640 | .Ltail4x: |
641 | veor $t0,$I0,$Xl | |
642 | vext.8 $IN,$t0,$t0,#8 | |
643 | ||
644 | vpmull.p64 $Xl,$H4,$IN @ H^4·(Xi+Ii) | |
645 | veor $t0,$t0,$IN | |
646 | vpmull2.p64 $Xh,$H4,$IN | |
647 | vpmull2.p64 $Xm,$H34,$t0 | |
648 | ||
649 | veor $Xl,$Xl,$Yl | |
650 | veor $Xh,$Xh,$Yh | |
651 | veor $Xm,$Xm,$Ym | |
652 | ||
603ebe03 AP |
653 | adds $len,$len,#64 |
654 | b.eq .Ldone4x | |
655 | ||
656 | cmp $len,#32 | |
657 | b.lo .Lone | |
658 | b.eq .Ltwo | |
659 | .Lthree: | |
660 | vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing | |
661 | veor $t2,$Xl,$Xh | |
662 | veor $Xm,$Xm,$t1 | |
663 | vld1.64 {$I0-$j2},[$inp] | |
664 | veor $Xm,$Xm,$t2 | |
665 | #ifndef __ARMEB__ | |
666 | vrev64.8 $j1,$j1 | |
667 | vrev64.8 $j2,$j2 | |
668 | vrev64.8 $I0,$I0 | |
669 | #endif | |
670 | ||
671 | vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction | |
672 | vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result | |
673 | vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl | |
674 | vext.8 $I2,$j2,$j2,#8 | |
675 | vext.8 $I1,$j1,$j1,#8 | |
676 | veor $Xl,$Xm,$t2 | |
677 | ||
678 | vpmull.p64 $Yl,$H,$I2 @ H·Ii+2 | |
679 | veor $j2,$j2,$I2 | |
680 | ||
681 | vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction | |
682 | vpmull.p64 $Xl,$Xl,$xC2 | |
683 | veor $t2,$t2,$Xh | |
684 | vpmull2.p64 $Yh,$H,$I2 | |
685 | vpmull.p64 $Ym,$Hhl,$j2 | |
686 | veor $Xl,$Xl,$t2 | |
687 | vpmull.p64 $j3,$H2,$I1 @ H^2·Ii+1 | |
688 | veor $j1,$j1,$I1 | |
689 | vext.8 $Xl,$Xl,$Xl,#8 | |
690 | ||
691 | vpmull2.p64 $I1,$H2,$I1 | |
692 | veor $t0,$I0,$Xl | |
693 | vpmull2.p64 $j1,$Hhl,$j1 | |
694 | vext.8 $IN,$t0,$t0,#8 | |
695 | ||
696 | veor $Yl,$Yl,$j3 | |
697 | veor $Yh,$Yh,$I1 | |
698 | veor $Ym,$Ym,$j1 | |
699 | ||
700 | vpmull.p64 $Xl,$H3,$IN @ H^3·(Xi+Ii) | |
701 | veor $t0,$t0,$IN | |
702 | vpmull2.p64 $Xh,$H3,$IN | |
703 | vpmull.p64 $Xm,$H34,$t0 | |
704 | ||
705 | veor $Xl,$Xl,$Yl | |
706 | veor $Xh,$Xh,$Yh | |
707 | veor $Xm,$Xm,$Ym | |
708 | b .Ldone4x | |
709 | ||
710 | .align 4 | |
711 | .Ltwo: | |
712 | vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing | |
713 | veor $t2,$Xl,$Xh | |
714 | veor $Xm,$Xm,$t1 | |
715 | vld1.64 {$I0-$j1},[$inp] | |
716 | veor $Xm,$Xm,$t2 | |
717 | #ifndef __ARMEB__ | |
718 | vrev64.8 $j1,$j1 | |
719 | vrev64.8 $I0,$I0 | |
720 | #endif | |
721 | ||
722 | vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction | |
723 | vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result | |
724 | vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl | |
725 | vext.8 $I1,$j1,$j1,#8 | |
726 | veor $Xl,$Xm,$t2 | |
727 | ||
728 | vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction | |
729 | vpmull.p64 $Xl,$Xl,$xC2 | |
730 | veor $t2,$t2,$Xh | |
731 | veor $Xl,$Xl,$t2 | |
732 | vext.8 $Xl,$Xl,$Xl,#8 | |
733 | ||
734 | vpmull.p64 $Yl,$H,$I1 @ H·Ii+1 | |
735 | veor $j1,$j1,$I1 | |
736 | ||
737 | veor $t0,$I0,$Xl | |
738 | vext.8 $IN,$t0,$t0,#8 | |
739 | ||
740 | vpmull2.p64 $Yh,$H,$I1 | |
741 | vpmull.p64 $Ym,$Hhl,$j1 | |
742 | ||
743 | vpmull.p64 $Xl,$H2,$IN @ H^2·(Xi+Ii) | |
744 | veor $t0,$t0,$IN | |
745 | vpmull2.p64 $Xh,$H2,$IN | |
746 | vpmull2.p64 $Xm,$Hhl,$t0 | |
747 | ||
748 | veor $Xl,$Xl,$Yl | |
749 | veor $Xh,$Xh,$Yh | |
750 | veor $Xm,$Xm,$Ym | |
751 | b .Ldone4x | |
752 | ||
753 | .align 4 | |
754 | .Lone: | |
755 | vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing | |
756 | veor $t2,$Xl,$Xh | |
757 | veor $Xm,$Xm,$t1 | |
758 | vld1.64 {$I0},[$inp] | |
759 | veor $Xm,$Xm,$t2 | |
760 | #ifndef __ARMEB__ | |
761 | vrev64.8 $I0,$I0 | |
762 | #endif | |
763 | ||
764 | vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction | |
765 | vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result | |
766 | vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl | |
767 | veor $Xl,$Xm,$t2 | |
768 | ||
769 | vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction | |
770 | vpmull.p64 $Xl,$Xl,$xC2 | |
771 | veor $t2,$t2,$Xh | |
772 | veor $Xl,$Xl,$t2 | |
773 | vext.8 $Xl,$Xl,$Xl,#8 | |
774 | ||
775 | veor $t0,$I0,$Xl | |
776 | vext.8 $IN,$t0,$t0,#8 | |
777 | ||
778 | vpmull.p64 $Xl,$H,$IN | |
779 | veor $t0,$t0,$IN | |
780 | vpmull2.p64 $Xh,$H,$IN | |
781 | vpmull.p64 $Xm,$Hhl,$t0 | |
782 | ||
783 | .Ldone4x: | |
9ee020f8 AP |
784 | vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing |
785 | veor $t2,$Xl,$Xh | |
786 | veor $Xm,$Xm,$t1 | |
787 | veor $Xm,$Xm,$t2 | |
788 | ||
789 | vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction | |
790 | vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result | |
791 | vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl | |
792 | veor $Xl,$Xm,$t2 | |
793 | ||
794 | vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction | |
795 | vpmull.p64 $Xl,$Xl,$xC2 | |
796 | veor $t2,$t2,$Xh | |
797 | veor $Xl,$Xl,$t2 | |
798 | vext.8 $Xl,$Xl,$Xl,#8 | |
799 | ||
7ff2fa4b AP |
800 | #ifndef __ARMEB__ |
801 | vrev64.8 $Xl,$Xl | |
802 | #endif | |
803 | vst1.64 {$Xl},[$Xi] @ write out Xi | |
804 | ||
805 | ret | |
806 | .size gcm_ghash_v8_4x,.-gcm_ghash_v8_4x | |
807 | ___ | |
808 | ||
2d5a799d | 809 | } |
7ff2fa4b AP |
810 | } |
811 | ||
2d5a799d AP |
812 | $code.=<<___; |
813 | .asciz "GHASH for ARMv8, CRYPTOGAMS by <appro\@openssl.org>" | |
814 | .align 2 | |
198a2ed7 | 815 | #endif |
2d5a799d AP |
816 | ___ |
817 | ||
818 | if ($flavour =~ /64/) { ######## 64-bit code | |
819 | sub unvmov { | |
820 | my $arg=shift; | |
821 | ||
822 | $arg =~ m/q([0-9]+)#(lo|hi),\s*q([0-9]+)#(lo|hi)/o && | |
7ff2fa4b AP |
823 | sprintf "ins v%d.d[%d],v%d.d[%d]",$1<8?$1:$1+8,($2 eq "lo")?0:1, |
824 | $3<8?$3:$3+8,($4 eq "lo")?0:1; | |
2d5a799d AP |
825 | } |
826 | foreach(split("\n",$code)) { | |
827 | s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or | |
828 | s/vmov\.i8/movi/o or # fix up legacy mnemonics | |
829 | s/vmov\s+(.*)/unvmov($1)/geo or | |
830 | s/vext\.8/ext/o or | |
831 | s/vshr\.s/sshr\.s/o or | |
832 | s/vshr/ushr/o or | |
833 | s/^(\s+)v/$1/o or # strip off v prefix | |
834 | s/\bbx\s+lr\b/ret/o; | |
835 | ||
836 | s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers | |
837 | s/@\s/\/\//o; # old->new style commentary | |
838 | ||
46f4e1be | 839 | # fix up remaining legacy suffixes |
2d5a799d AP |
840 | s/\.[ui]?8(\s)/$1/o; |
841 | s/\.[uis]?32//o and s/\.16b/\.4s/go; | |
842 | m/\.p64/o and s/\.16b/\.1q/o; # 1st pmull argument | |
843 | m/l\.p64/o and s/\.16b/\.1d/go; # 2nd and 3rd pmull arguments | |
844 | s/\.[uisp]?64//o and s/\.16b/\.2d/go; | |
845 | s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o; | |
846 | ||
40c24d74 DB |
847 | # Switch preprocessor checks to aarch64 versions. |
848 | s/__ARME([BL])__/__AARCH64E$1__/go; | |
849 | ||
2d5a799d AP |
850 | print $_,"\n"; |
851 | } | |
852 | } else { ######## 32-bit code | |
853 | sub unvdup32 { | |
854 | my $arg=shift; | |
855 | ||
856 | $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o && | |
857 | sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1; | |
858 | } | |
859 | sub unvpmullp64 { | |
860 | my ($mnemonic,$arg)=@_; | |
861 | ||
862 | if ($arg =~ m/q([0-9]+),\s*q([0-9]+),\s*q([0-9]+)/o) { | |
863 | my $word = 0xf2a00e00|(($1&7)<<13)|(($1&8)<<19) | |
864 | |(($2&7)<<17)|(($2&8)<<4) | |
865 | |(($3&7)<<1) |(($3&8)<<2); | |
866 | $word |= 0x00010001 if ($mnemonic =~ "2"); | |
867 | # since ARMv7 instructions are always encoded little-endian. | |
868 | # correct solution is to use .inst directive, but older | |
869 | # assemblers don't implement it:-( | |
3405db97 | 870 | sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s", |
2d5a799d AP |
871 | $word&0xff,($word>>8)&0xff, |
872 | ($word>>16)&0xff,($word>>24)&0xff, | |
873 | $mnemonic,$arg; | |
874 | } | |
875 | } | |
876 | ||
877 | foreach(split("\n",$code)) { | |
878 | s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers | |
879 | s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers | |
7eeeb49e | 880 | s/\/\/\s?/@ /o; # new->old style commentary |
2d5a799d | 881 | |
46f4e1be | 882 | # fix up remaining new-style suffixes |
2d5a799d AP |
883 | s/\],#[0-9]+/]!/o; |
884 | ||
3405db97 | 885 | s/cclr\s+([^,]+),\s*([a-z]+)/mov.$2 $1,#0/o or |
2d5a799d AP |
886 | s/vdup\.32\s+(.*)/unvdup32($1)/geo or |
887 | s/v?(pmull2?)\.p64\s+(.*)/unvpmullp64($1,$2)/geo or | |
888 | s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or | |
889 | s/^(\s+)b\./$1b/o or | |
890 | s/^(\s+)ret/$1bx\tlr/o; | |
891 | ||
3405db97 AP |
892 | if (s/^(\s+)mov\.([a-z]+)/$1mov$2/) { |
893 | print " it $2\n"; | |
894 | } | |
895 | ||
7eeeb49e | 896 | print $_,"\n"; |
2d5a799d AP |
897 | } |
898 | } | |
899 | ||
a21314db | 900 | close STDOUT or die "error closing STDOUT: $!"; # enforce flush |