]>
Commit | Line | Data |
---|---|---|
6aa36e8e RS |
1 | #! /usr/bin/env perl |
2 | # Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved. | |
3 | # | |
4 | # Licensed under the OpenSSL license (the "License"). You may not use | |
5 | # this file except in compliance with the License. You can obtain a copy | |
6 | # in the file LICENSE in the source distribution or at | |
7 | # https://www.openssl.org/source/license.html | |
8 | ||
a2a54ffc AP |
9 | |
10 | # ==================================================================== | |
11 | # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | |
12 | # project. The module is, however, dual licensed under OpenSSL and | |
13 | # CRYPTOGAMS licenses depending on where you obtain it. For further | |
14 | # details see http://www.openssl.org/~appro/cryptogams/. | |
15 | # ==================================================================== | |
16 | ||
17 | # SHA1 block procedure for s390x. | |
18 | ||
19 | # April 2007. | |
20 | # | |
21 | # Performance is >30% better than gcc 3.3 generated code. But the real | |
22 | # twist is that SHA1 hardware support is detected and utilized. In | |
251718e4 | 23 | # which case performance can reach further >4.5x for larger chunks. |
a2a54ffc | 24 | |
8626230a AP |
25 | # January 2009. |
26 | # | |
27 | # Optimize Xupdate for amount of memory references and reschedule | |
28 | # instructions to favour dual-issue z10 pipeline. On z10 hardware is | |
29 | # "only" ~2.3x faster than software. | |
30 | ||
e822c756 AP |
31 | # November 2010. |
32 | # | |
33 | # Adapt for -m31 build. If kernel supports what's called "highgprs" | |
34 | # feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit | |
35 | # instructions and achieve "64-bit" performance even in 31-bit legacy | |
36 | # application context. The feature is not specific to any particular | |
37 | # processor, as long as it's "z-CPU". Latter implies that the code | |
d900a015 | 38 | # remains z/Architecture specific. On z990 it was measured to perform |
da3bd277 | 39 | # 23% better than code generated by gcc 4.3. |
e822c756 | 40 | |
a2a54ffc AP |
41 | $kimdfunc=1; # magic function code for kimd instruction |
42 | ||
e822c756 AP |
43 | $flavour = shift; |
44 | ||
45 | if ($flavour =~ /3[12]/) { | |
46 | $SIZE_T=4; | |
47 | $g=""; | |
48 | } else { | |
49 | $SIZE_T=8; | |
50 | $g="g"; | |
51 | } | |
52 | ||
a5aa63a4 | 53 | while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} |
a2a54ffc AP |
54 | open STDOUT,">$output"; |
55 | ||
8626230a AP |
56 | $K_00_39="%r0"; $K=$K_00_39; |
57 | $K_40_79="%r1"; | |
58 | $ctx="%r2"; $prefetch="%r2"; | |
a2a54ffc AP |
59 | $inp="%r3"; |
60 | $len="%r4"; | |
61 | ||
62 | $A="%r5"; | |
63 | $B="%r6"; | |
64 | $C="%r7"; | |
65 | $D="%r8"; | |
66 | $E="%r9"; @V=($A,$B,$C,$D,$E); | |
8626230a AP |
67 | $t0="%r10"; |
68 | $t1="%r11"; | |
69 | @X=("%r12","%r13","%r14"); | |
a2a54ffc AP |
70 | $sp="%r15"; |
71 | ||
e822c756 AP |
72 | $stdframe=16*$SIZE_T+4*8; |
73 | $frame=$stdframe+16*4; | |
a2a54ffc | 74 | |
a2a54ffc AP |
75 | sub Xupdate { |
76 | my $i=shift; | |
77 | ||
8626230a | 78 | $code.=<<___ if ($i==15); |
e822c756 | 79 | lg $prefetch,$stdframe($sp) ### Xupdate(16) warm-up |
8626230a AP |
80 | lr $X[0],$X[2] |
81 | ___ | |
a2a54ffc | 82 | return if ($i&1); # Xupdate is vectorized and executed every 2nd cycle |
8626230a AP |
83 | $code.=<<___ if ($i<16); |
84 | lg $X[0],`$i*4`($inp) ### Xload($i) | |
85 | rllg $X[1],$X[0],32 | |
a2a54ffc | 86 | ___ |
8626230a AP |
87 | $code.=<<___ if ($i>=16); |
88 | xgr $X[0],$prefetch ### Xupdate($i) | |
e822c756 AP |
89 | lg $prefetch,`$stdframe+4*(($i+2)%16)`($sp) |
90 | xg $X[0],`$stdframe+4*(($i+8)%16)`($sp) | |
8626230a AP |
91 | xgr $X[0],$prefetch |
92 | rll $X[0],$X[0],1 | |
93 | rllg $X[1],$X[0],32 | |
94 | rll $X[1],$X[1],1 | |
95 | rllg $X[0],$X[1],32 | |
96 | lr $X[2],$X[1] # feedback | |
a2a54ffc | 97 | ___ |
8626230a | 98 | $code.=<<___ if ($i<=70); |
e822c756 | 99 | stg $X[0],`$stdframe+4*($i%16)`($sp) |
a2a54ffc | 100 | ___ |
8626230a | 101 | unshift(@X,pop(@X)); |
a2a54ffc | 102 | } |
8626230a AP |
103 | |
104 | sub BODY_00_19 { | |
105 | my ($i,$a,$b,$c,$d,$e)=@_; | |
106 | my $xi=$X[1]; | |
107 | ||
108 | &Xupdate($i); | |
a2a54ffc | 109 | $code.=<<___; |
8626230a AP |
110 | alr $e,$K ### $i |
111 | rll $t1,$a,5 | |
112 | lr $t0,$d | |
113 | xr $t0,$c | |
114 | alr $e,$t1 | |
115 | nr $t0,$b | |
116 | alr $e,$xi | |
117 | xr $t0,$d | |
118 | rll $b,$b,30 | |
119 | alr $e,$t0 | |
a2a54ffc AP |
120 | ___ |
121 | } | |
122 | ||
a2a54ffc AP |
123 | sub BODY_20_39 { |
124 | my ($i,$a,$b,$c,$d,$e)=@_; | |
8626230a | 125 | my $xi=$X[1]; |
a2a54ffc AP |
126 | |
127 | &Xupdate($i); | |
128 | $code.=<<___; | |
8626230a AP |
129 | alr $e,$K ### $i |
130 | rll $t1,$a,5 | |
a2a54ffc | 131 | lr $t0,$b |
8626230a | 132 | alr $e,$t1 |
a2a54ffc | 133 | xr $t0,$c |
8626230a | 134 | alr $e,$xi |
a2a54ffc | 135 | xr $t0,$d |
a2a54ffc | 136 | rll $b,$b,30 |
8626230a | 137 | alr $e,$t0 |
a2a54ffc AP |
138 | ___ |
139 | } | |
140 | ||
141 | sub BODY_40_59 { | |
142 | my ($i,$a,$b,$c,$d,$e)=@_; | |
8626230a | 143 | my $xi=$X[1]; |
a2a54ffc AP |
144 | |
145 | &Xupdate($i); | |
146 | $code.=<<___; | |
8626230a AP |
147 | alr $e,$K ### $i |
148 | rll $t1,$a,5 | |
a2a54ffc | 149 | lr $t0,$b |
8626230a | 150 | alr $e,$t1 |
a2a54ffc | 151 | or $t0,$c |
a2a54ffc | 152 | lr $t1,$b |
8626230a | 153 | nr $t0,$d |
a2a54ffc | 154 | nr $t1,$c |
8626230a | 155 | alr $e,$xi |
a2a54ffc | 156 | or $t0,$t1 |
a2a54ffc | 157 | rll $b,$b,30 |
8626230a | 158 | alr $e,$t0 |
a2a54ffc AP |
159 | ___ |
160 | } | |
161 | ||
162 | $code.=<<___; | |
163 | .text | |
8626230a AP |
164 | .align 64 |
165 | .type Ktable,\@object | |
166 | Ktable: .long 0x5a827999,0x6ed9eba1,0x8f1bbcdc,0xca62c1d6 | |
167 | .skip 48 #.long 0,0,0,0,0,0,0,0,0,0,0,0 | |
168 | .size Ktable,.-Ktable | |
a2a54ffc AP |
169 | .globl sha1_block_data_order |
170 | .type sha1_block_data_order,\@function | |
171 | sha1_block_data_order: | |
172 | ___ | |
173 | $code.=<<___ if ($kimdfunc); | |
91fdacb2 AP |
174 | larl %r1,OPENSSL_s390xcap_P |
175 | lg %r0,0(%r1) | |
176 | tmhl %r0,0x4000 # check for message-security assist | |
177 | jz .Lsoftware | |
670ad0fb | 178 | lg %r0,16(%r1) # check kimd capabilities |
a2a54ffc AP |
179 | tmhh %r0,`0x8000>>$kimdfunc` |
180 | jz .Lsoftware | |
181 | lghi %r0,$kimdfunc | |
182 | lgr %r1,$ctx | |
183 | lgr %r2,$inp | |
184 | sllg %r3,$len,6 | |
185 | .long 0xb93e0002 # kimd %r0,%r2 | |
251718e4 | 186 | brc 1,.-4 # pay attention to "partial completion" |
a2a54ffc | 187 | br %r14 |
f06d0072 | 188 | .align 16 |
a2a54ffc AP |
189 | .Lsoftware: |
190 | ___ | |
191 | $code.=<<___; | |
8626230a | 192 | lghi %r1,-$frame |
e822c756 AP |
193 | st${g} $ctx,`2*$SIZE_T`($sp) |
194 | stm${g} %r6,%r15,`6*$SIZE_T`($sp) | |
a2a54ffc | 195 | lgr %r0,$sp |
8626230a | 196 | la $sp,0(%r1,$sp) |
e822c756 | 197 | st${g} %r0,0($sp) |
a2a54ffc | 198 | |
8626230a | 199 | larl $t0,Ktable |
a2a54ffc AP |
200 | llgf $A,0($ctx) |
201 | llgf $B,4($ctx) | |
202 | llgf $C,8($ctx) | |
203 | llgf $D,12($ctx) | |
204 | llgf $E,16($ctx) | |
205 | ||
8626230a AP |
206 | lg $K_00_39,0($t0) |
207 | lg $K_40_79,8($t0) | |
208 | ||
a2a54ffc | 209 | .Lloop: |
8626230a AP |
210 | rllg $K_00_39,$K_00_39,32 |
211 | ___ | |
212 | for ($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); } | |
213 | $code.=<<___; | |
214 | rllg $K_00_39,$K_00_39,32 | |
a2a54ffc | 215 | ___ |
a2a54ffc | 216 | for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } |
8626230a AP |
217 | $code.=<<___; $K=$K_40_79; |
218 | rllg $K_40_79,$K_40_79,32 | |
219 | ___ | |
a2a54ffc | 220 | for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); } |
8626230a AP |
221 | $code.=<<___; |
222 | rllg $K_40_79,$K_40_79,32 | |
223 | ___ | |
a2a54ffc AP |
224 | for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } |
225 | $code.=<<___; | |
226 | ||
e822c756 | 227 | l${g} $ctx,`$frame+2*$SIZE_T`($sp) |
8626230a | 228 | la $inp,64($inp) |
a2a54ffc AP |
229 | al $A,0($ctx) |
230 | al $B,4($ctx) | |
231 | al $C,8($ctx) | |
232 | al $D,12($ctx) | |
233 | al $E,16($ctx) | |
234 | st $A,0($ctx) | |
235 | st $B,4($ctx) | |
236 | st $C,8($ctx) | |
237 | st $D,12($ctx) | |
238 | st $E,16($ctx) | |
e822c756 | 239 | brct${g} $len,.Lloop |
a2a54ffc | 240 | |
e822c756 | 241 | lm${g} %r6,%r15,`$frame+6*$SIZE_T`($sp) |
a2a54ffc AP |
242 | br %r14 |
243 | .size sha1_block_data_order,.-sha1_block_data_order | |
244 | .string "SHA1 block transform for s390x, CRYPTOGAMS by <appro\@openssl.org>" | |
670ad0fb | 245 | .comm OPENSSL_s390xcap_P,80,8 |
a2a54ffc AP |
246 | ___ |
247 | ||
248 | $code =~ s/\`([^\`]*)\`/eval $1/gem; | |
249 | ||
250 | print $code; | |
251 | close STDOUT; |