]> git.ipfire.org Git - thirdparty/openssl.git/blame - crypto/bn/asm/s390x-mont.pl
Add OpenSSL copyright to .pl files
[thirdparty/openssl.git] / crypto / bn / asm / s390x-mont.pl
CommitLineData
6aa36e8e
RS
1#! /usr/bin/env perl
2# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License"). You may not use
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
a2a54ffc
AP
9
10# ====================================================================
11# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16
17# April 2007.
18#
19# Performance improvement over vanilla C code varies from 85% to 45%
20# depending on key length and benchmark. Unfortunately in this context
21# these are not very impressive results [for code that utilizes "wide"
22# 64x64=128-bit multiplication, which is not commonly available to C
23# programmers], at least hand-coded bn_asm.c replacement is known to
24# provide 30-40% better results for longest keys. Well, on a second
25# thought it's not very surprising, because z-CPUs are single-issue
26# and _strictly_ in-order execution, while bn_mul_mont is more or less
27# dependent on CPU ability to pipe-line instructions and have several
28# of them "in-flight" at the same time. I mean while other methods,
29# for example Karatsuba, aim to minimize amount of multiplications at
30# the cost of other operations increase, bn_mul_mont aim to neatly
31# "overlap" multiplications and the other operations [and on most
32# platforms even minimize the amount of the other operations, in
33# particular references to memory]. But it's possible to improve this
34# module performance by implementing dedicated squaring code-path and
35# possibly by unrolling loops...
36
8626230a
AP
37# January 2009.
38#
39# Reschedule to minimize/avoid Address Generation Interlock hazard,
40# make inner loops counter-based.
41
e822c756
AP
42# November 2010.
43#
44# Adapt for -m31 build. If kernel supports what's called "highgprs"
45# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
46# instructions and achieve "64-bit" performance even in 31-bit legacy
47# application context. The feature is not specific to any particular
48# processor, as long as it's "z-CPU". Latter implies that the code
49# remains z/Architecture specific. Compatibility with 32-bit BN_ULONG
50# is achieved by swapping words after 64-bit loads, follow _dswap-s.
0ab8fd58
AP
51# On z990 it was measured to perform 2.6-2.2 times better than
52# compiler-generated code, less for longer keys...
e822c756
AP
53
54$flavour = shift;
55
56if ($flavour =~ /3[12]/) {
57 $SIZE_T=4;
58 $g="";
59} else {
60 $SIZE_T=8;
61 $g="g";
62}
63
a5aa63a4 64while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
1cbdca7b
AP
65open STDOUT,">$output";
66
e822c756
AP
67$stdframe=16*$SIZE_T+4*8;
68
a2a54ffc
AP
69$mn0="%r0";
70$num="%r1";
71
72# int bn_mul_mont(
73$rp="%r2"; # BN_ULONG *rp,
74$ap="%r3"; # const BN_ULONG *ap,
75$bp="%r4"; # const BN_ULONG *bp,
76$np="%r5"; # const BN_ULONG *np,
77$n0="%r6"; # const BN_ULONG *n0,
78#$num="160(%r15)" # int num);
79
80$bi="%r2"; # zaps rp
81$j="%r7";
82
83$ahi="%r8";
84$alo="%r9";
85$nhi="%r10";
86$nlo="%r11";
87$AHI="%r12";
88$NHI="%r13";
8626230a 89$count="%r14";
a2a54ffc
AP
90$sp="%r15";
91
92$code.=<<___;
93.text
94.globl bn_mul_mont
95.type bn_mul_mont,\@function
96bn_mul_mont:
e822c756
AP
97 lgf $num,`$stdframe+$SIZE_T-4`($sp) # pull $num
98 sla $num,`log($SIZE_T)/log(2)` # $num to enumerate bytes
a2a54ffc 99 la $bp,0($num,$bp)
a2a54ffc 100
e822c756 101 st${g} %r2,2*$SIZE_T($sp)
a2a54ffc
AP
102
103 cghi $num,16 #
104 lghi %r2,0 #
105 blr %r14 # if($num<16) return 0;
e822c756
AP
106___
107$code.=<<___ if ($flavour =~ /3[12]/);
108 tmll $num,4
109 bnzr %r14 # if ($num&1) return 0;
110___
111$code.=<<___ if ($flavour !~ /3[12]/);
0ab8fd58
AP
112 cghi $num,96 #
113 bhr %r14 # if($num>96) return 0;
e822c756
AP
114___
115$code.=<<___;
116 stm${g} %r3,%r15,3*$SIZE_T($sp)
a2a54ffc 117
e822c756 118 lghi $rp,-$stdframe-8 # leave room for carry bit
8626230a 119 lcgr $j,$num # -$num
a2a54ffc 120 lgr %r0,$sp
8626230a
AP
121 la $rp,0($rp,$sp)
122 la $sp,0($j,$rp) # alloca
e822c756 123 st${g} %r0,0($sp) # back chain
a2a54ffc 124
8626230a
AP
125 sra $num,3 # restore $num
126 la $bp,0($j,$bp) # restore $bp
127 ahi $num,-1 # adjust $num for inner loop
a2a54ffc 128 lg $n0,0($n0) # pull n0
e822c756 129 _dswap $n0
a2a54ffc
AP
130
131 lg $bi,0($bp)
e822c756 132 _dswap $bi
8626230a 133 lg $alo,0($ap)
e822c756 134 _dswap $alo
a2a54ffc
AP
135 mlgr $ahi,$bi # ap[0]*bp[0]
136 lgr $AHI,$ahi
137
138 lgr $mn0,$alo # "tp[0]"*n0
139 msgr $mn0,$n0
140
8626230a 141 lg $nlo,0($np) #
e822c756 142 _dswap $nlo
a2a54ffc
AP
143 mlgr $nhi,$mn0 # np[0]*m1
144 algr $nlo,$alo # +="tp[0]"
145 lghi $NHI,0
146 alcgr $NHI,$nhi
147
8626230a
AP
148 la $j,8(%r0) # j=1
149 lr $count,$num
150
151.align 16
a2a54ffc
AP
152.L1st:
153 lg $alo,0($j,$ap)
e822c756 154 _dswap $alo
a2a54ffc
AP
155 mlgr $ahi,$bi # ap[j]*bp[0]
156 algr $alo,$AHI
157 lghi $AHI,0
158 alcgr $AHI,$ahi
159
160 lg $nlo,0($j,$np)
e822c756 161 _dswap $nlo
a2a54ffc
AP
162 mlgr $nhi,$mn0 # np[j]*m1
163 algr $nlo,$NHI
164 lghi $NHI,0
165 alcgr $nhi,$NHI # +="tp[j]"
166 algr $nlo,$alo
167 alcgr $NHI,$nhi
168
e822c756 169 stg $nlo,$stdframe-8($j,$sp) # tp[j-1]=
8626230a
AP
170 la $j,8($j) # j++
171 brct $count,.L1st
a2a54ffc
AP
172
173 algr $NHI,$AHI
174 lghi $AHI,0
175 alcgr $AHI,$AHI # upmost overflow bit
e822c756
AP
176 stg $NHI,$stdframe-8($j,$sp)
177 stg $AHI,$stdframe($j,$sp)
a2a54ffc
AP
178 la $bp,8($bp) # bp++
179
180.Louter:
181 lg $bi,0($bp) # bp[i]
e822c756 182 _dswap $bi
8626230a 183 lg $alo,0($ap)
e822c756 184 _dswap $alo
a2a54ffc 185 mlgr $ahi,$bi # ap[0]*bp[i]
e822c756 186 alg $alo,$stdframe($sp) # +=tp[0]
a2a54ffc
AP
187 lghi $AHI,0
188 alcgr $AHI,$ahi
189
190 lgr $mn0,$alo
8626230a 191 msgr $mn0,$n0 # tp[0]*n0
a2a54ffc 192
8626230a 193 lg $nlo,0($np) # np[0]
e822c756 194 _dswap $nlo
a2a54ffc
AP
195 mlgr $nhi,$mn0 # np[0]*m1
196 algr $nlo,$alo # +="tp[0]"
197 lghi $NHI,0
198 alcgr $NHI,$nhi
199
8626230a
AP
200 la $j,8(%r0) # j=1
201 lr $count,$num
202
203.align 16
a2a54ffc
AP
204.Linner:
205 lg $alo,0($j,$ap)
e822c756 206 _dswap $alo
a2a54ffc
AP
207 mlgr $ahi,$bi # ap[j]*bp[i]
208 algr $alo,$AHI
209 lghi $AHI,0
210 alcgr $ahi,$AHI
e822c756 211 alg $alo,$stdframe($j,$sp)# +=tp[j]
a2a54ffc
AP
212 alcgr $AHI,$ahi
213
214 lg $nlo,0($j,$np)
e822c756 215 _dswap $nlo
a2a54ffc
AP
216 mlgr $nhi,$mn0 # np[j]*m1
217 algr $nlo,$NHI
218 lghi $NHI,0
219 alcgr $nhi,$NHI
220 algr $nlo,$alo # +="tp[j]"
221 alcgr $NHI,$nhi
222
e822c756 223 stg $nlo,$stdframe-8($j,$sp) # tp[j-1]=
8626230a
AP
224 la $j,8($j) # j++
225 brct $count,.Linner
a2a54ffc
AP
226
227 algr $NHI,$AHI
228 lghi $AHI,0
229 alcgr $AHI,$AHI
e822c756 230 alg $NHI,$stdframe($j,$sp)# accumulate previous upmost overflow bit
a2a54ffc
AP
231 lghi $ahi,0
232 alcgr $AHI,$ahi # new upmost overflow bit
e822c756
AP
233 stg $NHI,$stdframe-8($j,$sp)
234 stg $AHI,$stdframe($j,$sp)
a2a54ffc
AP
235
236 la $bp,8($bp) # bp++
e822c756 237 cl${g} $bp,`$stdframe+8+4*$SIZE_T`($j,$sp) # compare to &bp[num]
a2a54ffc 238 jne .Louter
a2a54ffc 239
e822c756
AP
240 l${g} $rp,`$stdframe+8+2*$SIZE_T`($j,$sp) # reincarnate rp
241 la $ap,$stdframe($sp)
8626230a 242 ahi $num,1 # restore $num, incidentally clears "borrow"
a2a54ffc 243
8626230a
AP
244 la $j,0(%r0)
245 lr $count,$num
7d9cf7c0 246.Lsub: lg $alo,0($j,$ap)
e822c756
AP
247 lg $nlo,0($j,$np)
248 _dswap $nlo
249 slbgr $alo,$nlo
a2a54ffc
AP
250 stg $alo,0($j,$rp)
251 la $j,8($j)
7d9cf7c0 252 brct $count,.Lsub
a2a54ffc 253 lghi $ahi,0
7d9cf7c0
AP
254 slbgr $AHI,$ahi # handle upmost carry
255
256 ngr $ap,$AHI
257 lghi $np,-1
258 xgr $np,$AHI
259 ngr $np,$rp
260 ogr $ap,$np # ap=borrow?tp:rp
a2a54ffc 261
8626230a
AP
262 la $j,0(%r0)
263 lgr $count,$num
e822c756
AP
264.Lcopy: lg $alo,0($j,$ap) # copy or in-place refresh
265 _dswap $alo
266 stg $j,$stdframe($j,$sp) # zap tp
7d9cf7c0 267 stg $alo,0($j,$rp)
8626230a
AP
268 la $j,8($j)
269 brct $count,.Lcopy
7d9cf7c0 270
e822c756
AP
271 la %r1,`$stdframe+8+6*$SIZE_T`($j,$sp)
272 lm${g} %r6,%r15,0(%r1)
7d9cf7c0
AP
273 lghi %r2,1 # signal "processed"
274 br %r14
a2a54ffc
AP
275.size bn_mul_mont,.-bn_mul_mont
276.string "Montgomery Multiplication for s390x, CRYPTOGAMS by <appro\@openssl.org>"
277___
278
e822c756
AP
279foreach (split("\n",$code)) {
280 s/\`([^\`]*)\`/eval $1/ge;
281 s/_dswap\s+(%r[0-9]+)/sprintf("rllg\t%s,%s,32",$1,$1) if($SIZE_T==4)/e;
282 print $_,"\n";
283}
a2a54ffc 284close STDOUT;