]> git.ipfire.org Git - thirdparty/openssl.git/blame - crypto/ec/asm/ecp_nistz256-sparcv9.pl
Update copyright year
[thirdparty/openssl.git] / crypto / ec / asm / ecp_nistz256-sparcv9.pl
CommitLineData
6aa36e8e 1#! /usr/bin/env perl
33388b44 2# Copyright 2015-2020 The OpenSSL Project Authors. All Rights Reserved.
6aa36e8e 3#
a7f182b7 4# Licensed under the Apache License 2.0 (the "License"). You may not use
6aa36e8e
RS
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
5557d5f2
AP
9
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# ECP_NISTZ256 module for SPARCv9.
18#
19# February 2015.
20#
21# Original ECP_NISTZ256 submission targeting x86_64 is detailed in
22# http://eprint.iacr.org/2013/816. In the process of adaptation
23# original .c module was made 32-bit savvy in order to make this
24# implementation possible.
25#
26# with/without -DECP_NISTZ256_ASM
27# UltraSPARC III +12-18%
28# SPARC T4 +99-550% (+66-150% on 32-bit Solaris)
29#
30# Ranges denote minimum and maximum improvement coefficients depending
31# on benchmark. Lower coefficients are for ECDSA sign, server-side
32# operation. Keep in mind that +200% means 3x improvement.
33
1aa89a7a 34$output = pop and open STDOUT,">$output";
eb77e888 35
5557d5f2
AP
36$code.=<<___;
37#include "sparc_arch.h"
38
39#define LOCALS (STACK_BIAS+STACK_FRAME)
40#ifdef __arch64__
41.register %g2,#scratch
42.register %g3,#scratch
43# define STACK64_FRAME STACK_FRAME
44# define LOCALS64 LOCALS
45#else
46# define STACK64_FRAME (2047+192)
47# define LOCALS64 STACK64_FRAME
48#endif
49
50.section ".text",#alloc,#execinstr
51___
52########################################################################
53# Convert ecp_nistz256_table.c to layout expected by ecp_nistz_gather_w7
54#
55$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
56open TABLE,"<ecp_nistz256_table.c" or
57open TABLE,"<${dir}../ecp_nistz256_table.c" or
58die "failed to open ecp_nistz256_table.c:",$!;
59
60use integer;
61
62foreach(<TABLE>) {
63 s/TOBN\(\s*(0x[0-9a-f]+),\s*(0x[0-9a-f]+)\s*\)/push @arr,hex($2),hex($1)/geo;
64}
65close TABLE;
66
67# See ecp_nistz256_table.c for explanation for why it's 64*16*37.
68# 64*16*37-1 is because $#arr returns last valid index or @arr, not
69# amount of elements.
70die "insane number of elements" if ($#arr != 64*16*37-1);
71
72$code.=<<___;
73.globl ecp_nistz256_precomputed
74.align 4096
75ecp_nistz256_precomputed:
76___
77########################################################################
78# this conversion smashes P256_POINT_AFFINE by individual bytes with
79# 64 byte interval, similar to
80# 1111222233334444
81# 1234123412341234
82for(1..37) {
83 @tbl = splice(@arr,0,64*16);
84 for($i=0;$i<64;$i++) {
85 undef @line;
86 for($j=0;$j<64;$j++) {
87 push @line,(@tbl[$j*16+$i/4]>>(($i%4)*8))&0xff;
88 }
89 $code.=".byte\t";
90 $code.=join(',',map { sprintf "0x%02x",$_} @line);
91 $code.="\n";
92 }
93}
94
95{{{
96my ($rp,$ap,$bp)=map("%i$_",(0..2));
97my @acc=map("%l$_",(0..7));
98my ($t0,$t1,$t2,$t3,$t4,$t5,$t6,$t7)=(map("%o$_",(0..5)),"%g4","%g5");
99my ($bi,$a0,$mask,$carry)=(map("%i$_",(3..5)),"%g1");
100my ($rp_real,$ap_real)=("%g2","%g3");
101
102$code.=<<___;
ff823ee8 103.type ecp_nistz256_precomputed,#object
5557d5f2
AP
104.size ecp_nistz256_precomputed,.-ecp_nistz256_precomputed
105.align 64
106.LRR: ! 2^512 mod P precomputed for NIST P256 polynomial
107.long 0x00000003, 0x00000000, 0xffffffff, 0xfffffffb
108.long 0xfffffffe, 0xffffffff, 0xfffffffd, 0x00000004
109.Lone:
110.long 1,0,0,0,0,0,0,0
111.asciz "ECP_NISTZ256 for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
112
113! void ecp_nistz256_to_mont(BN_ULONG %i0[8],const BN_ULONG %i1[8]);
114.globl ecp_nistz256_to_mont
115.align 64
116ecp_nistz256_to_mont:
117 save %sp,-STACK_FRAME,%sp
118 nop
1191: call .+8
120 add %o7,.LRR-1b,$bp
121 call __ecp_nistz256_mul_mont
122 nop
123 ret
124 restore
ff823ee8 125.type ecp_nistz256_to_mont,#function
5557d5f2
AP
126.size ecp_nistz256_to_mont,.-ecp_nistz256_to_mont
127
128! void ecp_nistz256_from_mont(BN_ULONG %i0[8],const BN_ULONG %i1[8]);
129.globl ecp_nistz256_from_mont
130.align 32
131ecp_nistz256_from_mont:
132 save %sp,-STACK_FRAME,%sp
133 nop
1341: call .+8
135 add %o7,.Lone-1b,$bp
136 call __ecp_nistz256_mul_mont
137 nop
138 ret
139 restore
ff823ee8 140.type ecp_nistz256_from_mont,#function
5557d5f2
AP
141.size ecp_nistz256_from_mont,.-ecp_nistz256_from_mont
142
143! void ecp_nistz256_mul_mont(BN_ULONG %i0[8],const BN_ULONG %i1[8],
144! const BN_ULONG %i2[8]);
145.globl ecp_nistz256_mul_mont
146.align 32
147ecp_nistz256_mul_mont:
148 save %sp,-STACK_FRAME,%sp
149 nop
150 call __ecp_nistz256_mul_mont
151 nop
152 ret
153 restore
ff823ee8 154.type ecp_nistz256_mul_mont,#function
5557d5f2
AP
155.size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont
156
157! void ecp_nistz256_sqr_mont(BN_ULONG %i0[8],const BN_ULONG %i2[8]);
158.globl ecp_nistz256_sqr_mont
159.align 32
160ecp_nistz256_sqr_mont:
161 save %sp,-STACK_FRAME,%sp
162 mov $ap,$bp
163 call __ecp_nistz256_mul_mont
164 nop
165 ret
166 restore
ff823ee8 167.type ecp_nistz256_sqr_mont,#function
5557d5f2
AP
168.size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont
169___
170
171########################################################################
172# Special thing to keep in mind is that $t0-$t7 hold 64-bit values,
173# while all others are meant to keep 32. "Meant to" means that additions
174# to @acc[0-7] do "contaminate" upper bits, but they are cleared before
175# they can affect outcome (follow 'and' with $mask). Also keep in mind
176# that addition with carry is addition with 32-bit carry, even though
177# CPU is 64-bit. [Addition with 64-bit carry was introduced in T3, see
178# below for VIS3 code paths.]
179
180$code.=<<___;
181.align 32
182__ecp_nistz256_mul_mont:
183 ld [$bp+0],$bi ! b[0]
184 mov -1,$mask
185 ld [$ap+0],$a0
186 srl $mask,0,$mask ! 0xffffffff
187 ld [$ap+4],$t1
188 ld [$ap+8],$t2
189 ld [$ap+12],$t3
190 ld [$ap+16],$t4
191 ld [$ap+20],$t5
192 ld [$ap+24],$t6
193 ld [$ap+28],$t7
194 mulx $a0,$bi,$t0 ! a[0-7]*b[0], 64-bit results
195 mulx $t1,$bi,$t1
196 mulx $t2,$bi,$t2
197 mulx $t3,$bi,$t3
198 mulx $t4,$bi,$t4
199 mulx $t5,$bi,$t5
200 mulx $t6,$bi,$t6
201 mulx $t7,$bi,$t7
202 srlx $t0,32,@acc[1] ! extract high parts
203 srlx $t1,32,@acc[2]
204 srlx $t2,32,@acc[3]
205 srlx $t3,32,@acc[4]
206 srlx $t4,32,@acc[5]
207 srlx $t5,32,@acc[6]
208 srlx $t6,32,@acc[7]
209 srlx $t7,32,@acc[0] ! "@acc[8]"
210 mov 0,$carry
211___
212for($i=1;$i<8;$i++) {
213$code.=<<___;
214 addcc @acc[1],$t1,@acc[1] ! accumulate high parts
215 ld [$bp+4*$i],$bi ! b[$i]
216 ld [$ap+4],$t1 ! re-load a[1-7]
217 addccc @acc[2],$t2,@acc[2]
218 addccc @acc[3],$t3,@acc[3]
219 ld [$ap+8],$t2
220 ld [$ap+12],$t3
221 addccc @acc[4],$t4,@acc[4]
222 addccc @acc[5],$t5,@acc[5]
223 ld [$ap+16],$t4
224 ld [$ap+20],$t5
225 addccc @acc[6],$t6,@acc[6]
226 addccc @acc[7],$t7,@acc[7]
227 ld [$ap+24],$t6
228 ld [$ap+28],$t7
229 addccc @acc[0],$carry,@acc[0] ! "@acc[8]"
230 addc %g0,%g0,$carry
231___
232 # Reduction iteration is normally performed by accumulating
233 # result of multiplication of modulus by "magic" digit [and
234 # omitting least significant word, which is guaranteed to
235 # be 0], but thanks to special form of modulus and "magic"
236 # digit being equal to least significant word, it can be
237 # performed with additions and subtractions alone. Indeed:
238 #
239 # ffff.0001.0000.0000.0000.ffff.ffff.ffff
240 # * abcd
241 # + xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.abcd
242 #
243 # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we
244 # rewrite above as:
245 #
246 # xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.abcd
247 # + abcd.0000.abcd.0000.0000.abcd.0000.0000.0000
248 # - abcd.0000.0000.0000.0000.0000.0000.abcd
249 #
250 # or marking redundant operations:
251 #
252 # xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.----
253 # + abcd.0000.abcd.0000.0000.abcd.----.----.----
254 # - abcd.----.----.----.----.----.----.----
255
256$code.=<<___;
257 ! multiplication-less reduction
258 addcc @acc[3],$t0,@acc[3] ! r[3]+=r[0]
259 addccc @acc[4],%g0,@acc[4] ! r[4]+=0
260 and @acc[1],$mask,@acc[1]
261 and @acc[2],$mask,@acc[2]
262 addccc @acc[5],%g0,@acc[5] ! r[5]+=0
263 addccc @acc[6],$t0,@acc[6] ! r[6]+=r[0]
264 and @acc[3],$mask,@acc[3]
265 and @acc[4],$mask,@acc[4]
266 addccc @acc[7],%g0,@acc[7] ! r[7]+=0
267 addccc @acc[0],$t0,@acc[0] ! r[8]+=r[0] "@acc[8]"
268 and @acc[5],$mask,@acc[5]
269 and @acc[6],$mask,@acc[6]
270 addc $carry,%g0,$carry ! top-most carry
271 subcc @acc[7],$t0,@acc[7] ! r[7]-=r[0]
272 subccc @acc[0],%g0,@acc[0] ! r[8]-=0 "@acc[8]"
273 subc $carry,%g0,$carry ! top-most carry
274 and @acc[7],$mask,@acc[7]
275 and @acc[0],$mask,@acc[0] ! "@acc[8]"
276___
277 push(@acc,shift(@acc)); # rotate registers to "omit" acc[0]
278$code.=<<___;
279 mulx $a0,$bi,$t0 ! a[0-7]*b[$i], 64-bit results
280 mulx $t1,$bi,$t1
281 mulx $t2,$bi,$t2
282 mulx $t3,$bi,$t3
283 mulx $t4,$bi,$t4
284 mulx $t5,$bi,$t5
285 mulx $t6,$bi,$t6
286 mulx $t7,$bi,$t7
287 add @acc[0],$t0,$t0 ! accumulate low parts, can't overflow
288 add @acc[1],$t1,$t1
289 srlx $t0,32,@acc[1] ! extract high parts
290 add @acc[2],$t2,$t2
291 srlx $t1,32,@acc[2]
292 add @acc[3],$t3,$t3
293 srlx $t2,32,@acc[3]
294 add @acc[4],$t4,$t4
295 srlx $t3,32,@acc[4]
296 add @acc[5],$t5,$t5
297 srlx $t4,32,@acc[5]
298 add @acc[6],$t6,$t6
299 srlx $t5,32,@acc[6]
300 add @acc[7],$t7,$t7
301 srlx $t6,32,@acc[7]
302 srlx $t7,32,@acc[0] ! "@acc[8]"
303___
304}
305$code.=<<___;
306 addcc @acc[1],$t1,@acc[1] ! accumulate high parts
307 addccc @acc[2],$t2,@acc[2]
308 addccc @acc[3],$t3,@acc[3]
309 addccc @acc[4],$t4,@acc[4]
310 addccc @acc[5],$t5,@acc[5]
311 addccc @acc[6],$t6,@acc[6]
312 addccc @acc[7],$t7,@acc[7]
313 addccc @acc[0],$carry,@acc[0] ! "@acc[8]"
314 addc %g0,%g0,$carry
315
316 addcc @acc[3],$t0,@acc[3] ! multiplication-less reduction
317 addccc @acc[4],%g0,@acc[4]
318 addccc @acc[5],%g0,@acc[5]
319 addccc @acc[6],$t0,@acc[6]
320 addccc @acc[7],%g0,@acc[7]
321 addccc @acc[0],$t0,@acc[0] ! "@acc[8]"
322 addc $carry,%g0,$carry
323 subcc @acc[7],$t0,@acc[7]
324 subccc @acc[0],%g0,@acc[0] ! "@acc[8]"
325 subc $carry,%g0,$carry ! top-most carry
326___
327 push(@acc,shift(@acc)); # rotate registers to omit acc[0]
328$code.=<<___;
329 ! Final step is "if result > mod, subtract mod", but we do it
330 ! "other way around", namely subtract modulus from result
331 ! and if it borrowed, add modulus back.
332
333 subcc @acc[0],-1,@acc[0] ! subtract modulus
334 subccc @acc[1],-1,@acc[1]
335 subccc @acc[2],-1,@acc[2]
336 subccc @acc[3],0,@acc[3]
337 subccc @acc[4],0,@acc[4]
338 subccc @acc[5],0,@acc[5]
339 subccc @acc[6],1,@acc[6]
340 subccc @acc[7],-1,@acc[7]
341 subc $carry,0,$carry ! broadcast borrow bit
342
343 ! Note that because mod has special form, i.e. consists of
344 ! 0xffffffff, 1 and 0s, we can conditionally synthesize it by
345 ! using value of broadcasted borrow and the borrow bit itself.
346 ! To minimize dependency chain we first broadcast and then
347 ! extract the bit by negating (follow $bi).
348
349 addcc @acc[0],$carry,@acc[0] ! add modulus or zero
350 addccc @acc[1],$carry,@acc[1]
351 neg $carry,$bi
352 st @acc[0],[$rp]
353 addccc @acc[2],$carry,@acc[2]
354 st @acc[1],[$rp+4]
355 addccc @acc[3],0,@acc[3]
356 st @acc[2],[$rp+8]
357 addccc @acc[4],0,@acc[4]
358 st @acc[3],[$rp+12]
359 addccc @acc[5],0,@acc[5]
360 st @acc[4],[$rp+16]
361 addccc @acc[6],$bi,@acc[6]
362 st @acc[5],[$rp+20]
363 addc @acc[7],$carry,@acc[7]
364 st @acc[6],[$rp+24]
365 retl
366 st @acc[7],[$rp+28]
ff823ee8 367.type __ecp_nistz256_mul_mont,#function
5557d5f2
AP
368.size __ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont
369
370! void ecp_nistz256_add(BN_ULONG %i0[8],const BN_ULONG %i1[8],
371! const BN_ULONG %i2[8]);
372.globl ecp_nistz256_add
373.align 32
374ecp_nistz256_add:
375 save %sp,-STACK_FRAME,%sp
376 ld [$ap],@acc[0]
377 ld [$ap+4],@acc[1]
378 ld [$ap+8],@acc[2]
379 ld [$ap+12],@acc[3]
380 ld [$ap+16],@acc[4]
381 ld [$ap+20],@acc[5]
382 ld [$ap+24],@acc[6]
383 call __ecp_nistz256_add
384 ld [$ap+28],@acc[7]
385 ret
386 restore
ff823ee8 387.type ecp_nistz256_add,#function
5557d5f2
AP
388.size ecp_nistz256_add,.-ecp_nistz256_add
389
390.align 32
391__ecp_nistz256_add:
392 ld [$bp+0],$t0 ! b[0]
393 ld [$bp+4],$t1
394 ld [$bp+8],$t2
395 ld [$bp+12],$t3
396 addcc @acc[0],$t0,@acc[0]
397 ld [$bp+16],$t4
398 ld [$bp+20],$t5
399 addccc @acc[1],$t1,@acc[1]
400 ld [$bp+24],$t6
401 ld [$bp+28],$t7
402 addccc @acc[2],$t2,@acc[2]
403 addccc @acc[3],$t3,@acc[3]
404 addccc @acc[4],$t4,@acc[4]
405 addccc @acc[5],$t5,@acc[5]
406 addccc @acc[6],$t6,@acc[6]
407 addccc @acc[7],$t7,@acc[7]
dfde4219 408 addc %g0,%g0,$carry
5557d5f2
AP
409
410.Lreduce_by_sub:
411
dfde4219 412 ! if a+b >= modulus, subtract modulus.
5557d5f2 413 !
dfde4219 414 ! But since comparison implies subtraction, we subtract
46f4e1be 415 ! modulus and then add it back if subtraction borrowed.
dfde4219
AP
416
417 subcc @acc[0],-1,@acc[0]
418 subccc @acc[1],-1,@acc[1]
419 subccc @acc[2],-1,@acc[2]
420 subccc @acc[3], 0,@acc[3]
421 subccc @acc[4], 0,@acc[4]
422 subccc @acc[5], 0,@acc[5]
423 subccc @acc[6], 1,@acc[6]
424 subccc @acc[7],-1,@acc[7]
425 subc $carry,0,$carry
426
5557d5f2
AP
427 ! Note that because mod has special form, i.e. consists of
428 ! 0xffffffff, 1 and 0s, we can conditionally synthesize it by
dfde4219 429 ! using value of borrow and its negative.
5557d5f2 430
dfde4219
AP
431 addcc @acc[0],$carry,@acc[0] ! add synthesized modulus
432 addccc @acc[1],$carry,@acc[1]
5557d5f2
AP
433 neg $carry,$bi
434 st @acc[0],[$rp]
dfde4219 435 addccc @acc[2],$carry,@acc[2]
5557d5f2 436 st @acc[1],[$rp+4]
dfde4219 437 addccc @acc[3],0,@acc[3]
5557d5f2 438 st @acc[2],[$rp+8]
dfde4219 439 addccc @acc[4],0,@acc[4]
5557d5f2 440 st @acc[3],[$rp+12]
dfde4219 441 addccc @acc[5],0,@acc[5]
5557d5f2 442 st @acc[4],[$rp+16]
dfde4219 443 addccc @acc[6],$bi,@acc[6]
5557d5f2 444 st @acc[5],[$rp+20]
dfde4219 445 addc @acc[7],$carry,@acc[7]
5557d5f2
AP
446 st @acc[6],[$rp+24]
447 retl
448 st @acc[7],[$rp+28]
ff823ee8 449.type __ecp_nistz256_add,#function
5557d5f2
AP
450.size __ecp_nistz256_add,.-__ecp_nistz256_add
451
452! void ecp_nistz256_mul_by_2(BN_ULONG %i0[8],const BN_ULONG %i1[8]);
453.globl ecp_nistz256_mul_by_2
454.align 32
455ecp_nistz256_mul_by_2:
456 save %sp,-STACK_FRAME,%sp
457 ld [$ap],@acc[0]
458 ld [$ap+4],@acc[1]
459 ld [$ap+8],@acc[2]
460 ld [$ap+12],@acc[3]
461 ld [$ap+16],@acc[4]
462 ld [$ap+20],@acc[5]
463 ld [$ap+24],@acc[6]
464 call __ecp_nistz256_mul_by_2
465 ld [$ap+28],@acc[7]
466 ret
467 restore
ff823ee8 468.type ecp_nistz256_mul_by_2,#function
5557d5f2
AP
469.size ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2
470
471.align 32
472__ecp_nistz256_mul_by_2:
473 addcc @acc[0],@acc[0],@acc[0] ! a+a=2*a
474 addccc @acc[1],@acc[1],@acc[1]
475 addccc @acc[2],@acc[2],@acc[2]
476 addccc @acc[3],@acc[3],@acc[3]
477 addccc @acc[4],@acc[4],@acc[4]
478 addccc @acc[5],@acc[5],@acc[5]
479 addccc @acc[6],@acc[6],@acc[6]
480 addccc @acc[7],@acc[7],@acc[7]
481 b .Lreduce_by_sub
dfde4219 482 addc %g0,%g0,$carry
ff823ee8 483.type __ecp_nistz256_mul_by_2,#function
5557d5f2
AP
484.size __ecp_nistz256_mul_by_2,.-__ecp_nistz256_mul_by_2
485
486! void ecp_nistz256_mul_by_3(BN_ULONG %i0[8],const BN_ULONG %i1[8]);
487.globl ecp_nistz256_mul_by_3
488.align 32
489ecp_nistz256_mul_by_3:
490 save %sp,-STACK_FRAME,%sp
491 ld [$ap],@acc[0]
492 ld [$ap+4],@acc[1]
493 ld [$ap+8],@acc[2]
494 ld [$ap+12],@acc[3]
495 ld [$ap+16],@acc[4]
496 ld [$ap+20],@acc[5]
497 ld [$ap+24],@acc[6]
498 call __ecp_nistz256_mul_by_3
499 ld [$ap+28],@acc[7]
500 ret
501 restore
ff823ee8 502.type ecp_nistz256_mul_by_3,#function
5557d5f2
AP
503.size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3
504
505.align 32
506__ecp_nistz256_mul_by_3:
507 addcc @acc[0],@acc[0],$t0 ! a+a=2*a
508 addccc @acc[1],@acc[1],$t1
509 addccc @acc[2],@acc[2],$t2
510 addccc @acc[3],@acc[3],$t3
511 addccc @acc[4],@acc[4],$t4
512 addccc @acc[5],@acc[5],$t5
513 addccc @acc[6],@acc[6],$t6
514 addccc @acc[7],@acc[7],$t7
dfde4219 515 addc %g0,%g0,$carry
5557d5f2 516
dfde4219
AP
517 subcc $t0,-1,$t0 ! .Lreduce_by_sub but without stores
518 subccc $t1,-1,$t1
519 subccc $t2,-1,$t2
520 subccc $t3, 0,$t3
521 subccc $t4, 0,$t4
522 subccc $t5, 0,$t5
523 subccc $t6, 1,$t6
524 subccc $t7,-1,$t7
525 subc $carry,0,$carry
526
527 addcc $t0,$carry,$t0 ! add synthesized modulus
528 addccc $t1,$carry,$t1
5557d5f2 529 neg $carry,$bi
dfde4219
AP
530 addccc $t2,$carry,$t2
531 addccc $t3,0,$t3
532 addccc $t4,0,$t4
533 addccc $t5,0,$t5
534 addccc $t6,$bi,$t6
535 addc $t7,$carry,$t7
5557d5f2
AP
536
537 addcc $t0,@acc[0],@acc[0] ! 2*a+a=3*a
538 addccc $t1,@acc[1],@acc[1]
539 addccc $t2,@acc[2],@acc[2]
540 addccc $t3,@acc[3],@acc[3]
541 addccc $t4,@acc[4],@acc[4]
542 addccc $t5,@acc[5],@acc[5]
543 addccc $t6,@acc[6],@acc[6]
544 addccc $t7,@acc[7],@acc[7]
545 b .Lreduce_by_sub
dfde4219 546 addc %g0,%g0,$carry
ff823ee8 547.type __ecp_nistz256_mul_by_3,#function
5557d5f2
AP
548.size __ecp_nistz256_mul_by_3,.-__ecp_nistz256_mul_by_3
549
550! void ecp_nistz256_sub(BN_ULONG %i0[8],const BN_ULONG %i1[8],
551! const BN_ULONG %i2[8]);
552.globl ecp_nistz256_sub
553.align 32
554ecp_nistz256_sub:
555 save %sp,-STACK_FRAME,%sp
556 ld [$ap],@acc[0]
557 ld [$ap+4],@acc[1]
558 ld [$ap+8],@acc[2]
559 ld [$ap+12],@acc[3]
560 ld [$ap+16],@acc[4]
561 ld [$ap+20],@acc[5]
562 ld [$ap+24],@acc[6]
563 call __ecp_nistz256_sub_from
564 ld [$ap+28],@acc[7]
565 ret
566 restore
ff823ee8 567.type ecp_nistz256_sub,#function
5557d5f2
AP
568.size ecp_nistz256_sub,.-ecp_nistz256_sub
569
570! void ecp_nistz256_neg(BN_ULONG %i0[8],const BN_ULONG %i1[8]);
571.globl ecp_nistz256_neg
572.align 32
573ecp_nistz256_neg:
574 save %sp,-STACK_FRAME,%sp
575 mov $ap,$bp
576 mov 0,@acc[0]
577 mov 0,@acc[1]
578 mov 0,@acc[2]
579 mov 0,@acc[3]
580 mov 0,@acc[4]
581 mov 0,@acc[5]
582 mov 0,@acc[6]
583 call __ecp_nistz256_sub_from
584 mov 0,@acc[7]
585 ret
586 restore
ff823ee8 587.type ecp_nistz256_neg,#function
5557d5f2
AP
588.size ecp_nistz256_neg,.-ecp_nistz256_neg
589
590.align 32
591__ecp_nistz256_sub_from:
592 ld [$bp+0],$t0 ! b[0]
593 ld [$bp+4],$t1
594 ld [$bp+8],$t2
595 ld [$bp+12],$t3
596 subcc @acc[0],$t0,@acc[0]
597 ld [$bp+16],$t4
598 ld [$bp+20],$t5
599 subccc @acc[1],$t1,@acc[1]
600 subccc @acc[2],$t2,@acc[2]
601 ld [$bp+24],$t6
602 ld [$bp+28],$t7
603 subccc @acc[3],$t3,@acc[3]
604 subccc @acc[4],$t4,@acc[4]
605 subccc @acc[5],$t5,@acc[5]
606 subccc @acc[6],$t6,@acc[6]
607 subccc @acc[7],$t7,@acc[7]
608 subc %g0,%g0,$carry ! broadcast borrow bit
609
610.Lreduce_by_add:
611
612 ! if a-b borrows, add modulus.
613 !
614 ! Note that because mod has special form, i.e. consists of
615 ! 0xffffffff, 1 and 0s, we can conditionally synthesize it by
616 ! using value of broadcasted borrow and the borrow bit itself.
617 ! To minimize dependency chain we first broadcast and then
618 ! extract the bit by negating (follow $bi).
619
620 addcc @acc[0],$carry,@acc[0] ! add synthesized modulus
621 addccc @acc[1],$carry,@acc[1]
622 neg $carry,$bi
623 st @acc[0],[$rp]
624 addccc @acc[2],$carry,@acc[2]
625 st @acc[1],[$rp+4]
626 addccc @acc[3],0,@acc[3]
627 st @acc[2],[$rp+8]
628 addccc @acc[4],0,@acc[4]
629 st @acc[3],[$rp+12]
630 addccc @acc[5],0,@acc[5]
631 st @acc[4],[$rp+16]
632 addccc @acc[6],$bi,@acc[6]
633 st @acc[5],[$rp+20]
634 addc @acc[7],$carry,@acc[7]
635 st @acc[6],[$rp+24]
636 retl
637 st @acc[7],[$rp+28]
ff823ee8 638.type __ecp_nistz256_sub_from,#function
5557d5f2
AP
639.size __ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from
640
641.align 32
642__ecp_nistz256_sub_morf:
643 ld [$bp+0],$t0 ! b[0]
644 ld [$bp+4],$t1
645 ld [$bp+8],$t2
646 ld [$bp+12],$t3
647 subcc $t0,@acc[0],@acc[0]
648 ld [$bp+16],$t4
649 ld [$bp+20],$t5
650 subccc $t1,@acc[1],@acc[1]
651 subccc $t2,@acc[2],@acc[2]
652 ld [$bp+24],$t6
653 ld [$bp+28],$t7
654 subccc $t3,@acc[3],@acc[3]
655 subccc $t4,@acc[4],@acc[4]
656 subccc $t5,@acc[5],@acc[5]
657 subccc $t6,@acc[6],@acc[6]
658 subccc $t7,@acc[7],@acc[7]
659 b .Lreduce_by_add
660 subc %g0,%g0,$carry ! broadcast borrow bit
ff823ee8 661.type __ecp_nistz256_sub_morf,#function
5557d5f2
AP
662.size __ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf
663
664! void ecp_nistz256_div_by_2(BN_ULONG %i0[8],const BN_ULONG %i1[8]);
665.globl ecp_nistz256_div_by_2
666.align 32
667ecp_nistz256_div_by_2:
668 save %sp,-STACK_FRAME,%sp
669 ld [$ap],@acc[0]
670 ld [$ap+4],@acc[1]
671 ld [$ap+8],@acc[2]
672 ld [$ap+12],@acc[3]
673 ld [$ap+16],@acc[4]
674 ld [$ap+20],@acc[5]
675 ld [$ap+24],@acc[6]
676 call __ecp_nistz256_div_by_2
677 ld [$ap+28],@acc[7]
678 ret
679 restore
ff823ee8 680.type ecp_nistz256_div_by_2,#function
5557d5f2
AP
681.size ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2
682
683.align 32
684__ecp_nistz256_div_by_2:
685 ! ret = (a is odd ? a+mod : a) >> 1
686
687 and @acc[0],1,$bi
688 neg $bi,$carry
689 addcc @acc[0],$carry,@acc[0]
690 addccc @acc[1],$carry,@acc[1]
691 addccc @acc[2],$carry,@acc[2]
692 addccc @acc[3],0,@acc[3]
693 addccc @acc[4],0,@acc[4]
694 addccc @acc[5],0,@acc[5]
695 addccc @acc[6],$bi,@acc[6]
696 addccc @acc[7],$carry,@acc[7]
697 addc %g0,%g0,$carry
698
699 ! ret >>= 1
700
701 srl @acc[0],1,@acc[0]
702 sll @acc[1],31,$t0
703 srl @acc[1],1,@acc[1]
704 or @acc[0],$t0,@acc[0]
705 sll @acc[2],31,$t1
706 srl @acc[2],1,@acc[2]
707 or @acc[1],$t1,@acc[1]
708 sll @acc[3],31,$t2
709 st @acc[0],[$rp]
710 srl @acc[3],1,@acc[3]
711 or @acc[2],$t2,@acc[2]
712 sll @acc[4],31,$t3
713 st @acc[1],[$rp+4]
714 srl @acc[4],1,@acc[4]
715 or @acc[3],$t3,@acc[3]
716 sll @acc[5],31,$t4
717 st @acc[2],[$rp+8]
718 srl @acc[5],1,@acc[5]
719 or @acc[4],$t4,@acc[4]
720 sll @acc[6],31,$t5
721 st @acc[3],[$rp+12]
722 srl @acc[6],1,@acc[6]
723 or @acc[5],$t5,@acc[5]
724 sll @acc[7],31,$t6
725 st @acc[4],[$rp+16]
726 srl @acc[7],1,@acc[7]
727 or @acc[6],$t6,@acc[6]
728 sll $carry,31,$t7
729 st @acc[5],[$rp+20]
730 or @acc[7],$t7,@acc[7]
731 st @acc[6],[$rp+24]
732 retl
733 st @acc[7],[$rp+28]
ff823ee8 734.type __ecp_nistz256_div_by_2,#function
5557d5f2
AP
735.size __ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2
736___
737
738########################################################################
085b3860 739# following subroutines are "literal" implementation of those found in
5557d5f2
AP
740# ecp_nistz256.c
741#
742########################################################################
743# void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp);
744#
745{
746my ($S,$M,$Zsqr,$tmp0)=map(32*$_,(0..3));
747# above map() describes stack layout with 4 temporary
748# 256-bit vectors on top.
749
750$code.=<<___;
751#ifdef __PIC__
752SPARC_PIC_THUNK(%g1)
753#endif
754
755.globl ecp_nistz256_point_double
756.align 32
757ecp_nistz256_point_double:
758 SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
759 ld [%g1],%g1 ! OPENSSL_sparcv9cap_P[0]
760 and %g1,(SPARCV9_VIS3|SPARCV9_64BIT_STACK),%g1
761 cmp %g1,(SPARCV9_VIS3|SPARCV9_64BIT_STACK)
762 be ecp_nistz256_point_double_vis3
763 nop
764
765 save %sp,-STACK_FRAME-32*4,%sp
766
767 mov $rp,$rp_real
768 mov $ap,$ap_real
769
1a661908 770.Lpoint_double_shortcut:
5557d5f2
AP
771 ld [$ap+32],@acc[0]
772 ld [$ap+32+4],@acc[1]
773 ld [$ap+32+8],@acc[2]
774 ld [$ap+32+12],@acc[3]
775 ld [$ap+32+16],@acc[4]
776 ld [$ap+32+20],@acc[5]
777 ld [$ap+32+24],@acc[6]
778 ld [$ap+32+28],@acc[7]
779 call __ecp_nistz256_mul_by_2 ! p256_mul_by_2(S, in_y);
780 add %sp,LOCALS+$S,$rp
781
782 add $ap_real,64,$bp
783 add $ap_real,64,$ap
784 call __ecp_nistz256_mul_mont ! p256_sqr_mont(Zsqr, in_z);
785 add %sp,LOCALS+$Zsqr,$rp
786
787 add $ap_real,0,$bp
788 call __ecp_nistz256_add ! p256_add(M, Zsqr, in_x);
789 add %sp,LOCALS+$M,$rp
790
791 add %sp,LOCALS+$S,$bp
792 add %sp,LOCALS+$S,$ap
793 call __ecp_nistz256_mul_mont ! p256_sqr_mont(S, S);
794 add %sp,LOCALS+$S,$rp
795
796 ld [$ap_real],@acc[0]
797 add %sp,LOCALS+$Zsqr,$bp
798 ld [$ap_real+4],@acc[1]
799 ld [$ap_real+8],@acc[2]
800 ld [$ap_real+12],@acc[3]
801 ld [$ap_real+16],@acc[4]
802 ld [$ap_real+20],@acc[5]
803 ld [$ap_real+24],@acc[6]
804 ld [$ap_real+28],@acc[7]
805 call __ecp_nistz256_sub_from ! p256_sub(Zsqr, in_x, Zsqr);
806 add %sp,LOCALS+$Zsqr,$rp
807
808 add $ap_real,32,$bp
809 add $ap_real,64,$ap
810 call __ecp_nistz256_mul_mont ! p256_mul_mont(tmp0, in_z, in_y);
811 add %sp,LOCALS+$tmp0,$rp
812
813 call __ecp_nistz256_mul_by_2 ! p256_mul_by_2(res_z, tmp0);
814 add $rp_real,64,$rp
815
816 add %sp,LOCALS+$Zsqr,$bp
817 add %sp,LOCALS+$M,$ap
818 call __ecp_nistz256_mul_mont ! p256_mul_mont(M, M, Zsqr);
819 add %sp,LOCALS+$M,$rp
820
821 call __ecp_nistz256_mul_by_3 ! p256_mul_by_3(M, M);
822 add %sp,LOCALS+$M,$rp
823
824 add %sp,LOCALS+$S,$bp
825 add %sp,LOCALS+$S,$ap
826 call __ecp_nistz256_mul_mont ! p256_sqr_mont(tmp0, S);
827 add %sp,LOCALS+$tmp0,$rp
828
829 call __ecp_nistz256_div_by_2 ! p256_div_by_2(res_y, tmp0);
830 add $rp_real,32,$rp
831
832 add $ap_real,0,$bp
833 add %sp,LOCALS+$S,$ap
834 call __ecp_nistz256_mul_mont ! p256_mul_mont(S, S, in_x);
835 add %sp,LOCALS+$S,$rp
836
837 call __ecp_nistz256_mul_by_2 ! p256_mul_by_2(tmp0, S);
838 add %sp,LOCALS+$tmp0,$rp
839
840 add %sp,LOCALS+$M,$bp
841 add %sp,LOCALS+$M,$ap
842 call __ecp_nistz256_mul_mont ! p256_sqr_mont(res_x, M);
843 add $rp_real,0,$rp
844
845 add %sp,LOCALS+$tmp0,$bp
846 call __ecp_nistz256_sub_from ! p256_sub(res_x, res_x, tmp0);
847 add $rp_real,0,$rp
848
849 add %sp,LOCALS+$S,$bp
850 call __ecp_nistz256_sub_morf ! p256_sub(S, S, res_x);
851 add %sp,LOCALS+$S,$rp
852
853 add %sp,LOCALS+$M,$bp
854 add %sp,LOCALS+$S,$ap
855 call __ecp_nistz256_mul_mont ! p256_mul_mont(S, S, M);
856 add %sp,LOCALS+$S,$rp
857
858 add $rp_real,32,$bp
859 call __ecp_nistz256_sub_from ! p256_sub(res_y, S, res_y);
860 add $rp_real,32,$rp
861
862 ret
863 restore
ff823ee8 864.type ecp_nistz256_point_double,#function
5557d5f2
AP
865.size ecp_nistz256_point_double,.-ecp_nistz256_point_double
866___
867}
868
869########################################################################
870# void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1,
871# const P256_POINT *in2);
872{
873my ($res_x,$res_y,$res_z,
874 $H,$Hsqr,$R,$Rsqr,$Hcub,
875 $U1,$U2,$S1,$S2)=map(32*$_,(0..11));
876my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr);
877
878# above map() describes stack layout with 12 temporary
879# 256-bit vectors on top. Then we reserve some space for
880# !in1infty, !in2infty, result of check for zero and return pointer.
881
882my $bp_real=$rp_real;
883
884$code.=<<___;
885.globl ecp_nistz256_point_add
886.align 32
887ecp_nistz256_point_add:
888 SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
889 ld [%g1],%g1 ! OPENSSL_sparcv9cap_P[0]
890 and %g1,(SPARCV9_VIS3|SPARCV9_64BIT_STACK),%g1
891 cmp %g1,(SPARCV9_VIS3|SPARCV9_64BIT_STACK)
892 be ecp_nistz256_point_add_vis3
893 nop
894
895 save %sp,-STACK_FRAME-32*12-32,%sp
896
897 stx $rp,[%fp+STACK_BIAS-8] ! off-load $rp
898 mov $ap,$ap_real
899 mov $bp,$bp_real
900
c74aea8d
AP
901 ld [$bp+64],$t0 ! in2_z
902 ld [$bp+64+4],$t1
903 ld [$bp+64+8],$t2
904 ld [$bp+64+12],$t3
905 ld [$bp+64+16],$t4
906 ld [$bp+64+20],$t5
907 ld [$bp+64+24],$t6
908 ld [$bp+64+28],$t7
5557d5f2
AP
909 or $t1,$t0,$t0
910 or $t3,$t2,$t2
911 or $t5,$t4,$t4
912 or $t7,$t6,$t6
913 or $t2,$t0,$t0
914 or $t6,$t4,$t4
c74aea8d 915 or $t4,$t0,$t0 ! !in2infty
5557d5f2
AP
916 movrnz $t0,-1,$t0
917 st $t0,[%fp+STACK_BIAS-12]
918
c74aea8d
AP
919 ld [$ap+64],$t0 ! in1_z
920 ld [$ap+64+4],$t1
921 ld [$ap+64+8],$t2
922 ld [$ap+64+12],$t3
923 ld [$ap+64+16],$t4
924 ld [$ap+64+20],$t5
925 ld [$ap+64+24],$t6
926 ld [$ap+64+28],$t7
5557d5f2
AP
927 or $t1,$t0,$t0
928 or $t3,$t2,$t2
929 or $t5,$t4,$t4
930 or $t7,$t6,$t6
931 or $t2,$t0,$t0
932 or $t6,$t4,$t4
c74aea8d 933 or $t4,$t0,$t0 ! !in1infty
5557d5f2
AP
934 movrnz $t0,-1,$t0
935 st $t0,[%fp+STACK_BIAS-16]
936
937 add $bp_real,64,$bp
938 add $bp_real,64,$ap
939 call __ecp_nistz256_mul_mont ! p256_sqr_mont(Z2sqr, in2_z);
940 add %sp,LOCALS+$Z2sqr,$rp
941
942 add $ap_real,64,$bp
943 add $ap_real,64,$ap
944 call __ecp_nistz256_mul_mont ! p256_sqr_mont(Z1sqr, in1_z);
945 add %sp,LOCALS+$Z1sqr,$rp
946
947 add $bp_real,64,$bp
948 add %sp,LOCALS+$Z2sqr,$ap
949 call __ecp_nistz256_mul_mont ! p256_mul_mont(S1, Z2sqr, in2_z);
950 add %sp,LOCALS+$S1,$rp
951
952 add $ap_real,64,$bp
953 add %sp,LOCALS+$Z1sqr,$ap
954 call __ecp_nistz256_mul_mont ! p256_mul_mont(S2, Z1sqr, in1_z);
955 add %sp,LOCALS+$S2,$rp
956
957 add $ap_real,32,$bp
958 add %sp,LOCALS+$S1,$ap
959 call __ecp_nistz256_mul_mont ! p256_mul_mont(S1, S1, in1_y);
960 add %sp,LOCALS+$S1,$rp
961
962 add $bp_real,32,$bp
963 add %sp,LOCALS+$S2,$ap
964 call __ecp_nistz256_mul_mont ! p256_mul_mont(S2, S2, in2_y);
965 add %sp,LOCALS+$S2,$rp
966
967 add %sp,LOCALS+$S1,$bp
968 call __ecp_nistz256_sub_from ! p256_sub(R, S2, S1);
969 add %sp,LOCALS+$R,$rp
970
971 or @acc[1],@acc[0],@acc[0] ! see if result is zero
972 or @acc[3],@acc[2],@acc[2]
973 or @acc[5],@acc[4],@acc[4]
974 or @acc[7],@acc[6],@acc[6]
975 or @acc[2],@acc[0],@acc[0]
976 or @acc[6],@acc[4],@acc[4]
977 or @acc[4],@acc[0],@acc[0]
978 st @acc[0],[%fp+STACK_BIAS-20]
979
980 add $ap_real,0,$bp
981 add %sp,LOCALS+$Z2sqr,$ap
982 call __ecp_nistz256_mul_mont ! p256_mul_mont(U1, in1_x, Z2sqr);
983 add %sp,LOCALS+$U1,$rp
984
985 add $bp_real,0,$bp
986 add %sp,LOCALS+$Z1sqr,$ap
987 call __ecp_nistz256_mul_mont ! p256_mul_mont(U2, in2_x, Z1sqr);
988 add %sp,LOCALS+$U2,$rp
989
990 add %sp,LOCALS+$U1,$bp
991 call __ecp_nistz256_sub_from ! p256_sub(H, U2, U1);
992 add %sp,LOCALS+$H,$rp
993
994 or @acc[1],@acc[0],@acc[0] ! see if result is zero
995 or @acc[3],@acc[2],@acc[2]
996 or @acc[5],@acc[4],@acc[4]
997 or @acc[7],@acc[6],@acc[6]
998 or @acc[2],@acc[0],@acc[0]
999 or @acc[6],@acc[4],@acc[4]
1000 orcc @acc[4],@acc[0],@acc[0]
1001
1002 bne,pt %icc,.Ladd_proceed ! is_equal(U1,U2)?
1003 nop
1004
1005 ld [%fp+STACK_BIAS-12],$t0
1006 ld [%fp+STACK_BIAS-16],$t1
1007 ld [%fp+STACK_BIAS-20],$t2
1008 andcc $t0,$t1,%g0
1009 be,pt %icc,.Ladd_proceed ! (in1infty || in2infty)?
1010 nop
1011 andcc $t2,$t2,%g0
1a661908 1012 be,pt %icc,.Ladd_double ! is_equal(S1,S2)?
5557d5f2
AP
1013 nop
1014
1015 ldx [%fp+STACK_BIAS-8],$rp
1016 st %g0,[$rp]
1017 st %g0,[$rp+4]
1018 st %g0,[$rp+8]
1019 st %g0,[$rp+12]
1020 st %g0,[$rp+16]
1021 st %g0,[$rp+20]
1022 st %g0,[$rp+24]
1023 st %g0,[$rp+28]
1024 st %g0,[$rp+32]
1025 st %g0,[$rp+32+4]
1026 st %g0,[$rp+32+8]
1027 st %g0,[$rp+32+12]
1028 st %g0,[$rp+32+16]
1029 st %g0,[$rp+32+20]
1030 st %g0,[$rp+32+24]
1031 st %g0,[$rp+32+28]
1032 st %g0,[$rp+64]
1033 st %g0,[$rp+64+4]
1034 st %g0,[$rp+64+8]
1035 st %g0,[$rp+64+12]
1036 st %g0,[$rp+64+16]
1037 st %g0,[$rp+64+20]
1038 st %g0,[$rp+64+24]
1039 st %g0,[$rp+64+28]
1040 b .Ladd_done
1041 nop
1042
1a661908
AP
1043.align 16
1044.Ladd_double:
1045 ldx [%fp+STACK_BIAS-8],$rp_real
1046 mov $ap_real,$ap
1047 b .Lpoint_double_shortcut
1048 add %sp,32*(12-4)+32,%sp ! difference in frame sizes
1049
5557d5f2
AP
1050.align 16
1051.Ladd_proceed:
1052 add %sp,LOCALS+$R,$bp
1053 add %sp,LOCALS+$R,$ap
1054 call __ecp_nistz256_mul_mont ! p256_sqr_mont(Rsqr, R);
1055 add %sp,LOCALS+$Rsqr,$rp
1056
1057 add $ap_real,64,$bp
1058 add %sp,LOCALS+$H,$ap
1059 call __ecp_nistz256_mul_mont ! p256_mul_mont(res_z, H, in1_z);
1060 add %sp,LOCALS+$res_z,$rp
1061
1062 add %sp,LOCALS+$H,$bp
1063 add %sp,LOCALS+$H,$ap
1064 call __ecp_nistz256_mul_mont ! p256_sqr_mont(Hsqr, H);
1065 add %sp,LOCALS+$Hsqr,$rp
1066
1067 add $bp_real,64,$bp
1068 add %sp,LOCALS+$res_z,$ap
1069 call __ecp_nistz256_mul_mont ! p256_mul_mont(res_z, res_z, in2_z);
1070 add %sp,LOCALS+$res_z,$rp
1071
1072 add %sp,LOCALS+$H,$bp
1073 add %sp,LOCALS+$Hsqr,$ap
1074 call __ecp_nistz256_mul_mont ! p256_mul_mont(Hcub, Hsqr, H);
1075 add %sp,LOCALS+$Hcub,$rp
1076
1077 add %sp,LOCALS+$U1,$bp
1078 add %sp,LOCALS+$Hsqr,$ap
1079 call __ecp_nistz256_mul_mont ! p256_mul_mont(U2, U1, Hsqr);
1080 add %sp,LOCALS+$U2,$rp
1081
1082 call __ecp_nistz256_mul_by_2 ! p256_mul_by_2(Hsqr, U2);
1083 add %sp,LOCALS+$Hsqr,$rp
1084
1085 add %sp,LOCALS+$Rsqr,$bp
1086 call __ecp_nistz256_sub_morf ! p256_sub(res_x, Rsqr, Hsqr);
1087 add %sp,LOCALS+$res_x,$rp
1088
1089 add %sp,LOCALS+$Hcub,$bp
1090 call __ecp_nistz256_sub_from ! p256_sub(res_x, res_x, Hcub);
1091 add %sp,LOCALS+$res_x,$rp
1092
1093 add %sp,LOCALS+$U2,$bp
1094 call __ecp_nistz256_sub_morf ! p256_sub(res_y, U2, res_x);
1095 add %sp,LOCALS+$res_y,$rp
1096
1097 add %sp,LOCALS+$Hcub,$bp
1098 add %sp,LOCALS+$S1,$ap
1099 call __ecp_nistz256_mul_mont ! p256_mul_mont(S2, S1, Hcub);
1100 add %sp,LOCALS+$S2,$rp
1101
1102 add %sp,LOCALS+$R,$bp
1103 add %sp,LOCALS+$res_y,$ap
1104 call __ecp_nistz256_mul_mont ! p256_mul_mont(res_y, res_y, R);
1105 add %sp,LOCALS+$res_y,$rp
1106
1107 add %sp,LOCALS+$S2,$bp
1108 call __ecp_nistz256_sub_from ! p256_sub(res_y, res_y, S2);
1109 add %sp,LOCALS+$res_y,$rp
1110
1111 ld [%fp+STACK_BIAS-16],$t1 ! !in1infty
1112 ld [%fp+STACK_BIAS-12],$t2 ! !in2infty
1113 ldx [%fp+STACK_BIAS-8],$rp
1114___
1115for($i=0;$i<96;$i+=8) { # conditional moves
1116$code.=<<___;
1117 ld [%sp+LOCALS+$i],@acc[0] ! res
1118 ld [%sp+LOCALS+$i+4],@acc[1]
1119 ld [$bp_real+$i],@acc[2] ! in2
1120 ld [$bp_real+$i+4],@acc[3]
1121 ld [$ap_real+$i],@acc[4] ! in1
1122 ld [$ap_real+$i+4],@acc[5]
1123 movrz $t1,@acc[2],@acc[0]
1124 movrz $t1,@acc[3],@acc[1]
1125 movrz $t2,@acc[4],@acc[0]
1126 movrz $t2,@acc[5],@acc[1]
1127 st @acc[0],[$rp+$i]
1128 st @acc[1],[$rp+$i+4]
1129___
1130}
1131$code.=<<___;
1132.Ladd_done:
1133 ret
1134 restore
ff823ee8 1135.type ecp_nistz256_point_add,#function
5557d5f2
AP
1136.size ecp_nistz256_point_add,.-ecp_nistz256_point_add
1137___
1138}
1139
1140########################################################################
1141# void ecp_nistz256_point_add_affine(P256_POINT *out,const P256_POINT *in1,
1142# const P256_POINT_AFFINE *in2);
1143{
1144my ($res_x,$res_y,$res_z,
1145 $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(32*$_,(0..9));
1146my $Z1sqr = $S2;
1147# above map() describes stack layout with 10 temporary
1148# 256-bit vectors on top. Then we reserve some space for
1149# !in1infty, !in2infty, result of check for zero and return pointer.
1150
1151my @ONE_mont=(1,0,0,-1,-1,-1,-2,0);
1152my $bp_real=$rp_real;
1153
1154$code.=<<___;
1155.globl ecp_nistz256_point_add_affine
1156.align 32
1157ecp_nistz256_point_add_affine:
1158 SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
1159 ld [%g1],%g1 ! OPENSSL_sparcv9cap_P[0]
1160 and %g1,(SPARCV9_VIS3|SPARCV9_64BIT_STACK),%g1
1161 cmp %g1,(SPARCV9_VIS3|SPARCV9_64BIT_STACK)
1162 be ecp_nistz256_point_add_affine_vis3
1163 nop
1164
1165 save %sp,-STACK_FRAME-32*10-32,%sp
1166
1167 stx $rp,[%fp+STACK_BIAS-8] ! off-load $rp
1168 mov $ap,$ap_real
1169 mov $bp,$bp_real
1170
c74aea8d
AP
1171 ld [$ap+64],$t0 ! in1_z
1172 ld [$ap+64+4],$t1
1173 ld [$ap+64+8],$t2
1174 ld [$ap+64+12],$t3
1175 ld [$ap+64+16],$t4
1176 ld [$ap+64+20],$t5
1177 ld [$ap+64+24],$t6
1178 ld [$ap+64+28],$t7
5557d5f2
AP
1179 or $t1,$t0,$t0
1180 or $t3,$t2,$t2
1181 or $t5,$t4,$t4
1182 or $t7,$t6,$t6
1183 or $t2,$t0,$t0
1184 or $t6,$t4,$t4
c74aea8d 1185 or $t4,$t0,$t0 ! !in1infty
5557d5f2
AP
1186 movrnz $t0,-1,$t0
1187 st $t0,[%fp+STACK_BIAS-16]
1188
1189 ld [$bp],@acc[0] ! in2_x
1190 ld [$bp+4],@acc[1]
1191 ld [$bp+8],@acc[2]
1192 ld [$bp+12],@acc[3]
1193 ld [$bp+16],@acc[4]
1194 ld [$bp+20],@acc[5]
1195 ld [$bp+24],@acc[6]
1196 ld [$bp+28],@acc[7]
1197 ld [$bp+32],$t0 ! in2_y
1198 ld [$bp+32+4],$t1
1199 ld [$bp+32+8],$t2
1200 ld [$bp+32+12],$t3
1201 ld [$bp+32+16],$t4
1202 ld [$bp+32+20],$t5
1203 ld [$bp+32+24],$t6
1204 ld [$bp+32+28],$t7
1205 or @acc[1],@acc[0],@acc[0]
1206 or @acc[3],@acc[2],@acc[2]
1207 or @acc[5],@acc[4],@acc[4]
1208 or @acc[7],@acc[6],@acc[6]
1209 or @acc[2],@acc[0],@acc[0]
1210 or @acc[6],@acc[4],@acc[4]
1211 or @acc[4],@acc[0],@acc[0]
1212 or $t1,$t0,$t0
1213 or $t3,$t2,$t2
1214 or $t5,$t4,$t4
1215 or $t7,$t6,$t6
1216 or $t2,$t0,$t0
1217 or $t6,$t4,$t4
1218 or $t4,$t0,$t0
1219 or @acc[0],$t0,$t0 ! !in2infty
1220 movrnz $t0,-1,$t0
1221 st $t0,[%fp+STACK_BIAS-12]
1222
1223 add $ap_real,64,$bp
1224 add $ap_real,64,$ap
1225 call __ecp_nistz256_mul_mont ! p256_sqr_mont(Z1sqr, in1_z);
1226 add %sp,LOCALS+$Z1sqr,$rp
1227
1228 add $bp_real,0,$bp
1229 add %sp,LOCALS+$Z1sqr,$ap
1230 call __ecp_nistz256_mul_mont ! p256_mul_mont(U2, Z1sqr, in2_x);
1231 add %sp,LOCALS+$U2,$rp
1232
1233 add $ap_real,0,$bp
1234 call __ecp_nistz256_sub_from ! p256_sub(H, U2, in1_x);
1235 add %sp,LOCALS+$H,$rp
1236
1237 add $ap_real,64,$bp
1238 add %sp,LOCALS+$Z1sqr,$ap
1239 call __ecp_nistz256_mul_mont ! p256_mul_mont(S2, Z1sqr, in1_z);
1240 add %sp,LOCALS+$S2,$rp
1241
1242 add $ap_real,64,$bp
1243 add %sp,LOCALS+$H,$ap
1244 call __ecp_nistz256_mul_mont ! p256_mul_mont(res_z, H, in1_z);
1245 add %sp,LOCALS+$res_z,$rp
1246
1247 add $bp_real,32,$bp
1248 add %sp,LOCALS+$S2,$ap
1249 call __ecp_nistz256_mul_mont ! p256_mul_mont(S2, S2, in2_y);
1250 add %sp,LOCALS+$S2,$rp
1251
1252 add $ap_real,32,$bp
1253 call __ecp_nistz256_sub_from ! p256_sub(R, S2, in1_y);
1254 add %sp,LOCALS+$R,$rp
1255
1256 add %sp,LOCALS+$H,$bp
1257 add %sp,LOCALS+$H,$ap
1258 call __ecp_nistz256_mul_mont ! p256_sqr_mont(Hsqr, H);
1259 add %sp,LOCALS+$Hsqr,$rp
1260
1261 add %sp,LOCALS+$R,$bp
1262 add %sp,LOCALS+$R,$ap
1263 call __ecp_nistz256_mul_mont ! p256_sqr_mont(Rsqr, R);
1264 add %sp,LOCALS+$Rsqr,$rp
1265
1266 add %sp,LOCALS+$H,$bp
1267 add %sp,LOCALS+$Hsqr,$ap
1268 call __ecp_nistz256_mul_mont ! p256_mul_mont(Hcub, Hsqr, H);
1269 add %sp,LOCALS+$Hcub,$rp
1270
1271 add $ap_real,0,$bp
1272 add %sp,LOCALS+$Hsqr,$ap
1273 call __ecp_nistz256_mul_mont ! p256_mul_mont(U2, in1_x, Hsqr);
1274 add %sp,LOCALS+$U2,$rp
1275
1276 call __ecp_nistz256_mul_by_2 ! p256_mul_by_2(Hsqr, U2);
1277 add %sp,LOCALS+$Hsqr,$rp
1278
1279 add %sp,LOCALS+$Rsqr,$bp
1280 call __ecp_nistz256_sub_morf ! p256_sub(res_x, Rsqr, Hsqr);
1281 add %sp,LOCALS+$res_x,$rp
1282
1283 add %sp,LOCALS+$Hcub,$bp
1284 call __ecp_nistz256_sub_from ! p256_sub(res_x, res_x, Hcub);
1285 add %sp,LOCALS+$res_x,$rp
1286
1287 add %sp,LOCALS+$U2,$bp
1288 call __ecp_nistz256_sub_morf ! p256_sub(res_y, U2, res_x);
1289 add %sp,LOCALS+$res_y,$rp
1290
1291 add $ap_real,32,$bp
1292 add %sp,LOCALS+$Hcub,$ap
1293 call __ecp_nistz256_mul_mont ! p256_mul_mont(S2, in1_y, Hcub);
1294 add %sp,LOCALS+$S2,$rp
1295
1296 add %sp,LOCALS+$R,$bp
1297 add %sp,LOCALS+$res_y,$ap
1298 call __ecp_nistz256_mul_mont ! p256_mul_mont(res_y, res_y, R);
1299 add %sp,LOCALS+$res_y,$rp
1300
1301 add %sp,LOCALS+$S2,$bp
1302 call __ecp_nistz256_sub_from ! p256_sub(res_y, res_y, S2);
1303 add %sp,LOCALS+$res_y,$rp
1304
1305 ld [%fp+STACK_BIAS-16],$t1 ! !in1infty
1306 ld [%fp+STACK_BIAS-12],$t2 ! !in2infty
1307 ldx [%fp+STACK_BIAS-8],$rp
1308___
1309for($i=0;$i<64;$i+=8) { # conditional moves
1310$code.=<<___;
1311 ld [%sp+LOCALS+$i],@acc[0] ! res
1312 ld [%sp+LOCALS+$i+4],@acc[1]
1313 ld [$bp_real+$i],@acc[2] ! in2
1314 ld [$bp_real+$i+4],@acc[3]
1315 ld [$ap_real+$i],@acc[4] ! in1
1316 ld [$ap_real+$i+4],@acc[5]
1317 movrz $t1,@acc[2],@acc[0]
1318 movrz $t1,@acc[3],@acc[1]
1319 movrz $t2,@acc[4],@acc[0]
1320 movrz $t2,@acc[5],@acc[1]
1321 st @acc[0],[$rp+$i]
1322 st @acc[1],[$rp+$i+4]
1323___
1324}
1325for(;$i<96;$i+=8) {
1326my $j=($i-64)/4;
1327$code.=<<___;
1328 ld [%sp+LOCALS+$i],@acc[0] ! res
1329 ld [%sp+LOCALS+$i+4],@acc[1]
1330 ld [$ap_real+$i],@acc[4] ! in1
1331 ld [$ap_real+$i+4],@acc[5]
1332 movrz $t1,@ONE_mont[$j],@acc[0]
1333 movrz $t1,@ONE_mont[$j+1],@acc[1]
1334 movrz $t2,@acc[4],@acc[0]
1335 movrz $t2,@acc[5],@acc[1]
1336 st @acc[0],[$rp+$i]
1337 st @acc[1],[$rp+$i+4]
1338___
1339}
1340$code.=<<___;
1341 ret
1342 restore
ff823ee8 1343.type ecp_nistz256_point_add_affine,#function
5557d5f2
AP
1344.size ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine
1345___
1346} }}}
1347{{{
1348my ($out,$inp,$index)=map("%i$_",(0..2));
1349my $mask="%o0";
1350
1351$code.=<<___;
1352! void ecp_nistz256_scatter_w5(void *%i0,const P256_POINT *%i1,
1353! int %i2);
1354.globl ecp_nistz256_scatter_w5
1355.align 32
1356ecp_nistz256_scatter_w5:
1357 save %sp,-STACK_FRAME,%sp
1358
1359 sll $index,2,$index
1360 add $out,$index,$out
1361
1362 ld [$inp],%l0 ! X
1363 ld [$inp+4],%l1
1364 ld [$inp+8],%l2
1365 ld [$inp+12],%l3
1366 ld [$inp+16],%l4
1367 ld [$inp+20],%l5
1368 ld [$inp+24],%l6
1369 ld [$inp+28],%l7
1370 add $inp,32,$inp
1371 st %l0,[$out+64*0-4]
1372 st %l1,[$out+64*1-4]
1373 st %l2,[$out+64*2-4]
1374 st %l3,[$out+64*3-4]
1375 st %l4,[$out+64*4-4]
1376 st %l5,[$out+64*5-4]
1377 st %l6,[$out+64*6-4]
1378 st %l7,[$out+64*7-4]
1379 add $out,64*8,$out
1380
1381 ld [$inp],%l0 ! Y
1382 ld [$inp+4],%l1
1383 ld [$inp+8],%l2
1384 ld [$inp+12],%l3
1385 ld [$inp+16],%l4
1386 ld [$inp+20],%l5
1387 ld [$inp+24],%l6
1388 ld [$inp+28],%l7
1389 add $inp,32,$inp
1390 st %l0,[$out+64*0-4]
1391 st %l1,[$out+64*1-4]
1392 st %l2,[$out+64*2-4]
1393 st %l3,[$out+64*3-4]
1394 st %l4,[$out+64*4-4]
1395 st %l5,[$out+64*5-4]
1396 st %l6,[$out+64*6-4]
1397 st %l7,[$out+64*7-4]
1398 add $out,64*8,$out
1399
1400 ld [$inp],%l0 ! Z
1401 ld [$inp+4],%l1
1402 ld [$inp+8],%l2
1403 ld [$inp+12],%l3
1404 ld [$inp+16],%l4
1405 ld [$inp+20],%l5
1406 ld [$inp+24],%l6
1407 ld [$inp+28],%l7
1408 st %l0,[$out+64*0-4]
1409 st %l1,[$out+64*1-4]
1410 st %l2,[$out+64*2-4]
1411 st %l3,[$out+64*3-4]
1412 st %l4,[$out+64*4-4]
1413 st %l5,[$out+64*5-4]
1414 st %l6,[$out+64*6-4]
1415 st %l7,[$out+64*7-4]
1416
1417 ret
1418 restore
ff823ee8 1419.type ecp_nistz256_scatter_w5,#function
5557d5f2
AP
1420.size ecp_nistz256_scatter_w5,.-ecp_nistz256_scatter_w5
1421
1422! void ecp_nistz256_gather_w5(P256_POINT *%i0,const void *%i1,
1423! int %i2);
1424.globl ecp_nistz256_gather_w5
1425.align 32
1426ecp_nistz256_gather_w5:
1427 save %sp,-STACK_FRAME,%sp
1428
1429 neg $index,$mask
1430 srax $mask,63,$mask
1431
1432 add $index,$mask,$index
1433 sll $index,2,$index
1434 add $inp,$index,$inp
1435
1436 ld [$inp+64*0],%l0
1437 ld [$inp+64*1],%l1
1438 ld [$inp+64*2],%l2
1439 ld [$inp+64*3],%l3
1440 ld [$inp+64*4],%l4
1441 ld [$inp+64*5],%l5
1442 ld [$inp+64*6],%l6
1443 ld [$inp+64*7],%l7
1444 add $inp,64*8,$inp
1445 and %l0,$mask,%l0
1446 and %l1,$mask,%l1
1447 st %l0,[$out] ! X
1448 and %l2,$mask,%l2
1449 st %l1,[$out+4]
1450 and %l3,$mask,%l3
1451 st %l2,[$out+8]
1452 and %l4,$mask,%l4
1453 st %l3,[$out+12]
1454 and %l5,$mask,%l5
1455 st %l4,[$out+16]
1456 and %l6,$mask,%l6
1457 st %l5,[$out+20]
1458 and %l7,$mask,%l7
1459 st %l6,[$out+24]
1460 st %l7,[$out+28]
1461 add $out,32,$out
1462
1463 ld [$inp+64*0],%l0
1464 ld [$inp+64*1],%l1
1465 ld [$inp+64*2],%l2
1466 ld [$inp+64*3],%l3
1467 ld [$inp+64*4],%l4
1468 ld [$inp+64*5],%l5
1469 ld [$inp+64*6],%l6
1470 ld [$inp+64*7],%l7
1471 add $inp,64*8,$inp
1472 and %l0,$mask,%l0
1473 and %l1,$mask,%l1
1474 st %l0,[$out] ! Y
1475 and %l2,$mask,%l2
1476 st %l1,[$out+4]
1477 and %l3,$mask,%l3
1478 st %l2,[$out+8]
1479 and %l4,$mask,%l4
1480 st %l3,[$out+12]
1481 and %l5,$mask,%l5
1482 st %l4,[$out+16]
1483 and %l6,$mask,%l6
1484 st %l5,[$out+20]
1485 and %l7,$mask,%l7
1486 st %l6,[$out+24]
1487 st %l7,[$out+28]
1488 add $out,32,$out
1489
1490 ld [$inp+64*0],%l0
1491 ld [$inp+64*1],%l1
1492 ld [$inp+64*2],%l2
1493 ld [$inp+64*3],%l3
1494 ld [$inp+64*4],%l4
1495 ld [$inp+64*5],%l5
1496 ld [$inp+64*6],%l6
1497 ld [$inp+64*7],%l7
1498 and %l0,$mask,%l0
1499 and %l1,$mask,%l1
1500 st %l0,[$out] ! Z
1501 and %l2,$mask,%l2
1502 st %l1,[$out+4]
1503 and %l3,$mask,%l3
1504 st %l2,[$out+8]
1505 and %l4,$mask,%l4
1506 st %l3,[$out+12]
1507 and %l5,$mask,%l5
1508 st %l4,[$out+16]
1509 and %l6,$mask,%l6
1510 st %l5,[$out+20]
1511 and %l7,$mask,%l7
1512 st %l6,[$out+24]
1513 st %l7,[$out+28]
1514
1515 ret
1516 restore
ff823ee8 1517.type ecp_nistz256_gather_w5,#function
5557d5f2
AP
1518.size ecp_nistz256_gather_w5,.-ecp_nistz256_gather_w5
1519
1520! void ecp_nistz256_scatter_w7(void *%i0,const P256_POINT_AFFINE *%i1,
1521! int %i2);
1522.globl ecp_nistz256_scatter_w7
1523.align 32
1524ecp_nistz256_scatter_w7:
1525 save %sp,-STACK_FRAME,%sp
1526 nop
1527 add $out,$index,$out
1528 mov 64/4,$index
1529.Loop_scatter_w7:
1530 ld [$inp],%l0
1531 add $inp,4,$inp
1532 subcc $index,1,$index
87a75b3e 1533 stb %l0,[$out+64*0]
5557d5f2 1534 srl %l0,8,%l1
87a75b3e 1535 stb %l1,[$out+64*1]
5557d5f2 1536 srl %l0,16,%l2
87a75b3e 1537 stb %l2,[$out+64*2]
5557d5f2 1538 srl %l0,24,%l3
87a75b3e 1539 stb %l3,[$out+64*3]
5557d5f2
AP
1540 bne .Loop_scatter_w7
1541 add $out,64*4,$out
1542
1543 ret
1544 restore
ff823ee8 1545.type ecp_nistz256_scatter_w7,#function
5557d5f2
AP
1546.size ecp_nistz256_scatter_w7,.-ecp_nistz256_scatter_w7
1547
1548! void ecp_nistz256_gather_w7(P256_POINT_AFFINE *%i0,const void *%i1,
1549! int %i2);
1550.globl ecp_nistz256_gather_w7
1551.align 32
1552ecp_nistz256_gather_w7:
1553 save %sp,-STACK_FRAME,%sp
1554
1555 neg $index,$mask
1556 srax $mask,63,$mask
1557
1558 add $index,$mask,$index
1559 add $inp,$index,$inp
1560 mov 64/4,$index
1561
1562.Loop_gather_w7:
1563 ldub [$inp+64*0],%l0
1564 prefetch [$inp+3840+64*0],1
1565 subcc $index,1,$index
1566 ldub [$inp+64*1],%l1
1567 prefetch [$inp+3840+64*1],1
1568 ldub [$inp+64*2],%l2
1569 prefetch [$inp+3840+64*2],1
1570 ldub [$inp+64*3],%l3
1571 prefetch [$inp+3840+64*3],1
1572 add $inp,64*4,$inp
1573 sll %l1,8,%l1
1574 sll %l2,16,%l2
1575 or %l0,%l1,%l0
1576 sll %l3,24,%l3
1577 or %l0,%l2,%l0
1578 or %l0,%l3,%l0
1579 and %l0,$mask,%l0
1580 st %l0,[$out]
1581 bne .Loop_gather_w7
1582 add $out,4,$out
1583
1584 ret
1585 restore
ff823ee8 1586.type ecp_nistz256_gather_w7,#function
5557d5f2
AP
1587.size ecp_nistz256_gather_w7,.-ecp_nistz256_gather_w7
1588___
1589}}}
1590{{{
1591########################################################################
1592# Following subroutines are VIS3 counterparts of those above that
1593# implement ones found in ecp_nistz256.c. Key difference is that they
46f4e1be 1594# use 128-bit multiplication and addition with 64-bit carry, and in order
5557d5f2
AP
1595# to do that they perform conversion from uin32_t[8] to uint64_t[4] upon
1596# entry and vice versa on return.
1597#
1598my ($rp,$ap,$bp)=map("%i$_",(0..2));
1599my ($t0,$t1,$t2,$t3,$a0,$a1,$a2,$a3)=map("%l$_",(0..7));
1600my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5)=map("%o$_",(0..5));
1601my ($bi,$poly1,$poly3,$minus1)=(map("%i$_",(3..5)),"%g1");
1602my ($rp_real,$ap_real)=("%g2","%g3");
1603my ($acc6,$acc7)=($bp,$bi); # used in squaring
1604
1605$code.=<<___;
1606.align 32
1607__ecp_nistz256_mul_by_2_vis3:
1608 addcc $acc0,$acc0,$acc0
1609 addxccc $acc1,$acc1,$acc1
1610 addxccc $acc2,$acc2,$acc2
1611 addxccc $acc3,$acc3,$acc3
1612 b .Lreduce_by_sub_vis3
1613 addxc %g0,%g0,$acc4 ! did it carry?
ff823ee8 1614.type __ecp_nistz256_mul_by_2_vis3,#function
5557d5f2
AP
1615.size __ecp_nistz256_mul_by_2_vis3,.-__ecp_nistz256_mul_by_2_vis3
1616
1617.align 32
1618__ecp_nistz256_add_vis3:
1619 ldx [$bp+0],$t0
1620 ldx [$bp+8],$t1
1621 ldx [$bp+16],$t2
1622 ldx [$bp+24],$t3
1623
1624__ecp_nistz256_add_noload_vis3:
1625
1626 addcc $t0,$acc0,$acc0
1627 addxccc $t1,$acc1,$acc1
1628 addxccc $t2,$acc2,$acc2
1629 addxccc $t3,$acc3,$acc3
1630 addxc %g0,%g0,$acc4 ! did it carry?
1631
1632.Lreduce_by_sub_vis3:
1633
1634 addcc $acc0,1,$t0 ! add -modulus, i.e. subtract
1635 addxccc $acc1,$poly1,$t1
1636 addxccc $acc2,$minus1,$t2
dfde4219
AP
1637 addxccc $acc3,$poly3,$t3
1638 addxc $acc4,$minus1,$acc4
5557d5f2 1639
dfde4219
AP
1640 movrz $acc4,$t0,$acc0 ! ret = borrow ? ret : ret-modulus
1641 movrz $acc4,$t1,$acc1
5557d5f2 1642 stx $acc0,[$rp]
dfde4219 1643 movrz $acc4,$t2,$acc2
5557d5f2 1644 stx $acc1,[$rp+8]
dfde4219 1645 movrz $acc4,$t3,$acc3
5557d5f2
AP
1646 stx $acc2,[$rp+16]
1647 retl
1648 stx $acc3,[$rp+24]
ff823ee8 1649.type __ecp_nistz256_add_vis3,#function
5557d5f2
AP
1650.size __ecp_nistz256_add_vis3,.-__ecp_nistz256_add_vis3
1651
1652! Trouble with subtraction is that there is no subtraction with 64-bit
1653! borrow, only with 32-bit one. For this reason we "decompose" 64-bit
1654! $acc0-$acc3 to 32-bit values and pick b[4] in 32-bit pieces. But
1655! recall that SPARC is big-endian, which is why you'll observe that
1656! b[4] is accessed as 4-0-12-8-20-16-28-24. And prior reduction we
1657! "collect" result back to 64-bit $acc0-$acc3.
1658.align 32
1659__ecp_nistz256_sub_from_vis3:
1660 ld [$bp+4],$t0
1661 ld [$bp+0],$t1
1662 ld [$bp+12],$t2
1663 ld [$bp+8],$t3
1664
1665 srlx $acc0,32,$acc4
1666 not $poly1,$poly1
1667 srlx $acc1,32,$acc5
1668 subcc $acc0,$t0,$acc0
1669 ld [$bp+20],$t0
1670 subccc $acc4,$t1,$acc4
1671 ld [$bp+16],$t1
1672 subccc $acc1,$t2,$acc1
1673 ld [$bp+28],$t2
1674 and $acc0,$poly1,$acc0
1675 subccc $acc5,$t3,$acc5
1676 ld [$bp+24],$t3
1677 sllx $acc4,32,$acc4
1678 and $acc1,$poly1,$acc1
1679 sllx $acc5,32,$acc5
1680 or $acc0,$acc4,$acc0
1681 srlx $acc2,32,$acc4
1682 or $acc1,$acc5,$acc1
1683 srlx $acc3,32,$acc5
1684 subccc $acc2,$t0,$acc2
1685 subccc $acc4,$t1,$acc4
1686 subccc $acc3,$t2,$acc3
1687 and $acc2,$poly1,$acc2
1688 subccc $acc5,$t3,$acc5
1689 sllx $acc4,32,$acc4
1690 and $acc3,$poly1,$acc3
1691 sllx $acc5,32,$acc5
1692 or $acc2,$acc4,$acc2
1693 subc %g0,%g0,$acc4 ! did it borrow?
1694 b .Lreduce_by_add_vis3
1695 or $acc3,$acc5,$acc3
ff823ee8 1696.type __ecp_nistz256_sub_from_vis3,#function
5557d5f2
AP
1697.size __ecp_nistz256_sub_from_vis3,.-__ecp_nistz256_sub_from_vis3
1698
1699.align 32
1700__ecp_nistz256_sub_morf_vis3:
1701 ld [$bp+4],$t0
1702 ld [$bp+0],$t1
1703 ld [$bp+12],$t2
1704 ld [$bp+8],$t3
1705
1706 srlx $acc0,32,$acc4
1707 not $poly1,$poly1
1708 srlx $acc1,32,$acc5
1709 subcc $t0,$acc0,$acc0
1710 ld [$bp+20],$t0
1711 subccc $t1,$acc4,$acc4
1712 ld [$bp+16],$t1
1713 subccc $t2,$acc1,$acc1
1714 ld [$bp+28],$t2
1715 and $acc0,$poly1,$acc0
1716 subccc $t3,$acc5,$acc5
1717 ld [$bp+24],$t3
1718 sllx $acc4,32,$acc4
1719 and $acc1,$poly1,$acc1
1720 sllx $acc5,32,$acc5
1721 or $acc0,$acc4,$acc0
1722 srlx $acc2,32,$acc4
1723 or $acc1,$acc5,$acc1
1724 srlx $acc3,32,$acc5
1725 subccc $t0,$acc2,$acc2
1726 subccc $t1,$acc4,$acc4
1727 subccc $t2,$acc3,$acc3
1728 and $acc2,$poly1,$acc2
1729 subccc $t3,$acc5,$acc5
1730 sllx $acc4,32,$acc4
1731 and $acc3,$poly1,$acc3
1732 sllx $acc5,32,$acc5
1733 or $acc2,$acc4,$acc2
1734 subc %g0,%g0,$acc4 ! did it borrow?
1735 or $acc3,$acc5,$acc3
1736
1737.Lreduce_by_add_vis3:
1738
1739 addcc $acc0,-1,$t0 ! add modulus
1740 not $poly3,$t3
1741 addxccc $acc1,$poly1,$t1
1742 not $poly1,$poly1 ! restore $poly1
1743 addxccc $acc2,%g0,$t2
1744 addxc $acc3,$t3,$t3
1745
1746 movrnz $acc4,$t0,$acc0 ! if a-b borrowed, ret = ret+mod
1747 movrnz $acc4,$t1,$acc1
1748 stx $acc0,[$rp]
1749 movrnz $acc4,$t2,$acc2
1750 stx $acc1,[$rp+8]
1751 movrnz $acc4,$t3,$acc3
1752 stx $acc2,[$rp+16]
1753 retl
1754 stx $acc3,[$rp+24]
ff823ee8 1755.type __ecp_nistz256_sub_morf_vis3,#function
5557d5f2
AP
1756.size __ecp_nistz256_sub_morf_vis3,.-__ecp_nistz256_sub_morf_vis3
1757
1758.align 32
1759__ecp_nistz256_div_by_2_vis3:
1760 ! ret = (a is odd ? a+mod : a) >> 1
1761
1762 not $poly1,$t1
1763 not $poly3,$t3
1764 and $acc0,1,$acc5
1765 addcc $acc0,-1,$t0 ! add modulus
1766 addxccc $acc1,$t1,$t1
1767 addxccc $acc2,%g0,$t2
1768 addxccc $acc3,$t3,$t3
1769 addxc %g0,%g0,$acc4 ! carry bit
1770
1771 movrnz $acc5,$t0,$acc0
1772 movrnz $acc5,$t1,$acc1
1773 movrnz $acc5,$t2,$acc2
1774 movrnz $acc5,$t3,$acc3
1775 movrz $acc5,%g0,$acc4
1776
1777 ! ret >>= 1
1778
1779 srlx $acc0,1,$acc0
1780 sllx $acc1,63,$t0
1781 srlx $acc1,1,$acc1
1782 or $acc0,$t0,$acc0
1783 sllx $acc2,63,$t1
1784 srlx $acc2,1,$acc2
1785 or $acc1,$t1,$acc1
1786 sllx $acc3,63,$t2
1787 stx $acc0,[$rp]
1788 srlx $acc3,1,$acc3
1789 or $acc2,$t2,$acc2
1790 sllx $acc4,63,$t3 ! don't forget carry bit
1791 stx $acc1,[$rp+8]
1792 or $acc3,$t3,$acc3
1793 stx $acc2,[$rp+16]
1794 retl
1795 stx $acc3,[$rp+24]
ff823ee8 1796.type __ecp_nistz256_div_by_2_vis3,#function
5557d5f2
AP
1797.size __ecp_nistz256_div_by_2_vis3,.-__ecp_nistz256_div_by_2_vis3
1798
1799! compared to __ecp_nistz256_mul_mont it's almost 4x smaller and
1800! 4x faster [on T4]...
1801.align 32
1802__ecp_nistz256_mul_mont_vis3:
1803 mulx $a0,$bi,$acc0
1804 not $poly3,$poly3 ! 0xFFFFFFFF00000001
1805 umulxhi $a0,$bi,$t0
1806 mulx $a1,$bi,$acc1
1807 umulxhi $a1,$bi,$t1
1808 mulx $a2,$bi,$acc2
1809 umulxhi $a2,$bi,$t2
1810 mulx $a3,$bi,$acc3
1811 umulxhi $a3,$bi,$t3
1812 ldx [$bp+8],$bi ! b[1]
1813
1814 addcc $acc1,$t0,$acc1 ! accumulate high parts of multiplication
1815 sllx $acc0,32,$t0
1816 addxccc $acc2,$t1,$acc2
1817 srlx $acc0,32,$t1
1818 addxccc $acc3,$t2,$acc3
1819 addxc %g0,$t3,$acc4
1820 mov 0,$acc5
1821___
1822for($i=1;$i<4;$i++) {
1823 # Reduction iteration is normally performed by accumulating
1824 # result of multiplication of modulus by "magic" digit [and
1825 # omitting least significant word, which is guaranteed to
1826 # be 0], but thanks to special form of modulus and "magic"
1827 # digit being equal to least significant word, it can be
1828 # performed with additions and subtractions alone. Indeed:
1829 #
1830 # ffff0001.00000000.0000ffff.ffffffff
1831 # * abcdefgh
1832 # + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh
1833 #
1834 # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we
1835 # rewrite above as:
1836 #
1837 # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh
1838 # + abcdefgh.abcdefgh.0000abcd.efgh0000.00000000
1839 # - 0000abcd.efgh0000.00000000.00000000.abcdefgh
1840 #
1841 # or marking redundant operations:
1842 #
1843 # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.--------
1844 # + abcdefgh.abcdefgh.0000abcd.efgh0000.--------
1845 # - 0000abcd.efgh0000.--------.--------.--------
1846 # ^^^^^^^^ but this word is calculated with umulxhi, because
1847 # there is no subtract with 64-bit borrow:-(
1848
1849$code.=<<___;
1850 sub $acc0,$t0,$t2 ! acc0*0xFFFFFFFF00000001, low part
1851 umulxhi $acc0,$poly3,$t3 ! acc0*0xFFFFFFFF00000001, high part
1852 addcc $acc1,$t0,$acc0 ! +=acc[0]<<96 and omit acc[0]
1853 mulx $a0,$bi,$t0
1854 addxccc $acc2,$t1,$acc1
1855 mulx $a1,$bi,$t1
1856 addxccc $acc3,$t2,$acc2 ! +=acc[0]*0xFFFFFFFF00000001
1857 mulx $a2,$bi,$t2
1858 addxccc $acc4,$t3,$acc3
1859 mulx $a3,$bi,$t3
1860 addxc $acc5,%g0,$acc4
1861
1862 addcc $acc0,$t0,$acc0 ! accumulate low parts of multiplication
1863 umulxhi $a0,$bi,$t0
1864 addxccc $acc1,$t1,$acc1
1865 umulxhi $a1,$bi,$t1
1866 addxccc $acc2,$t2,$acc2
1867 umulxhi $a2,$bi,$t2
1868 addxccc $acc3,$t3,$acc3
1869 umulxhi $a3,$bi,$t3
1870 addxc $acc4,%g0,$acc4
1871___
1872$code.=<<___ if ($i<3);
1873 ldx [$bp+8*($i+1)],$bi ! bp[$i+1]
1874___
1875$code.=<<___;
609b0852 1876 addcc $acc1,$t0,$acc1 ! accumulate high parts of multiplication
5557d5f2
AP
1877 sllx $acc0,32,$t0
1878 addxccc $acc2,$t1,$acc2
1879 srlx $acc0,32,$t1
1880 addxccc $acc3,$t2,$acc3
1881 addxccc $acc4,$t3,$acc4
1882 addxc %g0,%g0,$acc5
1883___
1884}
1885$code.=<<___;
1886 sub $acc0,$t0,$t2 ! acc0*0xFFFFFFFF00000001, low part
1887 umulxhi $acc0,$poly3,$t3 ! acc0*0xFFFFFFFF00000001, high part
1888 addcc $acc1,$t0,$acc0 ! +=acc[0]<<96 and omit acc[0]
1889 addxccc $acc2,$t1,$acc1
1890 addxccc $acc3,$t2,$acc2 ! +=acc[0]*0xFFFFFFFF00000001
1891 addxccc $acc4,$t3,$acc3
1892 b .Lmul_final_vis3 ! see below
1893 addxc $acc5,%g0,$acc4
ff823ee8 1894.type __ecp_nistz256_mul_mont_vis3,#function
5557d5f2
AP
1895.size __ecp_nistz256_mul_mont_vis3,.-__ecp_nistz256_mul_mont_vis3
1896
1897! compared to above __ecp_nistz256_mul_mont_vis3 it's 21% less
1898! instructions, but only 14% faster [on T4]...
1899.align 32
1900__ecp_nistz256_sqr_mont_vis3:
1901 ! | | | | | |a1*a0| |
1902 ! | | | | |a2*a0| | |
1903 ! | |a3*a2|a3*a0| | | |
1904 ! | | | |a2*a1| | | |
1905 ! | | |a3*a1| | | | |
1906 ! *| | | | | | | | 2|
1907 ! +|a3*a3|a2*a2|a1*a1|a0*a0|
1908 ! |--+--+--+--+--+--+--+--|
1909 ! |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx
1910 !
1911 ! "can't overflow" below mark carrying into high part of
1912 ! multiplication result, which can't overflow, because it
1913 ! can never be all ones.
1914
1915 mulx $a1,$a0,$acc1 ! a[1]*a[0]
1916 umulxhi $a1,$a0,$t1
1917 mulx $a2,$a0,$acc2 ! a[2]*a[0]
1918 umulxhi $a2,$a0,$t2
1919 mulx $a3,$a0,$acc3 ! a[3]*a[0]
1920 umulxhi $a3,$a0,$acc4
1921
1922 addcc $acc2,$t1,$acc2 ! accumulate high parts of multiplication
1923 mulx $a2,$a1,$t0 ! a[2]*a[1]
1924 umulxhi $a2,$a1,$t1
1925 addxccc $acc3,$t2,$acc3
1926 mulx $a3,$a1,$t2 ! a[3]*a[1]
1927 umulxhi $a3,$a1,$t3
1928 addxc $acc4,%g0,$acc4 ! can't overflow
1929
1930 mulx $a3,$a2,$acc5 ! a[3]*a[2]
1931 not $poly3,$poly3 ! 0xFFFFFFFF00000001
1932 umulxhi $a3,$a2,$acc6
1933
1934 addcc $t2,$t1,$t1 ! accumulate high parts of multiplication
1935 mulx $a0,$a0,$acc0 ! a[0]*a[0]
1936 addxc $t3,%g0,$t2 ! can't overflow
1937
1938 addcc $acc3,$t0,$acc3 ! accumulate low parts of multiplication
1939 umulxhi $a0,$a0,$a0
1940 addxccc $acc4,$t1,$acc4
1941 mulx $a1,$a1,$t1 ! a[1]*a[1]
1942 addxccc $acc5,$t2,$acc5
1943 umulxhi $a1,$a1,$a1
1944 addxc $acc6,%g0,$acc6 ! can't overflow
1945
1946 addcc $acc1,$acc1,$acc1 ! acc[1-6]*=2
1947 mulx $a2,$a2,$t2 ! a[2]*a[2]
1948 addxccc $acc2,$acc2,$acc2
1949 umulxhi $a2,$a2,$a2
1950 addxccc $acc3,$acc3,$acc3
1951 mulx $a3,$a3,$t3 ! a[3]*a[3]
1952 addxccc $acc4,$acc4,$acc4
1953 umulxhi $a3,$a3,$a3
1954 addxccc $acc5,$acc5,$acc5
1955 addxccc $acc6,$acc6,$acc6
1956 addxc %g0,%g0,$acc7
1957
1958 addcc $acc1,$a0,$acc1 ! +a[i]*a[i]
1959 addxccc $acc2,$t1,$acc2
1960 addxccc $acc3,$a1,$acc3
1961 addxccc $acc4,$t2,$acc4
1962 sllx $acc0,32,$t0
1963 addxccc $acc5,$a2,$acc5
1964 srlx $acc0,32,$t1
1965 addxccc $acc6,$t3,$acc6
1966 sub $acc0,$t0,$t2 ! acc0*0xFFFFFFFF00000001, low part
1967 addxc $acc7,$a3,$acc7
1968___
1969for($i=0;$i<3;$i++) { # reductions, see commentary
1970 # in multiplication for details
1971$code.=<<___;
1972 umulxhi $acc0,$poly3,$t3 ! acc0*0xFFFFFFFF00000001, high part
1973 addcc $acc1,$t0,$acc0 ! +=acc[0]<<96 and omit acc[0]
1974 sllx $acc0,32,$t0
1975 addxccc $acc2,$t1,$acc1
1976 srlx $acc0,32,$t1
1977 addxccc $acc3,$t2,$acc2 ! +=acc[0]*0xFFFFFFFF00000001
1978 sub $acc0,$t0,$t2 ! acc0*0xFFFFFFFF00000001, low part
46f4e1be 1979 addxc %g0,$t3,$acc3 ! can't overflow
5557d5f2
AP
1980___
1981}
1982$code.=<<___;
1983 umulxhi $acc0,$poly3,$t3 ! acc0*0xFFFFFFFF00000001, high part
1984 addcc $acc1,$t0,$acc0 ! +=acc[0]<<96 and omit acc[0]
1985 addxccc $acc2,$t1,$acc1
1986 addxccc $acc3,$t2,$acc2 ! +=acc[0]*0xFFFFFFFF00000001
1987 addxc %g0,$t3,$acc3 ! can't overflow
1988
1989 addcc $acc0,$acc4,$acc0 ! accumulate upper half
1990 addxccc $acc1,$acc5,$acc1
1991 addxccc $acc2,$acc6,$acc2
1992 addxccc $acc3,$acc7,$acc3
1993 addxc %g0,%g0,$acc4
1994
1995.Lmul_final_vis3:
1996
1997 ! Final step is "if result > mod, subtract mod", but as comparison
1998 ! means subtraction, we do the subtraction and then copy outcome
1999 ! if it didn't borrow. But note that as we [have to] replace
2000 ! subtraction with addition with negative, carry/borrow logic is
2001 ! inverse.
2002
2003 addcc $acc0,1,$t0 ! add -modulus, i.e. subtract
2004 not $poly3,$poly3 ! restore 0x00000000FFFFFFFE
2005 addxccc $acc1,$poly1,$t1
2006 addxccc $acc2,$minus1,$t2
2007 addxccc $acc3,$poly3,$t3
2008 addxccc $acc4,$minus1,%g0 ! did it carry?
2009
2010 movcs %xcc,$t0,$acc0
2011 movcs %xcc,$t1,$acc1
2012 stx $acc0,[$rp]
2013 movcs %xcc,$t2,$acc2
2014 stx $acc1,[$rp+8]
2015 movcs %xcc,$t3,$acc3
2016 stx $acc2,[$rp+16]
2017 retl
2018 stx $acc3,[$rp+24]
ff823ee8 2019.type __ecp_nistz256_sqr_mont_vis3,#function
5557d5f2
AP
2020.size __ecp_nistz256_sqr_mont_vis3,.-__ecp_nistz256_sqr_mont_vis3
2021___
2022
2023########################################################################
2024# void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp);
2025#
2026{
2027my ($res_x,$res_y,$res_z,
2028 $in_x,$in_y,$in_z,
2029 $S,$M,$Zsqr,$tmp0)=map(32*$_,(0..9));
2030# above map() describes stack layout with 10 temporary
2031# 256-bit vectors on top.
2032
2033$code.=<<___;
2034.align 32
2035ecp_nistz256_point_double_vis3:
2036 save %sp,-STACK64_FRAME-32*10,%sp
2037
2038 mov $rp,$rp_real
1a661908 2039.Ldouble_shortcut_vis3:
5557d5f2
AP
2040 mov -1,$minus1
2041 mov -2,$poly3
2042 sllx $minus1,32,$poly1 ! 0xFFFFFFFF00000000
2043 srl $poly3,0,$poly3 ! 0x00000000FFFFFFFE
2044
2045 ! convert input to uint64_t[4]
2046 ld [$ap],$a0 ! in_x
2047 ld [$ap+4],$t0
2048 ld [$ap+8],$a1
2049 ld [$ap+12],$t1
2050 ld [$ap+16],$a2
2051 ld [$ap+20],$t2
2052 ld [$ap+24],$a3
2053 ld [$ap+28],$t3
2054 sllx $t0,32,$t0
2055 sllx $t1,32,$t1
2056 ld [$ap+32],$acc0 ! in_y
2057 or $a0,$t0,$a0
2058 ld [$ap+32+4],$t0
2059 sllx $t2,32,$t2
2060 ld [$ap+32+8],$acc1
2061 or $a1,$t1,$a1
2062 ld [$ap+32+12],$t1
2063 sllx $t3,32,$t3
2064 ld [$ap+32+16],$acc2
2065 or $a2,$t2,$a2
2066 ld [$ap+32+20],$t2
2067 or $a3,$t3,$a3
2068 ld [$ap+32+24],$acc3
2069 sllx $t0,32,$t0
2070 ld [$ap+32+28],$t3
2071 sllx $t1,32,$t1
2072 stx $a0,[%sp+LOCALS64+$in_x]
2073 sllx $t2,32,$t2
2074 stx $a1,[%sp+LOCALS64+$in_x+8]
2075 sllx $t3,32,$t3
2076 stx $a2,[%sp+LOCALS64+$in_x+16]
2077 or $acc0,$t0,$acc0
2078 stx $a3,[%sp+LOCALS64+$in_x+24]
2079 or $acc1,$t1,$acc1
2080 stx $acc0,[%sp+LOCALS64+$in_y]
2081 or $acc2,$t2,$acc2
2082 stx $acc1,[%sp+LOCALS64+$in_y+8]
2083 or $acc3,$t3,$acc3
2084 stx $acc2,[%sp+LOCALS64+$in_y+16]
2085 stx $acc3,[%sp+LOCALS64+$in_y+24]
2086
2087 ld [$ap+64],$a0 ! in_z
2088 ld [$ap+64+4],$t0
2089 ld [$ap+64+8],$a1
2090 ld [$ap+64+12],$t1
2091 ld [$ap+64+16],$a2
2092 ld [$ap+64+20],$t2
2093 ld [$ap+64+24],$a3
2094 ld [$ap+64+28],$t3
2095 sllx $t0,32,$t0
2096 sllx $t1,32,$t1
2097 or $a0,$t0,$a0
2098 sllx $t2,32,$t2
2099 or $a1,$t1,$a1
2100 sllx $t3,32,$t3
2101 or $a2,$t2,$a2
2102 or $a3,$t3,$a3
2103 sllx $t0,32,$t0
2104 sllx $t1,32,$t1
2105 stx $a0,[%sp+LOCALS64+$in_z]
2106 sllx $t2,32,$t2
2107 stx $a1,[%sp+LOCALS64+$in_z+8]
2108 sllx $t3,32,$t3
2109 stx $a2,[%sp+LOCALS64+$in_z+16]
2110 stx $a3,[%sp+LOCALS64+$in_z+24]
2111
2112 ! in_y is still in $acc0-$acc3
2113 call __ecp_nistz256_mul_by_2_vis3 ! p256_mul_by_2(S, in_y);
2114 add %sp,LOCALS64+$S,$rp
2115
2116 ! in_z is still in $a0-$a3
2117 call __ecp_nistz256_sqr_mont_vis3 ! p256_sqr_mont(Zsqr, in_z);
2118 add %sp,LOCALS64+$Zsqr,$rp
2119
2120 mov $acc0,$a0 ! put Zsqr aside
2121 mov $acc1,$a1
2122 mov $acc2,$a2
2123 mov $acc3,$a3
2124
2125 add %sp,LOCALS64+$in_x,$bp
2126 call __ecp_nistz256_add_vis3 ! p256_add(M, Zsqr, in_x);
2127 add %sp,LOCALS64+$M,$rp
2128
2129 mov $a0,$acc0 ! restore Zsqr
2130 ldx [%sp+LOCALS64+$S],$a0 ! forward load
2131 mov $a1,$acc1
2132 ldx [%sp+LOCALS64+$S+8],$a1
2133 mov $a2,$acc2
2134 ldx [%sp+LOCALS64+$S+16],$a2
2135 mov $a3,$acc3
2136 ldx [%sp+LOCALS64+$S+24],$a3
2137
2138 add %sp,LOCALS64+$in_x,$bp
2139 call __ecp_nistz256_sub_morf_vis3 ! p256_sub(Zsqr, in_x, Zsqr);
2140 add %sp,LOCALS64+$Zsqr,$rp
2141
2142 call __ecp_nistz256_sqr_mont_vis3 ! p256_sqr_mont(S, S);
2143 add %sp,LOCALS64+$S,$rp
2144
2145 ldx [%sp+LOCALS64+$in_z],$bi
2146 ldx [%sp+LOCALS64+$in_y],$a0
2147 ldx [%sp+LOCALS64+$in_y+8],$a1
2148 ldx [%sp+LOCALS64+$in_y+16],$a2
2149 ldx [%sp+LOCALS64+$in_y+24],$a3
2150 add %sp,LOCALS64+$in_z,$bp
2151 call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(tmp0, in_z, in_y);
2152 add %sp,LOCALS64+$tmp0,$rp
2153
2154 ldx [%sp+LOCALS64+$M],$bi ! forward load
2155 ldx [%sp+LOCALS64+$Zsqr],$a0
2156 ldx [%sp+LOCALS64+$Zsqr+8],$a1
2157 ldx [%sp+LOCALS64+$Zsqr+16],$a2
2158 ldx [%sp+LOCALS64+$Zsqr+24],$a3
2159
2160 call __ecp_nistz256_mul_by_2_vis3 ! p256_mul_by_2(res_z, tmp0);
2161 add %sp,LOCALS64+$res_z,$rp
2162
2163 add %sp,LOCALS64+$M,$bp
2164 call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(M, M, Zsqr);
2165 add %sp,LOCALS64+$M,$rp
2166
2167 mov $acc0,$a0 ! put aside M
2168 mov $acc1,$a1
2169 mov $acc2,$a2
2170 mov $acc3,$a3
2171 call __ecp_nistz256_mul_by_2_vis3
2172 add %sp,LOCALS64+$M,$rp
2173 mov $a0,$t0 ! copy M
2174 ldx [%sp+LOCALS64+$S],$a0 ! forward load
2175 mov $a1,$t1
2176 ldx [%sp+LOCALS64+$S+8],$a1
2177 mov $a2,$t2
2178 ldx [%sp+LOCALS64+$S+16],$a2
2179 mov $a3,$t3
2180 ldx [%sp+LOCALS64+$S+24],$a3
2181 call __ecp_nistz256_add_noload_vis3 ! p256_mul_by_3(M, M);
2182 add %sp,LOCALS64+$M,$rp
2183
2184 call __ecp_nistz256_sqr_mont_vis3 ! p256_sqr_mont(tmp0, S);
2185 add %sp,LOCALS64+$tmp0,$rp
2186
2187 ldx [%sp+LOCALS64+$S],$bi ! forward load
2188 ldx [%sp+LOCALS64+$in_x],$a0
2189 ldx [%sp+LOCALS64+$in_x+8],$a1
2190 ldx [%sp+LOCALS64+$in_x+16],$a2
2191 ldx [%sp+LOCALS64+$in_x+24],$a3
2192
2193 call __ecp_nistz256_div_by_2_vis3 ! p256_div_by_2(res_y, tmp0);
2194 add %sp,LOCALS64+$res_y,$rp
2195
2196 add %sp,LOCALS64+$S,$bp
2197 call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(S, S, in_x);
2198 add %sp,LOCALS64+$S,$rp
2199
2200 ldx [%sp+LOCALS64+$M],$a0 ! forward load
2201 ldx [%sp+LOCALS64+$M+8],$a1
2202 ldx [%sp+LOCALS64+$M+16],$a2
2203 ldx [%sp+LOCALS64+$M+24],$a3
2204
2205 call __ecp_nistz256_mul_by_2_vis3 ! p256_mul_by_2(tmp0, S);
2206 add %sp,LOCALS64+$tmp0,$rp
2207
2208 call __ecp_nistz256_sqr_mont_vis3 ! p256_sqr_mont(res_x, M);
2209 add %sp,LOCALS64+$res_x,$rp
2210
2211 add %sp,LOCALS64+$tmp0,$bp
2212 call __ecp_nistz256_sub_from_vis3 ! p256_sub(res_x, res_x, tmp0);
2213 add %sp,LOCALS64+$res_x,$rp
2214
2215 ldx [%sp+LOCALS64+$M],$a0 ! forward load
2216 ldx [%sp+LOCALS64+$M+8],$a1
2217 ldx [%sp+LOCALS64+$M+16],$a2
2218 ldx [%sp+LOCALS64+$M+24],$a3
2219
2220 add %sp,LOCALS64+$S,$bp
2221 call __ecp_nistz256_sub_morf_vis3 ! p256_sub(S, S, res_x);
2222 add %sp,LOCALS64+$S,$rp
2223
2224 mov $acc0,$bi
2225 call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(S, S, M);
2226 add %sp,LOCALS64+$S,$rp
2227
2228 ldx [%sp+LOCALS64+$res_x],$a0 ! forward load
2229 ldx [%sp+LOCALS64+$res_x+8],$a1
2230 ldx [%sp+LOCALS64+$res_x+16],$a2
2231 ldx [%sp+LOCALS64+$res_x+24],$a3
2232
2233 add %sp,LOCALS64+$res_y,$bp
2234 call __ecp_nistz256_sub_from_vis3 ! p256_sub(res_y, S, res_y);
2235 add %sp,LOCALS64+$res_y,$bp
2236
2237 ! convert output to uint_32[8]
2238 srlx $a0,32,$t0
2239 srlx $a1,32,$t1
2240 st $a0,[$rp_real] ! res_x
2241 srlx $a2,32,$t2
2242 st $t0,[$rp_real+4]
2243 srlx $a3,32,$t3
2244 st $a1,[$rp_real+8]
2245 st $t1,[$rp_real+12]
2246 st $a2,[$rp_real+16]
2247 st $t2,[$rp_real+20]
2248 st $a3,[$rp_real+24]
2249 st $t3,[$rp_real+28]
2250
2251 ldx [%sp+LOCALS64+$res_z],$a0 ! forward load
2252 srlx $acc0,32,$t0
2253 ldx [%sp+LOCALS64+$res_z+8],$a1
2254 srlx $acc1,32,$t1
2255 ldx [%sp+LOCALS64+$res_z+16],$a2
2256 srlx $acc2,32,$t2
2257 ldx [%sp+LOCALS64+$res_z+24],$a3
2258 srlx $acc3,32,$t3
2259 st $acc0,[$rp_real+32] ! res_y
2260 st $t0, [$rp_real+32+4]
2261 st $acc1,[$rp_real+32+8]
2262 st $t1, [$rp_real+32+12]
2263 st $acc2,[$rp_real+32+16]
2264 st $t2, [$rp_real+32+20]
2265 st $acc3,[$rp_real+32+24]
2266 st $t3, [$rp_real+32+28]
2267
2268 srlx $a0,32,$t0
2269 srlx $a1,32,$t1
2270 st $a0,[$rp_real+64] ! res_z
2271 srlx $a2,32,$t2
2272 st $t0,[$rp_real+64+4]
2273 srlx $a3,32,$t3
2274 st $a1,[$rp_real+64+8]
2275 st $t1,[$rp_real+64+12]
2276 st $a2,[$rp_real+64+16]
2277 st $t2,[$rp_real+64+20]
2278 st $a3,[$rp_real+64+24]
2279 st $t3,[$rp_real+64+28]
2280
2281 ret
2282 restore
ff823ee8 2283.type ecp_nistz256_point_double_vis3,#function
5557d5f2
AP
2284.size ecp_nistz256_point_double_vis3,.-ecp_nistz256_point_double_vis3
2285___
2286}
2287########################################################################
2288# void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1,
2289# const P256_POINT *in2);
2290{
2291my ($res_x,$res_y,$res_z,
2292 $in1_x,$in1_y,$in1_z,
2293 $in2_x,$in2_y,$in2_z,
2294 $H,$Hsqr,$R,$Rsqr,$Hcub,
2295 $U1,$U2,$S1,$S2)=map(32*$_,(0..17));
2296my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr);
2297
2298# above map() describes stack layout with 18 temporary
2299# 256-bit vectors on top. Then we reserve some space for
2300# !in1infty, !in2infty and result of check for zero.
2301
2302$code.=<<___;
5557d5f2
AP
2303.align 32
2304ecp_nistz256_point_add_vis3:
2305 save %sp,-STACK64_FRAME-32*18-32,%sp
2306
2307 mov $rp,$rp_real
2308 mov -1,$minus1
2309 mov -2,$poly3
2310 sllx $minus1,32,$poly1 ! 0xFFFFFFFF00000000
2311 srl $poly3,0,$poly3 ! 0x00000000FFFFFFFE
2312
2313 ! convert input to uint64_t[4]
2314 ld [$bp],$a0 ! in2_x
2315 ld [$bp+4],$t0
2316 ld [$bp+8],$a1
2317 ld [$bp+12],$t1
2318 ld [$bp+16],$a2
2319 ld [$bp+20],$t2
2320 ld [$bp+24],$a3
2321 ld [$bp+28],$t3
2322 sllx $t0,32,$t0
2323 sllx $t1,32,$t1
2324 ld [$bp+32],$acc0 ! in2_y
2325 or $a0,$t0,$a0
2326 ld [$bp+32+4],$t0
2327 sllx $t2,32,$t2
2328 ld [$bp+32+8],$acc1
2329 or $a1,$t1,$a1
2330 ld [$bp+32+12],$t1
2331 sllx $t3,32,$t3
2332 ld [$bp+32+16],$acc2
2333 or $a2,$t2,$a2
2334 ld [$bp+32+20],$t2
2335 or $a3,$t3,$a3
2336 ld [$bp+32+24],$acc3
2337 sllx $t0,32,$t0
2338 ld [$bp+32+28],$t3
2339 sllx $t1,32,$t1
2340 stx $a0,[%sp+LOCALS64+$in2_x]
2341 sllx $t2,32,$t2
2342 stx $a1,[%sp+LOCALS64+$in2_x+8]
2343 sllx $t3,32,$t3
2344 stx $a2,[%sp+LOCALS64+$in2_x+16]
2345 or $acc0,$t0,$acc0
2346 stx $a3,[%sp+LOCALS64+$in2_x+24]
2347 or $acc1,$t1,$acc1
2348 stx $acc0,[%sp+LOCALS64+$in2_y]
2349 or $acc2,$t2,$acc2
2350 stx $acc1,[%sp+LOCALS64+$in2_y+8]
2351 or $acc3,$t3,$acc3
2352 stx $acc2,[%sp+LOCALS64+$in2_y+16]
2353 stx $acc3,[%sp+LOCALS64+$in2_y+24]
2354
5557d5f2
AP
2355 ld [$bp+64],$acc0 ! in2_z
2356 ld [$bp+64+4],$t0
2357 ld [$bp+64+8],$acc1
2358 ld [$bp+64+12],$t1
2359 ld [$bp+64+16],$acc2
2360 ld [$bp+64+20],$t2
2361 ld [$bp+64+24],$acc3
2362 ld [$bp+64+28],$t3
2363 sllx $t0,32,$t0
2364 sllx $t1,32,$t1
2365 ld [$ap],$a0 ! in1_x
2366 or $acc0,$t0,$acc0
2367 ld [$ap+4],$t0
2368 sllx $t2,32,$t2
2369 ld [$ap+8],$a1
2370 or $acc1,$t1,$acc1
2371 ld [$ap+12],$t1
2372 sllx $t3,32,$t3
2373 ld [$ap+16],$a2
2374 or $acc2,$t2,$acc2
2375 ld [$ap+20],$t2
2376 or $acc3,$t3,$acc3
2377 ld [$ap+24],$a3
2378 sllx $t0,32,$t0
2379 ld [$ap+28],$t3
2380 sllx $t1,32,$t1
2381 stx $acc0,[%sp+LOCALS64+$in2_z]
2382 sllx $t2,32,$t2
2383 stx $acc1,[%sp+LOCALS64+$in2_z+8]
2384 sllx $t3,32,$t3
2385 stx $acc2,[%sp+LOCALS64+$in2_z+16]
2386 stx $acc3,[%sp+LOCALS64+$in2_z+24]
2387
c74aea8d
AP
2388 or $acc1,$acc0,$acc0
2389 or $acc3,$acc2,$acc2
2390 or $acc2,$acc0,$acc0
2391 movrnz $acc0,-1,$acc0 ! !in2infty
2392 stx $acc0,[%fp+STACK_BIAS-8]
2393
5557d5f2
AP
2394 or $a0,$t0,$a0
2395 ld [$ap+32],$acc0 ! in1_y
2396 or $a1,$t1,$a1
2397 ld [$ap+32+4],$t0
2398 or $a2,$t2,$a2
2399 ld [$ap+32+8],$acc1
2400 or $a3,$t3,$a3
2401 ld [$ap+32+12],$t1
2402 ld [$ap+32+16],$acc2
2403 ld [$ap+32+20],$t2
2404 ld [$ap+32+24],$acc3
2405 sllx $t0,32,$t0
2406 ld [$ap+32+28],$t3
2407 sllx $t1,32,$t1
2408 stx $a0,[%sp+LOCALS64+$in1_x]
2409 sllx $t2,32,$t2
2410 stx $a1,[%sp+LOCALS64+$in1_x+8]
2411 sllx $t3,32,$t3
2412 stx $a2,[%sp+LOCALS64+$in1_x+16]
2413 or $acc0,$t0,$acc0
2414 stx $a3,[%sp+LOCALS64+$in1_x+24]
2415 or $acc1,$t1,$acc1
2416 stx $acc0,[%sp+LOCALS64+$in1_y]
2417 or $acc2,$t2,$acc2
2418 stx $acc1,[%sp+LOCALS64+$in1_y+8]
2419 or $acc3,$t3,$acc3
2420 stx $acc2,[%sp+LOCALS64+$in1_y+16]
2421 stx $acc3,[%sp+LOCALS64+$in1_y+24]
2422
5557d5f2
AP
2423 ldx [%sp+LOCALS64+$in2_z],$a0 ! forward load
2424 ldx [%sp+LOCALS64+$in2_z+8],$a1
2425 ldx [%sp+LOCALS64+$in2_z+16],$a2
2426 ldx [%sp+LOCALS64+$in2_z+24],$a3
2427
2428 ld [$ap+64],$acc0 ! in1_z
2429 ld [$ap+64+4],$t0
2430 ld [$ap+64+8],$acc1
2431 ld [$ap+64+12],$t1
2432 ld [$ap+64+16],$acc2
2433 ld [$ap+64+20],$t2
2434 ld [$ap+64+24],$acc3
2435 ld [$ap+64+28],$t3
2436 sllx $t0,32,$t0
2437 sllx $t1,32,$t1
2438 or $acc0,$t0,$acc0
2439 sllx $t2,32,$t2
2440 or $acc1,$t1,$acc1
2441 sllx $t3,32,$t3
2442 stx $acc0,[%sp+LOCALS64+$in1_z]
2443 or $acc2,$t2,$acc2
2444 stx $acc1,[%sp+LOCALS64+$in1_z+8]
2445 or $acc3,$t3,$acc3
2446 stx $acc2,[%sp+LOCALS64+$in1_z+16]
2447 stx $acc3,[%sp+LOCALS64+$in1_z+24]
2448
c74aea8d
AP
2449 or $acc1,$acc0,$acc0
2450 or $acc3,$acc2,$acc2
2451 or $acc2,$acc0,$acc0
2452 movrnz $acc0,-1,$acc0 ! !in1infty
2453 stx $acc0,[%fp+STACK_BIAS-16]
2454
5557d5f2
AP
2455 call __ecp_nistz256_sqr_mont_vis3 ! p256_sqr_mont(Z2sqr, in2_z);
2456 add %sp,LOCALS64+$Z2sqr,$rp
2457
2458 ldx [%sp+LOCALS64+$in1_z],$a0
2459 ldx [%sp+LOCALS64+$in1_z+8],$a1
2460 ldx [%sp+LOCALS64+$in1_z+16],$a2
2461 ldx [%sp+LOCALS64+$in1_z+24],$a3
2462 call __ecp_nistz256_sqr_mont_vis3 ! p256_sqr_mont(Z1sqr, in1_z);
2463 add %sp,LOCALS64+$Z1sqr,$rp
2464
2465 ldx [%sp+LOCALS64+$Z2sqr],$bi
2466 ldx [%sp+LOCALS64+$in2_z],$a0
2467 ldx [%sp+LOCALS64+$in2_z+8],$a1
2468 ldx [%sp+LOCALS64+$in2_z+16],$a2
2469 ldx [%sp+LOCALS64+$in2_z+24],$a3
2470 add %sp,LOCALS64+$Z2sqr,$bp
2471 call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(S1, Z2sqr, in2_z);
2472 add %sp,LOCALS64+$S1,$rp
2473
2474 ldx [%sp+LOCALS64+$Z1sqr],$bi
2475 ldx [%sp+LOCALS64+$in1_z],$a0
2476 ldx [%sp+LOCALS64+$in1_z+8],$a1
2477 ldx [%sp+LOCALS64+$in1_z+16],$a2
2478 ldx [%sp+LOCALS64+$in1_z+24],$a3
2479 add %sp,LOCALS64+$Z1sqr,$bp
2480 call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(S2, Z1sqr, in1_z);
2481 add %sp,LOCALS64+$S2,$rp
2482
2483 ldx [%sp+LOCALS64+$S1],$bi
2484 ldx [%sp+LOCALS64+$in1_y],$a0
2485 ldx [%sp+LOCALS64+$in1_y+8],$a1
2486 ldx [%sp+LOCALS64+$in1_y+16],$a2
2487 ldx [%sp+LOCALS64+$in1_y+24],$a3
2488 add %sp,LOCALS64+$S1,$bp
2489 call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(S1, S1, in1_y);
2490 add %sp,LOCALS64+$S1,$rp
2491
2492 ldx [%sp+LOCALS64+$S2],$bi
2493 ldx [%sp+LOCALS64+$in2_y],$a0
2494 ldx [%sp+LOCALS64+$in2_y+8],$a1
2495 ldx [%sp+LOCALS64+$in2_y+16],$a2
2496 ldx [%sp+LOCALS64+$in2_y+24],$a3
2497 add %sp,LOCALS64+$S2,$bp
2498 call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(S2, S2, in2_y);
2499 add %sp,LOCALS64+$S2,$rp
2500
2501 ldx [%sp+LOCALS64+$Z2sqr],$bi ! forward load
2502 ldx [%sp+LOCALS64+$in1_x],$a0
2503 ldx [%sp+LOCALS64+$in1_x+8],$a1
2504 ldx [%sp+LOCALS64+$in1_x+16],$a2
2505 ldx [%sp+LOCALS64+$in1_x+24],$a3
2506
2507 add %sp,LOCALS64+$S1,$bp
2508 call __ecp_nistz256_sub_from_vis3 ! p256_sub(R, S2, S1);
2509 add %sp,LOCALS64+$R,$rp
2510
2511 or $acc1,$acc0,$acc0 ! see if result is zero
2512 or $acc3,$acc2,$acc2
2513 or $acc2,$acc0,$acc0
2514 stx $acc0,[%fp+STACK_BIAS-24]
2515
2516 add %sp,LOCALS64+$Z2sqr,$bp
2517 call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(U1, in1_x, Z2sqr);
2518 add %sp,LOCALS64+$U1,$rp
2519
2520 ldx [%sp+LOCALS64+$Z1sqr],$bi
2521 ldx [%sp+LOCALS64+$in2_x],$a0
2522 ldx [%sp+LOCALS64+$in2_x+8],$a1
2523 ldx [%sp+LOCALS64+$in2_x+16],$a2
2524 ldx [%sp+LOCALS64+$in2_x+24],$a3
2525 add %sp,LOCALS64+$Z1sqr,$bp
2526 call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(U2, in2_x, Z1sqr);
2527 add %sp,LOCALS64+$U2,$rp
2528
2529 ldx [%sp+LOCALS64+$R],$a0 ! forward load
2530 ldx [%sp+LOCALS64+$R+8],$a1
2531 ldx [%sp+LOCALS64+$R+16],$a2
2532 ldx [%sp+LOCALS64+$R+24],$a3
2533
2534 add %sp,LOCALS64+$U1,$bp
2535 call __ecp_nistz256_sub_from_vis3 ! p256_sub(H, U2, U1);
2536 add %sp,LOCALS64+$H,$rp
2537
2538 or $acc1,$acc0,$acc0 ! see if result is zero
2539 or $acc3,$acc2,$acc2
2540 orcc $acc2,$acc0,$acc0
2541
2542 bne,pt %xcc,.Ladd_proceed_vis3 ! is_equal(U1,U2)?
2543 nop
2544
2545 ldx [%fp+STACK_BIAS-8],$t0
2546 ldx [%fp+STACK_BIAS-16],$t1
2547 ldx [%fp+STACK_BIAS-24],$t2
2548 andcc $t0,$t1,%g0
2549 be,pt %xcc,.Ladd_proceed_vis3 ! (in1infty || in2infty)?
2550 nop
2551 andcc $t2,$t2,%g0
1a661908
AP
2552 be,a,pt %xcc,.Ldouble_shortcut_vis3 ! is_equal(S1,S2)?
2553 add %sp,32*(12-10)+32,%sp ! difference in frame sizes
5557d5f2
AP
2554
2555 st %g0,[$rp_real]
2556 st %g0,[$rp_real+4]
2557 st %g0,[$rp_real+8]
2558 st %g0,[$rp_real+12]
2559 st %g0,[$rp_real+16]
2560 st %g0,[$rp_real+20]
2561 st %g0,[$rp_real+24]
2562 st %g0,[$rp_real+28]
2563 st %g0,[$rp_real+32]
2564 st %g0,[$rp_real+32+4]
2565 st %g0,[$rp_real+32+8]
2566 st %g0,[$rp_real+32+12]
2567 st %g0,[$rp_real+32+16]
2568 st %g0,[$rp_real+32+20]
2569 st %g0,[$rp_real+32+24]
2570 st %g0,[$rp_real+32+28]
2571 st %g0,[$rp_real+64]
2572 st %g0,[$rp_real+64+4]
2573 st %g0,[$rp_real+64+8]
2574 st %g0,[$rp_real+64+12]
2575 st %g0,[$rp_real+64+16]
2576 st %g0,[$rp_real+64+20]
2577 st %g0,[$rp_real+64+24]
2578 st %g0,[$rp_real+64+28]
2579 b .Ladd_done_vis3
2580 nop
2581
2582.align 16
2583.Ladd_proceed_vis3:
2584 call __ecp_nistz256_sqr_mont_vis3 ! p256_sqr_mont(Rsqr, R);
2585 add %sp,LOCALS64+$Rsqr,$rp
2586
2587 ldx [%sp+LOCALS64+$H],$bi
2588 ldx [%sp+LOCALS64+$in1_z],$a0
2589 ldx [%sp+LOCALS64+$in1_z+8],$a1
2590 ldx [%sp+LOCALS64+$in1_z+16],$a2
2591 ldx [%sp+LOCALS64+$in1_z+24],$a3
2592 add %sp,LOCALS64+$H,$bp
2593 call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(res_z, H, in1_z);
2594 add %sp,LOCALS64+$res_z,$rp
2595
2596 ldx [%sp+LOCALS64+$H],$a0
2597 ldx [%sp+LOCALS64+$H+8],$a1
2598 ldx [%sp+LOCALS64+$H+16],$a2
2599 ldx [%sp+LOCALS64+$H+24],$a3
2600 call __ecp_nistz256_sqr_mont_vis3 ! p256_sqr_mont(Hsqr, H);
2601 add %sp,LOCALS64+$Hsqr,$rp
2602
2603 ldx [%sp+LOCALS64+$res_z],$bi
2604 ldx [%sp+LOCALS64+$in2_z],$a0
2605 ldx [%sp+LOCALS64+$in2_z+8],$a1
2606 ldx [%sp+LOCALS64+$in2_z+16],$a2
2607 ldx [%sp+LOCALS64+$in2_z+24],$a3
2608 add %sp,LOCALS64+$res_z,$bp
2609 call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(res_z, res_z, in2_z);
2610 add %sp,LOCALS64+$res_z,$rp
2611
2612 ldx [%sp+LOCALS64+$H],$bi
2613 ldx [%sp+LOCALS64+$Hsqr],$a0
2614 ldx [%sp+LOCALS64+$Hsqr+8],$a1
2615 ldx [%sp+LOCALS64+$Hsqr+16],$a2
2616 ldx [%sp+LOCALS64+$Hsqr+24],$a3
2617 add %sp,LOCALS64+$H,$bp
2618 call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(Hcub, Hsqr, H);
2619 add %sp,LOCALS64+$Hcub,$rp
2620
2621 ldx [%sp+LOCALS64+$U1],$bi
2622 ldx [%sp+LOCALS64+$Hsqr],$a0
2623 ldx [%sp+LOCALS64+$Hsqr+8],$a1
2624 ldx [%sp+LOCALS64+$Hsqr+16],$a2
2625 ldx [%sp+LOCALS64+$Hsqr+24],$a3
2626 add %sp,LOCALS64+$U1,$bp
2627 call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(U2, U1, Hsqr);
2628 add %sp,LOCALS64+$U2,$rp
2629
2630 call __ecp_nistz256_mul_by_2_vis3 ! p256_mul_by_2(Hsqr, U2);
2631 add %sp,LOCALS64+$Hsqr,$rp
2632
2633 add %sp,LOCALS64+$Rsqr,$bp
2634 call __ecp_nistz256_sub_morf_vis3 ! p256_sub(res_x, Rsqr, Hsqr);
2635 add %sp,LOCALS64+$res_x,$rp
2636
2637 add %sp,LOCALS64+$Hcub,$bp
2638 call __ecp_nistz256_sub_from_vis3 ! p256_sub(res_x, res_x, Hcub);
2639 add %sp,LOCALS64+$res_x,$rp
2640
2641 ldx [%sp+LOCALS64+$S1],$bi ! forward load
2642 ldx [%sp+LOCALS64+$Hcub],$a0
2643 ldx [%sp+LOCALS64+$Hcub+8],$a1
2644 ldx [%sp+LOCALS64+$Hcub+16],$a2
2645 ldx [%sp+LOCALS64+$Hcub+24],$a3
2646
2647 add %sp,LOCALS64+$U2,$bp
2648 call __ecp_nistz256_sub_morf_vis3 ! p256_sub(res_y, U2, res_x);
2649 add %sp,LOCALS64+$res_y,$rp
2650
2651 add %sp,LOCALS64+$S1,$bp
2652 call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(S2, S1, Hcub);
2653 add %sp,LOCALS64+$S2,$rp
2654
2655 ldx [%sp+LOCALS64+$R],$bi
2656 ldx [%sp+LOCALS64+$res_y],$a0
2657 ldx [%sp+LOCALS64+$res_y+8],$a1
2658 ldx [%sp+LOCALS64+$res_y+16],$a2
2659 ldx [%sp+LOCALS64+$res_y+24],$a3
2660 add %sp,LOCALS64+$R,$bp
2661 call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(res_y, res_y, R);
2662 add %sp,LOCALS64+$res_y,$rp
2663
2664 add %sp,LOCALS64+$S2,$bp
2665 call __ecp_nistz256_sub_from_vis3 ! p256_sub(res_y, res_y, S2);
2666 add %sp,LOCALS64+$res_y,$rp
2667
2668 ldx [%fp+STACK_BIAS-16],$t1 ! !in1infty
2669 ldx [%fp+STACK_BIAS-8],$t2 ! !in2infty
2670___
2671for($i=0;$i<96;$i+=16) { # conditional moves
2672$code.=<<___;
2673 ldx [%sp+LOCALS64+$res_x+$i],$acc0 ! res
2674 ldx [%sp+LOCALS64+$res_x+$i+8],$acc1
2675 ldx [%sp+LOCALS64+$in2_x+$i],$acc2 ! in2
2676 ldx [%sp+LOCALS64+$in2_x+$i+8],$acc3
2677 ldx [%sp+LOCALS64+$in1_x+$i],$acc4 ! in1
2678 ldx [%sp+LOCALS64+$in1_x+$i+8],$acc5
2679 movrz $t1,$acc2,$acc0
2680 movrz $t1,$acc3,$acc1
2681 movrz $t2,$acc4,$acc0
2682 movrz $t2,$acc5,$acc1
2683 srlx $acc0,32,$acc2
2684 srlx $acc1,32,$acc3
2685 st $acc0,[$rp_real+$i]
2686 st $acc2,[$rp_real+$i+4]
2687 st $acc1,[$rp_real+$i+8]
2688 st $acc3,[$rp_real+$i+12]
2689___
2690}
2691$code.=<<___;
2692.Ladd_done_vis3:
2693 ret
2694 restore
ff823ee8 2695.type ecp_nistz256_point_add_vis3,#function
5557d5f2
AP
2696.size ecp_nistz256_point_add_vis3,.-ecp_nistz256_point_add_vis3
2697___
2698}
2699########################################################################
2700# void ecp_nistz256_point_add_affine(P256_POINT *out,const P256_POINT *in1,
2701# const P256_POINT_AFFINE *in2);
2702{
2703my ($res_x,$res_y,$res_z,
2704 $in1_x,$in1_y,$in1_z,
2705 $in2_x,$in2_y,
2706 $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(32*$_,(0..14));
2707my $Z1sqr = $S2;
2708# above map() describes stack layout with 15 temporary
2709# 256-bit vectors on top. Then we reserve some space for
2710# !in1infty and !in2infty.
2711
2712$code.=<<___;
2713.align 32
2714ecp_nistz256_point_add_affine_vis3:
2715 save %sp,-STACK64_FRAME-32*15-32,%sp
2716
2717 mov $rp,$rp_real
2718 mov -1,$minus1
2719 mov -2,$poly3
2720 sllx $minus1,32,$poly1 ! 0xFFFFFFFF00000000
2721 srl $poly3,0,$poly3 ! 0x00000000FFFFFFFE
2722
2723 ! convert input to uint64_t[4]
2724 ld [$bp],$a0 ! in2_x
2725 ld [$bp+4],$t0
2726 ld [$bp+8],$a1
2727 ld [$bp+12],$t1
2728 ld [$bp+16],$a2
2729 ld [$bp+20],$t2
2730 ld [$bp+24],$a3
2731 ld [$bp+28],$t3
2732 sllx $t0,32,$t0
2733 sllx $t1,32,$t1
2734 ld [$bp+32],$acc0 ! in2_y
2735 or $a0,$t0,$a0
2736 ld [$bp+32+4],$t0
2737 sllx $t2,32,$t2
2738 ld [$bp+32+8],$acc1
2739 or $a1,$t1,$a1
2740 ld [$bp+32+12],$t1
2741 sllx $t3,32,$t3
2742 ld [$bp+32+16],$acc2
2743 or $a2,$t2,$a2
2744 ld [$bp+32+20],$t2
2745 or $a3,$t3,$a3
2746 ld [$bp+32+24],$acc3
2747 sllx $t0,32,$t0
2748 ld [$bp+32+28],$t3
2749 sllx $t1,32,$t1
2750 stx $a0,[%sp+LOCALS64+$in2_x]
2751 sllx $t2,32,$t2
2752 stx $a1,[%sp+LOCALS64+$in2_x+8]
2753 sllx $t3,32,$t3
2754 stx $a2,[%sp+LOCALS64+$in2_x+16]
2755 or $acc0,$t0,$acc0
2756 stx $a3,[%sp+LOCALS64+$in2_x+24]
2757 or $acc1,$t1,$acc1
2758 stx $acc0,[%sp+LOCALS64+$in2_y]
2759 or $acc2,$t2,$acc2
2760 stx $acc1,[%sp+LOCALS64+$in2_y+8]
2761 or $acc3,$t3,$acc3
2762 stx $acc2,[%sp+LOCALS64+$in2_y+16]
2763 stx $acc3,[%sp+LOCALS64+$in2_y+24]
2764
2765 or $a1,$a0,$a0
2766 or $a3,$a2,$a2
2767 or $acc1,$acc0,$acc0
2768 or $acc3,$acc2,$acc2
2769 or $a2,$a0,$a0
2770 or $acc2,$acc0,$acc0
2771 or $acc0,$a0,$a0
2772 movrnz $a0,-1,$a0 ! !in2infty
2773 stx $a0,[%fp+STACK_BIAS-8]
2774
2775 ld [$ap],$a0 ! in1_x
2776 ld [$ap+4],$t0
2777 ld [$ap+8],$a1
2778 ld [$ap+12],$t1
2779 ld [$ap+16],$a2
2780 ld [$ap+20],$t2
2781 ld [$ap+24],$a3
2782 ld [$ap+28],$t3
2783 sllx $t0,32,$t0
2784 sllx $t1,32,$t1
2785 ld [$ap+32],$acc0 ! in1_y
2786 or $a0,$t0,$a0
2787 ld [$ap+32+4],$t0
2788 sllx $t2,32,$t2
2789 ld [$ap+32+8],$acc1
2790 or $a1,$t1,$a1
2791 ld [$ap+32+12],$t1
2792 sllx $t3,32,$t3
2793 ld [$ap+32+16],$acc2
2794 or $a2,$t2,$a2
2795 ld [$ap+32+20],$t2
2796 or $a3,$t3,$a3
2797 ld [$ap+32+24],$acc3
2798 sllx $t0,32,$t0
2799 ld [$ap+32+28],$t3
2800 sllx $t1,32,$t1
2801 stx $a0,[%sp+LOCALS64+$in1_x]
2802 sllx $t2,32,$t2
2803 stx $a1,[%sp+LOCALS64+$in1_x+8]
2804 sllx $t3,32,$t3
2805 stx $a2,[%sp+LOCALS64+$in1_x+16]
2806 or $acc0,$t0,$acc0
2807 stx $a3,[%sp+LOCALS64+$in1_x+24]
2808 or $acc1,$t1,$acc1
2809 stx $acc0,[%sp+LOCALS64+$in1_y]
2810 or $acc2,$t2,$acc2
2811 stx $acc1,[%sp+LOCALS64+$in1_y+8]
2812 or $acc3,$t3,$acc3
2813 stx $acc2,[%sp+LOCALS64+$in1_y+16]
2814 stx $acc3,[%sp+LOCALS64+$in1_y+24]
2815
5557d5f2
AP
2816 ld [$ap+64],$a0 ! in1_z
2817 ld [$ap+64+4],$t0
2818 ld [$ap+64+8],$a1
2819 ld [$ap+64+12],$t1
2820 ld [$ap+64+16],$a2
2821 ld [$ap+64+20],$t2
2822 ld [$ap+64+24],$a3
2823 ld [$ap+64+28],$t3
2824 sllx $t0,32,$t0
2825 sllx $t1,32,$t1
2826 or $a0,$t0,$a0
2827 sllx $t2,32,$t2
2828 or $a1,$t1,$a1
2829 sllx $t3,32,$t3
2830 stx $a0,[%sp+LOCALS64+$in1_z]
2831 or $a2,$t2,$a2
2832 stx $a1,[%sp+LOCALS64+$in1_z+8]
2833 or $a3,$t3,$a3
2834 stx $a2,[%sp+LOCALS64+$in1_z+16]
2835 stx $a3,[%sp+LOCALS64+$in1_z+24]
2836
c74aea8d
AP
2837 or $a1,$a0,$t0
2838 or $a3,$a2,$t2
2839 or $t2,$t0,$t0
2840 movrnz $t0,-1,$t0 ! !in1infty
2841 stx $t0,[%fp+STACK_BIAS-16]
2842
5557d5f2
AP
2843 call __ecp_nistz256_sqr_mont_vis3 ! p256_sqr_mont(Z1sqr, in1_z);
2844 add %sp,LOCALS64+$Z1sqr,$rp
2845
2846 ldx [%sp+LOCALS64+$in2_x],$bi
2847 mov $acc0,$a0
2848 mov $acc1,$a1
2849 mov $acc2,$a2
2850 mov $acc3,$a3
2851 add %sp,LOCALS64+$in2_x,$bp
2852 call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(U2, Z1sqr, in2_x);
2853 add %sp,LOCALS64+$U2,$rp
2854
2855 ldx [%sp+LOCALS64+$Z1sqr],$bi ! forward load
2856 ldx [%sp+LOCALS64+$in1_z],$a0
2857 ldx [%sp+LOCALS64+$in1_z+8],$a1
2858 ldx [%sp+LOCALS64+$in1_z+16],$a2
2859 ldx [%sp+LOCALS64+$in1_z+24],$a3
2860
2861 add %sp,LOCALS64+$in1_x,$bp
2862 call __ecp_nistz256_sub_from_vis3 ! p256_sub(H, U2, in1_x);
2863 add %sp,LOCALS64+$H,$rp
2864
2865 add %sp,LOCALS64+$Z1sqr,$bp
2866 call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(S2, Z1sqr, in1_z);
2867 add %sp,LOCALS64+$S2,$rp
2868
2869 ldx [%sp+LOCALS64+$H],$bi
2870 ldx [%sp+LOCALS64+$in1_z],$a0
2871 ldx [%sp+LOCALS64+$in1_z+8],$a1
2872 ldx [%sp+LOCALS64+$in1_z+16],$a2
2873 ldx [%sp+LOCALS64+$in1_z+24],$a3
2874 add %sp,LOCALS64+$H,$bp
2875 call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(res_z, H, in1_z);
2876 add %sp,LOCALS64+$res_z,$rp
2877
2878 ldx [%sp+LOCALS64+$S2],$bi
2879 ldx [%sp+LOCALS64+$in2_y],$a0
2880 ldx [%sp+LOCALS64+$in2_y+8],$a1
2881 ldx [%sp+LOCALS64+$in2_y+16],$a2
2882 ldx [%sp+LOCALS64+$in2_y+24],$a3
2883 add %sp,LOCALS64+$S2,$bp
2884 call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(S2, S2, in2_y);
2885 add %sp,LOCALS64+$S2,$rp
2886
2887 ldx [%sp+LOCALS64+$H],$a0 ! forward load
2888 ldx [%sp+LOCALS64+$H+8],$a1
2889 ldx [%sp+LOCALS64+$H+16],$a2
2890 ldx [%sp+LOCALS64+$H+24],$a3
2891
2892 add %sp,LOCALS64+$in1_y,$bp
2893 call __ecp_nistz256_sub_from_vis3 ! p256_sub(R, S2, in1_y);
2894 add %sp,LOCALS64+$R,$rp
2895
2896 call __ecp_nistz256_sqr_mont_vis3 ! p256_sqr_mont(Hsqr, H);
2897 add %sp,LOCALS64+$Hsqr,$rp
2898
2899 ldx [%sp+LOCALS64+$R],$a0
2900 ldx [%sp+LOCALS64+$R+8],$a1
2901 ldx [%sp+LOCALS64+$R+16],$a2
2902 ldx [%sp+LOCALS64+$R+24],$a3
2903 call __ecp_nistz256_sqr_mont_vis3 ! p256_sqr_mont(Rsqr, R);
2904 add %sp,LOCALS64+$Rsqr,$rp
2905
2906 ldx [%sp+LOCALS64+$H],$bi
2907 ldx [%sp+LOCALS64+$Hsqr],$a0
2908 ldx [%sp+LOCALS64+$Hsqr+8],$a1
2909 ldx [%sp+LOCALS64+$Hsqr+16],$a2
2910 ldx [%sp+LOCALS64+$Hsqr+24],$a3
2911 add %sp,LOCALS64+$H,$bp
2912 call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(Hcub, Hsqr, H);
2913 add %sp,LOCALS64+$Hcub,$rp
2914
2915 ldx [%sp+LOCALS64+$Hsqr],$bi
2916 ldx [%sp+LOCALS64+$in1_x],$a0
2917 ldx [%sp+LOCALS64+$in1_x+8],$a1
2918 ldx [%sp+LOCALS64+$in1_x+16],$a2
2919 ldx [%sp+LOCALS64+$in1_x+24],$a3
2920 add %sp,LOCALS64+$Hsqr,$bp
2921 call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(U2, in1_x, Hsqr);
2922 add %sp,LOCALS64+$U2,$rp
2923
2924 call __ecp_nistz256_mul_by_2_vis3 ! p256_mul_by_2(Hsqr, U2);
2925 add %sp,LOCALS64+$Hsqr,$rp
2926
2927 add %sp,LOCALS64+$Rsqr,$bp
2928 call __ecp_nistz256_sub_morf_vis3 ! p256_sub(res_x, Rsqr, Hsqr);
2929 add %sp,LOCALS64+$res_x,$rp
2930
2931 add %sp,LOCALS64+$Hcub,$bp
2932 call __ecp_nistz256_sub_from_vis3 ! p256_sub(res_x, res_x, Hcub);
2933 add %sp,LOCALS64+$res_x,$rp
2934
2935 ldx [%sp+LOCALS64+$Hcub],$bi ! forward load
2936 ldx [%sp+LOCALS64+$in1_y],$a0
2937 ldx [%sp+LOCALS64+$in1_y+8],$a1
2938 ldx [%sp+LOCALS64+$in1_y+16],$a2
2939 ldx [%sp+LOCALS64+$in1_y+24],$a3
2940
2941 add %sp,LOCALS64+$U2,$bp
2942 call __ecp_nistz256_sub_morf_vis3 ! p256_sub(res_y, U2, res_x);
2943 add %sp,LOCALS64+$res_y,$rp
2944
2945 add %sp,LOCALS64+$Hcub,$bp
2946 call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(S2, in1_y, Hcub);
2947 add %sp,LOCALS64+$S2,$rp
2948
2949 ldx [%sp+LOCALS64+$R],$bi
2950 ldx [%sp+LOCALS64+$res_y],$a0
2951 ldx [%sp+LOCALS64+$res_y+8],$a1
2952 ldx [%sp+LOCALS64+$res_y+16],$a2
2953 ldx [%sp+LOCALS64+$res_y+24],$a3
2954 add %sp,LOCALS64+$R,$bp
2955 call __ecp_nistz256_mul_mont_vis3 ! p256_mul_mont(res_y, res_y, R);
2956 add %sp,LOCALS64+$res_y,$rp
2957
2958 add %sp,LOCALS64+$S2,$bp
2959 call __ecp_nistz256_sub_from_vis3 ! p256_sub(res_y, res_y, S2);
2960 add %sp,LOCALS64+$res_y,$rp
2961
2962 ldx [%fp+STACK_BIAS-16],$t1 ! !in1infty
2963 ldx [%fp+STACK_BIAS-8],$t2 ! !in2infty
29641: call .+8
2965 add %o7,.Lone_mont_vis3-1b,$bp
2966___
2967for($i=0;$i<64;$i+=16) { # conditional moves
2968$code.=<<___;
2969 ldx [%sp+LOCALS64+$res_x+$i],$acc0 ! res
2970 ldx [%sp+LOCALS64+$res_x+$i+8],$acc1
2971 ldx [%sp+LOCALS64+$in2_x+$i],$acc2 ! in2
2972 ldx [%sp+LOCALS64+$in2_x+$i+8],$acc3
2973 ldx [%sp+LOCALS64+$in1_x+$i],$acc4 ! in1
2974 ldx [%sp+LOCALS64+$in1_x+$i+8],$acc5
2975 movrz $t1,$acc2,$acc0
2976 movrz $t1,$acc3,$acc1
2977 movrz $t2,$acc4,$acc0
2978 movrz $t2,$acc5,$acc1
2979 srlx $acc0,32,$acc2
2980 srlx $acc1,32,$acc3
2981 st $acc0,[$rp_real+$i]
2982 st $acc2,[$rp_real+$i+4]
2983 st $acc1,[$rp_real+$i+8]
2984 st $acc3,[$rp_real+$i+12]
2985___
2986}
2987for(;$i<96;$i+=16) {
2988$code.=<<___;
2989 ldx [%sp+LOCALS64+$res_x+$i],$acc0 ! res
2990 ldx [%sp+LOCALS64+$res_x+$i+8],$acc1
2991 ldx [$bp+$i-64],$acc2 ! "in2"
2992 ldx [$bp+$i-64+8],$acc3
2993 ldx [%sp+LOCALS64+$in1_x+$i],$acc4 ! in1
2994 ldx [%sp+LOCALS64+$in1_x+$i+8],$acc5
2995 movrz $t1,$acc2,$acc0
2996 movrz $t1,$acc3,$acc1
2997 movrz $t2,$acc4,$acc0
2998 movrz $t2,$acc5,$acc1
2999 srlx $acc0,32,$acc2
3000 srlx $acc1,32,$acc3
3001 st $acc0,[$rp_real+$i]
3002 st $acc2,[$rp_real+$i+4]
3003 st $acc1,[$rp_real+$i+8]
3004 st $acc3,[$rp_real+$i+12]
3005___
3006}
3007$code.=<<___;
3008 ret
3009 restore
ff823ee8 3010.type ecp_nistz256_point_add_affine_vis3,#function
5557d5f2
AP
3011.size ecp_nistz256_point_add_affine_vis3,.-ecp_nistz256_point_add_affine_vis3
3012.align 64
3013.Lone_mont_vis3:
3014.long 0x00000000,0x00000001, 0xffffffff,0x00000000
3015.long 0xffffffff,0xffffffff, 0x00000000,0xfffffffe
3016.align 64
3017___
3018} }}}
3019\f
3020# Purpose of these subroutines is to explicitly encode VIS instructions,
3021# so that one can compile the module without having to specify VIS
3022# extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
3023# Idea is to reserve for option to produce "universal" binary and let
3024# programmer detect if current CPU is VIS capable at run-time.
3025sub unvis3 {
3026my ($mnemonic,$rs1,$rs2,$rd)=@_;
3027my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
3028my ($ref,$opf);
3029my %visopf = ( "addxc" => 0x011,
3030 "addxccc" => 0x013,
3031 "umulxhi" => 0x016 );
3032
3033 $ref = "$mnemonic\t$rs1,$rs2,$rd";
3034
3035 if ($opf=$visopf{$mnemonic}) {
3036 foreach ($rs1,$rs2,$rd) {
3037 return $ref if (!/%([goli])([0-9])/);
3038 $_=$bias{$1}+$2;
3039 }
3040
3041 return sprintf ".word\t0x%08x !%s",
3042 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
3043 $ref;
3044 } else {
3045 return $ref;
3046 }
3047}
3048
3049foreach (split("\n",$code)) {
3050 s/\`([^\`]*)\`/eval $1/ge;
3051
3052 s/\b(umulxhi|addxc[c]{0,2})\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
3053 &unvis3($1,$2,$3,$4)
3054 /ge;
3055
3056 print $_,"\n";
3057}
3058
a21314db 3059close STDOUT or die "error closing STDOUT: $!";