]> git.ipfire.org Git - thirdparty/openssl.git/blame - crypto/poly1305/asm/poly1305-mips.pl
Following the license change, modify the boilerplates in crypto/poly1305/
[thirdparty/openssl.git] / crypto / poly1305 / asm / poly1305-mips.pl
CommitLineData
6aa36e8e
RS
1#! /usr/bin/env perl
2# Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
3#
49d3b641 4# Licensed under the Apache License 2.0 (the "License"). You may not use
6aa36e8e
RS
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
c6b77c16
AP
9
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16
17# Poly1305 hash for MIPS64.
18#
19# May 2016
20#
21# Numbers are cycles per processed byte with poly1305_blocks alone.
22#
23# IALU/gcc
24# R1x000 5.64/+120% (big-endian)
25# Octeon II 3.80/+280% (little-endian)
26
27######################################################################
28# There is a number of MIPS ABI in use, O32 and N32/64 are most
29# widely used. Then there is a new contender: NUBI. It appears that if
30# one picks the latter, it's possible to arrange code in ABI neutral
31# manner. Therefore let's stick to NUBI register layout:
32#
33($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
34($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
35($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
36($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
37#
38# The return value is placed in $a0. Following coding rules facilitate
39# interoperability:
40#
41# - never ever touch $tp, "thread pointer", former $gp [o32 can be
42# excluded from the rule, because it's specified volatile];
43# - copy return value to $t0, former $v0 [or to $a0 if you're adapting
44# old code];
45# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
46#
47# For reference here is register layout for N32/64 MIPS ABIs:
48#
49# ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
50# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
51# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
52# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
53# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
54#
55# <appro@openssl.org>
56#
57######################################################################
58
59$flavour = shift || "o32"; # supported flavours are o32,n32,64,nubi32,nubi64
60
61die "MIPS64 only" unless ($flavour =~ /64|n32/i);
62
63$v0 = ($flavour =~ /nubi/i) ? $a0 : $t0;
64$SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x0003f000" : "0x00030000";
65
66($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3);
67($in0,$in1,$tmp0,$tmp1,$tmp2,$tmp3,$tmp4) = ($a4,$a5,$a6,$a7,$at,$t0,$t1);
68
69$code.=<<___;
947716c1
AP
70#include "mips_arch.h"
71
c6b77c16
AP
72#ifdef MIPSEB
73# define MSB 0
74# define LSB 7
75#else
76# define MSB 7
77# define LSB 0
78#endif
79
80.text
81.set noat
82.set noreorder
83
84.align 5
85.globl poly1305_init
86.ent poly1305_init
87poly1305_init:
88 .frame $sp,0,$ra
89 .set reorder
90
91 sd $zero,0($ctx)
92 sd $zero,8($ctx)
93 sd $zero,16($ctx)
94
95 beqz $inp,.Lno_key
96
947716c1
AP
97#if defined(_MIPS_ARCH_MIPS64R6)
98 ld $in0,0($inp)
99 ld $in1,8($inp)
100#else
c6b77c16
AP
101 ldl $in0,0+MSB($inp)
102 ldl $in1,8+MSB($inp)
103 ldr $in0,0+LSB($inp)
104 ldr $in1,8+LSB($inp)
947716c1 105#endif
c6b77c16
AP
106#ifdef MIPSEB
107# if defined(_MIPS_ARCH_MIPS64R2)
108 dsbh $in0,$in0 # byte swap
109 dsbh $in1,$in1
110 dshd $in0,$in0
111 dshd $in1,$in1
112# else
113 ori $tmp0,$zero,0xFF
114 dsll $tmp2,$tmp0,32
115 or $tmp0,$tmp2 # 0x000000FF000000FF
116
117 and $tmp1,$in0,$tmp0 # byte swap
118 and $tmp3,$in1,$tmp0
119 dsrl $tmp2,$in0,24
120 dsrl $tmp4,$in1,24
121 dsll $tmp1,24
122 dsll $tmp3,24
123 and $tmp2,$tmp0
124 and $tmp4,$tmp0
125 dsll $tmp0,8 # 0x0000FF000000FF00
126 or $tmp1,$tmp2
127 or $tmp3,$tmp4
128 and $tmp2,$in0,$tmp0
129 and $tmp4,$in1,$tmp0
130 dsrl $in0,8
131 dsrl $in1,8
132 dsll $tmp2,8
133 dsll $tmp4,8
134 and $in0,$tmp0
135 and $in1,$tmp0
136 or $tmp1,$tmp2
137 or $tmp3,$tmp4
138 or $in0,$tmp1
139 or $in1,$tmp3
140 dsrl $tmp1,$in0,32
141 dsrl $tmp3,$in1,32
142 dsll $in0,32
143 dsll $in1,32
144 or $in0,$tmp1
145 or $in1,$tmp3
146# endif
147#endif
148 li $tmp0,1
149 dsll $tmp0,32
150 daddiu $tmp0,-63
151 dsll $tmp0,28
152 daddiu $tmp0,-1 # 0ffffffc0fffffff
153
154 and $in0,$tmp0
155 daddiu $tmp0,-3 # 0ffffffc0ffffffc
156 and $in1,$tmp0
157
158 sd $in0,24($ctx)
159 dsrl $tmp0,$in1,2
160 sd $in1,32($ctx)
161 daddu $tmp0,$in1 # s1 = r1 + (r1 >> 2)
162 sd $tmp0,40($ctx)
163
164.Lno_key:
165 li $v0,0 # return 0
166 jr $ra
167.end poly1305_init
168___
169{
170my ($h0,$h1,$h2,$r0,$r1,$s1,$d0,$d1,$d2) =
171 ($s0,$s1,$s2,$s3,$s4,$s5,$in0,$in1,$t2);
172
173$code.=<<___;
174.align 5
175.globl poly1305_blocks
176.ent poly1305_blocks
177poly1305_blocks:
178 .set noreorder
179 dsrl $len,4 # number of complete blocks
8640f210 180 bnez $len,poly1305_blocks_internal
c6b77c16 181 nop
8640f210
AP
182 jr $ra
183 nop
184.end poly1305_blocks
c6b77c16 185
8640f210
AP
186.align 5
187.ent poly1305_blocks_internal
188poly1305_blocks_internal:
189 .frame $sp,6*8,$ra
c6b77c16 190 .mask $SAVED_REGS_MASK,-8
8640f210 191 .set noreorder
947716c1 192 dsubu $sp,6*8
8640f210
AP
193 sd $s5,40($sp)
194 sd $s4,32($sp)
c6b77c16
AP
195___
196$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
8640f210
AP
197 sd $s3,24($sp)
198 sd $s2,16($sp)
199 sd $s1,8($sp)
200 sd $s0,0($sp)
c6b77c16
AP
201___
202$code.=<<___;
203 .set reorder
204
205 ld $h0,0($ctx) # load hash value
206 ld $h1,8($ctx)
207 ld $h2,16($ctx)
208
209 ld $r0,24($ctx) # load key
210 ld $r1,32($ctx)
211 ld $s1,40($ctx)
212
213.Loop:
947716c1
AP
214#if defined(_MIPS_ARCH_MIPS64R6)
215 ld $in0,0($inp) # load input
216 ld $in1,8($inp)
217#else
c6b77c16
AP
218 ldl $in0,0+MSB($inp) # load input
219 ldl $in1,8+MSB($inp)
220 ldr $in0,0+LSB($inp)
c6b77c16 221 ldr $in1,8+LSB($inp)
947716c1
AP
222#endif
223 daddiu $len,-1
c6b77c16
AP
224 daddiu $inp,16
225#ifdef MIPSEB
226# if defined(_MIPS_ARCH_MIPS64R2)
227 dsbh $in0,$in0 # byte swap
228 dsbh $in1,$in1
229 dshd $in0,$in0
230 dshd $in1,$in1
231# else
232 ori $tmp0,$zero,0xFF
233 dsll $tmp2,$tmp0,32
234 or $tmp0,$tmp2 # 0x000000FF000000FF
235
236 and $tmp1,$in0,$tmp0 # byte swap
237 and $tmp3,$in1,$tmp0
238 dsrl $tmp2,$in0,24
239 dsrl $tmp4,$in1,24
240 dsll $tmp1,24
241 dsll $tmp3,24
242 and $tmp2,$tmp0
243 and $tmp4,$tmp0
244 dsll $tmp0,8 # 0x0000FF000000FF00
245 or $tmp1,$tmp2
246 or $tmp3,$tmp4
247 and $tmp2,$in0,$tmp0
248 and $tmp4,$in1,$tmp0
249 dsrl $in0,8
250 dsrl $in1,8
251 dsll $tmp2,8
252 dsll $tmp4,8
253 and $in0,$tmp0
254 and $in1,$tmp0
255 or $tmp1,$tmp2
256 or $tmp3,$tmp4
257 or $in0,$tmp1
258 or $in1,$tmp3
259 dsrl $tmp1,$in0,32
260 dsrl $tmp3,$in1,32
261 dsll $in0,32
262 dsll $in1,32
263 or $in0,$tmp1
264 or $in1,$tmp3
265# endif
266#endif
267 daddu $h0,$in0 # accumulate input
268 daddu $h1,$in1
269 sltu $tmp0,$h0,$in0
270 sltu $tmp1,$h1,$in1
271 daddu $h1,$tmp0
272
947716c1 273 dmultu ($r0,$h0) # h0*r0
c6b77c16
AP
274 daddu $h2,$padbit
275 sltu $tmp0,$h1,$tmp0
947716c1
AP
276 mflo ($d0,$r0,$h0)
277 mfhi ($d1,$r0,$h0)
c6b77c16 278
947716c1 279 dmultu ($s1,$h1) # h1*5*r1
c6b77c16
AP
280 daddu $tmp0,$tmp1
281 daddu $h2,$tmp0
947716c1
AP
282 mflo ($tmp0,$s1,$h1)
283 mfhi ($tmp1,$s1,$h1)
c6b77c16 284
947716c1 285 dmultu ($r1,$h0) # h0*r1
c6b77c16
AP
286 daddu $d0,$tmp0
287 daddu $d1,$tmp1
947716c1
AP
288 mflo ($tmp2,$r1,$h0)
289 mfhi ($d2,$r1,$h0)
c6b77c16
AP
290 sltu $tmp0,$d0,$tmp0
291 daddu $d1,$tmp0
292
947716c1 293 dmultu ($r0,$h1) # h1*r0
c6b77c16
AP
294 daddu $d1,$tmp2
295 sltu $tmp2,$d1,$tmp2
947716c1
AP
296 mflo ($tmp0,$r0,$h1)
297 mfhi ($tmp1,$r0,$h1)
c6b77c16
AP
298 daddu $d2,$tmp2
299
947716c1 300 dmultu ($s1,$h2) # h2*5*r1
c6b77c16
AP
301 daddu $d1,$tmp0
302 daddu $d2,$tmp1
947716c1 303 mflo ($tmp2,$s1,$h2)
c6b77c16 304
947716c1 305 dmultu ($r0,$h2) # h2*r0
c6b77c16
AP
306 sltu $tmp0,$d1,$tmp0
307 daddu $d2,$tmp0
947716c1 308 mflo ($tmp3,$r0,$h2)
c6b77c16
AP
309
310 daddu $d1,$tmp2
311 daddu $d2,$tmp3
312 sltu $tmp2,$d1,$tmp2
313 daddu $d2,$tmp2
314
315 li $tmp0,-4 # final reduction
316 and $tmp0,$d2
317 dsrl $tmp1,$d2,2
318 andi $h2,$d2,3
319 daddu $tmp0,$tmp1
320 daddu $h0,$d0,$tmp0
321 sltu $tmp0,$h0,$tmp0
322 daddu $h1,$d1,$tmp0
323 sltu $tmp0,$h1,$tmp0
324 daddu $h2,$h2,$tmp0
325
326 bnez $len,.Loop
327
328 sd $h0,0($ctx) # store hash value
329 sd $h1,8($ctx)
330 sd $h2,16($ctx)
331
332 .set noreorder
8640f210
AP
333 ld $s5,40($sp) # epilogue
334 ld $s4,32($sp)
c6b77c16
AP
335___
336$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi epilogue
8640f210
AP
337 ld $s3,24($sp)
338 ld $s2,16($sp)
339 ld $s1,8($sp)
340 ld $s0,0($sp)
c6b77c16
AP
341___
342$code.=<<___;
c6b77c16 343 jr $ra
947716c1 344 daddu $sp,6*8
8640f210 345.end poly1305_blocks_internal
c6b77c16
AP
346___
347}
348{
349my ($ctx,$mac,$nonce) = ($a0,$a1,$a2);
350
351$code.=<<___;
352.align 5
353.globl poly1305_emit
354.ent poly1305_emit
355poly1305_emit:
356 .frame $sp,0,$ra
357 .set reorder
358
359 ld $tmp0,0($ctx)
360 ld $tmp1,8($ctx)
361 ld $tmp2,16($ctx)
362
363 daddiu $in0,$tmp0,5 # compare to modulus
364 sltiu $tmp3,$in0,5
365 daddu $in1,$tmp1,$tmp3
366 sltu $tmp3,$in1,$tmp3
367 daddu $tmp2,$tmp2,$tmp3
368
369 dsrl $tmp2,2 # see if it carried/borrowed
370 dsubu $tmp2,$zero,$tmp2
371 nor $tmp3,$zero,$tmp2
372
373 and $in0,$tmp2
374 and $tmp0,$tmp3
375 and $in1,$tmp2
376 and $tmp1,$tmp3
377 or $in0,$tmp0
378 or $in1,$tmp1
379
380 lwu $tmp0,0($nonce) # load nonce
381 lwu $tmp1,4($nonce)
382 lwu $tmp2,8($nonce)
383 lwu $tmp3,12($nonce)
384 dsll $tmp1,32
385 dsll $tmp3,32
386 or $tmp0,$tmp1
387 or $tmp2,$tmp3
388
389 daddu $in0,$tmp0 # accumulate nonce
390 daddu $in1,$tmp2
391 sltu $tmp0,$in0,$tmp0
392 daddu $in1,$tmp0
393
394 dsrl $tmp0,$in0,8 # write mac value
395 dsrl $tmp1,$in0,16
396 dsrl $tmp2,$in0,24
397 sb $in0,0($mac)
398 dsrl $tmp3,$in0,32
399 sb $tmp0,1($mac)
400 dsrl $tmp0,$in0,40
401 sb $tmp1,2($mac)
402 dsrl $tmp1,$in0,48
403 sb $tmp2,3($mac)
404 dsrl $tmp2,$in0,56
405 sb $tmp3,4($mac)
406 dsrl $tmp3,$in1,8
407 sb $tmp0,5($mac)
408 dsrl $tmp0,$in1,16
409 sb $tmp1,6($mac)
410 dsrl $tmp1,$in1,24
411 sb $tmp2,7($mac)
412
413 sb $in1,8($mac)
414 dsrl $tmp2,$in1,32
415 sb $tmp3,9($mac)
416 dsrl $tmp3,$in1,40
417 sb $tmp0,10($mac)
418 dsrl $tmp0,$in1,48
419 sb $tmp1,11($mac)
420 dsrl $tmp1,$in1,56
421 sb $tmp2,12($mac)
422 sb $tmp3,13($mac)
423 sb $tmp0,14($mac)
424 sb $tmp1,15($mac)
425
426 jr $ra
427.end poly1305_emit
428.rdata
429.asciiz "Poly1305 for MIPS64, CRYPTOGAMS by <appro\@openssl.org>"
430.align 2
431___
432}
433
434$output=pop and open STDOUT,">$output";
435print $code;
436close STDOUT;
437