+2013-03-07 Niels Möller <nisse@lysator.liu.se>
+
+ * x86_64/ecc-521-modp.asm: New file. 2.4 time speedup.
+
2013-03-06 Niels Möller <nisse@lysator.liu.se>
* x86_64/ecc-384-modp.asm: New file, 3 time speedup.
--- /dev/null
+C nettle, low-level cryptographics library
+C
+C Copyright (C) 2013 Niels Möller
+C
+C The nettle library is free software; you can redistribute it and/or modify
+C it under the terms of the GNU Lesser General Public License as published by
+C the Free Software Foundation; either version 2.1 of the License, or (at your
+C option) any later version.
+C
+C The nettle library is distributed in the hope that it will be useful, but
+C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+C or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+C License for more details.
+C
+C You should have received a copy of the GNU Lesser General Public License
+C along with the nettle library; see the file COPYING.LIB. If not, write to
+C the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+C MA 02111-1301, USA.
+
+ .file "ecc-521-modp.asm"
+
+define(<RP>, <%rsi>)
+define(<U0>, <%rax>)
+define(<U1>, <%rbx>)
+define(<U2>, <%rcx>)
+define(<U3>, <%rdx>)
+define(<U4>, <%rbp>)
+define(<U5>, <%rdi>)
+define(<U6>, <%r8>)
+define(<U7>, <%r9>)
+define(<U8>, <%r10>)
+define(<U9>, <%r11>)
+define(<T0>, <%r12>)
+define(<T1>, <%r13>)
+
+PROLOGUE(nettle_ecc_521_modp)
+ W64_ENTRY(2, 0)
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+
+ C Read top 17 limbs, shift left 55 bits
+ mov 72(RP), U1
+ mov U1, U0
+ shl $55, U0
+ shr $9, U1
+
+ mov 80(RP), U2
+ mov U2, T0
+ shr $9, U2
+ shl $55, T0
+ or T0, U1
+
+ mov 88(RP), U3
+ mov U3, T0
+ shr $9, U3
+ shl $55, T0
+ or T0, U2
+
+ mov 96(RP), U4
+ mov U4, T0
+ shr $9, U4
+ shl $55, T0
+ or T0, U3
+
+ mov 104(RP), U5
+ mov U5, T0
+ shr $9, U5
+ shl $55, T0
+ or T0, U4
+
+ mov 112(RP), U6
+ mov U6, T0
+ shr $9, U6
+ shl $55, T0
+ or T0, U5
+
+ mov 120(RP), U7
+ mov U7, T0
+ shr $9, U7
+ shl $55, T0
+ or T0, U6
+
+ mov 128(RP), U8
+ mov U8, T0
+ shr $9, U8
+ shl $55, T0
+ or T0, U7
+
+ mov 136(RP), U9
+ mov U9, T0
+ shr $9, U9
+ shl $55, T0
+ or T0, U8
+
+ add (RP), U0
+ adc 8(RP), U1
+ adc 16(RP), U2
+ adc 24(RP), U3
+ adc 32(RP), U4
+ adc 40(RP), U5
+ adc 48(RP), U6
+ adc 56(RP), U7
+ adc 64(RP), U8
+ adc $0, U9
+
+ C Top limbs are <U9, U8>. Keep low 9 bits of 8, and fold the
+ C top bits (at most 65 bits).
+ mov U8, T0
+ shr $9, T0
+ and $0x1ff, U8
+ mov U9, T1
+ shl $55, U9
+ shr $9, T1
+ or U9, T0
+
+ add T0, U0
+ mov U0, (RP)
+ adc T1, U1
+ mov U1, 8(RP)
+ adc $0, U2
+ mov U2, 16(RP)
+ adc $0, U3
+ mov U3, 24(RP)
+ adc $0, U4
+ mov U4, 32(RP)
+ adc $0, U5
+ mov U5, 40(RP)
+ adc $0, U6
+ mov U6, 48(RP)
+ adc $0, U7
+ mov U7, 56(RP)
+ adc $0, U8
+ mov U8, 64(RP)
+
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ W64_EXIT(2, 0)
+ ret
+EPILOGUE(nettle_ecc_521_modp)