]> git.ipfire.org Git - thirdparty/openssl.git/commitdiff
Always end BN_mod_exp_mont_consttime with normal Montgomery reduction.
authorTomas Mraz <tomas@openssl.org>
Thu, 9 Jun 2022 10:34:55 +0000 (12:34 +0200)
committerTomas Mraz <tomas@openssl.org>
Thu, 16 Jun 2022 13:22:35 +0000 (15:22 +0200)
This partially fixes a bug where, on x86_64, BN_mod_exp_mont_consttime
would sometimes return m, the modulus, when it should have returned
zero. Thanks to Guido Vranken for reporting it. It is only a partial fix
because the same bug also exists in the "rsaz" codepath.

The bug only affects zero outputs (with non-zero inputs), so we believe
it has no security impact on our cryptographic functions.

The fx is to delete lowercase bn_from_montgomery altogether, and have the
mont5 path use the same BN_from_montgomery ending as the non-mont5 path.
This only impacts the final step of the whole exponentiation and has no
measurable perf impact.

See the original BoringSSL commit
https://boringssl.googlesource.com/boringssl/+/13c9d5c69d04485a7a8840c12185c832026c8315
for further analysis.

Original-author: David Benjamin <davidben@google.com>

Reviewed-by: Matt Caswell <matt@openssl.org>
Reviewed-by: Paul Dale <pauli@openssl.org>
(Merged from https://github.com/openssl/openssl/pull/18510)

crypto/bn/asm/x86_64-mont5.pl
crypto/bn/bn_exp.c
test/recipes/10-test_bn_data/bnmod.txt

index c03473f13ae17fdc543a1f59010b77edd8adb9da..1faea0bcf89723774a0e3c3719e1430c9497a9f5 100755 (executable)
@@ -2103,193 +2103,6 @@ __bn_post4x_internal:
 .size  __bn_post4x_internal,.-__bn_post4x_internal
 ___
 }
-{
-$code.=<<___;
-.globl bn_from_montgomery
-.type  bn_from_montgomery,\@abi-omnipotent
-.align 32
-bn_from_montgomery:
-.cfi_startproc
-       testl   \$7,`($win64?"48(%rsp)":"%r9d")`
-       jz      bn_from_mont8x
-       xor     %eax,%eax
-       ret
-.cfi_endproc
-.size  bn_from_montgomery,.-bn_from_montgomery
-
-.type  bn_from_mont8x,\@function,6
-.align 32
-bn_from_mont8x:
-.cfi_startproc
-       .byte   0x67
-       mov     %rsp,%rax
-.cfi_def_cfa_register  %rax
-       push    %rbx
-.cfi_push      %rbx
-       push    %rbp
-.cfi_push      %rbp
-       push    %r12
-.cfi_push      %r12
-       push    %r13
-.cfi_push      %r13
-       push    %r14
-.cfi_push      %r14
-       push    %r15
-.cfi_push      %r15
-.Lfrom_prologue:
-
-       shl     \$3,${num}d             # convert $num to bytes
-       lea     ($num,$num,2),%r10      # 3*$num in bytes
-       neg     $num
-       mov     ($n0),$n0               # *n0
-
-       ##############################################################
-       # Ensure that stack frame doesn't alias with $rptr+3*$num
-       # modulo 4096, which covers ret[num], am[num] and n[num]
-       # (see bn_exp.c). The stack is allocated to aligned with
-       # bn_power5's frame, and as bn_from_montgomery happens to be
-       # last operation, we use the opportunity to cleanse it.
-       #
-       lea     -320(%rsp,$num,2),%r11
-       mov     %rsp,%rbp
-       sub     $rptr,%r11
-       and     \$4095,%r11
-       cmp     %r11,%r10
-       jb      .Lfrom_sp_alt
-       sub     %r11,%rbp               # align with $aptr
-       lea     -320(%rbp,$num,2),%rbp  # future alloca(frame+2*$num*8+256)
-       jmp     .Lfrom_sp_done
-
-.align 32
-.Lfrom_sp_alt:
-       lea     4096-320(,$num,2),%r10
-       lea     -320(%rbp,$num,2),%rbp  # future alloca(frame+2*$num*8+256)
-       sub     %r10,%r11
-       mov     \$0,%r10
-       cmovc   %r10,%r11
-       sub     %r11,%rbp
-.Lfrom_sp_done:
-       and     \$-64,%rbp
-       mov     %rsp,%r11
-       sub     %rbp,%r11
-       and     \$-4096,%r11
-       lea     (%rbp,%r11),%rsp
-       mov     (%rsp),%r10
-       cmp     %rbp,%rsp
-       ja      .Lfrom_page_walk
-       jmp     .Lfrom_page_walk_done
-
-.Lfrom_page_walk:
-       lea     -4096(%rsp),%rsp
-       mov     (%rsp),%r10
-       cmp     %rbp,%rsp
-       ja      .Lfrom_page_walk
-.Lfrom_page_walk_done:
-
-       mov     $num,%r10
-       neg     $num
-
-       ##############################################################
-       # Stack layout
-       #
-       # +0    saved $num, used in reduction section
-       # +8    &t[2*$num], used in reduction section
-       # +32   saved *n0
-       # +40   saved %rsp
-       # +48   t[2*$num]
-       #
-       mov     $n0,  32(%rsp)
-       mov     %rax, 40(%rsp)          # save original %rsp
-.cfi_cfa_expression    %rsp+40,deref,+8
-.Lfrom_body:
-       mov     $num,%r11
-       lea     48(%rsp),%rax
-       pxor    %xmm0,%xmm0
-       jmp     .Lmul_by_1
-
-.align 32
-.Lmul_by_1:
-       movdqu  ($aptr),%xmm1
-       movdqu  16($aptr),%xmm2
-       movdqu  32($aptr),%xmm3
-       movdqa  %xmm0,(%rax,$num)
-       movdqu  48($aptr),%xmm4
-       movdqa  %xmm0,16(%rax,$num)
-       .byte   0x48,0x8d,0xb6,0x40,0x00,0x00,0x00      # lea   64($aptr),$aptr
-       movdqa  %xmm1,(%rax)
-       movdqa  %xmm0,32(%rax,$num)
-       movdqa  %xmm2,16(%rax)
-       movdqa  %xmm0,48(%rax,$num)
-       movdqa  %xmm3,32(%rax)
-       movdqa  %xmm4,48(%rax)
-       lea     64(%rax),%rax
-       sub     \$64,%r11
-       jnz     .Lmul_by_1
-
-       movq    $rptr,%xmm1
-       movq    $nptr,%xmm2
-       .byte   0x67
-       mov     $nptr,%rbp
-       movq    %r10, %xmm3             # -num
-___
-$code.=<<___ if ($addx);
-       mov     OPENSSL_ia32cap_P+8(%rip),%r11d
-       and     \$0x80108,%r11d
-       cmp     \$0x80108,%r11d         # check for AD*X+BMI2+BMI1
-       jne     .Lfrom_mont_nox
-
-       lea     (%rax,$num),$rptr
-       call    __bn_sqrx8x_reduction
-       call    __bn_postx4x_internal
-
-       pxor    %xmm0,%xmm0
-       lea     48(%rsp),%rax
-       jmp     .Lfrom_mont_zero
-
-.align 32
-.Lfrom_mont_nox:
-___
-$code.=<<___;
-       call    __bn_sqr8x_reduction
-       call    __bn_post4x_internal
-
-       pxor    %xmm0,%xmm0
-       lea     48(%rsp),%rax
-       jmp     .Lfrom_mont_zero
-
-.align 32
-.Lfrom_mont_zero:
-       mov     40(%rsp),%rsi           # restore %rsp
-.cfi_def_cfa   %rsi,8
-       movdqa  %xmm0,16*0(%rax)
-       movdqa  %xmm0,16*1(%rax)
-       movdqa  %xmm0,16*2(%rax)
-       movdqa  %xmm0,16*3(%rax)
-       lea     16*4(%rax),%rax
-       sub     \$32,$num
-       jnz     .Lfrom_mont_zero
-
-       mov     \$1,%rax
-       mov     -48(%rsi),%r15
-.cfi_restore   %r15
-       mov     -40(%rsi),%r14
-.cfi_restore   %r14
-       mov     -32(%rsi),%r13
-.cfi_restore   %r13
-       mov     -24(%rsi),%r12
-.cfi_restore   %r12
-       mov     -16(%rsi),%rbp
-.cfi_restore   %rbp
-       mov     -8(%rsi),%rbx
-.cfi_restore   %rbx
-       lea     (%rsi),%rsp
-.cfi_def_cfa_register  %rsp
-.Lfrom_epilogue:
-       ret
-.cfi_endproc
-.size  bn_from_mont8x,.-bn_from_mont8x
-___
-}
 }}}
 \f
 if ($addx) {{{
@@ -3896,10 +3709,6 @@ mul_handler:
        .rva    .LSEH_begin_bn_power5
        .rva    .LSEH_end_bn_power5
        .rva    .LSEH_info_bn_power5
-
-       .rva    .LSEH_begin_bn_from_mont8x
-       .rva    .LSEH_end_bn_from_mont8x
-       .rva    .LSEH_info_bn_from_mont8x
 ___
 $code.=<<___ if ($addx);
        .rva    .LSEH_begin_bn_mulx4x_mont_gather5
@@ -3931,11 +3740,6 @@ $code.=<<___;
        .byte   9,0,0,0
        .rva    mul_handler
        .rva    .Lpower5_prologue,.Lpower5_body,.Lpower5_epilogue       # HandlerData[]
-.align 8
-.LSEH_info_bn_from_mont8x:
-       .byte   9,0,0,0
-       .rva    mul_handler
-       .rva    .Lfrom_prologue,.Lfrom_body,.Lfrom_epilogue             # HandlerData[]
 ___
 $code.=<<___ if ($addx);
 .align 8
index 89ce77fbb772d64cccdd1826e1734b660a1ed580..c7b62232f3a6a243b0f69d89305d7ab3a143abe2 100644 (file)
@@ -897,14 +897,21 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
 #if defined(OPENSSL_BN_ASM_MONT5)
     if (window == 5 && top > 1) {
         /*
-         * This optimization uses ideas from http://eprint.iacr.org/2011/239,
-         * specifically optimization of cache-timing attack countermeasures
-         * and pre-computation optimization.
-         */
-
-        /*
-         * Dedicated window==4 case improves 512-bit RSA sign by ~15%, but as
-         * 512-bit RSA is hardly relevant, we omit it to spare size...
+         * This optimization uses ideas from https://eprint.iacr.org/2011/239,
+         * specifically optimization of cache-timing attack countermeasures,
+         * pre-computation optimization, and Almost Montgomery Multiplication.
+         *
+         * The paper discusses a 4-bit window to optimize 512-bit modular
+         * exponentiation, used in RSA-1024 with CRT, but RSA-1024 is no longer
+         * important.
+         *
+         * |bn_mul_mont_gather5| and |bn_power5| implement the "almost"
+         * reduction variant, so the values here may not be fully reduced.
+         * They are bounded by R (i.e. they fit in |top| words), not |m|.
+         * Additionally, we pass these "almost" reduced inputs into
+         * |bn_mul_mont|, which implements the normal reduction variant.
+         * Given those inputs, |bn_mul_mont| may not give reduced
+         * output, but it will still produce "almost" reduced output.
          */
         void bn_mul_mont_gather5(BN_ULONG *rp, const BN_ULONG *ap,
                                  const void *table, const BN_ULONG *np,
@@ -916,9 +923,6 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
                        const void *table, const BN_ULONG *np,
                        const BN_ULONG *n0, int num, int power);
         int bn_get_bits5(const BN_ULONG *ap, int off);
-        int bn_from_montgomery(BN_ULONG *rp, const BN_ULONG *ap,
-                               const BN_ULONG *not_used, const BN_ULONG *np,
-                               const BN_ULONG *n0, int num);
 
         BN_ULONG *n0 = mont->n0, *np;
 
@@ -1007,14 +1011,18 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
             }
         }
 
-        ret = bn_from_montgomery(tmp.d, tmp.d, NULL, np, n0, top);
         tmp.top = top;
-        bn_correct_top(&tmp);
-        if (ret) {
-            if (!BN_copy(rr, &tmp))
-                ret = 0;
-            goto err;           /* non-zero ret means it's not error */
-        }
+        /*
+         * The result is now in |tmp| in Montgomery form, but it may not be
+         * fully reduced. This is within bounds for |BN_from_montgomery|
+         * (tmp < R <= m*R) so it will, when converting from Montgomery form,
+         * produce a fully reduced result.
+         *
+         * This differs from Figure 2 of the paper, which uses AMM(h, 1) to
+         * convert from Montgomery form with unreduced output, followed by an
+         * extra reduction step. In the paper's terminology, we replace
+         * steps 9 and 10 with MM(h, 1).
+         */
     } else
 #endif
     {
index 6b040d395ca131465ac3a55d993934d1e4fd0bda..cc941047f5b6b2fe65a535b63db477af69bffdf5 100644 (file)
@@ -2474,6 +2474,73 @@ A = 9025e6183706105e948b1b0edf922f9011b9e11887d70adb00b26f272b9e76a38f3099084d9c
 E = d7e6df5d755284929b986cd9b61c9c2c8843f24c711fbdbae1a468edcae159400943725570726cdc92b3ea94f9f206729516fdda83e31d815b0c7720e7598a91d992273e3bd8ac413b441d8f1dfe5aa7c3bf3ef573adc38292676217467731e6cf440a59611b8110af88d3e62f60209b513b01fbb69a097458ad02096b5e38f0
 M = e4e784aa1fa88625a43ba0185a153a929663920be7fe674a4d33c943d3b898cff051482e7050a070cede53be5e89f31515772c7aea637576f99f82708f89d9e244f6ad3a24a02cbe5c0ff7bcf2dad5491f53db7c3f2698a7c41b44f086652f17bb05fe4c5c0a92433c34086b49d7e1825b28bab6c5a9bd0bc95b53d659afa0d7
 
+# The following inputs trigger an edge case between Montgomery reduction and the
+# "almost" reduction variant from https://eprint.iacr.org/2011/239
+ModExp = 00
+A = 19c7bc9b97c6083cd7b8d1cd001452c9b67983247169c6532047eb7fc8933014dbf69fee7a358769f1429802c8ea89d4f9ca6ba6f368fbdb1fa5717b4a00
+E = bbc7e09147408571050e8d0c634682c5863b7e8a573626648902cff12e590c74f5a23ecce39732266bc15b8afbd6c48a48c83fbdc33947515cc0b6e4fb98ae2cd730e58f951fec8be7e2e3c74f4506c7fd7e29bdb28675fe8a59789ab1148e931a2ebd2d36f78bc241682a3d8083d8ff538858cd240c5a693936e5a391dc9d77118062a3f868c058440a4192267faaaba91112f45eee5842060febbf9353a6d3e7f7996573209136a5506062ea23d74067f08c613f3ff74bade25f8c3368e6dba84eae672eac11be1137fc514924fcab8c82e46d092bd047dcbadaa48c67a096ec1a04f392a8511e6acbad9954949b703e71ff837337b594055ae6f3c0fc154447a687c9ac8a2cdfd64a2e680c6ff21254735af7f5eb6b43f0bce86bda55a04143a991711081435ed4f4a89b23fc3a588022b7a8543db4bf5c8ac93603367c750ff2191f59a716340fab49bb7544759c8d846465eec1438e76395f73e7b5e945f31f1b87fefa854a0d208846eaab5fa27144fd039911608bab0eaee80f1d3553dfa2d9ba95268479b97a059613660df5ad79796e0b272244aca90ccc13449ec15c206eeed7b60405a4c5cfdf5da5d136c27fa9385d810ad198dfe794ffce9955e10520efea1e2eb794e379401b9affd863b9566ce941c4726755574a1b1946acf0090bfb93f37dd55f524485bbba7fa84b53addfde01ae1de9c57fe50d4b708dd0fa45d02af398b3d05c6d17f84c11e9aacdbe0b146cad6ddbd877731e26a17f3ebed459560d12ed7a6abc2ea6fe922e69d2622ef11b6b245b9ba8f0940faaa671a4beb727be5393a94dafaeff7221b29183e7418f4c5bb95a6a586c93dbc8ce0236d9dbe26c40513611b4141fed66599adbfb20fc30e09a4815e4159f65a6708f34584a7a77b3843941cd61a6917dcc3d07a3dfb5a2cb108bacea7e782f2111b4d22ecaaeff469ecd0da371df1ac5e9bf6df6ccba2d3a9f393d597499eaca2c206bfb81c3426c5fe45bcf16e38aecd246a319a1f37041c638b75a4839517e43a6d01bee7d85eaeedbce13cd15699d3ee42c7414cfed576590e4fb6ddb6edd3e1957efaf039bfe8b9dc75869b1f93abff15cae8b234161070fa3542303c2ed35ca66083d0ac299b81182317a2a3985269602b1fa1e822fcbda48e686d80b273f06b0a702ca7f42cbbbd2fc2b3601422c8bff6302eda3c61b293049636002649b16f3c1f0be2b6599d66493a4497cd795b10a2ab8220fafad24fa90e1bfcf39ecce337e705695c7a224bf9f445a287d6aab221341659ca4be7861f6ac4c9d33dac811e6
+M = 519b6e57781d40d897ec0c1b648d195526726b295438c9a70928ac25979563d72db91c8c42298a33b572edecdf40904c68a23337aa5341b56e92b0da5041
+
+# To fully exercise BN_mod_exp_mont_consttime codepaths, we generate inputs at
+# different bitwidths. rsaz-avx2.pl only runs at 1024-bit moduli, and
+# x86_64-mont5.pl unrolls 8 64-bit words at a time, so we want to capture both
+# multiples of 512- and non-multiples. Also include moduli that are not quite a
+# full word.
+# 512-bit
+ModExp = 00
+A = 8000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000e
+E = ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
+M = 8f42c9e9e351ba9b32ab0cf69da43f4acf7028d19cff6e5059ea0e3fcc97c97f36a31470044737d4c0c933ac441ecb29e32c81401523afdac7de9c3fd8493c97
+
+# 1024-bit
+# TODO(davidben): This test breaks the RSAZ implementation. Fix it and enable
+# this test.
+# ModExp = 00
+# A = 800000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000002f
+# E = ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
+# M = 9da8dc26fdf4d2e49833b240ee552beb7a6e251caa91bfb5d6cafaf8ed9461877fda8f6ac299036d35806bc1ae7872e54eaac1ec6bee6d02c6621a9cf8883b3abc33c49b3e601203e0e86ef8f0562412cc689ee2670704583909ca6d7774c9f9f9f4d77d37fedef9cb51d207cb629ec02fa03b526fd6594bfa8f2da71238a0b7
+
+# 1025-bit
+ModExp = 00
+A = 010000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000011
+E = ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
+M = 010223abfdda02e84e11cec8ee7fc784fa135733935f7b9054bb70f1f06d234d76dcf3beed55c7f39e955dc1fef2b65009240fd02f7a1b27a78fc2867144bf666efb929856db9f671c356c4c67a068a70fe83c52eebda03668872fd270d0794f0771d217fb6b93b12529a944f7f0496a9158757c55b8ee14f803f1d2d887e2f561
+
+# 1088-bit
+ModExp = 00
+A = 8000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000003d
+E = ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
+M = e91f6d748773cb212a23aa348125615123b1800c9ea222c9374c757702ae4140fa333790ed8f6bf60a1d7dda65c2767cc5f33e32e333d19fbfb5a2b85795757c9ca070268763a618e9d33873d28a89bf88acd209efbb15b80cd33b92a6b3a682e1c91782fc24fb86ddff4f809219c977b54b99359094bbcc51dfe17b992ab24b74a17950ad754281
+
+# 1472-bit
+ModExp = 00
+A = 8000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001d
+E = ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
+M = a8770362f4bfe4fc1ab0e52705c11a9b6ba235d5a5f22197c2d68e27ed18426ede3316af706aa79bcf943dbd51459eb15ae1f9386216b3f3a847f94440a65b97659bc5ba2adb67173714ecaa886c0b926d7a64ea45576f9d2171784ce7e801724d5b0abfd93357d538ea7ad3ad89a74f4660bdb66dfb5f684dcf00402e3cdf0ab58afd867c943c8f47b80268a789456aa7c50a619dd2f9f5e3f74b5d810f0f8dadbf4ad5b917cdcb156c4c132611c8b3b035118a9e03551f
+
+# 1536-bit
+ModExp = 00
+A = 800000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000002
+E = ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
+M = 878cd000778f927b2f1a4b8bac86efd282079a7ac0d25e09ffd2f72fbc282e65e233929d2457c7b1d63c56fb706cdfa04fb87e654c578c98d7cf59c2293dc5641086b68db4867105981daaf147a0ee91f6932ef064deae4142c19e58d50c0686f0eaf778be72450f89a98b4680bbc5ffab942195e44dd20616150fd1deca058068ca31ab2f861e99082588f17a2025bf5e536150142fca3187a259c791fc721430f24d7e338f8dc02e693a7e694d42775e80f7f7c03600b6ae86b4aba2b0e991
+
+# 2048-bit
+ModExp = 00
+A = 8000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000f
+E = ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
+M = 9f40a7535c561208ecb38e17c9336d9bc8484d335901b2cd42759cf03689227f6992f10cb6b586d767fbcdf30e9d82a0eda60d2694ccd0194fa96b50b56e0cdeec1951ea9e58b07e334a7f108841a0ab28256917fecea561388807ed124a17386a7a7b501f9cbf3404247a76948d0561e48137d3f9669e36f175731796aeaf78851f7d866917f661422186a4814aa35c066b5a90b9cfc918af769a9f0bb30c12581027df64ac328a0f07dbd20adb704479f6d0f233a131828c71bab19c3c34795ea4fb68aa632c6f688e5b3b84413c9031d8dc251003a590dec0dd09bfa6109ed4570701439b6f265b84ac2170c317357b5fbe5535e2bbdd93c1aacfdaa28c85
+
+# 3072-bit
+ModExp = 00
+A = 80000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001d
+E = ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
+M = c23dfd244a58a668d514498a705c8f8f548311b24f0f98b023d2d33632534c2ae948d6641d41fd7a29fbbd594bfc7fdd6e8162cbb3056af3075347b6fc8876458d33a9d0ffdbcdf482de0c73d1310fd8fa8f9f92dd0dbb0e2034e98a30f6c11b482f7476c5b593f673a322b1130daa4314e9074270dce1076436f0d56cf196afcbb235a9a7b3ac85b9062e85fc0e63a12c468c787019f6805f9faab64fc6a0babc80785d88740243f11366bffb40ccbe8b2bb7a99a2c8238a6f656bb0117d7b2602aa400f4d77de5f93c673f13264ca70de949454e3e3f261993c1aa427e8ef4f507af744f71f3b4aaf3c981d44cc1bfb1eb1151168762b242b740573df698e500d99612e17dc760f7b3bf7c235e39e81ad7edbe6c07dbb8b139745bb394d61cb799bcafec5de074932b0b2d74797e779ac8d81f63a2b2e9baa229dfaa7f90f34ffade1d2ad022a3407d35eb2d7477c6ae8ad100f6e95c05b4f947c1fabfb11a17add384e6b4cd3a02fd9b43f46805c6c74e366b74aa3b766be7a5fbbd67fa81
+
+# 4096-bit
+ModExp = 00
+A = 8000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001
+E = ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
+M = 8030411ecbddcb0fe4e76fd6b5bf542e8b015d1610cf96130ded12ba2cda0641bd9692080f218ea8b0d751845b519d95b843542ec8d2a07f1f93afe3189b69a4f35c983011c7f7928c3df458cc3eae85c36e6934a4b1bc0a67c8a521de336642c49e10a7ffa8d0af911aacc19e3900449161940f139220e099a150dcaf0ff96ffff6e726c1ac139969103cf6a828ac3adf0301506aa02787b4f570d5dde53a34acab8fec6fa94760abf16ee99954371ad65a6e899daab87b95811d069404991de9abe064ebbddf886e970f10d260c899dda940191a82d4c8bd36651363aff5493f4f59e700007dcadf37ebea7fcfd7600d16617ffea0d9ae659446d851d93c564e50e558f734c894d735fa273770703dab62844d9f01badf632f3d14a00f739c022c9be95f54e9cea46ec6da7cb11f4602e06962951c48204726b7f120ddbd0eb3566dc8d1e6f195a9196e96db33322d088b43aecffe9b4df182dd016aca0bd14f1c56cd1a18b89165c027029862b09ffd78e92ab614349c4fd67f49cb12cd33d0728930d0538bda57acef1365a73cc8fbac7d463b9e3c3bae0bb6224b080cdb8b5cd47d546d53111fdc22b7ff679bcfe27192920ee163b2be337d8cccc93b4de7d2d31934b9c0e97af291dcc1135b4a473bd37114eec3ba75c411887b57799d3188e7353f33a4d31735ebfc9fcfc044985148dd96da3876a5ab7ea7a404b411
 
 # These test vectors satisfy (ModSqrt * ModSqrt) mod P = A mod P with P a prime.
 # ModSqrt is in [0, (P-1)/2].