Disable FMA by default. Use -Ofma or jit.opt.start("+fma") to enable.

author Mike Pall <mike>

Wed, 7 Dec 2022 17:38:22 +0000 (18:38 +0100)

committer Mike Pall <mike>

Wed, 7 Dec 2022 17:38:22 +0000 (18:38 +0100)
author Mike Pall <mike>
Wed, 7 Dec 2022 17:38:22 +0000 (18:38 +0100)
committer Mike Pall <mike>
Wed, 7 Dec 2022 17:38:22 +0000 (18:38 +0100)
diff --git a/doc/running.html b/doc/running.html

index 9979d223ffbd52650e9e410b8ee721f5899cf304..edc049fb73b551995880dcd7e8f63fb45a4f11fa 100644 (file)
--- a/doc/running.html
+++ b/doc/running.html
@@ -220,6 +220,12 @@ mix the three forms, but note that setting an optimization level
  overrides all earlier flags.
  </p>
  <p>
+Note that <tt>-Ofma</tt> is not enabled by default at any level,
+because it affects floating-point result accuracy. Only enable this,
+if you fully understand the trade-offs of FMA for performance (higher),
+determinism (lower) and numerical accuracy (higher).
+</p>
+<p>
  Here are the available flags and at what optimization levels they
  are enabled:
  </p>
@@ -251,6 +257,8 @@ are enabled:
  <td class="flag_name">sink</td><td class="flag_level">&nbsp;</td><td class="flag_level">&nbsp;</td><td class="flag_level">&bull;</td><td class="flag_desc">Allocation/Store Sinking</td></tr>
  <tr class="even">
  <td class="flag_name">fuse</td><td class="flag_level">&nbsp;</td><td class="flag_level">&nbsp;</td><td class="flag_level">&bull;</td><td class="flag_desc">Fusion of operands into instructions</td></tr>
+<tr class="odd">
+<td class="flag_name">fma </td><td class="flag_level">&nbsp;</td><td class="flag_level">&nbsp;</td><td class="flag_level">&nbsp;</td><td class="flag_desc">Fused multiply-add</td></tr>
  </table>
  <p>
  Here are the parameters and their default settings:
diff --git a/src/lj_asm_arm.h b/src/lj_asm_arm.h

index 326330f476b8bfad8256b5b10681f087e271b9e8..ba6267eccfde41165f4d24942ee4ffef9bd361e1 100644 (file)
--- a/src/lj_asm_arm.h
+++ b/src/lj_asm_arm.h
@@ -313,7 +313,11 @@ static void asm_fusexref(ASMState *as, ARMIns ai, Reg rd, IRRef ref,
  }
  
  #if !LJ_SOFTFP
-/* Fuse to multiply-add/sub instruction. */
+/*
+** Fuse to multiply-add/sub instruction.
+** VMLA rounds twice (UMA, not FMA) -- no need to check for JIT_F_OPT_FMA.
+** VFMA needs VFPv4, which is uncommon on the remaining ARM32 targets.
+*/
  static int asm_fusemadd(ASMState *as, IRIns *ir, ARMIns ai, ARMIns air)
  {
    IRRef lref = ir->op1, rref = ir->op2;
diff --git a/src/lj_asm_arm64.h b/src/lj_asm_arm64.h

index 4e34b3bebe9e030c3050d303d7d416ab992ceaf2..805ea54bb9e33a4ee01e9d77fa6dcfbf3e3a9c01 100644 (file)
--- a/src/lj_asm_arm64.h
+++ b/src/lj_asm_arm64.h
@@ -337,7 +337,8 @@ static int asm_fusemadd(ASMState *as, IRIns *ir, A64Ins ai, A64Ins air)
  {
    IRRef lref = ir->op1, rref = ir->op2;
    IRIns *irm;
-  if (lref != rref &&
+  if ((as->flags & JIT_F_OPT_FMA) &&
+      lref != rref &&
        ((mayfuse(as, lref) && (irm = IR(lref), irm->o == IR_MUL) &&
         ra_noreg(irm->r)) ||
         (mayfuse(as, rref) && (irm = IR(rref), irm->o == IR_MUL) &&
diff --git a/src/lj_asm_ppc.h b/src/lj_asm_ppc.h

index 546b8e5dc69c05db73436167f7cf86d412b8af0d..aa8187456342e674271ee29946dc7fcba3ad35d2 100644 (file)
--- a/src/lj_asm_ppc.h
+++ b/src/lj_asm_ppc.h
@@ -235,7 +235,8 @@ static int asm_fusemadd(ASMState *as, IRIns *ir, PPCIns pi, PPCIns pir)
  {
    IRRef lref = ir->op1, rref = ir->op2;
    IRIns *irm;
-  if (lref != rref &&
+  if ((as->flags & JIT_F_OPT_FMA) &&
+      lref != rref &&
        ((mayfuse(as, lref) && (irm = IR(lref), irm->o == IR_MUL) &&
         ra_noreg(irm->r)) ||
         (mayfuse(as, rref) && (irm = IR(rref), irm->o == IR_MUL) &&
diff --git a/src/lj_jit.h b/src/lj_jit.h

index 32b3861a11928f96eb830c5d286b653dba26d555..7f081730e47adb6a08fd630f4ede560aab877fab 100644 (file)
--- a/src/lj_jit.h
+++ b/src/lj_jit.h
@@ -87,10 +87,11 @@
  #define JIT_F_OPT_ABC          (JIT_F_OPT << 7)
  #define JIT_F_OPT_SINK         (JIT_F_OPT << 8)
  #define JIT_F_OPT_FUSE         (JIT_F_OPT << 9)
+#define JIT_F_OPT_FMA          (JIT_F_OPT << 10)
  
  /* Optimizations names for -O. Must match the order above. */
  #define JIT_F_OPTSTRING        \
-  "\4fold\3cse\3dce\3fwd\3dse\6narrow\4loop\3abc\4sink\4fuse"
+  "\4fold\3cse\3dce\3fwd\3dse\6narrow\4loop\3abc\4sink\4fuse\3fma"
  
  /* Optimization levels set a fixed combination of flags. */
  #define JIT_F_OPT_0    0
@@ -99,6 +100,7 @@
  #define JIT_F_OPT_3    (JIT_F_OPT_2|\
    JIT_F_OPT_FWD|JIT_F_OPT_DSE|JIT_F_OPT_ABC|JIT_F_OPT_SINK|JIT_F_OPT_FUSE)
  #define JIT_F_OPT_DEFAULT      JIT_F_OPT_3
+/* Note: FMA is not set by default. */
  
  /* -- JIT engine parameters ----------------------------------------------- */
  
diff --git a/src/lj_vmmath.c b/src/lj_vmmath.c

index b6cc60ba5cc9a4a0d048787e6f9997c04828905b..d0febd81dc33467b126b95f851f672c0d5938cec 100644 (file)
--- a/src/lj_vmmath.c
+++ b/src/lj_vmmath.c
@@ -36,6 +36,17 @@ LJ_FUNCA double lj_wrap_fmod(double x, double y) { return fmod(x, y); }
  
  /* -- Helper functions ---------------------------------------------------- */
  
+/* Required to prevent the C compiler from applying FMA optimizations.
+**
+** Yes, there's -ffp-contract and the FP_CONTRACT pragma ... in theory.
+** But the current state of C compilers is a mess in this regard.
+** Also, this function is not performance sensitive at all.
+*/
+LJ_NOINLINE static double lj_vm_floormul(double x, double y)
+{
+  return lj_vm_floor(x / y) * y;
+}
+
  double lj_vm_foldarith(double x, double y, int op)
  {
    switch (op) {
@@ -43,7 +54,7 @@ double lj_vm_foldarith(double x, double y, int op)
    case IR_SUB - IR_ADD: return x-y; break;
    case IR_MUL - IR_ADD: return x*y; break;
    case IR_DIV - IR_ADD: return x/y; break;
-  case IR_MOD - IR_ADD: return x-lj_vm_floor(x/y)*y; break;
+  case IR_MOD - IR_ADD: return x-lj_vm_floormul(x, y); break;
    case IR_POW - IR_ADD: return pow(x, y); break;
    case IR_NEG - IR_ADD: return -x; break;
    case IR_ABS - IR_ADD: return fabs(x); break;
diff --git a/src/vm_arm64.dasc b/src/vm_arm64.dasc

index 3448d0d2b8e1229d423adc2934f54645be116132..36a036aee8b7796554a53f2297bf01ae2a6872f8 100644 (file)
--- a/src/vm_arm64.dasc
+++ b/src/vm_arm64.dasc
@@ -2636,7 +2636,9 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
      |.macro ins_arithmod, res, reg1, reg2
      |  fdiv d2, reg1, reg2
      |  frintm d2, d2
-    |  fmsub res, d2, reg2, reg1
+    |  // Cannot use fmsub, because FMA is not enabled by default.
+    |  fmul d2, d2, reg2
+    |  fsub res, reg1, d2
      |.endmacro
      |
      |.macro ins_arithdn, intins, fpins
author	Mike Pall <mike>
	Wed, 7 Dec 2022 17:38:22 +0000 (18:38 +0100)
committer	Mike Pall <mike>
	Wed, 7 Dec 2022 17:38:22 +0000 (18:38 +0100)
doc/running.html		patch \| blob \| blame \| history
src/lj_asm_arm.h		patch \| blob \| blame \| history
src/lj_asm_arm64.h		patch \| blob \| blame \| history
src/lj_asm_ppc.h		patch \| blob \| blame \| history
src/lj_jit.h		patch \| blob \| blame \| history
src/lj_vmmath.c		patch \| blob \| blame \| history
src/vm_arm64.dasc		patch \| blob \| blame \| history