AVR: target/84211 - Add a post reload register optimization pass.

author Georg-Johann Lay <avr@gjlay.de>

Sun, 17 Nov 2024 17:19:42 +0000 (18:19 +0100)

committer Georg-Johann Lay <avr@gjlay.de>

Mon, 18 Nov 2024 18:14:57 +0000 (19:14 +0100)
author Georg-Johann Lay <avr@gjlay.de>
Sun, 17 Nov 2024 17:19:42 +0000 (18:19 +0100)
committer Georg-Johann Lay <avr@gjlay.de>
Mon, 18 Nov 2024 18:14:57 +0000 (19:14 +0100)
diff --git a/gcc/common/config/avr/avr-common.cc b/gcc/common/config/avr/avr-common.cc

index f6409612eb571c11dbd21129dc94e044e8c76bae..333f950c80e3bedb15a6fba0a65ebf504928916a 100644 (file)
--- a/gcc/common/config/avr/avr-common.cc
+++ b/gcc/common/config/avr/avr-common.cc
@@ -37,6 +37,8 @@ static const struct default_options avr_option_optimization_table[] =
      { OPT_LEVELS_1_PLUS, OPT_mmain_is_OS_task, NULL, 1 },
      { OPT_LEVELS_1_PLUS, OPT_mfuse_add_, NULL, 1 },
      { OPT_LEVELS_2_PLUS, OPT_mfuse_add_, NULL, 2 },
+    { OPT_LEVELS_1_PLUS_NOT_DEBUG, OPT_mfuse_move_, NULL, 3 },
+    { OPT_LEVELS_2_PLUS, OPT_mfuse_move_, NULL, 23 },
      // Stick to the "old" placement of the subreg lowering pass.
      { OPT_LEVELS_1_PLUS, OPT_fsplit_wide_types_early, NULL, 1 },
      /* Allow optimizer to introduce store data races. This used to be the
diff --git a/gcc/config/avr/avr-log.cc b/gcc/config/avr/avr-log.cc

index 5708ac3deeb8dc3e36f9a9fea2ac8a7575882a84..6f567d845e1d17ae7737264e0fdb584744a541ad 100644 (file)
--- a/gcc/config/avr/avr-log.cc
+++ b/gcc/config/avr/avr-log.cc
@@ -77,6 +77,10 @@ avr_log_t avr_log;
  /* The worker function implementing the %-codes */
  static void avr_log_vadump (FILE*, const char*, va_list);
  
+/* Forward to fprintf for convenience.  Return the number of consumed format
+   chars after a %-code, or 0 if unrecognized and nothing consumed.  */
+static int avr_forward_to_printf (FILE*, const char*, va_list);
+
  /* Wrapper for avr_log_vadump.  If STREAM is NULL we are called by avr_dump,
     i.e. output to dump_file if available.  The 2nd argument is __FUNCTION__.
     The 3rd argument is the format string. */
@@ -157,14 +161,6 @@ avr_log_vadump (FILE *file, const char *caller, va_list ap)
               }
               break;
  
-           case 'd':
-             fprintf (file, "%d", va_arg (ap, int));
-             break;
-
-           case 'x':
-             fprintf (file, "%x", va_arg (ap, int));
-             break;
-
             case 'b':
               fprintf (file, "%s", va_arg (ap, int) ? "true" : "false");
               break;
@@ -258,11 +254,18 @@ avr_log_vadump (FILE *file, const char *caller, va_list ap)
               abort();
  
             default:
-             /* Unknown %-code: Stop printing */
-
-             fprintf (file, "??? %%%c ???%s\n", *(fmt-1), fmt);
-             fmt = "";
-
+             int n_used = avr_forward_to_printf (file, fmt - 1, ap);
+             if (n_used > 0)
+               {
+                 // "-1" due to "*fmt++" above.
+                 fmt += n_used - 1;
+               }
+             else
+               {
+                 // Unknown %-code: Stop printing.
+                 fprintf (file, "??? %%%c ???%s\n", *(fmt-1), fmt);
+                 fmt = "";
+               }
               break;
             }
           break; /* % */
@@ -273,6 +276,63 @@ avr_log_vadump (FILE *file, const char *caller, va_list ap)
  }
  
  
+#define IS_INTC(c) (c == 'd' || c == 'x' || c == 'X')
+
+// Consume FMTs like:  %x  %4x  %04x  %*x  %0*x
+// and similar for 'd' or 'ld' or 'X' instead of 'x'.
+
+static int
+avr_forward_to_printf (FILE *file, const char* const fmt, va_list ap)
+{
+  const char *p = fmt;
+  bool len_p = false;
+
+  // Optional fill
+  p += p[0] == '0' || p[0] == ' ';
+
+  // optional length
+  if (p[0] >= '1' && p[0] <= '9')
+    ++p;
+  else if (p[0] == '*')
+    ++p, len_p = true;
+
+  // Type
+  const bool long_p = p[0] == 'l';
+
+  if (IS_INTC (p[0])
+      || (long_p && IS_INTC (p[1])))
+    {
+      p += 1 + long_p;
+
+      const int n_used = (int) (p - fmt);
+
+      char xfm[10] = { '%' };
+      memcpy (xfm + 1, fmt, n_used);
+      xfm[1 + n_used] = '\0';
+
+      if (len_p)
+       {
+         const int len = va_arg (ap, int);
+         if (long_p)
+           fprintf (file, xfm, len, va_arg (ap, long));
+         else
+           fprintf (file, xfm, len, va_arg (ap, int));
+       }
+      else
+       {
+         if (long_p)
+           fprintf (file, xfm, va_arg (ap, long));
+         else
+           fprintf (file, xfm, va_arg (ap, int));
+       }
+
+      return n_used;
+    }
+
+  return 0;
+}
+
+
  /* Called from avr.cc:avr_option_override().
     Parse argument of -mlog= and set respective fields in avr_log.  */
  
diff --git a/gcc/config/avr/avr-passes-fuse-move.h b/gcc/config/avr/avr-passes-fuse-move.h

new file mode 100644 (file)

index 0000000..dbed1a6
--- /dev/null
+++ b/gcc/config/avr/avr-passes-fuse-move.h
@@ -0,0 +1,1186 @@
+/* Support for avr-passes.cc for AVR 8-bit microcontrollers.
+   Copyright (C) 2024 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+
+/* FIXME: The documentation in hard-reg-set.h is wrong in that it states
+   that HARD_REG_SET is a scalar iff HARD_REG_SET is a macro.
+   This is not the case:  HARD_REG_SET is a typedef no matter what.
+   So in order to get the lower 32 bits (and maybe more) as a scalar
+   we have to invoke type traits as we can't #ifdef HARD_REG_SET  */
+template<typename T, typename ELT, bool = std::is_same<T, ELT>::value>
+struct elt0_getter;
+
+// All hard regs fit in one HARD_REG_ELT_TYPE: T === ELT.
+template<typename T, typename ELT>
+struct elt0_getter<T, ELT, true>
+{
+  static inline const ELT &get (const T &t)
+  {
+    return t;
+  }
+};
+
+// HARD_REG_SET is not a scalar but a composite with HARD_REG_ELT_TYPE elts[].
+template<typename T, typename ELT>
+struct elt0_getter<T, ELT, false>
+{
+  static inline const ELT &get (const T &t)
+  {
+    return t.elts[0];
+  }
+};
+
+
+// To track known values held in General Purpose Registers R2 ... R31.
+
+struct memento_t
+{
+  // One bit for each GPR.
+  gprmask_t known = 0;
+
+  std::array<uint8_t, REG_32> values;
+
+  static gprmask_t fixed_regs_mask;
+
+  void apply (const ply_t &);
+
+  void apply_insn (rtx_insn *insn, bool unused)
+  {
+    apply_insn1 (insn, unused);
+    known &= ~memento_t::fixed_regs_mask;
+  }
+
+private:
+  void apply_insn1 (rtx_insn *, bool);
+
+public:
+  bool knows (int rno, int n = 1) const
+  {
+    gcc_checking_assert (gpr_regno_p (rno, n));
+    const gprmask_t mask = regmask (rno, n);
+    return (known & mask) == mask;
+  }
+
+  uint8_t operator[] (int rno) const
+  {
+    gcc_checking_assert (gpr_regno_p (rno));
+    return values[rno];
+  }
+
+  // Set the 8-bit register number DEST as known to hold value VAL.
+  void set_value (int dest, int val)
+  {
+    gcc_checking_assert (gpr_regno_p (dest, 1));
+    values[dest] = (uint8_t) val;
+    set_known (dest);
+  }
+
+  void copy_value (int dest, int src)
+  {
+    gcc_checking_assert (gpr_regno_p (dest, 1));
+    gcc_checking_assert (gpr_regno_p (src, 1));
+    values[dest] = values[src];
+    set_known (dest, knows (src));
+  }
+
+  void copy_values (int dest, int src, int n_bytes)
+  {
+    gcc_checking_assert (gpr_regno_p (dest, n_bytes));
+    gcc_checking_assert (gpr_regno_p (src, n_bytes));
+    if (dest < src)
+      for (int n = 0; n < n_bytes; ++n)
+       copy_value (n + dest, n + src);
+    else if (dest > src)
+      for (int n = n_bytes - 1; n >= 0; --n)
+       copy_value (n + dest, n + src);
+  }
+
+  // Get the value as a CONST_INT or NULL_RTX when any byte is unknown.
+  rtx get_value_as_const_int (int regno, int n_bytes) const
+  {
+    gcc_checking_assert (gpr_regno_p (regno, n_bytes));
+
+    if (! knows (regno, n_bytes))
+      return NULL_RTX;
+
+    const machine_mode mode = size_to_mode (n_bytes);
+    uint64_t val = 0;
+
+    for (int i = n_bytes - 1; i >= 0; --i)
+      val = 256 * val + values[regno + i];
+
+    return gen_int_mode (val, mode);
+  }
+
+  // Copy the known state and the value (provided it is known) from
+  // register SRC to register DEST.
+  void copy_values (rtx dest, rtx src)
+  {
+    if (REG_P (dest) && REG_P (src)
+       && GET_MODE_SIZE (GET_MODE (src)) <= GET_MODE_SIZE (GET_MODE (dest)))
+      {
+       int n_bytes = std::min (GET_MODE_SIZE (GET_MODE (src)),
+                               GET_MODE_SIZE (GET_MODE (dest)));
+       copy_values (REGNO (dest), REGNO (src), n_bytes);
+      }
+  }
+
+  void set_values (rtx dest, rtx src)
+  {
+    gcc_assert (REG_P (dest) && CONST_INT_P (src));
+    int regno = REGNO (dest);
+    for (int i = 0; i < GET_MODE_SIZE (GET_MODE (dest)); ++i)
+      set_value (regno + i, avr_uint8 (src, i));
+  }
+
+  // Value >= 0 of the i-th reg or -1 if unknown.
+  int value (int i) const
+  {
+    gcc_checking_assert (gpr_regno_p (i));
+    return knows (i) ? (int) values[i] : -1;
+  }
+
+  // Value >= 0 of the rno-th reg[n] or -1 if unknown.
+  int64_t value (int rno, int n, bool strict = true) const
+  {
+    gcc_assert (n <= 4);
+    gcc_checking_assert (gpr_regno_p (rno, n));
+    if (! knows (rno, n))
+      {
+       if (! strict)
+         return -1;
+       gcc_unreachable ();
+      }
+
+    uint64_t val = 0;
+    for (int r = rno + n - 1; r >= rno; --r)
+      val = 256 * val + values[r];
+
+    return val;
+  }
+
+  void set_known (int r, bool kno = true)
+  {
+    gcc_checking_assert (gpr_regno_p (r));
+    known = kno
+      ? known | (1u << r)
+      : known & ~(1u << r);
+  }
+
+  void set_unknown (int r)
+  {
+    gcc_checking_assert (gpr_regno_p (r));
+    set_known (r, false);
+  }
+
+  int n_known () const
+  {
+    return popcount_hwi (known);
+  }
+
+  // Hamming byte distance of R[n] to VAL.
+  int hamming (int r, int n, uint64_t val) const
+  {
+    gcc_assert (n <= 8);
+    gcc_checking_assert (gpr_regno_p (r, n));
+
+    int ham = 0;
+    for (int i = 0; i < n; ++i)
+      ham += value (r + i) != (uint8_t) (val >> (8 * i));
+
+    return ham;
+  }
+
+  // Calculate the Hamming byte distance, ignoring regs in IGNORES.
+  int distance_to (const memento_t &that, gprmask_t ignores = 0) const
+  {
+    int d = 0;
+    for (int r = FIRST_GPR; r < REG_32; ++r)
+      if (! (ignores & (1u << r)))
+       d += value (r) != that.value (r);
+    return d;
+  }
+
+  // Return true when *this and THAT are the same, with the only allowed
+  // exceptions as of mask IGNORES.
+  bool equals (const memento_t &that, gprmask_t ignores) const
+  {
+    if ((known & ~ignores) != (that.known & ~ignores))
+      return false;
+
+    for (int r = FIRST_GPR; r < REG_32; ++r)
+      if (! (ignores & (1u << r)))
+       if (value (r) != that.value (r))
+         return false;
+
+    return true;
+  }
+
+  // Return TRUE iff the N_BYTES registers starting at REGNO are known
+  // to contain VAL.
+  bool have_value (int rno, int n_bytes, int val) const
+  {
+    gcc_assert (n_bytes <= 4);
+    for (int i = rno; i < rno + n_bytes; ++i)
+      if (value (i) != (uint8_t) val)
+       return false;
+      else
+       val >>= 8;
+
+    return true;
+  }
+
+  // The regno of a d-reg that has a known value, or 0 if none found.
+  int known_dregno (void) const
+  {
+    const gprmask_t dregs = known & 0xffff0000 & ~memento_t::fixed_regs_mask;
+    return dregs ? clz_hwi (1) - clz_hwi (dregs) : 0;
+  }
+
+  // Return a regno for a register that contains value VAL8 and that does
+  // not overlap with the registers mentioned in EXCLUDES.  Else return 0.
+  int regno_with_value (uint8_t val8, gprmask_t excludes) const
+  {
+    for (int r = REG_31; r >= FIRST_GPR; --r)
+      if (value (r) == val8
+         && ! (regmask (r, 1) & excludes))
+       return r;
+    return 0;
+  }
+
+  // Return a regno for a 16-bit reg that contains value HI8:LO8 and that does
+  // not overlap with the registers mentioned in EXCLUDES.  Else return 0.
+  int reg16_with_value (uint8_t lo8, uint8_t hi8, gprmask_t excludes) const
+  {
+    for (int r = REG_30; r >= FIRST_GPR; r -= 2)
+      if (! (regmask (r, 2) & excludes)
+         && value (r) == lo8
+         && value (r + 1) == hi8)
+       return r;
+    return 0;
+  }
+
+  void operator&= (const HARD_REG_SET &hrs)
+  {
+    known &= elt0_getter<HARD_REG_SET, HARD_REG_ELT_TYPE>::get (hrs);
+  }
+
+  // Coalesce register knowledge about *this and THAT.
+  void coalesce (const memento_t &that)
+  {
+    known &= that.known;
+
+    for (int i = FIRST_GPR; i < REG_32; ++i)
+      if (values[i] != that.values[i])
+       set_unknown (i);
+  }
+
+  void dump (const char *msg = nullptr, FILE *f = dump_file) const
+  {
+    if (f)
+      {
+       msg = msg && msg[0] ? msg : "%s\n";
+       const char *const xs = strstr (msg, "%s");
+       gcc_assert (xs);
+
+       fprintf (f, "%.*s", (int) (xs - msg), msg);
+       fprintf (f, " (%d known): ", n_known ());
+       for (int i = FIRST_GPR; i < REG_32; ++i)
+         if (knows (i))
+           fprintf (f, " r%d=%02x", i, values[i]);
+
+       fprintf (f, "%s", xs + strlen ("%s"));
+      }
+  }
+}; // memento_t
+
+
+// In avr-fuse-move, a possible step towards an optimal code sequence
+// to load a compile-time constant.  A ply_t represents one or two
+// instructions.  There are cases where there is no 1-to-1 correspondence
+// between a ply_t and an insn; but a sequence of ply_ts can be mapped to
+// a sequence of insns; though there are cases where 2 or more ply_ts map
+// to a single insn and vice versa.
+
+struct ply_t
+{
+  // The destination register with .size in { 1, 2 }.
+  int regno;
+  int size;
+
+  // The performed operation where .arg represents an optional source operand.
+  // .code may be one of:  SET (ldi, clr, ldi+mov), REG (mov, movw), NEG (neg),
+  // NOT (com), PRE_INC (inc), PRE_DEC (dec), ROTATE (swap), ASHIFT (lsl),
+  // LSHIFTRT (lsr), ASHIFTRT (asr), PLUS (add), MINUS (sub), AND (and),
+  // IOR (or), XOR (eor), SS_PLUS (adiw, sbiw), MOD (set+bld, clt+bld, bld).
+  rtx_code code;
+  int arg;
+
+  // Code size in terms of words / instructions.  Extra costs for, say
+  // a CLT prior to a sequence of BLDs, are added to the 1st element.
+  int cost;
+
+  // We only consider ply_ts that reduce the Hamming distance by 0, 1 or 2.
+  // There are exotic cases where the Hamming distance temporarily increases,
+  // but we don't consider them.  (They may fall out of the algorithm anyways,
+  // for example when a "set_some" insn is used that restores its scratch.
+  int dhamming = 1;
+
+  // Whether this is a SET that's intended for insn "set_some"'s payload.
+  bool in_set_some = false;
+
+  // 0 or an upper scratch register.  One needed for SETs of a lower reg.
+  // SETs in a set_some don't need a scratch.
+  int scratch = 0;
+
+  // Statistics.
+  static int n_ply_ts;
+  static int max_n_ply_ts;
+
+  gprmask_t mask_dest () const
+  {
+    return regmask (regno, size);
+  }
+
+  gprmask_t mask_src () const
+  {
+    if (code == SET)
+      return 0;
+    else if (code == REG)
+      return regmask (arg, size);
+    else if (code == PLUS || code == MINUS || code == AND
+            || code == IOR || code == XOR)
+      return regmask (arg, size) | mask_dest ();
+    else
+      return mask_dest ();
+  }
+
+  bool is_movw () const
+  {
+    return size == 2 && code == REG;
+  }
+
+  bool is_adiw () const
+  {
+    return size == 2 && code == SS_PLUS;
+  }
+
+  bool is_bld () const
+  {
+    return code == MOD;
+  }
+
+  // A BLD setting one bit.
+  bool is_setbld () const
+  {
+    return is_bld () && popcount_hwi (arg) == 1;
+  }
+
+  // A BLD clearing one bit.
+  bool is_cltbld () const
+  {
+    return is_bld () && popcount_hwi (arg) == 7;
+  }
+
+  rtx_code bld_rtx_code () const
+  {
+    return select<rtx_code>()
+      : is_setbld () ? IOR
+      : is_cltbld () ? AND
+      : UNKNOWN;
+  }
+
+  // Is *P a BLD of the same kind?
+  bool is_same_bld (const ply_t *p) const
+  {
+    gcc_assert (is_bld ());
+    return p && bld_rtx_code () == p->bld_rtx_code ();
+  }
+
+  int bld_bitno () const
+  {
+    gcc_assert (is_bld ());
+    int bit = exact_log2 (popcount_hwi (arg) == 1 ? arg : 0xff ^ arg);
+    gcc_assert (IN_RANGE (bit, 0, 7));
+
+    return bit;
+  }
+
+  bool needs_scratch () const
+  {
+    return code == SET && AVRasm::ldi_needs_scratch (regno, arg);
+  }
+
+  // Return true when *this modifies (changes *AND* uses) the result
+  // generated by *P.
+  bool changes_result_of (const ply_t *p) const
+  {
+    return code != REG && code != SET && (mask_dest() & p->mask_dest());
+  }
+
+  bool overrides (const ply_t *p) const
+  {
+    return code == REG || code == SET
+      ? mask_dest () & p->mask_dest ()
+      : false;
+  }
+
+  bool commutes_with (const ply_t *p, int scratch = 0) const
+  {
+    if (code == SET || p->code == SET)
+      {
+       // SETs will be emit as a group where they commute.
+       if (code == SET && p->code == SET)
+         return true;
+
+       // Grant more flexibility to move around expensive SETs.
+       if (! scratch
+           && (needs_scratch () || p->needs_scratch ()))
+         return false;
+      }
+
+    if (is_bld () || p->is_bld ())
+      {
+       // BLD requires a previous SET or CLT which means that like
+       // BLDs should occur as a contiguous sequence.  This limits
+       // re-ordering for the purpose of canonicalization of instruction
+       // ordering.
+       return ((is_cltbld () && p->is_cltbld ())
+               || (is_setbld () && p->is_setbld ()));
+      }
+
+    gprmask_t msrc = 1u << scratch;
+    gprmask_t m1 = mask_dest() | mask_src();
+    gprmask_t m2 = p->mask_dest() | p->mask_src();
+    return (m1 & m2) == 0 && ((m1 | m2) & msrc) == 0;
+  }
+
+  // Expected insn name; used in dumps.
+  const char *insn_name () const
+  {
+    if (code == SET)
+      return select<const char *>()
+       : in_set_some ? "set_some"
+       : scratch && needs_scratch () ? "*reload_inqi"
+       : "movqi_insn";
+
+    return "???";
+  }
+
+  void dump (int level = 0, FILE *f = dump_file) const
+  {
+    if (f)
+      {
+       if (level)
+         avr_fdump (f, ";; .%d ply_t R%d[%d] = %C", level, regno, size, code);
+       else
+         avr_fdump (f, ";; ply_t R%d[%d] = %C", regno, size, code);
+       if (code == REG || is_adiw ())
+         fprintf (f, " %d", arg);
+       else if (code == PLUS || code == MINUS || code == AND
+                || code == IOR || code == XOR)
+         fprintf (f, " R%d", arg);
+       else if (is_setbld ())
+         fprintf (f, " BLD |= 0x%02x", arg);
+       else if (is_cltbld ())
+         fprintf (f, " BLD &= 0x%02x", arg);
+       else
+         fprintf (f, " 0x%x = %d", arg, arg);
+
+       const char *const name = insn_name ();
+       fprintf (f, ", cost=%d, dhamm=%d", cost, dhamming);
+       if (name && name[0] != '?')
+         fprintf (f, ", \"%s\"", name);
+       fprintf (f, "\n");
+      }
+  }
+
+  // Helper for dump_plys:  Value of the destination.
+  int dest_value (const memento_t &memo) const
+  {
+    return memo.value (regno, size);
+  }
+
+  // Helper for dump_plys:  Value of 1st source arg provided it is a register.
+  int src1_value (const memento_t &memo) const
+  {
+    int rsrc = regno;
+
+    switch (code)
+      {
+      default:
+       return -1;
+
+      case REG:
+       gcc_assert (size == 1 || size == 2);
+       rsrc = arg;
+       break;
+
+      case SS_PLUS:
+       gcc_assert (size == 2);
+       break;
+
+      case NEG: case NOT: case PRE_DEC: case PRE_INC:
+      case ASHIFT: case LSHIFTRT: case ASHIFTRT: case ROTATE:
+      case AND: case IOR: case XOR: case MOD:
+      case PLUS: case MINUS:
+       gcc_assert (size == 1);
+       break;
+      }
+
+    return memo.value (rsrc, size);
+  }
+
+  // Helper for dump_plys:  Value of 2nd source argument.
+  int src2_value (const memento_t &memo) const
+  {
+    switch (code)
+      {
+      default:
+       break;
+
+      case AND: case IOR: case XOR:
+      case PLUS: case MINUS:
+       gcc_assert (size == 1);
+
+       return memo.value (arg, 1);
+      }
+
+    return -1;
+  }
+
+  // Dumping a solution (or parts of it) is tedious because when
+  // their specific action should be displayed.
+  static void dump_plys (FILE *f, int level, int len,
+                        const ply_t *const ps[], const memento_t &m0)
+  {
+    if (f)
+      {
+       memento_t memo = m0;
+
+       for (int i = 0; i < len; ++i)
+         ps[i]->dump (level, memo, f);
+      }
+  }
+
+  void dump (int level, memento_t &memo, FILE *f = dump_file) const
+  {
+    if (! f)
+      return;
+
+    const ply_t &p = *this;
+
+    // Keep track of chars in the current line for neat alignment.
+    int cs = level > 0
+      ? fprintf (f, ";; .%d ", level)
+      : fprintf (f, ";; ");
+    cs += fprintf (f, "ply_t %-4s R%d[%d] = ", p.mnemonic (), p.regno, p.size);
+
+    const int x = p.src1_value (memo);
+    const int y = p.src2_value (memo);
+
+    memo.apply (p);
+
+    const int z = p.dest_value (memo);
+
+    switch (p.code)
+      {
+      default:
+       fprintf (f, "%s ???", rtx_name[p.code]);
+       gcc_unreachable ();
+       break;
+
+      case REG:
+       cs += fprintf (f, "R%d = 0x%0*x", p.arg, 2 * p.size, x);
+       break;
+
+      case SET:
+       cs += fprintf (f, "0x%02x = %d, \"%s\"", p.arg, p.arg, insn_name ());
+       break;
+
+      case PRE_DEC: case PRE_INC:
+      case ASHIFT: case LSHIFTRT: case ASHIFTRT: case ROTATE:
+       cs += fprintf (f, "R%d %s = 0x%02x = 0x%02x %s",
+                      p.regno, p.op_str (), z, x, p.op_str ());
+       break;
+
+      case NEG: case NOT:
+       cs += fprintf (f, "%sR%d = 0x%02x = %s0x%02x",
+                      p.op_str (), p.regno, z, p.op_str (), x);
+       break;
+
+      case PLUS: case MINUS:
+      case AND: case IOR: case XOR:
+       cs += fprintf (f, "R%d %s R%d = 0x%02x = 0x%02x %s 0x%02x",
+                      p.regno, p.op_str (), p.arg, z, x, p.op_str (), y);
+       break;
+
+      case SS_PLUS: // ADIW / SBIW
+       {
+         int arg = (int16_t) p.arg;
+         char op = arg < 0 ? '-' : '+';
+         cs += fprintf (f, "R%d %c %d = 0x%04x = 0x%04x %c %d", p.regno,
+                        op, std::abs (arg), z, x, op, std::abs (arg));
+       }
+       break;
+
+      case MOD: // BLD
+       {
+         const char opc = "&|" [p.is_setbld ()];
+         cs += fprintf (f, "R%d %c 0x%02x = 0x%02x = 0x%02x %c bit%d",
+                        p.regno, opc, p.arg, z, x, opc, p.bld_bitno ());
+       }
+       break;
+      }
+
+    cs += fprintf (f, ", ");
+
+    while (cs++ < 56)
+      fputc (' ', f);
+
+    fprintf (f, "cost=%d, dhamm=%d\n", p.cost, p.dhamming);
+  }
+
+  // AVR mnemnic; used in dumps.
+  const char *mnemonic () const
+  {
+    if (is_bld ())
+      {
+       static char s_bld[] = "BLD*";
+       s_bld[3] = '0' + bld_bitno ();
+       return s_bld;
+      }
+
+    return select<const char *>()
+      : code == LSHIFTRT ? "LSR"
+      : code == ASHIFTRT ? "ASR"
+      : code == ASHIFT ? "LSL"
+      : code == ROTATE ? "SWAP"
+      : code == PRE_DEC ? "DEC"
+      : code == PRE_INC ? "INC"
+      : code == MINUS ? "SUB"
+      : code == PLUS ? "ADD"
+      : code == NEG ? "NEG"
+      : code == NOT ? "COM"
+      : code == AND ? "AND"
+      : code == IOR ? "OR"
+      : code == XOR ? "EOR"
+      : code == REG ? size == 1 ? "MOV" : "MOVW"
+      : code == SET ? arg == 0 ? "CLR" : "LDI"
+      : code == SS_PLUS ? arg < 0 ? "SBIW" : "ADIW"
+      : rtx_name[code];
+  }
+
+  // Return a string of length 1 for CODE, or "?".
+  static const char *code_name_str1 (rtx_code code)
+  {
+    return select<const char *>()
+      : code == NEG ? "-"
+      : code == NOT ? "~"
+      : code == AND ? "&"
+      : code == IOR ? "|"
+      : code == XOR ? "^"
+      : code == PLUS ? "+"
+      : code == MINUS ? "-"
+      : "?";
+  }
+
+  // Short semantics representation used in dumps.
+  const char *op_str () const
+  {
+    return select<const char *>()
+      : code == LSHIFTRT ? ">> 1"
+      : code == ASHIFTRT ? ">> 1"
+      : code == ASHIFT ? "<< 1"
+      : code == ROTATE ? ">>> 4"
+      : code == PRE_DEC ? "- 1"
+      : code == PRE_INC ? "+ 1"
+      : code == SS_PLUS ? "+"
+      : *(ply_t::code_name_str1 (code)) != '?' ? ply_t::code_name_str1 (code)
+      : rtx_name[code];
+  }
+}; // ply_t
+
+
+// A set of ply_t's.  We prefer std:array (with some expected upper
+// bound for the number of ply_t's as generated by bbinfo_t::get_plies())
+// over std::vector.  That way, all plies_t are only allocated once as
+// elements of avr_pass_fuse_move::BInfo.
+
+struct plies_t
+{
+  int n_plies;
+  std::array<ply_t, 50> plies;
+
+  int emit_insns (const insninfo_t &, const memento_t &) const;
+  int emit_sets (const insninfo_t&, int &n_insns, const memento_t&, int) const;
+  int emit_blds (const insninfo_t &, int &n_insns, int i0) const;
+  void add_plies_movw (int regno, int size, uint64_t, int, const memento_t &);
+
+  void reset ()
+  {
+    n_plies = 0;
+  }
+
+  void add (const ply_t &ply)
+  {
+    if (n_plies < (int) plies.size ())
+      {
+       plies[n_plies++] = ply;
+       ply_t::n_ply_ts += 1;
+      }
+    else
+      avr_dump (";; WARNING: plies_t is full\n");
+  }
+
+  void add (ply_t, const ply_t *prev, const memento_t &, bool maybe_set_some);
+
+  plies_t () {}
+
+  plies_t (int n, const ply_t *const ps[])
+  {
+    gcc_assert (n <= (int) plies.size ());
+    for (int i = 0; i < n; ++i)
+      plies[i] = *ps[i];
+    n_plies = n;
+  }
+
+  static int max_n_plies;
+}; // plies_t
+
+
+// An 8-bit value leaf of absint_byte_t.
+// May be known to equal an 8-bit value.
+// May be known to equal the content of an 8-bit GPR.
+struct absint_val_t
+{
+  int16_t val8 = -1;
+  int8_t regno = 0;
+
+  absint_val_t () {}
+
+  bool knows_val8 () const
+  {
+    gcc_assert (IN_RANGE (val8, -1, 0xff));
+    return val8 >= 0;
+  }
+
+  bool knows_regno () const
+  {
+    gcc_assert (IN_RANGE (regno, 0, REG_31));
+    return regno;
+  }
+
+  bool clueless () const
+  {
+    return ! knows_val8 () && ! knows_regno ();
+  }
+
+  gprmask_t reg_mask () const
+  {
+    return regno ? regmask (regno, 1) : 0;
+  }
+
+  void dump (FILE *f = dump_file) const
+  {
+    if (f)
+      {
+       if (knows_regno ())
+         fprintf (f, "r%d%s", regno, knows_val8 () ? "=" : "");
+       if (knows_val8 ())
+         fprintf (f, "%02x", val8);
+       else if (! knows_regno ())
+         fprintf (f, "--");
+      }
+  }
+}; // absint_val_t
+
+
+// One byte in AbsInt.
+class absint_byte_t
+{
+  // "SET": the value is .x0.
+  rtx_code code = UNKNOWN;
+  absint_val_t x0;
+  absint_val_t x1;
+
+public:
+
+  const absint_val_t &arg (int i) const
+  {
+    gcc_assert (IN_RANGE (i, 0, arity () - 1));
+    return i == 1 ? x1 : x0;
+  }
+
+  rtx_code get_code () const
+  {
+    return code;
+  }
+
+  absint_byte_t () {}
+
+  absint_byte_t (absint_val_t x)
+    : code(x.clueless () ? UNKNOWN : SET), x0(x)
+  {}
+
+  // new = <code> A0  where CODE is a unary operation.
+  absint_byte_t (rtx_code c, const absint_byte_t &a0)
+    : code(c)
+  {
+    switch (code)
+      {
+      default:
+       gcc_unreachable ();
+
+      case NOT:
+       if (a0.can (CONST_INT))
+         init_val8 (absint_byte_t::eval (code, a0.val8 ()));
+       else if (a0.can (REG))
+         x0 = a0.x0;
+       else if (a0.can (NOT))
+         init_regno (a0.regno ());
+       else
+         code = UNKNOWN;
+       break;
+
+      case SIGN_EXTEND:
+       if (a0.can (CONST_INT))
+         init_val8 (absint_byte_t::eval (code, a0.val8 ()));
+       else if (a0.can (REG))
+         x0 = a0.x0;
+       else
+         code = UNKNOWN;
+       break;
+      }
+  }
+
+  // new = A0 <code> A1  where CODE is a binary operation.
+  absint_byte_t (rtx_code c, const absint_byte_t &a0, const absint_byte_t &a1)
+    : code(c)
+  {
+    gcc_assert (c == AND || c == IOR || c == XOR || code == PLUS);
+
+    if (a1.is_image1 (c))
+      *this = a1;
+    else if (a0.is_image1 (c))
+      *this = a0;
+    else if (a1.is_neutral (c))
+      *this = a0;
+    else if (a0.is_neutral (c))
+      *this = a1;
+    else if (a0.can (CONST_INT) && a1.can (CONST_INT))
+      init_val8 (absint_byte_t::eval (code, a0.val8 (), a1.val8 ()));
+    else if (a0.can (REG) && a1.can (CONST_INT))
+      {
+       x0 = a0.x0;
+       x1 = a1.x0;
+       if (code == XOR && a1.val8 () == 0xff)
+         code = NOT;
+      }
+    else if (a0.can (CONST_INT) && a1.can (REG))
+      {
+       x0 = a1.x0;
+       x1 = a0.x0;
+       if (code == XOR && a0.val8 () == 0xff)
+         code = NOT;
+      }
+    else if (a0.can (REG) && a1.can (REG))
+      {
+       x0.regno = std::min (a0.regno (), a1.regno ());
+       x1.regno = std::max (a0.regno (), a1.regno ());
+      }
+    else
+      code = UNKNOWN;
+  }
+
+  int arity () const
+  {
+    return select<int>()
+      : code == UNKNOWN ? 0
+      : code == SET || code == NOT || code == SIGN_EXTEND ? 1
+      : code == AND || code == IOR || code == XOR || code == PLUS ? 2
+      : bad_case<int> ();
+  }
+
+  // Return a byte with 8 signs according to code CODE.
+  absint_byte_t get_signs (rtx_code ext) const
+  {
+    return select<absint_byte_t>()
+      : ext == ZERO_EXTEND ? absint_byte_t::from_val8 (0)
+      : ext == SIGN_EXTEND ? absint_byte_t (SIGN_EXTEND, *this)
+      : ext == LSHIFTRT ? absint_byte_t::from_val8 (0)
+      : ext == ASHIFTRT ? absint_byte_t (SIGN_EXTEND, *this)
+      : bad_case<absint_byte_t> ();
+  }
+
+  gprmask_t reg_mask () const
+  {
+    return select<gprmask_t>()
+      : code == SET ? x0.reg_mask ()
+      : arity () == 1 ? x0.reg_mask ()
+      : arity () == 2 ? x0.reg_mask () | x1.reg_mask ()
+      : bad_case<gprmask_t> ();
+  }
+
+  bool check () const
+  {
+    return select<bool>()
+      : arity () >= 1 && x0.clueless () ? false
+      : arity () == 2 && x1.clueless () ? false
+      : true;
+  }
+
+  static inline uint8_t eval (rtx_code code, uint8_t x)
+  {
+    return select<int>()
+      : code == NOT ? ~x
+      : code == SIGN_EXTEND ? (x >= 0x80 ? 0xff : 0x00)
+      : bad_case<int> ();
+  }
+
+  static inline uint8_t eval (rtx_code code, uint8_t x, uint8_t y)
+  {
+    return select<int>()
+      : code == AND ? x & y
+      : code == IOR ? x | y
+      : code == XOR ? x ^ y
+      : code == PLUS ? x + y
+      : bad_case<int> ();
+  }
+
+  bool is_neutral (rtx_code c) const
+  {
+    return can (CONST_INT) && val8 () == AVRasm::neutral_val (c);
+  }
+
+  bool is_image1 (rtx_code c) const
+  {
+    return can (CONST_INT) && val8 () == AVRasm::image1_val (c);
+  }
+
+  bool can (rtx_code c) const
+  {
+    if (code == SET)
+      gcc_assert (IN_RANGE (x0.val8, 0, 0xff) || gpr_regno_p (x0.regno));
+
+    if (c == CONST_INT)
+      return code == SET && x0.knows_val8 ();
+    else if (c == REG)
+      return code == SET && x0.knows_regno ();
+    else if (c == VALUE)
+      return code != UNKNOWN;
+    else if (c == UNKNOWN
+            || c == SET || c == NOT || c == SIGN_EXTEND
+            || c == AND || c == IOR || c == XOR || c == PLUS)
+      return code == c;
+
+    gcc_unreachable ();
+  }
+
+  // Return the known byte value in 0...0xff, or -1 if unknown and ! STRICT.
+  int val8 (bool strict = true) const
+  {
+    gcc_assert (! strict || code == SET);
+    gcc_assert (! strict || can (CONST_INT));
+    return can (CONST_INT) ? x0.val8 : -1;
+  }
+
+  int regno (bool strict = true) const
+  {
+    gcc_assert (! strict || code == SET);
+    gcc_assert (! strict || can (REG));
+    return can (REG) ? x0.regno : 0;
+  }
+
+  void init_val8 (int v)
+  {
+    gcc_assert (IN_RANGE (v, 0, 0xff));
+    x0.val8 = v;
+    x0.regno = 0;
+    code = SET;
+  }
+
+  void init_regno (int r)
+  {
+    gcc_assert (gpr_regno_p (r));
+    x0.val8 = -1;
+    x0.regno = r;
+    code = SET;
+  }
+
+  void learn_val8 (int v)
+  {
+    gcc_assert (IN_RANGE (v, 0, 0xff));
+    gcc_assert (code == SET || code == UNKNOWN);
+    x0.val8 = v;
+    code = SET;
+  }
+
+  void learn_regno (int r)
+  {
+    gcc_assert (gpr_regno_p (r));
+    gcc_assert (code == SET || code == UNKNOWN);
+    x0.regno = r;
+    code = SET;
+  }
+
+  static inline absint_byte_t from_val8 (int val, bool strict = true)
+  {
+    gcc_assert (IN_RANGE (val, -1, 0xff));
+    gcc_assert (! strict || val >= 0);
+    absint_byte_t b;
+    if (val >= 0)
+      b.init_val8 (val);
+
+    return  b;
+  }
+
+  // Return a SET rtx that can replace the set_src of INSN.
+  // Returns BINARY_P or NULL_RTX.
+  absint_byte_t find_alternative_binary (const memento_t &memo) const
+  {
+    gprmask_t excludes = x1.knows_regno () ? regmask (x1.regno, 1) : 0;
+    absint_byte_t alt = *this;
+
+    if (arity () == 2
+       && x0.knows_regno ()
+       && x1.knows_val8 ()
+       && (! x1.knows_regno () || x0.regno != x1.regno)
+       && (alt.x1.regno = memo.regno_with_value (x1.val8, excludes)))
+      {
+       if (dump_flags & TDF_FOLDING)
+         {
+           alt.dump (";; AI.alternative AI=[%s]");
+           dump (" can replace AI=[%s]\n");
+         }
+
+       return alt;
+      }
+
+    return absint_byte_t {};
+  }
+
+  rtx to_rtx () const
+  {
+    if (arity () == 2)
+      {
+       gcc_assert (x0.knows_regno ());
+       gcc_assert (x1.knows_regno ());
+       rtx op0 = gen_rtx_REG (QImode, x0.regno);
+       rtx op1 = gen_rtx_REG (QImode, x1.regno);
+       return gen_rtx_fmt_ee (code, QImode, op0, op1);
+      }
+
+    gcc_unreachable ();
+  }
+
+  void dump (const char *msg = nullptr, FILE *f = dump_file) const
+  {
+    if (f)
+      {
+       msg = msg && msg[0] ? msg : "%s";
+       const char *const xs = strstr (msg, "%s");
+       gcc_assert (xs);
+
+       fprintf (f, "%.*s", (int) (xs - msg), msg);
+       if (code == UNKNOWN)
+         fprintf (f, "--");
+       else if (code == SET)
+         x0.dump (f);
+       else if (code == NOT)
+         {
+           fprintf (f, "~");
+           x0.dump (f);
+         }
+       else if (code == SIGN_EXTEND)
+         {
+           fprintf (f, "signs(");
+           x0.dump (f);
+           fprintf (f, ")");
+         }
+       else if (arity () == 2)
+         {
+           x0.dump (f);
+           fprintf (f, "%s", ply_t::code_name_str1 (code));
+           x1.dump (f);
+         }
+       else
+         gcc_unreachable ();
+
+       fprintf (f, "%s", xs + strlen ("%s"));
+      }
+  }
+}; // absint_byte_t
+
+
+struct bbinfo_t
+{
+  // All BBs of the current function.
+  static bbinfo_t *bb_info;
+
+  // bbinfo_t holds additional information for this basic block.
+  basic_block bb;
+
+  // Known values held in GPRs.
+  memento_t regs;
+
+  // Represents the "time" when the value was set.  When we have the choice
+  // between several registers to copy from, we use the first (oldest) set.
+  // This can avoid copy-chains.
+  std::array<int, REG_32> ticks;
+  static int tick;
+
+  // Whether according BB is done and optimized.
+  bool done;
+
+  static void optimize_one_function (function *func);
+  void optimize_one_block (bool &changed);
+  void enter ();
+  void leave ();
+
+  // Used when finding a best plies_t.  This object is only needed
+  // once and can be shared between all basic blocks.
+  struct find_plies_data_t
+  {
+    // These are used by [run_]find_plies()
+    const ply_t *ply_stack[N_BEST_PLYS];
+    plies_t plies[N_BEST_PLYS];
+    plies_t solution;
+    // Register knowledge at start of recursive algo.
+    memento_t regs0;
+    int max_ply_cost;
+    int movmode_cost;
+    int n_best_plys;
+    int n_get_plies; // Only for bookkeeping / statistics.
+  }; // find_plies_data_t
+
+  static find_plies_data_t *fpd;
+  static bool try_fuse_p;
+  static bool try_bin_arg1_p;
+  static bool try_simplify_p;
+  static bool try_split_ldi_p;
+  static bool try_split_any_p;
+  static bool use_arith_p;
+  static bool use_set_some_p;
+
+  static void get_plies (plies_t &, const insninfo_t &, const memento_t &,
+                        const ply_t *);
+  static void find_plies (int depth, const insninfo_t &, const memento_t &);
+  bool run_find_plies (const insninfo_t &, const memento_t &) const;
+}; // bbinfo_t
diff --git a/gcc/config/avr/avr-passes.cc b/gcc/config/avr/avr-passes.cc

index dff98368caa1708c3055ad58c4b98f3fdafdb960..2b67f9fa32ccdcc8774e7c88c8c4f9f8a287085b 100644 (file)
--- a/gcc/config/avr/avr-passes.cc
+++ b/gcc/config/avr/avr-passes.cc
@@ -19,6 +19,7 @@
  
  #define IN_TARGET_CODE 1
  
+#define INCLUDE_ARRAY
  #define INCLUDE_VECTOR
  #include "config.h"
  #include "system.h"
@@ -41,6 +42,3183 @@
  #include "cfgrtl.h"
  #include "context.h"
  #include "tree-pass.h"
+#include "insn-attr.h"
+
+
+#define CONST_INT_OR_FIXED_P(X) (CONST_INT_P (X) || CONST_FIXED_P (X))
+
+#define FIRST_GPR (AVR_TINY ? REG_18 : REG_2)
+
+namespace
+{
+
+/////////////////////////////////////////////////////////////////////////////
+// Before we start with the very code, introduce some helpers that are
+// quite generic, though up to now only avr-fuse-add makes use of them.
+
+/* Get the next / previous NONDEBUG_INSN_P after INSN in basic block BB.
+   This assumes we are in CFG layout mode so that BLOCK_FOR_INSN()
+   can be used.  */
+
+static rtx_insn *
+next_nondebug_insn_bb (basic_block bb, rtx_insn *insn, bool forward = true)
+{
+  while (insn)
+    {
+      insn = forward ? NEXT_INSN (insn) : PREV_INSN (insn);
+
+      if (insn && NONDEBUG_INSN_P (insn))
+       return BLOCK_FOR_INSN (insn) == bb ? insn : nullptr;
+    }
+
+  return insn;
+}
+
+static rtx_insn *
+prev_nondebug_insn_bb (basic_block bb, rtx_insn *insn)
+{
+  return next_nondebug_insn_bb (bb, insn, false);
+}
+
+
+/* Like `single_set' with the addition that it sets REGNO_SCRATCH when the
+   insn is a single_set with a QImode scratch register.  When the insn has
+   no QImode scratch or just a scratch:QI, then set REGNO_SCRATCH = 0.
+   The assumption is that the function is only used after the splits for
+   REG_CC so that the pattern is a parallel with 2 elements (INSN has no
+   scratch operand), or 3 elements (INSN does have a scratch operand).  */
+
+static rtx
+single_set_with_scratch (rtx_insn *insn, int &regno_scratch)
+{
+  regno_scratch = 0;
+
+  if (! INSN_P (insn))
+    return NULL_RTX;
+
+  rtx set, clo, reg, pat = PATTERN (insn);
+
+  // Search for SET + CLOBBER(QI) + CLOBBER(CC).
+  if (GET_CODE (pat) == PARALLEL
+      && XVECLEN (pat, 0) == 3
+      && GET_CODE (set = XVECEXP (pat, 0, 0)) == SET
+      // At this pass, all insn are endowed with clobber(CC).
+      && GET_CODE (clo = XVECEXP (pat, 0, 2)) == CLOBBER
+      && GET_MODE (XEXP (clo, 0)) == CCmode
+      && GET_CODE (clo = XVECEXP (pat, 0, 1)) == CLOBBER
+      && REG_P (reg = XEXP (clo, 0))
+      && GET_MODE (reg) == QImode)
+    {
+      regno_scratch = REGNO (reg);
+      return set;
+    }
+
+  return single_set (insn);
+}
+
+// Emit pattern PAT, and ICE when the insn is not valid / not recognized.
+
+static rtx_insn *
+emit_valid_insn (rtx pat)
+{
+  rtx_insn *insn = emit_insn (pat);
+
+  if (! valid_insn_p (insn))  // Also runs recog().
+    fatal_insn ("emit unrecognizable insn", insn);
+
+  return insn;
+}
+
+// Emit a single_set with an optional scratch operand.  This function
+// asserts that the new insn is valid and recognized.
+
+static rtx_insn *
+emit_valid_move_clobbercc (rtx dest, rtx src, rtx scratch = NULL_RTX)
+{
+  rtx pat = scratch
+    ? gen_gen_move_clobbercc_scratch (dest, src, scratch)
+    : gen_gen_move_clobbercc (dest, src);
+
+  return emit_valid_insn (pat);
+}
+
+// One bit for each GRP in REG_0 ... REG_31.
+using gprmask_t = uint32_t;
+
+// True when this is a valid GPR number for ordinary code, e.g.
+// registers wider than 2 bytes have to start at an exven regno.
+// TMP_REG and ZERO_REG are not considered valid, even though
+// the C source can use register vars with them.
+static inline bool
+gpr_regno_p (int regno, int n_bytes = 1)
+{
+  return (IN_RANGE (regno, FIRST_GPR, REG_32 - n_bytes)
+         // Size in { 1, 2, 3, 4, 8 } bytes.
+         && ((1u << n_bytes) & 0x11e)
+         // Registers >= 2 bytes start at an even regno.
+         && (n_bytes == 1 || regno % 2 == 0));
+}
+
+// There are cases where the C source defines local reg vars
+// for R1 etc.  The assumption is that this is handled before
+// calling this function, e.g. by skipping code when a register
+// overlaps with a fixed register.
+static inline gprmask_t
+regmask (int regno, int size)
+{
+  gcc_checking_assert (gpr_regno_p (regno, size));
+  gprmask_t bits = (1u << size) - 1;
+
+  return bits << regno;
+}
+
+// Mask for hard register X that's some GPR, including fixed regs like R0.
+static gprmask_t
+regmask (rtx x)
+{
+  gcc_assert (REG_P (x));
+  gprmask_t bits = (1u << GET_MODE_SIZE (GET_MODE (x))) - 1;
+
+  return bits << REGNO (x);
+}
+
+
+// Whether X has bits in the range [B0 ... B1]
+static inline bool
+has_bits_in (gprmask_t x, int b0, int b1)
+{
+  if (b0 > b1 || b0 > 31 || b1 < 0)
+    return false;
+
+  const gprmask_t m = (2u << (b1 - b0)) - 1;
+  return x & (m << b0);
+}
+
+
+template<typename T>
+T bad_case ()
+{
+  gcc_unreachable ();
+}
+
+#define select false ? bad_case
+
+
+namespace AVRasm
+{
+  // Returns true when we a scratch reg is needed in order to get
+  // (siged or unsigned) 8-bit value VAL in some GPR.
+  // When it's about costs rather than the sheer requirement for a
+  // scratch, see also AVRasm::constant_cost.
+  static inline bool ldi_needs_scratch (int regno, int val)
+  {
+    return regno < REG_16 && IN_RANGE (val & 0xff, 2, 254);
+  }
+
+  // Return a byte value x >= 0 such that  x <code> y == x for all y, or -1.
+  static inline int neutral_val (rtx_code code)
+  {
+    return select<int>()
+      : code == AND ? 0xff
+      : code == IOR ? 0x00
+      : code == XOR ? 0x00
+      : code == PLUS ? 0
+      : -1;
+  }
+
+  // When there exists a value x such that the image of the function
+  //   y -> y <code> x  has order 1, then return that x.  Else return -1.
+  static inline int image1_val (rtx_code code)
+  {
+    return select<int>()
+      : code == AND ? 0x00
+      : code == IOR ? 0xff
+      : -1;
+  }
+
+  // Cost of 8-bit binary operation  x o= VAL  provided a scratch is
+  // available as needed.
+  static int constant_cost (rtx_code code, int regno, uint8_t val)
+  {
+    bool needs_scratch_p = select<bool>()
+      : code == PLUS ? regno < REG_16 && val != 1 && val != 0xff
+      : code == XOR ? val != 0xff && (regno < REG_16 || val != 0x80)
+      : code == IOR ? regno < REG_16
+      : code == AND ? regno < REG_16 && val != 0
+      : code == SET ? regno < REG_16 && val != 0
+      : bad_case<bool> ();
+
+    return val == AVRasm::neutral_val (code)
+      ? 0
+      : 1 + needs_scratch_p;
+  }
+}; // AVRasm
+
+
+// Returns the mode mask for a mode size of SIZE bytes.
+static uint64_t size_to_mask (int size)
+{
+  return ((uint64_t) 2 << (8 * size - 1)) - 1;
+}
+
+// Return the scalar int mode for a modesize of 1, 2, 3, 4 or 8 bytes.
+static machine_mode size_to_mode (int size)
+{
+  return select<machine_mode>()
+    : size == 1 ? QImode
+    : size == 2 ? HImode
+    : size == 3 ? PSImode
+    : size == 4 ? SImode
+    : size == 8 ? DImode
+    : bad_case<machine_mode> ();
+}
+
+\f
+//////////////////////////////////////////////////////////////////////////////
+// Optimize moves after reload: -mfuse-move=<0,23>
+
+/* The purpose of this pass is to perform optimizations after reload
+   like the following ones:
+
+   Without optimization                     |   With optimization
+   ====================                     |   =================
+
+   long long fn_zero (void)        (1)
+   {
+      return 0;
+   }
+
+   ldi r18, 0    ;  movqi_insn      |   ldi r18, 0     ;  movqi_insn
+   ldi r19, 0    ;  movqi_insn      |   ldi r19, 0     ;  movqi_insn
+   ldi r20, 0    ;  movqi_insn      |   movw r20, r18  ;  *movhi
+   ldi r21, 0    ;  movqi_insn      |
+   ldi r22, 0    ;  movqi_insn      |   movw r22, r18  ;  *movhi
+   ldi r23, 0    ;  movqi_insn      |
+   ldi r24, 0    ;  movqi_insn      |   movw r24, r18  ;  *movhi
+   ldi r25, 0    ;  movqi_insn      |
+   ret                              |   ret
+
+   int fn_eq0 (char c)             (2)
+   {
+       return c == 0;
+   }
+
+   mov r18, r24           ;  movqi_insn     |   mov r18, r24   ;  movqi_insn
+   ldi r24, 1     ;  *movhi         |   ldi r24, 1     ;  *movhi
+   ldi r25, 0                       |   ldi r25, 0
+   cp  r18, ZERO   ;  cmpqi3        |   cpse r18, ZERO ;  peephole
+   breq .+4       ;  branch         |
+   ldi r24, 0     ;  *movhi         |   ldi r24, 0     ;  movqi_insn
+   ldi r25, 0                       |
+   ret                              |   ret
+
+   int a, b;                       (3)
+
+   void fn_store_ab (void)
+   {
+       a = 1;
+       b = -1;
+   }
+
+   ldi r24, 1     ;  *movhi         |  ldi r24, 1       ;  *movhi
+   ldi r25, 0                       |  ldi r25, 0
+   sts a+1, r25           ;  *movhi         |  sts a+1, r25     ;  *movhi
+   sts a,   r24                             |  sts a,   r24
+   ldi r24, -1    ;  *movhi         |  sbiw r24, 2      ;  *addhi3
+   ldi r25, -1                      |
+   sts b+1, r25           ;  *movhi         |  sts b+1, r25     ;  *movhi
+   sts b,   r24                             |  sts b,   r24
+   ret                              |  ret
+
+   unsigned fn_crc (unsigned x, unsigned y)   (4)
+   {
+       for (char i = 8; i--; x <<= 1)
+          y ^= (x ^ y) & 0x80 ? 79U : 0U;
+       return y;
+   }
+
+   movw r18, r24   ;  *movhi        |  movw r18, r24    ;  *movhi
+   movw r24, r22   ;  *movhi        |  movw r24, r22    ;  *movhi
+   ldi r22, 8     ;  movqi_insn     |  ldi  r22, 8      ;  movqi_insn
+  .L13:                                     | .L13:
+   movw r30, r18   ;  *movhi        |  movw r30, r18    ;  *movhi
+   eor r30, r24   ;  *xorqi3        |  eor  r30, r24    ;  *xorqi3
+   eor r31, r25   ;  *xorqi3        |  eor  r31, r25    ;  *xorqi3
+   mov r20, r30   ;  *andhi3        |  mov  r20, r30    ;  *andqi3
+   andi r20, 1<<7                   |  andi r20, 1<<7
+   clr r21                          |
+   sbrs r30, 7    ;  *sbrx_branchhi |  sbrc r30, 7      ;  *sbrx_branchhi
+   rjmp .+4                         |
+   ldi r20, 79    ;  movqi_insn     |  ldi  r20, 79     ;  movqi_insn
+   ldi r21, 0     ;  movqi_insn     |
+   eor r24, r20   ;  *xorqi3        |  eor r24, r20     ;  *xorqi3
+   eor r25, r21   ;  *xorqi3        |
+   lsl r18        ;  *ashlhi3_const |  lsl  r18         ;  *ashlhi3_const
+   rol r19                          |  rol  r19
+   subi r22, 1    ;  *op8.for.cczn.p|  subi r22, 1      ;  *op8.for.cczn.plus
+   brne .L13      ;  branch_ZN      |  brne .L13        ;  branch_ZN
+   ret                              |  ret
+
+   #define SPDR (*(uint8_t volatile*) 0x2c)     (5)
+
+   void fn_PR49807 (long big)
+   {
+       SPDR = big >> 24;
+       SPDR = big >> 16;
+       SPDR = big >> 8;
+       SPDR = big;
+   }
+
+   movw r20, r22   ;  *movhi        |  movw r20, r22    ;  *movhi
+   movw r22, r24   ;  *movhi        |  movw r22, r24    ;  *movhi
+   mov r24, r23   ;  *ashrsi3_const |
+   clr r27                          |
+   sbrc r24,7                       |
+   com r27                          |
+   mov r25, r27                     |
+   mov r26, r27                     |
+   out 0xc, r24   ;  movqi_insn     |  out 0xc, r23     ;  movqi_insn
+   movw r24, r22   ;  *ashrsi3_const |
+   clr r27                          |
+   sbrc r25, 7                      |
+   com r27                          |
+   mov r26, r27                     |
+   out 0xc, r24   ;  movqi_insn     |  out 0xc, r24     ;  movqi_insn
+   clr r27        ;  *ashrsi3_const |
+   sbrc r23, 7                      |
+   dec r27                          |
+   mov r26, r23                     |
+   mov r25, r22                     |
+   mov r24, r21                     |
+   out 0xc, r24   ;  movqi_insn     |  out 0xc, r21     ;  movqi_insn
+   out 0xc, r20   ;  movqi_insn     |  out 0xc, r20     ;  movqi_insn
+   ret                              |  ret
+
+   The insns of each basic block are traversed from first to last.
+   Each insn is optimized on its own, or may be fused with the
+   previous insn like in example (1).
+      As the insns are traversed, memento_t keeps track of known values
+   held in the GPRs (general purpse registers) R2 ... R31 by simulating
+   the effect of the current insn in memento_t.apply_insn().
+      The basic blocks are traversed in reverse post order so as to
+   maximize the chance that GPRs from all preceding blocks are known,
+   which is the case in example (2).  The traversal of the basic block
+   is performed by bbinfo_t.optimize_one_function().
+      bbinfo_t.optimize_one_block() traverses the insns of a BB and tries
+   the following optimizations:
+
+   bbinfo_t::try_fuse_p
+      Try to fuse two 8-bit insns to one MOVW like in (1).
+
+   bbinfo_t::try_simplify_p
+      Only perform the simplest optimizations that don't impede the
+      traceability of the generated code, which are:
+      - Transform operations like  Rn = Rn=0 ^ Rm  to  Rn = Rm.
+      - Remove insns that are no-ops like  Rn = Rn ^ Rm=0.
+
+   bbinfo_t::try_bin_arg1_p
+      In insns like  EOR Rn, arg1  where arg1 is known or is a reg that
+      dies in the insn, *and* there is a different register Rm that's
+      known to contain the same value, then arg1 is replaced with Rm.
+
+   bbinfo_t::try_split_ldi_p
+      Tries to simplify loads of constants like in examples (1), (2) and (3).
+      It may use arithmetic instructions like AND with registers that
+      are holding known values when this is profitable.
+
+   bbinfo_t::try_split_any_p
+      Split all insns where the operation can be performed on individual
+      bytes, like andsi3.  In example (4) the andhi3 can be optimized
+      to an andqi3.
+*/
+
+
+// A basic block with additional information like the GPR state.
+// The main entry point for the pass.  Runs various strategies
+// like try_fuse, try_simplify, try_bin_arg1, try_split_ldi, try_split_any
+// depending on -mfuse-add=<0,11>.
+struct bbinfo_t;
+
+// Additional insn information on a  REG = non-memory  single_set insn
+// for quick access.  Only valid when the m_size member is non-zero.
+struct insninfo_t;
+
+// Helper classes with data needed by the try_xxx optimizers.
+struct optimize_data_t;
+struct insn_optimize_data_t;
+
+// Records which GPRs R0 ... R31 are holding a known value,
+// and which values these are.
+struct memento_t;
+
+// Abstract Interpretation of expressions.
+// absint_val_t represents an 8-bit value that equals the content of
+//    some GPR, or equals some known value (or both, or none of them).
+// absint_byte_t represents an 8-bit entity that is equivalent to
+//    an absint_val_t, or is equivalent to some (unary or binary) operation
+//    on absint_val_t's like NOT, AND, IOR, XOR that operate bit-wise (and
+//    hence also byte-wise).
+// absint_t represents an array of absint_byte_t's.  When some insn is applied
+//    to a GPR state, then memento_t.apply_insn() represents the RHS of
+//    a single_set as an absint_t, and then applies that result to the GPRs.
+//    For example, in  int y = x << 8  the representation is  x = [r25; r24]
+//    and  RHS = [r24; 00].
+struct absint_val_t;
+class absint_byte_t;
+struct absint_t;
+
+// A ply_t is a potential step towards an optimal sequence to load a constant
+// value into a multi-byte register.  A ply_t loosely relates to one AVR
+// instruction, but it may also represent a sequence of instructions.
+// For example, loading a constant into a lower register when no sratch reg
+// is available may take up to 4 instructions.  There is no 1:1 correspondence
+// to insns, either.
+//    try_split_ldi determines the best sequence of ply_t's by means of a
+// brute-force search with tree pruning:  It's much too complicated to
+// construct a good sequence directly, but there are many conditions that
+// good sequence will satisfy, implemented in bbinfo_t::find_plies.
+struct ply_t;
+struct plies_t;
+
+// The maximal number of ply_t's in any conceivable optimal solution
+// that is better than what a vanilla mov<mode> generates.
+// This is 6 for modes <= 4 and 8 for modes == 8.
+static constexpr int N_BEST_PLYS = 8;
+
+#define FUSE_MOVE_MAX_MODESIZE 8
+
+#include "avr-passes-fuse-move.h"
+
+// Static members.
+
+gprmask_t memento_t::fixed_regs_mask;
+
+// Statistics.
+int ply_t::n_ply_ts;
+int ply_t::max_n_ply_ts;
+int plies_t::max_n_plies;
+
+bbinfo_t *bbinfo_t::bb_info;
+int bbinfo_t::tick;
+bbinfo_t::find_plies_data_t *bbinfo_t::fpd;
+
+// Which optimizations should be performed.
+bool bbinfo_t::try_fuse_p;
+bool bbinfo_t::try_bin_arg1_p;
+bool bbinfo_t::try_split_ldi_p;
+bool bbinfo_t::try_split_any_p;
+bool bbinfo_t::try_simplify_p;
+bool bbinfo_t::use_arith_p;
+bool bbinfo_t::use_set_some_p;
+
+
+// Abstract Interpretation of expressions.
+// A bunch of absint_byte_t's.
+
+struct absint_t
+{
+  static constexpr int eq_size = FUSE_MOVE_MAX_MODESIZE;
+  std::array<absint_byte_t, eq_size> eq;
+
+  rtx xexp = NULL_RTX;
+  rtx xexp_new = NULL_RTX;
+
+  absint_byte_t &operator[] (int i)
+  {
+    gcc_assert (IN_RANGE (i, 0, absint_t::eq_size - 1));
+    return eq[i];
+  }
+
+  const absint_byte_t &operator[] (int i) const
+  {
+    gcc_assert (IN_RANGE (i, 0, absint_t::eq_size - 1));
+    return eq[i];
+  }
+
+  absint_t () {}
+
+  absint_t (rtx xold)
+    : xexp(xold)
+  {}
+
+  absint_t (rtx xold, rtx xnew, int n_bytes)
+    : xexp(xold), xexp_new(xnew)
+  {
+    gcc_assert (n_bytes <= eq_size);
+    if (xnew)
+      for (int i = 0; i < n_bytes; ++i)
+       eq[i].learn_val8 (avr_uint8 (xnew, i));
+  }
+
+  // CODE != UNKNOWN: Maximal index of a byte with code CODE, or -1.
+  // CODE == UNKNOWN: Maximal index of a byte with known CODE, or -1.
+  int max_knows (rtx_code code = UNKNOWN) const
+  {
+    for (int i = eq_size - 1; i >= 0; --i)
+      if ((code == UNKNOWN && ! eq[i].can (UNKNOWN))
+         || (code != UNKNOWN && eq[i].can (code)))
+       return i;
+    return -1;
+  }
+
+  // CODE != UNKNOWN: Maximal i such that all bytes < i have code CODE.
+  // CODE == UNKNOWN: Maximal i such that all bytes < i have code != UNKNOWN.
+  int end_knows (rtx_code code = UNKNOWN) const
+  {
+    for (int i = 0; i < eq_size; ++i)
+      if ((code == UNKNOWN && eq[i].can (UNKNOWN))
+         || (code != UNKNOWN && ! eq[i].can (code)))
+       return i;
+    return eq_size;
+  }
+
+  // Number of bytes for which there is usable information.
+  int popcount () const
+  {
+    int pop = 0;
+    for (int i = 0; i < eq_size; ++i)
+      pop += ! eq[i].can (UNKNOWN);
+    return pop;
+  }
+
+  // Get the value under the assumption that all eq[].val8 are known.
+  uint64_t get_value (int n_bytes, bool strict = true) const
+  {
+    gcc_assert (IN_RANGE (n_bytes, 1, eq_size));
+    gcc_assert (! strict || end_knows (CONST_INT) >= n_bytes);
+
+    uint64_t val = 0;
+    for (int i = n_bytes - 1; i >= 0; --i)
+      val = 256 * val + eq[i].val8 (strict);
+    return val;
+  }
+
+  // Get n-byte value as a const_int, or NULL_RTX when (partially) unknown.
+  rtx get_value_as_const_int (int n_bytes) const
+  {
+    gcc_checking_assert (gpr_regno_p (REG_24, n_bytes));
+
+    if (end_knows (CONST_INT) < n_bytes)
+      return NULL_RTX;
+
+    const uint64_t val = get_value (n_bytes);
+    const machine_mode mode = size_to_mode (n_bytes);
+
+    return gen_int_mode (val, mode);
+  }
+
+  // Find a 16-bit register that contains the same value like held
+  // in positions I1 and I2 (if any).  Return 0 when nothing appropriate
+  // for a MOVW is found.
+  int reg16_with_value (int i1, int i2, const memento_t &memo) const
+  {
+    if (i1 == (i2 ^ 1))
+      {
+       const int lo8 = eq[i1 & ~1].val8 (false);
+       const int hi8 = eq[i1 | 1].val8 (false);
+       if (lo8 >= 0 && hi8 >= 0)
+         return memo.reg16_with_value (lo8, hi8, 0);
+      }
+    return 0;
+  }
+
+  // When X is a REG rtx with a known content as of MEMO, then return
+  // the respective value as a constant for mode MODE.
+  // If X is NULL_RTX, or not a REG, or not known, then return NULL_RTX.
+  static rtx maybe_fold (rtx x, const memento_t &memo)
+  {
+    int n_bytes;
+
+    if (x != NULL_RTX
+       && REG_P (x)
+       && (n_bytes = GET_MODE_SIZE (GET_MODE (x))) <= FUSE_MOVE_MAX_MODESIZE
+       && gpr_regno_p (REGNO (x), n_bytes))
+      {
+       rtx xval = memo.get_value_as_const_int (REGNO (x), n_bytes);
+       if (xval)
+         return avr_chunk (GET_MODE (x), xval, 0);
+      }
+
+    return NULL_RTX;
+  }
+
+  // Try to conclude about the bytes that comprise X.  DEST_MODE is the
+  // context mode that is used when X is CONST_INT and has VOIDmode.
+  static absint_t explore (rtx x, const memento_t &memo,
+                          machine_mode dest_mode = VOIDmode)
+  {
+    const rtx_code code = GET_CODE (x);
+    bool worth_dumping = dump_file && (dump_flags & TDF_FOLDING);
+
+    const machine_mode mode = GET_MODE (x) == VOIDmode
+      ? dest_mode
+      : GET_MODE (x);
+
+    const int n_bytes = mode == VOIDmode && CONST_INT_P (x)
+      ? absint_t::eq_size
+      : GET_MODE_SIZE (mode);
+
+    if (! IN_RANGE (n_bytes, 1, absint_t::eq_size))
+      return absint_t (x);
+
+    // Eat our own dog food as produced by try_plit_ldi.
+
+    rtx xop0 = BINARY_P (x) || UNARY_P (x) ? XEXP (x, 0) : NULL_RTX;
+    rtx xval0 = xop0 && CONST_INT_OR_FIXED_P (xop0)
+      ? xop0
+      : absint_t::maybe_fold (xop0, memo);
+
+    if (UNARY_P (x)
+       && REG_P (xop0)
+       && GET_MODE (xop0) == mode
+       && xval0)
+      {
+       rtx y = simplify_unary_operation (code, mode, xval0, mode);
+       if (y && CONST_INT_OR_FIXED_P (y))
+         return absint_t (x, y, n_bytes);
+      }
+
+    rtx xop1 = BINARY_P (x) ? XEXP (x, 1) : NULL_RTX;
+    rtx xval1 = xop1 && CONST_INT_OR_FIXED_P (xop1)
+      ? xop1
+      : absint_t::maybe_fold (xop1, memo);
+
+    if (BINARY_P (x)
+       && xval0 && xval1)
+      {
+       rtx y = simplify_binary_operation (code, mode, xval0, xval1);
+       if (y && CONST_INT_OR_FIXED_P (y))
+         return absint_t (x, y, n_bytes);
+      }
+
+    // No fold to a constant value was found:
+    // Look at the individual bytes more closely.
+
+    absint_t ai (x);
+
+    switch (code)
+      {
+      default:
+       worth_dumping = false;
+       break;
+
+      case REG:
+       if (END_REGNO (x) <= REG_32
+           && ! (regmask (x) & memento_t::fixed_regs_mask))
+         for (unsigned r = REGNO (x); r < END_REGNO (x); ++r)
+           {
+             ai[r - REGNO (x)].learn_regno (r);
+             if (memo.knows (r))
+               ai[r - REGNO (x)].learn_val8 (memo.value (r));
+           }
+       break;
+
+      CASE_CONST_UNIQUE:
+       ai = absint_t (x, x, n_bytes);
+       break;
+
+      case ASHIFT:
+      case ASHIFTRT:
+      case LSHIFTRT:
+      case ROTATE:
+      case ROTATERT:
+       if ((CONST_INT_P (xop1) && INTVAL (xop1) >= 8)
+           // DImode shift offsets for transparent calls are shipped in R16.
+           || n_bytes == 8)
+         ai = explore_shift (x, memo);
+       break;
+
+      case AND:
+      case IOR:
+      case XOR:
+       {
+         const absint_t ai0 = absint_t::explore (xop0, memo, mode);
+         const absint_t ai1 = absint_t::explore (xop1, memo, mode);
+         for (int i = 0; i < n_bytes; ++i)
+           ai[i] = absint_byte_t (code, ai0[i], ai1[i]);
+       }
+       break;
+
+      case NOT:
+       {
+         const absint_t ai0 = absint_t::explore (xop0, memo);
+         for (int i = 0; i < n_bytes; ++i)
+           ai[i] = absint_byte_t (NOT, ai0[i]);
+       }
+       break;
+
+      case ZERO_EXTEND:
+      case SIGN_EXTEND:
+       {
+         const absint_t ai0 = absint_t::explore (xop0, memo);
+         const int ai0_size = GET_MODE_SIZE (GET_MODE (xop0));
+         const absint_byte_t b_signs = ai0[ai0_size - 1].get_signs (code);
+         for (int i = 0; i < n_bytes; ++i)
+           ai[i] = i < ai0_size ? ai0[i] : b_signs;
+       }
+       break;
+
+      case PLUS:
+      case MINUS:
+       if (SCALAR_INT_MODE_P (mode)
+           || ALL_SCALAR_FIXED_POINT_MODE_P (mode))
+         {
+           const absint_t ai0 = absint_t::explore (xop0, memo, mode);
+           const absint_t ai1 = absint_t::explore (xop1, memo, mode);
+           if (code == MINUS)
+             for (int i = 0; i < n_bytes && ai1[i].val8 (false) == 0; ++i)
+               ai[i] = ai0[i];
+
+           if (code == PLUS)
+             for (int i = 0; i < n_bytes; ++i)
+               {
+                 if (ai0[i].val8 (false) == 0)
+                   ai[i] = ai1[i];
+                 else if (ai1[i].val8 (false) == 0)
+                   ai[i] = ai0[i];
+                 else
+                   {
+                     ai[i] = absint_byte_t (code, ai0[i], ai1[i]);
+                     break;
+                   }
+               }
+
+           if (code == PLUS
+               && GET_CODE (xop0) == ZERO_EXTEND
+               && CONST_INT_P (xop1))
+             {
+               rtx exop = XEXP (xop0, 0);
+               int exsize = GET_MODE_SIZE (GET_MODE (exop));
+               rtx lo_xop1 = avr_chunk (GET_MODE (exop), xop1, 0);
+               if (lo_xop1 == const0_rtx)
+                 for (int i = exsize; i < n_bytes; ++i)
+                   ai[i] = ai1[i];
+             }
+         }
+       break; // PLUS, MINUS
+
+      case MULT:
+       if (GET_MODE (xop0) == mode
+           && SCALAR_INT_MODE_P (mode))
+         {
+           // The constant may be located in xop0's zero_extend...
+           const absint_t ai0 = absint_t::explore (xop0, memo, mode);
+           const absint_t ai1 = absint_t::explore (xop1, memo, mode);
+           const int end0 = ai0.end_knows (CONST_INT);
+           const int end1 = ai1.end_knows (CONST_INT);
+           const uint64_t mul0 = end0 > 0 ? ai0.get_value (end0) : 1;
+           const uint64_t mul1 = end1 > 0 ? ai1.get_value (end1) : 1;
+           // Shifting in off/8 zero bytes from the right.
+           const int off = mul0 * mul1 != 0 ? ctz_hwi (mul0 * mul1) : 0;
+           for (int i = 0; i < off / 8; ++i)
+             ai[i].learn_val8 (0);
+         }
+       break; // MULT
+
+      case BSWAP:
+       if (GET_MODE (xop0) == mode)
+         {
+           const absint_t ai0 = absint_t::explore (xop0, memo);
+           for (int i = 0; i < n_bytes; ++i)
+             ai[i] = ai0[n_bytes - 1 - i];
+         }
+       break;
+      } // switch code
+
+    if (worth_dumping)
+      {
+       avr_dump (";; AI.explore %C:%m ", code, mode);
+       ai.dump ();
+      }
+
+    for (int i = 0; i < n_bytes; ++i)
+      gcc_assert (ai[i].check ());
+
+    return ai;
+  }
+
+  // Helper for the method above.
+  static absint_t explore_shift (rtx x, const memento_t &memo)
+  {
+    absint_t ai (x);
+
+    const rtx_code code = GET_CODE (x);
+    const machine_mode mode = GET_MODE (x);
+    const int n_bytes = GET_MODE_SIZE (mode);
+
+    if (! BINARY_P (x))
+      return ai;
+
+    rtx xop0 = XEXP (x, 0);
+    rtx xop1 = XEXP (x, 1);
+
+    // Look at shift offsets of DImode more closely;
+    // they are in R16 for __lshrdi3 etc.  Patch xop1 on success.
+    if (n_bytes == 8
+       && ! CONST_INT_P (xop1)
+       && GET_MODE (xop0) == mode)
+      {
+       const int n_off = GET_MODE_SIZE (GET_MODE (xop1));
+       const absint_t aoff = absint_t::explore (xop1, memo);
+       xop1 = aoff.get_value_as_const_int (n_off);
+      }
+
+    if (! xop1
+       || GET_MODE (xop0) != mode
+       || ! IN_RANGE (n_bytes, 1, FUSE_MOVE_MAX_MODESIZE)
+       || ! CONST_INT_P (xop1)
+       || ! IN_RANGE (INTVAL (xop1), 8, 8 * n_bytes - 1))
+      return ai;
+
+    const int off = INTVAL (xop1);
+    const absint_t ai0 = absint_t::explore (xop0, memo);
+
+    switch (GET_CODE (x))
+      {
+      default:
+       break;
+
+      case ASHIFT:
+       // Shifting in 0x00's from the right.
+       for (int i = 0; i < off / 8; ++i)
+         ai[i].learn_val8 (0);
+       break;
+
+      case LSHIFTRT:
+      case ASHIFTRT:
+       {
+         // Shifting in 0x00's or signs from the left.
+         absint_byte_t b_signs = ai0[n_bytes - 1].get_signs (GET_CODE (x));
+         for (int i = n_bytes - off / 8; i < n_bytes; ++i)
+           ai[i] = b_signs;
+         if (off == 8 * n_bytes - 1)
+           if (code == ASHIFTRT)
+             ai[0] = b_signs;
+       }
+       break;
+      }
+
+    if (off % 8 != 0
+       || ai0.popcount () == 0)
+      return ai;
+
+    // For shift offsets that are a multiple of 8, record the
+    // action on the constituent bytes.
+
+    // Bytes are moving left by this offset (or zero for "none").
+    const int boffL = select<int>()
+      : code == ROTATE || code == ASHIFT ? off / 8
+      : code == ROTATERT ? n_bytes - off / 8
+      : 0;
+
+    // Bytes are moving right by this offset (or zero for "none").
+    const int boffR = select<int>()
+      : code == ROTATERT || code == ASHIFTRT || code == LSHIFTRT ? off / 8
+      : code == ROTATE ? n_bytes - off / 8
+      : 0;
+
+    if (dump_flags & TDF_FOLDING)
+      {
+       avr_dump (";; AI.explore_shift %C:%m ", code, mode);
+       if (boffL)
+         avr_dump ("<< %d%s", 8 * boffL, boffL && boffR ? ", " : "");
+       if (boffR)
+         avr_dump (">> %d", 8 * boffR);
+       avr_dump ("\n");
+      }
+
+    if (boffL)
+      for (int i = 0; i < n_bytes - boffL; ++i)
+       ai[i + boffL] = ai0[i];
+
+    if (boffR)
+      for (int i = boffR; i < n_bytes; ++i)
+       ai[i - boffR] = ai0[i];
+
+    return ai;
+  }
+
+  void dump (const char *msg = nullptr, FILE *f = dump_file) const
+  {
+    if (f)
+      dump (NULL_RTX, msg, f);
+  }
+
+  void dump (rtx dest, const char *msg = nullptr, FILE *f = dump_file) const
+  {
+    if (f)
+      {
+       int regno = dest && REG_P (dest) ? REGNO (dest) : 0;
+
+       msg = msg && msg[0] ? msg : "AI=[%s]\n";
+       const char *const xs = strstr (msg, "%s");
+       gcc_assert (xs);
+
+       fprintf (f, "%.*s", (int) (xs - msg), msg);
+       for (int i = max_knows (); i >= 0; --i)
+         {
+           const int sub_regno = eq[i].regno (false /*nonstrict*/);
+           const bool nop = regno &&  sub_regno == regno + i;
+           eq[i].dump (nop ? "%s=nop" : "%s", f);
+           fprintf (f, "%s", i ? "; " : xs + strlen ("%s"));
+         }
+      }
+  }
+}; // absint_t
+
+
+// Information for a REG = non-memory single_set.
+
+struct insninfo_t
+{
+  // This is an insn that sets the m_size bytes of m_regno to either
+  // - A compile time constant m_isrc (m_code = CONST_INT), or
+  // - The contents of register number m_rsrc (m_code = REG).
+  int m_size;
+  int m_regno;
+  int m_rsrc;
+  rtx_code m_code;
+  uint64_t m_isrc;
+  rtx_insn *m_insn;
+  rtx m_set = NULL_RTX;
+  rtx m_src = NULL_RTX;
+  int m_scratch = 0; // 0 or the register number of a QImode scratch.
+  rtx_code m_old_code = UNKNOWN;
+
+  // Knowledge about the bytes of the SET_SRC:  A byte may have a known
+  // value, may be known to equal some register (e.g. with BSWAP),
+  // or both, or may be unknown.
+  absint_t m_ai;
+
+  // May be set for binary operations.
+  absint_byte_t m_new_src;
+
+  bool init1 (insn_optimize_data_t &, int max_size, const char *purpose);
+
+  // Upper bound for the cost (in words) of a move<mode> insn that
+  // performs a REG = CONST_XXX = .m_isrc move of modesize .m_size.
+  int cost () const;
+  bool combine (const insninfo_t &prev, const insninfo_t &curr);
+  int emit_insn () const;
+
+  bool needs_scratch () const
+  {
+    gcc_assert (m_code == CONST_INT);
+
+    for (int i = 0; i < m_size; ++i)
+      if (AVRasm::ldi_needs_scratch (m_regno, m_isrc >> (8 * i)))
+       return true;
+
+    return false;
+  }
+
+  int hamming (const memento_t &memo) const
+  {
+    gcc_assert (m_code == CONST_INT);
+
+    int h = 0;
+    for (int i = 0; i < m_size; ++i)
+      h += ! memo.have_value (m_regno + i, 1, 0xff & (m_isrc >> (8 * i)));
+
+    return h;
+  }
+
+  // Upper bound for the number of ply_t's of a solution, given Hamming
+  // distance of HAMM (-1 for unknown).
+  int n_best_plys (int hamm = -1) const
+  {
+    gcc_assert (m_code == CONST_INT);
+
+    if (m_size == 8)
+      return (hamm >= 0 ? hamm : m_size);
+    else if (hamm <= 4)
+      return (hamm >= 0 ? hamm : m_size)
+       // The following terms is the max number of MOVWs with a
+       // Hamming difference of less than 2.
+       + (AVR_HAVE_MOVW && m_regno < REG_14) * m_size / 2
+       + (AVR_HAVE_MOVW && m_regno == REG_14) * std::max (0, m_size - 2)
+       - (AVR_HAVE_MOVW && hamm == 4 && (uint32_t) m_isrc % 0x10001 == 0);
+    else
+      gcc_unreachable ();
+  }
+}; // insninfo_t
+
+
+struct insn_optimize_data_t
+{
+  // Known values held in GPRs prior to the action of .insn / .ii,
+  memento_t &regs;
+  rtx_insn *insn;
+  insninfo_t ii;
+  bool unused;
+
+  insn_optimize_data_t () = delete;
+
+  insn_optimize_data_t (memento_t &memo)
+    : regs(memo)
+  {}
+}; // insn_optimize_data_t
+
+struct optimize_data_t
+{
+  insn_optimize_data_t prev;
+  insn_optimize_data_t curr;
+
+  // Number >= 0 of new insns that replace the curr insn and maybe also the
+  // prev insn.  -1 when no replacement has been found.
+  int n_new_insns = -1;
+
+  // .prev will be removed provided we have (potentially zero) new insns.
+  bool delete_prev_p = false;
+
+  // Ignore these GPRs when comparing the simulation results of
+  // old and new insn sequences.  Usually some scratch reg(s).
+  gprmask_t ignore_mask = 0;
+
+  optimize_data_t () = delete;
+
+  optimize_data_t (memento_t &prev_regs, memento_t &curr_regs)
+    : prev(prev_regs), curr(curr_regs)
+  {}
+
+  bool try_fuse (bbinfo_t *);
+  bool try_bin_arg1 (bbinfo_t *);
+  bool try_simplify (bbinfo_t *);
+  bool try_split_ldi (bbinfo_t *);
+  bool try_split_any (bbinfo_t *);
+  bool fail (const char *reason);
+  bool emit_signs (int r_sign, gprmask_t);
+  void emit_move_mask (int dest, int src, int n_bytes, gprmask_t &);
+  rtx_insn *emit_sequence (basic_block, rtx_insn *);
+  bool get_2ary_operands (rtx_code &, const absint_byte_t &,
+                         insn_optimize_data_t &, int r_dest,
+                         absint_val_t &, absint_val_t &, int &ex_cost);
+  rtx_insn *emit_and_apply_move (memento_t &, rtx dest, rtx src);
+
+  // M2 is the state of GPRs as the sequence starts; M1 is the state one before.
+  static void apply_sequence (const std::vector<rtx_insn *> &insns,
+                             memento_t &m1, memento_t &m2)
+  {
+    gcc_assert (insns.size () >= 1);
+
+    for (auto &i : insns)
+      {
+       m1 = m2;
+       m2.apply_insn (i, false);
+      }
+  }
+}; // optimize_data_t
+
+
+// Emit INSNS before .curr.insn, replacing .curr.insn and also .prev.insn when
+// .delete_prev_p is on.  Adjusts .curr.regs and .prev.regs accordingly.
+rtx_insn *
+optimize_data_t::emit_sequence (basic_block bb, rtx_insn *insns)
+{
+  gcc_assert (n_new_insns >= 0);
+
+  // The old insns will be replaced by and simulated...
+  const std::vector<rtx_insn *> old_insns = delete_prev_p
+    ? std::vector<rtx_insn *> { prev.insn, curr.insn }
+    : std::vector<rtx_insn *> { curr.insn };
+
+  // ...against the new insns.
+  std::vector<rtx_insn *> new_insns;
+  for (rtx_insn *i = insns; i; i = NEXT_INSN (i))
+    new_insns.push_back (i);
+
+  rtx_insn *new_curr_insn;
+
+  memento_t &m1 = prev.regs;
+  memento_t &m2 = curr.regs;
+
+  if (new_insns.empty ())
+    {
+      if (delete_prev_p)
+       {
+         m2 = m1;
+         m1.known = 0;
+         new_curr_insn = prev_nondebug_insn_bb (bb, prev.insn);
+       }
+      else
+       new_curr_insn = prev.insn;
+    }
+  else
+    {
+      // We are going to emit at least one new insn.  Simulate the effect of
+      // the new sequence and compare it against the effect of the old one.
+      // Both effects must be the same (modulo scratch regs).
+
+      memento_t n1 = m1;
+      memento_t n2 = m2;
+
+      if (delete_prev_p)
+       {
+         m2 = m1, m1.known = 0;
+         n2 = n1, n1.known = 0;
+       }
+
+      avr_dump (";; Applying new route...\n");
+      optimize_data_t::apply_sequence (new_insns, n1, n2);
+
+      avr_dump (";; Applying old route...\n");
+      optimize_data_t::apply_sequence (old_insns, m1, m2);
+      avr_dump ("\n");
+
+      if (! m2.equals (n2, ignore_mask))
+       {
+         // When we come here, then
+         // - We have a genuine bug, and/or
+         // - We did produce insns that are opaque to absint_t's explore().
+         avr_dump ("INCOMPLETE APPLICATION:\n");
+         m2.dump ("regs old route=%s\n\n");
+         n2.dump ("regs new route=%s\n\n");
+         avr_dump ("The new insns are:\n%L", insns);
+
+         fatal_insn ("incomplete application of insn", insns);
+       }
+
+      // Use N1 and N2 as the new GPR states.  Even though they are equal
+      // modulo ignore_mask, N2 may know more about GPRs when it doesn't
+      // clobber the scratch reg.
+      m1 = n1;
+      m2 = n2;
+
+      emit_insn_before (insns, curr.insn);
+
+      new_curr_insn = new_insns.back ();
+    }
+
+  if (delete_prev_p)
+    SET_INSN_DELETED (prev.insn);
+
+  SET_INSN_DELETED (curr.insn);
+
+  return new_curr_insn;
+}
+
+
+const pass_data avr_pass_data_fuse_move =
+{
+  RTL_PASS,     // type
+  "",           // name (will be patched)
+  OPTGROUP_NONE, // optinfo_flags
+  TV_MACH_DEP,  // tv_id
+  0,            // properties_required
+  0,            // properties_provided
+  0,            // properties_destroyed
+  0,            // todo_flags_start
+  TODO_df_finish | TODO_df_verify // todo_flags_finish
+};
+
+
+class avr_pass_fuse_move : public rtl_opt_pass
+{
+public:
+  avr_pass_fuse_move (gcc::context *ctxt, const char *name)
+    : rtl_opt_pass (avr_pass_data_fuse_move, ctxt)
+  {
+    this->name = name;
+  }
+
+  unsigned int execute (function *func) final override
+  {
+    if (optimize > 0 && avr_fuse_move > 0)
+      {
+       df_note_add_problem ();
+       df_analyze ();
+
+       bbinfo_t::optimize_one_function (func);
+      }
+
+    return 0;
+  }
+}; // avr_pass_fuse_move
+
+
+// Append PLY to .plies[].  A SET or BLD ply may start a new sequence of
+// SETs or BLDs and gets assigned the overhead of the sequence like for an
+// initial SET or CLT instruction.  A SET ply my be added in two flavours:
+// One that starts a sequence of single_sets, and one that represents the
+// payload of a set_some insn.  MEMO is the GPR state prior to PLY.
+void
+plies_t::add (ply_t ply, const ply_t *prev, const memento_t &memo,
+             bool maybe_set_some)
+{
+  if (ply.code == SET)
+    {
+      if (prev && prev->code == SET)
+       {
+         // Proceed with the SET sequence flavour.
+         ply.in_set_some = prev->in_set_some;
+
+         if (ply.in_set_some)
+           ply.scratch = 0;
+         else if (! ply.scratch && ply.needs_scratch ())
+           ply.cost += 2;
+       }
+      else
+       {
+         // The 1st SET in a sequence.  May use set_some to set
+         // all bytes in one insn, or a bunch of single_sets.
+
+         // Route1: Bunch of single_sets.
+         const int ply_cost = ply.cost;
+         if (! ply.scratch && ply.needs_scratch ())
+           ply.cost += 2;
+         ply.in_set_some = false;
+
+         add (ply);
+
+         if (maybe_set_some)
+           {
+             // Route 2: One set_some: The 1st SET gets all the overhead.
+             ply.scratch = 0;
+             ply.cost = ply_cost + 1 + ! memo.known_dregno ();
+             ply.in_set_some = true;
+           }
+       }
+    } // SET
+  else if (ply.is_bld ())
+    {
+      // The first BLD in a series of BLDs gets the extra costs
+      // for the SET / CLT that precedes the BLDs.
+      ply.cost += ! ply.is_same_bld (prev);
+    }
+
+  add (ply);
+}
+
+
+// Emit insns for .plies[] and return the number of emitted insns.
+// The emitted insns represent the effect of II with MEMO, which
+// is the GPR knowledge before II is executed.
+int
+plies_t::emit_insns (const insninfo_t &ii, const memento_t &memo) const
+{
+  int n_insns = 0;
+
+  for (int i = 0; i < n_plies; ++i)
+    {
+      const ply_t &p = plies[i];
+
+      // SETs and BLDs are dumped by their emit_xxxs().
+      if (p.code != SET && ! p.is_bld ())
+       p.dump ();
+
+      rtx src1 = NULL_RTX;
+      rtx src2 = NULL_RTX;
+      rtx dest = NULL_RTX;
+      rtx xscratch = NULL_RTX;
+      rtx_code code = p.code;
+
+      switch (p.code)
+       {
+       default:
+         avr_dump ("\n\n;; Bad ply_t:\n");
+         p.dump (i + 1);
+         gcc_unreachable ();
+         break;
+
+       case REG: // *movhi = MOVW; movqi_insn = MOV
+         dest = gen_rtx_REG (p.size == 1 ? QImode : HImode, p.regno);
+         src1 = gen_rtx_REG (p.size == 1 ? QImode : HImode, p.arg);
+         break;
+
+       case SET: // movqi_insn = LDI, CLR; set_some = (LDI + MOV) ** size.
+         i += emit_sets (ii, n_insns, memo, i) - 1;
+         continue;
+
+       case MOD: // *ior<mode>3, *and<mode>3 = SET + BLD... / CLT + BLD...
+         i += emit_blds (ii, n_insns, i) - 1;
+         continue;
+
+       case MINUS: // *subqi3 = SUB
+       case PLUS:  // *addqi3 = ADD
+       case AND: // *andqi3 = AND
+       case IOR: // *iorqi3 = OR
+       case XOR: // *xorqi3 = EOR
+         dest = gen_rtx_REG (QImode, p.regno);
+         src2 = gen_rtx_REG (QImode, p.arg);
+         break;
+
+       case PRE_INC: // *addqi3 = INC
+       case PRE_DEC: // *addqi3 = DEC
+         code = PLUS;
+         dest = gen_rtx_REG (QImode, p.regno);
+         src2 = p.code == PRE_INC ? const1_rtx : constm1_rtx;
+         break;
+
+       case NEG: // *negqi2 = NEG
+       case NOT: // *one_cmplqi2 = COM
+         dest = gen_rtx_REG (QImode, p.regno);
+         src1 = dest;
+         break;
+
+       case ROTATE:   // *rotlqi3 = SWAP
+       case ASHIFT:   // *ashlqi3 = LSL
+       case ASHIFTRT: // *ashrqi3 = ASR
+       case LSHIFTRT: // *lshrqi3 = LSR
+         dest = gen_rtx_REG (QImode, p.regno);
+         src2 = GEN_INT (code == ROTATE ? 4 : 1);
+         break;
+
+       case SS_PLUS: // *addhi3 = ADIW, SBIW
+         code = PLUS;
+         dest = gen_rtx_REG (HImode, p.regno);
+         src2 = gen_int_mode (p.arg, HImode);
+         break;
+       } // switch p.code
+
+      gcc_assert (dest && (! src1) + (! src2) == 1);
+
+      rtx src = code == REG || code == SET
+       ? src1
+       : (src2
+          ? gen_rtx_fmt_ee (code, GET_MODE (dest), dest, src2)
+          : gen_rtx_fmt_e (code, GET_MODE (dest), src1));
+
+      emit_valid_move_clobbercc (dest, src, xscratch);
+      n_insns += 1;
+    }
+
+  return n_insns;
+}
+
+
+// Helper for .emit_insns().  Emit an ior<mode>3 or and<mode>3 insns
+// that's equivalent to a sequence of contiguous BLDs starting at
+// .plies[ISTART].  Updates N_INSNS according to the number of insns emitted
+// and returns the number of consumed plys in .plies[].
+int
+plies_t::emit_blds (const insninfo_t &ii, int &n_insns, int istart) const
+{
+  const ply_t &first = plies[istart];
+
+  gcc_assert (ii.m_size <= 4);
+  gcc_assert (first.is_bld ());
+
+  const rtx_code code = first.is_setbld () ? IOR : AND;
+  const machine_mode mode = size_to_mode (ii.m_size);
+
+  // Determine mask and number of BLDs.
+
+  uint32_t mask = 0;
+  int n_blds = 0;
+
+  for (int i = istart; i < n_plies; ++i, ++n_blds)
+    {
+      const ply_t &p = plies[i];
+      if (! p.is_bld () || ! p.is_same_bld (& first))
+       break;
+
+      // For AND, work on the 1-complement of the mask,
+      // i.e. 1's specify which bits to clear.
+      uint8_t mask8 = code == IOR ? p.arg : ~p.arg;
+      mask |= mask8 << (8 * (p.regno - ii.m_regno));
+    }
+
+  mask = GET_MODE_MASK (mode) & (code == IOR ? mask : ~mask);
+
+  if (dump_file)
+    {
+      fprintf (dump_file, ";; emit_blds[%d...%d] R%d[%d]%s=%0*x\n",
+              istart, istart + n_blds - 1, ii.m_regno, ii.m_size,
+              code == IOR ? "|" : "&", 2 * ii.m_size, (int) mask);
+    }
+
+  for (int i = 0; i < n_blds; ++i)
+    plies[i + istart].dump ();
+
+  rtx dest = gen_rtx_REG (mode, ii.m_regno);
+  rtx src = gen_rtx_fmt_ee (code, mode, dest, gen_int_mode (mask, mode));
+  rtx xscratch = mode == QImode ? NULL_RTX : gen_rtx_SCRATCH (QImode);
+
+  emit_valid_move_clobbercc (dest, src, xscratch);
+  n_insns += 1;
+
+  return n_blds;
+}
+
+
+// Emit insns for a contiguous sequence of SET ply_t's starting at
+// .plies[ISTART].  Advances N_INSNS by the number of emitted insns.
+// MEMO ist the state of the GPRs before II es executed, where II
+// represents the insn under optimization.
+// The emitted insns are "movqi_insn" or "*reload_inqi"
+// when .plies[ISTART].in_set_some is not set, and one "set_some" insn
+// when .plies[ISTART].in_set_some is set.
+int
+plies_t::emit_sets (const insninfo_t &ii, int &n_insns, const memento_t &memo,
+                   int istart) const
+{
+  gcc_assert (plies[istart].code == SET);
+
+  const bool in_set_some = plies[istart].in_set_some;
+
+  // Some d-regno that holds a compile-time constant, or 0.
+  const int known_dregno = memo.known_dregno ();
+
+  // Determine number of contiguous SETs,
+  // and sort them in ps[] such that smaller regnos come first.
+
+  const ply_t *ps[FUSE_MOVE_MAX_MODESIZE];
+  int n_sets = 0;
+
+  for (int i = istart; i < n_plies && plies[i].code == SET; ++i)
+    ps[n_sets++] = & plies[i];
+
+  if (dump_file)
+    {
+      fprintf (dump_file, ";; emit_sets[%d...%d] R%d[%d]=%0*" PRIx64,
+              istart, istart + n_sets - 1, ii.m_regno, ii.m_size,
+              2 * ii.m_size, ii.m_isrc);
+      fprintf (dump_file, ", scratch=%s%d", "R" + ! ii.m_scratch, ii.m_scratch);
+      fprintf (dump_file, ", known_dreg=%s%d, set_some=%d\n",
+              "R" + ! known_dregno, known_dregno, in_set_some);
+    }
+
+  for (int i = 0; i < n_sets; ++i)
+    ps[i]->dump ();
+
+  // Sort.  This is most useful on regs like (reg:SI REG_14).
+  for (int i = 0; i < n_sets - 1; ++i)
+    for (int j = i + 1; j < n_sets; ++j)
+      if (ps[i]->regno > ps[j]->regno)
+       std::swap (ps[i], ps[j]);
+
+  // Prepare operands.
+  rtx dst[FUSE_MOVE_MAX_MODESIZE];
+  rtx src[FUSE_MOVE_MAX_MODESIZE];
+  for (int i = 0; i < n_sets; ++i)
+    {
+      dst[i] = gen_rtx_REG (QImode, ps[i]->regno);
+      src[i] = gen_int_mode (ps[i]->arg, QImode);
+    }
+
+  if (in_set_some)
+    {
+      // Emit a "set_some" insn that sets all of the collected 8-bit SETs.
+      // This is a parallel with n_sets QImode SETs as payload.
+
+      gcc_assert (! known_dregno || memo.knows (known_dregno));
+
+      // A scratch reg...
+      rtx op1 = known_dregno
+       ? gen_rtx_REG (QImode, known_dregno)
+       : const0_rtx;
+      // ...with a known content, so it can be restored without saving.
+      rtx op2 = known_dregno
+       ? gen_int_mode (memo.values[known_dregno], QImode)
+       : const0_rtx;
+      // Target register envelope.
+      rtx op3 = GEN_INT (ii.m_regno);
+      rtx op4 = GEN_INT (ii.m_size);
+
+      // Payload.
+      for (int i = 0; i < n_sets; ++i)
+       dst[i] = gen_rtx_SET (dst[i], src[i]);
+
+      rtvec vec = gen_rtvec (5 + n_sets,
+                            gen_rtx_USE (VOIDmode, op1),
+                            gen_rtx_USE (VOIDmode, op2),
+                            gen_rtx_USE (VOIDmode, op3),
+                            gen_rtx_USE (VOIDmode, op4),
+                            gen_rtx_CLOBBER (VOIDmode, cc_reg_rtx),
+                            dst[0], dst[1], dst[2], dst[3]);
+      rtx pattern = gen_rtx_PARALLEL (VOIDmode, vec);
+
+      emit_valid_insn (pattern);
+      n_insns += 1;
+    }
+  else
+    {
+      // Emit a bunch of movqi_insn / *reload_inqi insns.
+
+      for (int i = 0; i < n_sets; ++i)
+       if (ii.m_scratch
+           && AVRasm::constant_cost (SET, ps[i]->regno, ps[i]->arg) > 1)
+         {
+           rtx scratch = gen_rtx_REG (QImode, ii.m_scratch);
+           bool use_reload_inqi = true;
+           if (use_reload_inqi)
+             {
+               emit_valid_move_clobbercc (dst[i], src[i], scratch);
+               n_insns += 1;
+             }
+           else
+             {
+               emit_valid_move_clobbercc (scratch, src[i]);
+               emit_valid_move_clobbercc (dst[i], scratch);
+               n_insns += 2;
+             }
+         }
+       else
+         {
+           emit_valid_move_clobbercc (dst[i], src[i]);
+           n_insns += 1;
+         }
+    }
+
+  return n_sets;
+}
+
+
+// Try to find an operation such that  Y = op (X).
+// Shifts and rotates are regarded as unary operaions with
+// an implied 2nd operand.
+static rtx_code
+find_arith (uint8_t y, uint8_t x)
+{
+#define RETIF(ex, code) y == (0xff & (ex)) ? code
+  return select<rtx_code>()
+    : RETIF (x + 1, PRE_INC)
+    : RETIF (x - 1, PRE_DEC)
+    : RETIF ((x << 4) | (x >> 4), ROTATE)
+    : RETIF (-x, NEG)
+    : RETIF (~x, NOT)
+    : RETIF (x >> 1, LSHIFTRT)
+    : RETIF (x << 1, ASHIFT)
+    : RETIF ((x >> 1) | (x & 0x80), ASHIFTRT)
+    : UNKNOWN;
+#undef RETIF
+}
+
+
+// Try to find an operation such that  Z = X op X.
+static rtx_code
+find_arith2 (uint8_t z, uint8_t x, uint8_t y)
+{
+#define RETIF(ex, code) z == (0xff & (ex)) ? code
+  return select<rtx_code>()
+    : RETIF (x + y, PLUS)
+    : RETIF (x - y, MINUS)
+    : RETIF (x & y, AND)
+    : RETIF (x | y, IOR)
+    : RETIF (x ^ y, XOR)
+    : UNKNOWN;
+#undef RETIF
+}
+
+
+// Add plies to .plies[] that represent a MOVW, but only ones that reduce the
+// Hamming distance from REGNO[SIZE] to VAL by exactly DHAMM.
+void
+plies_t::add_plies_movw (int regno, int size, uint64_t val,
+                        int dhamm, const memento_t &memo)
+{
+  if (! AVR_HAVE_MOVW || size < 2)
+    return;
+
+  for (int i = 0; i < size - 1; i += 2)
+    {
+      // MOVW that sets less than 2 regs to the target value is
+      // not needed for the upper regs.
+      if (dhamm != 2 && regno + i >= REG_16)
+       continue;
+
+      const uint16_t val16 = val >> (8 * i);
+      const uint8_t lo8 = val16;
+      const uint8_t hi8 = val16 >> 8;
+
+      // When one of the target bytes is already as expected, then
+      // no MOVW is needed for an optimal sequence.
+      if (memo.have_value (regno + i, 1, lo8)
+         || memo.have_value (regno + i + 1, 1, hi8))
+       continue;
+
+      const int h_old = memo.hamming (regno + i, 2, val16);
+
+      // Record MOVWs that reduce the Hamming distance by DHAMM as requested.
+      for (int j = FIRST_GPR; j < REG_32; j += 2)
+       if (j != regno + i
+           && memo.knows (j, 2))
+         {
+           const int h_new = memo.hamming (j, 2, val16);
+           if (h_new == h_old - dhamm)
+             add (ply_t { regno + i, 2, REG, j, 1, dhamm });
+         }
+    }
+}
+
+
+// Set PS to plys that reduce the Hamming distance from II.m_regno to
+// compile-time constant II.m_isrc by 2, 1 or 0.  PREV is NULL or points
+// to a previous ply_t.  MEMO is the GPR state after PREV and prior to the
+// added plys.
+void
+bbinfo_t::get_plies (plies_t &ps, const insninfo_t &ii, const memento_t &memo,
+                    const ply_t *prev)
+{
+  ps.reset ();
+
+  fpd->n_get_plies += 1;
+
+  const bool maybe_set_some = (bbinfo_t::use_set_some_p && ii.needs_scratch ());
+
+  // Start with cheap plies, then continue to more expensive ones.
+  const int regno = ii.m_regno;
+  const int size = ii.m_size;
+  const uint64_t val = ii.m_isrc;
+
+  // Find MOVW with a Hamming delta of 2.
+  ps.add_plies_movw (regno, size, val, 2, memo);
+
+  // Find ADIW / SBIW
+  if (AVR_HAVE_ADIW && size >= 2)
+    for (int i = 0; i < size - 1; i += 2)
+      if (regno + i >= REG_24
+         && memo.knows (regno + i, 2))
+       {
+         const int16_t value16 = memo[regno + i] + 256 * memo[regno + i + 1];
+         const int16_t lo16 = val >> (8 * i);
+         const int16_t delta = lo16 - value16;
+         const uint8_t lo8 = val >> (8 * i);
+         const uint8_t hi8 = val >> (8 * i + 8);
+         if (IN_RANGE (delta, -63, 63)
+             && lo8 != memo[regno + i]
+             && hi8 != memo[regno + i + 1])
+           {
+             ps.add (ply_t { regno + i, 2, SS_PLUS, delta, 1, 2 });
+           }
+       }
+
+  // Find 1-reg plies.  In an optimal sequence, each 1-reg ply will decrease
+  // the Hamming distance.  Thus we only have to consider plies that set
+  // one of the target bytes to the target value VAL.  Start with the
+  // high registers since that is the canonical order when two plies commute.
+
+  for (int i = size - 1; i >= 0; --i)
+    {
+      const uint8_t val8 = val >> (8 * i);
+
+      // Nothing to do for this byte when its value is already as desired.
+      if (memo.have_value (regno + i, 1, val8))
+       continue;
+
+      // LDI or CLR.
+      if (regno + i >= REG_16 || val8 == 0)
+       ps.add (ply_t { regno + i, 1, SET, val8, 1 }, prev, memo,
+               maybe_set_some);
+
+      // We only may need to MOV non-zero values since there is CLR,
+      // and only when there is no LDI.
+      if (val8 != 0
+         && regno + i < REG_16)
+       {
+         // MOV where the source register is one of the target regs.
+         for (int j = 0; j < size; ++j)
+           if (j != i)
+             if (memo.have_value (regno + j, 1, val8))
+               ps.add (ply_t { regno + i, 1, REG, regno + j, 1 });
+
+         // MOV where the source register is not a target reg.
+         // FIXME: ticks.
+         for (int j = FIRST_GPR; j < REG_32; ++j)
+           if (! IN_RANGE (j, regno, regno + size - 1))
+             if (memo.have_value (j, 1, val8))
+               ps.add (ply_t { regno + i, 1, REG, j, 1 });
+
+         // LDI + MOV.
+         if (regno + i < REG_16 && val8 != 0)
+           {
+             ply_t p { regno + i, 1, SET, val8, 2 };
+             p.scratch = ii.m_scratch;
+             ps.add (p, prev, memo, maybe_set_some);
+           }
+       }
+    }
+
+  // Arithmetic like INC, DEC or ASHIFT.
+  for (int i = size - 1; i >= 0; --i)
+    if (bbinfo_t::use_arith_p
+       && regno + i < REG_16
+       && memo.knows (regno + i))
+      {
+       const uint8_t y = val >> (8 * i);
+       const uint8_t x = memo[regno + i];
+       rtx_code code;
+
+       if (y == 0 || y == x)
+         continue;
+
+       // INC, DEC, SWAP, LSL, NEG, ...
+       if (UNKNOWN != (code = find_arith (y, x)))
+         {
+           ps.add (ply_t { regno + i, 1, code, x /* dummy */, 1 });
+           continue;
+         }
+
+       // ADD, AND, ...
+       for (int r = FIRST_GPR; r < REG_32; ++r)
+         if (r != regno + i
+             && memo.knows (r)
+             && memo[r] != 0
+             && UNKNOWN != (code = find_arith2 (y, x, memo[r])))
+           {
+             ps.add (ply_t { regno + i, 1, code, r, 1 });
+           }
+
+       if (size < 2 || size > 4)
+         continue;
+
+       // SET + BLD
+       if ((x & y) == x && popcount_hwi (x ^ y) == 1)
+         ps.add (ply_t { regno + i, 1, MOD, x ^ y, 1 },
+                 prev, memo, maybe_set_some);
+
+       // CLT + BLD
+       if ((x & y) == y && popcount_hwi (x ^ y) == 1)
+         ps.add (ply_t { regno + i, 1, MOD, x ^ y ^ 0xff, 1 },
+                 prev, memo, maybe_set_some);
+      }
+
+  if (bbinfo_t::use_arith_p
+      // For 8-byte values, don't use ply_t's with only a partial reduction
+      // of the hamming distance.
+      && size <= 4)
+    {
+      // Find MOVW with a Hamming delta of 1, then 0.
+      ps.add_plies_movw (regno, size, val, 1, memo);
+      ps.add_plies_movw (regno, size, val, 0, memo);
+    }
+
+  plies_t::max_n_plies = std::max (plies_t::max_n_plies, ps.n_plies);
+}
+
+
+// Try to combine two 8-bit insns PREV and CURR that (effectively)
+// are REG = CONST_INT to one 16-bit such insn.  Returns true on success.
+bool
+insninfo_t::combine (const insninfo_t &prev, const insninfo_t &curr)
+{
+  if (prev.m_size == 1 && curr.m_size == 1
+      && prev.m_regno == (1 ^ curr.m_regno)
+      && curr.m_code == CONST_INT
+      && prev.m_code == CONST_INT)
+    {
+      m_regno = curr.m_regno & ~1;
+      m_code = CONST_INT;
+      m_size = 2;
+      m_scratch = std::max (curr.m_scratch, prev.m_scratch);
+      m_isrc = m_regno == prev.m_regno
+       ? (uint8_t) prev.m_isrc + 256 * (uint8_t) curr.m_isrc
+       : (uint8_t) curr.m_isrc + 256 * (uint8_t) prev.m_isrc;
+
+      return true;
+    }
+
+  return false;
+}
+
+
+// Return the cost (in terms of words) of the respective mov<mode> insn.
+// This can be used as an upper bound for the ply_t's cost.
+int
+insninfo_t::cost () const
+{
+  if (m_code != CONST_INT)
+    return m_size;
+
+  if (m_regno >= REG_16 || m_isrc == 0)
+    return m_size
+      // MOVW can save one instruction.
+      - (AVR_HAVE_MOVW && m_size == 4 && (uint32_t) m_isrc % 0x10001 == 0);
+
+  // LDI + MOV to a lower reg.
+  if (m_scratch && m_size == 1)
+    return 2;
+
+  if (m_size == 8)
+    {
+      int len = m_size;
+      for (int i = 0; i < m_size; ++i)
+       len += m_regno + i < REG_16 && (0xff & (m_isrc >> (8 * i))) != 0;
+      return len;
+    }
+
+  // All other cases are complicated.  Ask the output oracle.
+  const machine_mode mode = size_to_mode (m_size);
+  rtx xscratch = m_scratch ? all_regs_rtx[m_scratch] : NULL_RTX;
+  rtx xop[] = { gen_rtx_REG (mode, m_regno), gen_int_mode (m_isrc, mode) };
+  int len;
+  if (m_size == 4)
+    output_reload_insisf (xop, xscratch, &len);
+  else
+    output_reload_in_const (xop, xscratch, &len, false);
+
+  return len;
+}
+
+// Emit the according REG = REG-or-CONST_INT insn.  Returns 1 or aborts
+// when the insn is not of that form.
+int
+insninfo_t::emit_insn () const
+{
+  int n_insns = 0;
+
+  machine_mode mode = size_to_mode (m_size);
+  rtx xsrc = NULL_RTX;
+  rtx xscratch = NULL_RTX;
+
+  gcc_assert (m_size > 0);
+
+  switch (m_code)
+    {
+    default:
+      gcc_unreachable();
+
+    case CONST_INT:
+      xsrc = gen_int_mode (m_isrc, mode);
+      if (m_scratch && m_regno < REG_16)
+       xscratch = gen_rtx_REG (QImode, m_scratch);
+      break;
+
+    case REG:
+      gcc_assert (gpr_regno_p (m_rsrc, m_size));
+      if (m_regno != m_rsrc)
+       xsrc = gen_rtx_REG (mode, m_rsrc);
+      break;
+    }
+
+  if (xsrc)
+    {
+      rtx dest = gen_rtx_REG (mode, m_regno);
+      emit_valid_move_clobbercc (dest, xsrc, xscratch);
+      n_insns += 1;
+    }
+
+  return n_insns;
+}
+
+
+// Entering a basic block means combining known register values from
+// all incoming BBs.
+void
+bbinfo_t::enter ()
+{
+  avr_dump ("\n;; Entering [bb %d]\n", bb->index);
+
+  gcc_assert (! done);
+
+  edge e;
+  edge_iterator ei;
+  gprmask_t pred_known_mask = ~0u;
+  bbinfo_t *bbi = nullptr;
+
+  // A quick iteration over all predecessors / incoming edges to reveal
+  // whether this BB is worth a closer look.
+  FOR_EACH_EDGE (e, ei, bb->preds)
+    {
+      basic_block pred = e->src;
+      bbi = & bb_info[pred->index];
+
+      pred_known_mask &= bbi->regs.known;
+
+      if (dump_file)
+       {
+         avr_dump (";; [bb %d] <- [bb %d] ", e->dest->index, e->src->index);
+         if (bbi->done)
+           bbi->regs.dump ();
+         else
+           avr_dump (" (unknown)\n");
+       }
+    }
+
+  // Only if all predecessors have already been handled, we can
+  // have known values as we are entering the current BB.
+  if (pred_known_mask != 0
+      && bbi != nullptr)
+    {
+      // Initialize current BB info from BI, an arbitrary predecessor.
+
+      regs = bbi->regs;
+
+      // Coalesce the output values from all predecessing BBs.  At the
+      // start of the current BB, a value is only known if it is known
+      // in *all* predecessors and *all* these values are the same.
+      FOR_EACH_EDGE (e, ei, bb->preds)
+       {
+         regs.coalesce (bb_info[e->src->index].regs);
+       }
+    }
+
+  if (dump_file)
+    {
+      avr_dump (";; [bb %d] known at start: ", bb->index);
+      if (regs.known)
+       regs.dump ();
+      else
+       avr_dump (" (none)\n");
+      avr_dump ("\n");
+    }
+}
+
+
+void
+bbinfo_t::leave ()
+{
+  done = true;
+
+  if (dump_file)
+    fprintf (dump_file, ";; Leaving [bb %d]\n\n", bb->index);
+}
+
+
+/* Initialize according to INSN which is a 1-byte single_set that's
+   (effectively) a reg = reg or reg = const move.  INSN may be the result
+   of the current pass's optimization, e.g. something like INC R2 where R2
+   has a known content.  MEMO is the state prior to INSN.  Only CONST
+   cases are recorded; plus cases that are non-trivial for example when
+   an XOR decays to a move.  */
+
+bool
+insninfo_t::init1 (insn_optimize_data_t &iod, int max_size,
+                  const char *purpose = "")
+{
+  m_size = 0;
+  m_insn = iod.insn;
+  m_old_code = UNKNOWN;
+  iod.unused = false;
+
+  if (! iod.insn
+      || ! (m_set = single_set_with_scratch (iod.insn, m_scratch)))
+    return false;
+
+  rtx dest = SET_DEST (m_set);
+  machine_mode mode = GET_MODE (dest);
+  const int n_bytes = GET_MODE_SIZE (mode);
+  max_size = std::min (max_size, FUSE_MOVE_MAX_MODESIZE);
+
+  if (! REG_P (dest)
+      || END_REGNO (dest) > REG_32
+      || n_bytes > max_size)
+    return false;
+
+  // Omit insns that (explicitly) touch fixed GPRs in any way.
+  using elt0_getter_HRS = elt0_getter<HARD_REG_SET, HARD_REG_ELT_TYPE>;
+  HARD_REG_SET hregs;
+  CLEAR_HARD_REG_SET (hregs);
+  find_all_hard_regs (PATTERN (iod.insn), & hregs);
+  if (memento_t::fixed_regs_mask & (gprmask_t) elt0_getter_HRS::get (hregs))
+    {
+      avr_dump (";; %sinit1 has fixed GPRs\n", purpose);
+      return false;
+    }
+
+  if ((iod.unused = find_reg_note (iod.insn, REG_UNUSED, dest)))
+    return false;
+
+  m_src = SET_SRC (m_set);
+  m_regno = REGNO (dest);
+  const rtx_code src_code = GET_CODE (m_src);
+
+  m_ai = absint_t::explore (m_src, iod.regs, mode);
+
+  if (m_ai.popcount ())
+    {
+      if (m_ai.end_knows (CONST_INT) >= n_bytes)
+       {
+         m_code = CONST_INT;
+         m_old_code = CONSTANT_P (m_src) ? UNKNOWN : src_code;
+         m_isrc = m_ai.get_value (n_bytes);
+         m_size = n_bytes;
+       }
+      else if (! REG_P (m_src)
+              && n_bytes == 1
+              && m_ai.end_knows (REG) >= n_bytes)
+       {
+         m_code = REG;
+         m_old_code = src_code;
+         m_rsrc = m_ai[0].regno ();
+         m_size = n_bytes;
+       }
+      else if (n_bytes == 1)
+       {
+         absint_byte_t &aib = m_new_src;
+         aib = m_ai[0].find_alternative_binary (iod.regs);
+
+         if (aib.arity () == 2
+             && aib.arg (0).regno == m_regno)
+           {
+             m_old_code = src_code;
+             m_code = aib.get_code ();
+             m_size = n_bytes;
+           }
+       }
+      else if (n_bytes >= 2
+              && m_ai.end_knows (VALUE) >= n_bytes)
+       {
+         m_code = src_code;
+         m_size = n_bytes;
+       }
+
+      if (dump_file && m_size != 0)
+       {
+         avr_dump (";; %sinit1 (%C", purpose,
+                   m_old_code ? m_old_code : m_code);
+         if (m_old_code)
+           avr_dump ("-> %C", m_code);
+         avr_dump (") insn %d to R%d[%d] := %C:%m = ", INSN_UID (iod.insn),
+                   m_regno, n_bytes, src_code, mode);
+
+         m_ai.dump (dest);
+
+         if (dump_flags & TDF_FOLDING)
+           avr_dump ("\n");
+       }
+    }
+
+  return m_size != 0;
+}
+
+
+// The private worker for .apply_insn().
+void
+memento_t::apply_insn1 (rtx_insn *insn, bool unused)
+{
+  gcc_assert (NONDEBUG_INSN_P (insn));
+
+  if (INSN_CODE (insn) == CODE_FOR_set_some)
+    {
+      // This insn only sets some selected bytes of register $3 of
+      // modesize $4.  If non-0, then $1 is a QImode scratch d-reg with
+      // a known value of $2.
+
+      const auto &xop = recog_data.operand;
+      extract_insn (insn);
+      gcc_assert (recog_data.n_operands == 7);
+      gcc_assert (set_some_operation (xop[0], VOIDmode));
+
+      const rtx &xscratch = xop[1];
+      const rtx &xscratch_value = xop[2];
+      const int sets_start = 5;
+
+      for (int i = sets_start; i < XVECLEN (xop[0], 0); ++i)
+       {
+         rtx xset = XVECEXP (xop[0], 0, i);
+         avr_dump (";; set_some %r = %r\n", XEXP (xset, 0), XEXP (xset, 1));
+         set_values (XEXP (xset, 0), XEXP (xset, 1));
+       }
+
+      if (REG_P (xscratch))
+       {
+         avr_dump (";; set_some %r = %r restore\n", xscratch, xscratch_value);
+         set_values (xscratch, xscratch_value);
+       }
+
+      return;
+    } // CODE_FOR_set_some
+
+  memento_t mold = *this;
+
+  // When insn changes a register in whatever way, set it to "unknown".
+
+  HARD_REG_SET rset;
+  find_all_hard_reg_sets (insn, &rset, true /* implicit */);
+  known &= ~rset;
+
+  rtx set = single_set (insn);
+  rtx dest;
+
+  if (! set
+      || ! REG_P (dest = SET_DEST (set))
+      || END_REGNO (dest) > REG_32
+      || (regmask (dest) & memento_t::fixed_regs_mask))
+    return;
+
+  rtx src = SET_SRC (set);
+  const rtx_code src_code = GET_CODE (src);
+  const machine_mode mode = GET_MODE (dest);
+  const int n_bytes = GET_MODE_SIZE (mode);
+
+  // Insns that are too complicated or have a poor yield.
+  // Just record which regs are clobberd / changed.
+  if (n_bytes > FUSE_MOVE_MAX_MODESIZE
+      || MEM_P (src)
+      || (REG_P (src) && END_REGNO (src) > REG_32))
+    {
+      // Comparisons may clobber the compared reg when it is unused after.
+      if (src_code == COMPARE
+         && REG_P (XEXP (src, 0))
+         && CONSTANT_P (XEXP (src, 1)))
+       {
+         rtx reg = XEXP (src, 0);
+         for (unsigned r = REGNO (reg); r < END_REGNO (reg); ++r)
+           set_unknown (r);
+       }
+      return;
+    }
+
+  if (unused)
+    return;
+
+  // Simulate the effect of some selected insns that are likely to produce
+  // or propagate known values.
+
+  // Get an abstract representation of src.  Bytes may be unknown,
+  // known to equal some 8-bit compile-time constant (CTC) value,
+  // or are known to equal some 8-bit register.
+  // TODO: Currently, only the ai[].val8 knowledge ist used.
+  //       What's the best way to make use of ai[].regno ?
+
+  absint_t ai = absint_t::explore (src, mold, mode);
+
+  if (ai.popcount ())
+    {
+      avr_dump (";; apply_insn %d R%d[%d] := %C:%m = ", INSN_UID (insn),
+               REGNO (dest), n_bytes, src_code, mode);
+      ai.dump ();
+
+      for (int i = 0; i < n_bytes; ++i)
+       if (ai[i].can (CONST_INT))
+         set_value (i + REGNO (dest), ai[i].val8 ());
+    }
+}
+
+
+void
+memento_t::apply (const ply_t &p)
+{
+  if (p.is_movw ())
+    {
+      copy_value (p.regno, p.arg);
+      copy_value (p.regno + 1, p.arg + 1);
+    }
+  else if (p.is_adiw ())
+    {
+      int val = p.arg + values[p.regno] + 256 * values[1 + p.regno];
+      set_value (p.regno, val);
+      set_value (p.regno + 1, val >> 8);
+    }
+  else if (p.size == 1)
+    {
+      int x = values[p.regno];
+      int y = values[p.arg];
+
+      switch (p.code)
+       {
+       default:
+         gcc_unreachable ();
+         break;
+
+       case REG:
+         copy_value (p.regno, p.arg);
+         break;
+
+       case SET:
+         set_value (p.regno, p.arg);
+         if (p.scratch >= REG_16)
+           set_unknown (p.scratch);
+         break;
+
+       case MOD: // BLD
+         gcc_assert (knows (p.regno));
+         if (popcount_hwi (p.arg) == 1)
+           values[p.regno] |= p.arg;
+         else if (popcount_hwi (p.arg) == 7)
+           values[p.regno] &= p.arg;
+         else
+           gcc_unreachable ();
+         break;
+
+#define DO_ARITH(n_args, code, expr)                                   \
+         case code:                                                    \
+           gcc_assert (knows (p.regno));                               \
+           if (n_args == 2)                                            \
+             gcc_assert (knows (p.arg));                               \
+           set_value (p.regno, expr);                                  \
+           break
+
+         DO_ARITH (1, NEG, -x);
+         DO_ARITH (1, NOT, ~x);
+         DO_ARITH (1, PRE_INC, x + 1);
+         DO_ARITH (1, PRE_DEC, x - 1);
+         DO_ARITH (1, ROTATE, (x << 4) | (x >> 4));
+         DO_ARITH (1, ASHIFT, x << 1);
+         DO_ARITH (1, LSHIFTRT, x >> 1);
+         DO_ARITH (1, ASHIFTRT, (x >> 1) | (x & 0x80));
+
+         DO_ARITH (2, AND, x & y);
+         DO_ARITH (2, IOR, x | y);
+         DO_ARITH (2, XOR, x ^ y);
+         DO_ARITH (2, PLUS, x + y);
+         DO_ARITH (2, MINUS, x - y);
+#undef DO_ARITH
+       }
+    } // size == 1
+  else
+    gcc_unreachable ();
+}
+
+
+// Try to find a sequence of ply_t's that represent a II.m_regno = II.m_isrc
+// insn that sets a reg to a compile-time constant, and that is more
+// efficient than just a move insn.  (When try_split_any_p is on, then
+// solutions that perform equal to a move insn are also allowed).
+// MEMO0 is the GPR state before II runs.  A solution has been found
+// when .fpd->solution has at least one entry.  LEN specifies the
+// depth of recursion, which works on the LEN-th ply_t.
+void
+bbinfo_t::find_plies (int len, const insninfo_t &ii, const memento_t &memo0)
+{
+  if (len > fpd->n_best_plys)
+    return;
+
+  memento_t memo = memo0;
+  bool ply_applied_p = false;
+
+  //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+  const bool extra = dump_file && (dump_flags & TDF_FOLDING);
+
+  if (extra)
+    {
+      fprintf (dump_file, ";; #%d (HAM=%d): get_plies R%d[%d] = ", len,
+              ii.hamming (fpd->regs0), ii.m_regno, ii.m_size);
+      fprintf (dump_file, "0x%0*" PRIx64 "\n",
+              2 * ii.m_size, ii.m_isrc & size_to_mask (ii.m_size));
+    }
+
+  plies_t &ps = fpd->plies[len - 1];
+
+  const ply_t *const prev = len >= 2 ? fpd->ply_stack[len - 2] : nullptr;
+  const ply_t *const prev2 = len >= 3 ? fpd->ply_stack[len - 3] : nullptr;
+
+  bbinfo_t::get_plies (ps, ii, memo0, prev);
+
+#define NEXT(reason)                                   \
+  do {                                                 \
+    if (extra)                                         \
+      fprintf (dump_file, ";; cont=%s\n", reason);     \
+    goto next;                                         \
+  } while (0)
+
+  for (int ip = 0; ip < ps.n_plies; ++ip)
+    {
+      const ply_t &p = ps.plies[ip];
+
+      fpd->ply_stack[len - 1] = &p;
+
+      if (0)
+       next: continue;
+
+      if (extra)
+       ply_t::dump_plys (dump_file, len, 1, fpd->ply_stack + len - 1, memo0);
+
+      // A MOVW with a Hamming distance of < 2 requires more plys.
+      if (p.is_movw () && len + (2 - p.dhamming) > fpd->n_best_plys)
+       NEXT ("movw.plys");
+
+      if (len >= 2)
+       {
+         // Destroying (parts of) the results of the previous ply
+         // won't yield an optimal sequence.
+         if (p.overrides (prev))
+           NEXT ("overrides");
+
+         // When two plys are independent of each other, then only
+         // investigate sequences that operate on the higher reg first.
+         // This canonicalization reduces the number of candidates,
+         if (p.commutes_with (prev, ii.m_scratch)
+             && p.regno > prev->regno)
+           NEXT ("noncanonic");
+
+         // Two subsequent BLDs touching the same register.
+         if (p.is_bld ()
+             && prev->is_bld ()
+             && p.changes_result_of (prev))
+           NEXT ("2bld");
+
+         // When there is a BLD, then at least 2 of the same kind
+         // shall occur in a row.
+         if (prev->is_bld ()
+             && ! p.is_bld ()
+             && (len == 2
+                 || (prev->is_setbld () && ! prev2->is_setbld ())
+                 || (prev->is_cltbld () && ! prev2->is_cltbld ())))
+           NEXT ("1bld");
+       }
+
+      // The hamming delta of a MOVW may be less than 2, namely 0 or 1.
+      // When the latter is the case, then a reasonable sequence must
+      // modify the result of the MOVW.
+      if (len >= 2
+         && prev->is_movw ()
+         && prev->dhamming == 1
+         && ! p.changes_result_of (prev))
+       NEXT ("movw.dh=1");
+
+      if (len >= 3
+         && prev2->is_movw ()
+         && prev2->dhamming == 0
+         && ! p.changes_result_of (prev2))
+       NEXT ("movw.dh=0");
+
+      // When setting an n-byte destination, then at most n/2 MOVWs
+      // will occur in an optimal sequence.
+      int n_movw = 0;
+      for (int i = 0; i < len; ++i)
+       n_movw += fpd->ply_stack[i]->is_movw ();
+      if (n_movw > ii.m_size / 2)
+       NEXT ("movws");
+
+      if (ply_applied_p)
+       memo = memo0;
+
+      memo.apply (p);
+
+      ply_applied_p = true;
+
+      // Calculate the cost of the sequence we have so far.  Scale by some
+      // factor so that we can express that ADIW is more expensive than MOVW
+      // because it is slower, but without defeating MOVW.
+      const int SCALE = 4;
+
+      int penal = 0;
+      int cost = SCALE * 0;
+
+      bool movw_p = 0;
+      for (int i = 0; i < len; ++i)
+       {
+         bool adiw_p = fpd->ply_stack[i]->is_adiw ();
+         cost += SCALE * fpd->ply_stack[i]->cost + adiw_p;
+         penal += adiw_p;
+         movw_p |= fpd->ply_stack[i]->is_movw ();
+       }
+      penal += movw_p;
+
+      const int hamm = ii.hamming (memo);
+
+      // The current Hamming distance yields a lower bound of how many
+      // plys are still required.  Consider that future cost already now.
+      int future_cost = AVR_HAVE_MOVW || (AVR_HAVE_ADIW && ii.m_regno >= REG_22)
+       ? (1 + hamm) / 2
+       : hamm;
+
+      // Similarly, when MOVW doesn't decrease the Hamming distance by 2,
+      // then we know that at least 2 - dhamming plys must follow in the
+      // future.  (MOVW + ADIW will not occur.)
+      if (p.is_movw ())
+       future_cost = std::max (future_cost, 2 - p.dhamming);
+
+      if (extra && future_cost)
+       avr_dump (";; future cost = %d, dh=%d\n", future_cost, hamm);
+
+      cost += SCALE * future_cost;
+
+      bool profitable = (cost < SCALE * fpd->max_ply_cost
+                        || (bbinfo_t::try_split_any_p
+                            && cost / SCALE <= fpd->max_ply_cost
+                            && cost / SCALE == fpd->movmode_cost));
+      if (! profitable)
+       {
+         if (extra)
+           avr_dump (";; cont=cost %d+%d/%d\n", cost / SCALE, penal, SCALE);
+         continue;
+       }
+
+      if (hamm)
+       {
+         // Go down that rabbit hole.
+         gcc_assert (ply_applied_p);
+         bbinfo_t::find_plies (1 + len, ii, memo);
+         continue;
+       }
+
+      // Found a solution that's better than everything so far.
+
+      // Reduce the upper cost bound according to the found solution.
+      // No future solution will be more expensive.
+      fpd->max_ply_cost = cost / SCALE;
+
+      fpd->solution = plies_t (len, fpd->ply_stack);
+
+      if (dump_file)
+       {
+         avr_dump (";; #%d FOUND COST = %d%s\n", len, cost / SCALE,
+                   penal ? " with penalty" : "");
+         ply_t::dump_plys (dump_file, 0, len, fpd->ply_stack, fpd->regs0);
+         if (extra)
+           avr_dump (";; END\n");
+       }
+    } // for ply_t's
+
+#undef NEXT
+}
+
+
+// Run .find_plies() and return true when .fpd->solution is a sequence of ply_t's
+// that represents II, a REG = CONST insn.  MEMO is the GPR state prior to II.
+bool
+bbinfo_t::run_find_plies (const insninfo_t &ii, const memento_t &memo) const
+{
+  fpd->solution.reset ();
+  fpd->regs0 = memo;
+  fpd->n_get_plies = 0;
+
+  const int hamm = ii.hamming (memo);
+
+  if (hamm == 0)
+    {
+      avr_dump (";; Found redundant insn %d\n", INSN_UID (ii.m_insn));
+      return true;
+    }
+
+  // Upper bound (in words) for any solution that's better than mov<mode>.
+  // Will be decreased by find plies as it finds better solutions.
+  fpd->movmode_cost = ii.cost ();
+  fpd->max_ply_cost = fpd->movmode_cost;
+
+  // With a non-zero Hamming distance, this insn will require at least one
+  // instruction.  When the upper bound for required instructions is that
+  // small, then the current insn is good enough.
+  if (fpd->max_ply_cost <= 1)
+    return false;
+
+  fpd->n_best_plys = ii.n_best_plys (hamm);
+  gcc_assert (fpd->n_best_plys <= N_BEST_PLYS);
+
+  if (dump_file)
+    {
+      const uint64_t mask = size_to_mask (ii.m_size);
+      fprintf (dump_file, ";; find_plies R%d[%d] = 0x%0*" PRIx64,
+              ii.m_regno, ii.m_size, 2 * ii.m_size, ii.m_isrc & mask);
+      if (ii.m_scratch)
+       fprintf (dump_file, ", scratch=r%d", ii.m_scratch);
+      memo.dump ("\n;; regs%s\n");
+    }
+
+  avr_dump (";; mov<mode> cost = %d\n", fpd->max_ply_cost);
+  avr_dump (";; max plys = %d\n", fpd->n_best_plys);
+  ply_t::n_ply_ts = 0;
+
+  find_plies (1, ii, memo);
+
+  avr_dump (";; get_plies called %d times\n", fpd->n_get_plies);
+  avr_dump (";; n_ply_ts = %d\n", ply_t::n_ply_ts);
+  ply_t::max_n_ply_ts = std::max (ply_t::max_n_ply_ts, ply_t::n_ply_ts);
+
+  return fpd->solution.n_plies != 0;
+}
+
+
+// Try to fuse two 1-byte insns .prev and .curr to one 2-byte insn (MOVW).
+// Returns true on success, and sets .n_new_insns, .ignore_mask etc.
+bool
+optimize_data_t::try_fuse (bbinfo_t *bbi)
+{
+  insninfo_t comb;
+
+  if (! prev.ii.m_size
+      || ! curr.ii.m_size
+      || ! comb.combine (prev.ii, curr.ii))
+    return false;
+
+  avr_dump (";; Working on fuse of insn %d + insn %d = 0x%04x\n",
+           INSN_UID (prev.insn), INSN_UID (curr.insn),
+           (unsigned) comb.m_isrc);
+
+  bool found = bbi->run_find_plies (comb, prev.regs);
+  if (found)
+    {
+      avr_dump (";; Found fuse of insns %d and %d\n",
+               INSN_UID (prev.insn), INSN_UID (curr.insn));
+
+      n_new_insns = bbinfo_t::fpd->solution.emit_insns (comb, prev.regs);
+      delete_prev_p = true;
+
+      if (prev.ii.m_scratch)
+       ignore_mask |= regmask (prev.ii.m_scratch, 1);
+      if (curr.ii.m_scratch)
+       ignore_mask |= regmask (curr.ii.m_scratch, 1);
+      ignore_mask &= ~regmask (comb.m_regno, comb.m_size);
+    }
+
+  return found;
+}
+
+
+// Try to replace an arithmetic 1-byte insn by a reg-reg move.
+// Returns true on success, and sets .n_new_insns etc.
+bool
+optimize_data_t::try_simplify (bbinfo_t *)
+{
+  if (curr.ii.m_size == 1
+      && curr.ii.m_old_code != REG
+      && curr.ii.m_code == REG)
+    {
+      avr_dump (";; Found simplify of insn %d\n", INSN_UID (curr.insn));
+
+      n_new_insns = curr.ii.emit_insn ();
+
+      return true;
+    }
+
+  return false;
+}
+
+
+// Try to replace XEXP (*, 1) of a binary operation by a cheaper expression.
+// Returns true on success; sets .n_new_insns, .ignore_mask, .delete_prev_p.
+bool
+optimize_data_t::try_bin_arg1 (bbinfo_t *)
+{
+  if (curr.ii.m_size != 1
+      || curr.ii.m_new_src.arity () != 2
+      || curr.unused)
+    return false;
+
+  avr_dump (";; Working on bin_arg1 insn %d\n", INSN_UID (curr.insn));
+
+  gcc_assert (curr.ii.m_src && BINARY_P (curr.ii.m_src));
+  rtx xarg1_old = XEXP (curr.ii.m_src, 1);
+
+  const absint_byte_t &aib = curr.ii.m_new_src;
+  const absint_val_t &arg0 = aib.arg (0);
+  const absint_val_t &arg1 = aib.arg (1);
+  const absint_val_t &arg1_old = curr.ii.m_ai[0].arg (1);
+
+  rtx src = NULL_RTX;
+
+  if (CONSTANT_P (xarg1_old))
+    {
+      // Sometimes, we allow expensive constants as 2nd operand like
+      // in  R2 += 2  which produces two INCs.  When we have the
+      // constant handy in a reg, then use that instead of the constant.
+      const rtx_code code = aib.get_code ();
+      gcc_assert (arg1.val8 == (INTVAL (xarg1_old) & 0xff));
+
+      if (AVRasm::constant_cost (code, arg0.regno, arg1.val8) > 1)
+         src = aib.to_rtx ();
+    }
+  else if (REG_P (xarg1_old)
+          && dead_or_set_p (curr.insn, xarg1_old))
+    {
+      src = aib.to_rtx ();
+
+      // The 2nd operand is a reg with a known content that dies
+      // at the current insn.  Chances are high that the register
+      // holds a reload value only used by the current insn.
+      if (prev.ii.m_size == 1
+         && rtx_equal_p (xarg1_old, SET_DEST (prev.ii.m_set))
+         && CONSTANT_P (prev.ii.m_src))
+       {
+         avr_dump (";; Found dying reload insn %d\n", INSN_UID (prev.insn));
+
+         delete_prev_p = true;
+         ignore_mask = regmask (arg1_old.regno, 1);
+       }
+    }
+
+  if (src)
+    {
+      rtx dest = SET_DEST (curr.ii.m_set);
+
+      avr_dump (";; Found bin_arg1 for insn %d: ", INSN_UID (curr.insn));
+      avr_dump ("%C:%m %r", curr.ii.m_code, GET_MODE (dest), xarg1_old);
+      aib.dump (" = %s\n");
+
+      emit_valid_move_clobbercc (dest, src);
+      n_new_insns = 1;
+    }
+
+  return src != NULL_RTX;
+}
+
+
+// Try to replace a REG = CONST insn by a cheaper sequence.
+// Returns true on success, and sets .n_new_insns, .ignore_mask etc.
+bool
+optimize_data_t::try_split_ldi (bbinfo_t *bbi)
+{
+  if (! curr.ii.m_size
+      || curr.unused
+      || curr.ii.m_code != CONST_INT
+      || (! bbinfo_t::try_split_any_p
+         // Finding plys will only ever succeed when there are
+         // regs with a known value.
+         && ! (curr.regs.known
+               || (AVR_HAVE_MOVW
+                   && curr.ii.m_regno < REG_16 && curr.ii.m_size == 4))))
+    return false;
+
+  avr_dump (";; Working on split_ldi insn %d\n", INSN_UID (curr.insn));
+
+  bool found = bbi->run_find_plies (curr.ii, curr.regs);
+  if (found)
+    {
+      avr_dump (";; Found split for ldi insn %d\n", INSN_UID (curr.insn));
+
+      n_new_insns = bbinfo_t::fpd->solution.emit_insns (curr.ii, curr.regs);
+
+      if (curr.ii.m_scratch)
+       ignore_mask = regmask (curr.ii.m_scratch, 1);
+    }
+
+  return found;
+}
+
+
+// Helper for try_split_any().
+bool
+optimize_data_t::fail (const char *reason)
+{
+  n_new_insns = -1;
+
+  if (dump_file)
+    fprintf (dump_file, ";; Giving up split_any: %s\n", reason);
+
+  return false;
+}
+
+
+// Helper for try_split_any().
+rtx_insn *
+optimize_data_t::emit_and_apply_move (memento_t &memo, rtx dest, rtx src)
+{
+  rtx_insn *insn = emit_valid_move_clobbercc (dest, src);
+  n_new_insns += 1;
+  memo.apply_insn (insn, false);
+
+  return insn;
+}
+
+
+// Set X0 and X1 so that they are operands valid for a andqi3, iorqi3, xorqi3
+// or addqi3 insn with destination R_DEST.  The method loads X1 to
+// a scratch reg as needed and records the GPR effect in IOD.regs.
+// EXTRA_COST are extra costs in units of words of insns that cost more
+// than one instruction.  This is a helper for try_split_any().
+bool
+optimize_data_t
+    ::get_2ary_operands (rtx_code &code, const absint_byte_t &aib,
+                        insn_optimize_data_t &iod, int r_dest,
+                        absint_val_t &x0, absint_val_t &x1, int &extra_cost)
+{
+  if (code != IOR && code != AND && code != XOR && code != PLUS)
+    return fail ("2ary: unknown code");
+
+  x0 = aib.arg (0);
+  x1 = aib.arg (1);
+
+  if (! x0.knows_regno ()
+      || x1.clueless ())
+    return fail ("2ary: clueless");
+
+  int val8 = x1.val8;
+  int val8_cost = val8 < 0 ? 100 : AVRasm::constant_cost (code, r_dest, val8);
+
+  if (x0.regno == r_dest
+      && (x1.knows_regno ()
+         || val8_cost <= 1))
+    {
+      if (code == XOR
+         && val8 == 0x80
+         && x0.regno >= REG_16)
+       {
+         // xorxi3 can only "r,0,r".
+         // x0 ^ 0x80  <=>  x0 - 0x80.
+         x1.regno = 0;
+         code = MINUS;
+       }
+      return true;
+    }
+
+  const bool and_1_bit = code == AND && popcount_hwi (val8) == 1;
+  // andqi3 has a "r,r,Cb1" alternative where Cb1 has exactly 1 bit set.
+  // This can accommodate bytes of higher AND Cb<N> alternatives.
+  if (x0.regno != r_dest)
+    {
+      if (and_1_bit)
+       {
+         extra_cost += 1 + (r_dest < REG_16);
+         return true;
+       }
+      else if (x1.regno == r_dest)
+       {
+         std::swap (x0, x1);
+         return true;
+       }
+      return fail ("2ary is a 3-operand insn");
+    }
+
+  // Now we have:
+  // 1)  r_dest = x0.regno, and
+  // 2)  x1 is val8, and
+  // 3)  x1 costs 2.
+
+  const bool needs_scratch_p = select<bool>()
+    : code == XOR ? true
+    : code == AND ? popcount_hwi (val8) != 7
+    : code == IOR ? popcount_hwi (val8) != 1
+    : code == PLUS ? IN_RANGE (val8, 3, 0xff - 3)
+    : bad_case<bool> ();
+
+  const int r_val8 = iod.regs.regno_with_value (val8, 0 /* excludes: none */);
+  if (r_val8)
+    {
+      // Found a reg that already holds the constant.
+      x1.val8 = -1;
+      x1.regno = r_val8;
+      return true;
+    }
+  else if (iod.ii.m_scratch)
+    {
+      // Using the insn's scratch reg.
+      rtx xdst = gen_rtx_REG (QImode, iod.ii.m_scratch);
+      rtx xsrc = gen_int_mode (x1.val8, QImode);
+      emit_and_apply_move (iod.regs, xdst, xsrc);
+
+      x1.regno = iod.ii.m_scratch;
+      x1.val8 = -1;
+
+      return true;
+    }
+  else if (! needs_scratch_p)
+    {
+      // Some constants (1 and -1) can be loaded without a scratch.
+      extra_cost += 1;
+      return true;
+    }
+  else if (and_1_bit)
+    {
+      // This can always fall back to BST + CLR + BLD, but may be cheaper.
+      extra_cost += 1 + (r_dest < REG_16);
+      return true;
+    }
+
+  return fail ("2ary: expensive constant");
+}
+
+
+static inline bool
+any_shift_p (rtx_code code)
+{
+  return code == LSHIFTRT || code == ASHIFTRT || code == ASHIFT;
+}
+
+// Try to split .curr into a sequence of 1-byte insns.
+// Returns true on success.  Sets .n_new_insns and .ignore_mask.
+bool
+optimize_data_t::try_split_any (bbinfo_t *)
+{
+  if (curr.ii.m_size < 2
+      // Constants are split by split_ldi.
+      || CONSTANT_P (curr.ii.m_src)
+      // Splitting requires knowledge about what to do with each byte.
+      || curr.ii.m_ai.end_knows (VALUE) < curr.ii.m_size)
+    return false;
+
+  avr_dump (";; Working on split_any %C:%m insn %d\n", curr.ii.m_code,
+           GET_MODE (SET_DEST (curr.ii.m_set)), INSN_UID (curr.insn));
+
+  const insninfo_t &ii = curr.ii;
+  const int n_bytes = ii.m_size;
+  int extra_cost = 0;
+  int binop_cost = -1;
+
+  // For plain AND, IOR, XOR get the current cost in units of words.
+  if (BINARY_P (curr.ii.m_src))
+    {
+      const rtx_code code = curr.ii.m_code;
+      if ((code == IOR || code == AND || code == XOR)
+         && REG_P (XEXP (curr.ii.m_src, 0))
+         && CONSTANT_P (XEXP (curr.ii.m_src, 1)))
+       {
+         binop_cost = get_attr_length (curr.insn);
+         avr_dump (";; Competing against %C:%m cost = %d\n", code,
+                   GET_MODE (curr.ii.m_src), binop_cost);
+       }
+    }
+
+  // Step 1: Work out conflicts and which sign extends to perform.
+
+  const gprmask_t regs_dest = regmask (ii.m_regno, n_bytes);
+  int r_sign = 0;
+  gprmask_t regs_signs = 0;
+  bool has_lsl = false;
+  bool has_lsr = false;
+
+  for (int i = 0; i < n_bytes; ++i)
+    {
+      const absint_byte_t &aib = ii.m_ai[i];
+      const int r_dest = ii.m_regno + i;
+      const gprmask_t regs_src = aib.reg_mask ();
+
+      // When only regs to the right are used, or only regs to the left
+      // are used, then there's no conflict like it is arising for rotates.
+      // For now, only implement conflict-free splits.
+      has_lsl |= has_bits_in (regs_src & regs_dest, 0, r_dest - 1);
+      has_lsr |= has_bits_in (regs_src & regs_dest, r_dest + 1, 31);
+      if (has_lsl && has_lsr)
+       return fail ("has both << and >>");
+
+      if (aib.get_code () == SIGN_EXTEND)
+       {
+         const absint_val_t x0 = aib.arg (0);
+         if (! r_sign)
+           r_sign = x0.regno;
+         else if (r_sign != x0.regno)
+           return fail ("too many signs");
+
+         // Signs are handled below after all the other bytes.
+         regs_signs |= regmask (r_dest, 1);
+       }
+    }
+
+  // Step 2: Work on the individual bytes and emit according insns.
+
+  n_new_insns = 0;
+  memento_t memo = curr.regs;
+
+  const int step = has_lsl ? -1 : 1;
+  const int istart = step == 1 ? 0 : n_bytes - 1;
+  const int iend = step == 1 ? n_bytes : -1;
+
+  for (int i = istart; i != iend; i += step)
+    {
+      const absint_byte_t &aib = ii.m_ai[i];
+      const int r_dest = ii.m_regno + i;
+      rtx_code code = aib.get_code ();
+      rtx xsrc = NULL_RTX;
+      rtx xdest = gen_rtx_REG (QImode, r_dest);
+
+      if (code == SET)
+       {
+         const int r_src = aib.regno (false);
+         const int val8 = aib.val8 (false);
+         int r16;
+
+         // A no-op...
+         if (r_dest == r_src)
+           continue;
+         // ...or an existing 16-bit constant...
+         else if (AVR_HAVE_MOVW
+                  && i + step != iend
+                  // Next is not a no-op.
+                  && ii.m_ai[i + step].regno (false) != r_dest + step
+                  // Eligible for MOVW.
+                  && r_dest + step == (r_dest ^ 1)
+                  && r_dest % 2 == i % 2
+                  && (r16 = ii.m_ai.reg16_with_value (i, i + step, memo)))
+           {
+             xdest = gen_rtx_REG (HImode, r_dest & ~1);
+             xsrc = gen_rtx_REG (HImode, r16);
+             i += step;
+           }
+         // ...or a cheap constant...
+         else if (val8 >= 0
+                  && AVRasm::constant_cost (SET, r_dest, val8) <= 1)
+           xsrc = gen_int_mode (val8, QImode);
+         // ...or a reg-reg move...
+         else if (r_src)
+           xsrc = gen_rtx_REG (QImode, r_src);
+         // ...or a costly constant that already exists in some reg...
+         else if (memo.regno_with_value (val8, 0 /* excludes: none */))
+           xsrc = gen_rtx_REG (QImode, memo.regno_with_value (val8, 0));
+         // ...or a costly constant loaded into curr.insn's scratch reg...
+         else if (ii.m_scratch)
+           {
+             rtx xscratch = gen_rtx_REG (QImode, ii.m_scratch);
+             rtx xval8 = gen_int_mode (val8, QImode);
+             emit_and_apply_move (memo, xscratch, xval8);
+             xsrc = xscratch;
+           }
+         // ...or a costly constant (1 or -1) that doesn't need a scratch.
+         else if (! AVRasm::ldi_needs_scratch (r_dest, val8))
+           {
+             extra_cost += 1;
+             xsrc = gen_int_mode (val8, QImode);
+           }
+         else
+           return fail ("expensive val8");
+       } // SET
+      else if (aib.arity () == 1)
+       {
+         if (aib.get_code () == SIGN_EXTEND)
+           // Signs are handled after all the others.
+           continue;
+         else
+           {
+             const absint_val_t x0 = aib.arg (0);
+             rtx xop0 = gen_rtx_REG (QImode, x0.regno);
+             xsrc = gen_rtx_fmt_e (code, QImode, xop0);
+           }
+       } // unary
+      else if (aib.arity () == 2)
+       {
+         absint_val_t x0;
+         absint_val_t x1;
+         insn_optimize_data_t iod (memo);
+         iod.ii = curr.ii;
+
+         if (! get_2ary_operands (code, aib, iod, r_dest, x0, x1, extra_cost))
+           return false;
+         rtx xop0 = gen_rtx_REG (QImode, x0.regno);
+         rtx xop1 = x1.knows_val8 ()
+           ? gen_int_mode (x1.val8, QImode)
+           : gen_rtx_REG (QImode, x1.regno);
+
+         xsrc = gen_rtx_fmt_ee (code, QImode, xop0, xop1);
+       } // binary
+
+      if (! xsrc)
+       return fail ("no source found");
+
+      if (r_sign
+         && (regmask (xdest) & regmask (r_sign, 1)))
+       return fail ("clobbered r_sign");
+
+      emit_and_apply_move (memo, xdest, xsrc);
+    }
+
+  // Step 3: Emit insns for sign extend.
+  // No more need to track memo beyond this point.
+
+  if (! emit_signs (r_sign, regs_signs))
+    return false;
+
+  if (binop_cost >= 0)
+    {
+      avr_dump (";; Expected cost: %d + %d\n", n_new_insns, extra_cost);
+      if (n_new_insns + extra_cost > binop_cost)
+       return fail ("too expensive");
+    }
+
+  if (ii.m_scratch)
+    ignore_mask = regmask (ii.m_scratch, 1);
+
+  return true;
+}
+
+
+// A helper for try_split_any() above.
+// Emit sign extends from R_MSB.7 to all regs in REGS_SIGNS.
+bool
+optimize_data_t::emit_signs (const int r_msb, gprmask_t regs_signs)
+{
+  if (! regs_signs)
+    return true;
+  else if (! r_msb)
+    return fail ("fatal: no r_msb given");
+
+  // Pick an arbitrary reg from the sign destinations when the source
+  // isn't one of the signs.
+  const int r_signs = regs_signs & regmask (r_msb, 1)
+    ? r_msb
+    : ctz_hwi (regs_signs);
+
+  // Set all bits in r_signs according to the sign of r_msb using the
+  // r,r,C07 alternative of ashrqi3.
+  rtx xsrc = gen_rtx_fmt_ee (ASHIFTRT, QImode,
+                            gen_rtx_REG (QImode, r_msb), GEN_INT (7));
+  emit_valid_move_clobbercc (gen_rtx_REG (QImode, r_signs), xsrc);
+  regs_signs &= ~regmask (r_signs, 1);
+
+  // Set up a 16-bit sign register if possible.
+  int r16_signs = 0;
+  if (regs_signs & regmask (r_signs ^ 1, 1))
+    {
+      emit_move_mask (r_signs ^ 1, r_signs, 1, regs_signs);
+      r16_signs = r_signs & ~1;
+    }
+
+  // Handle all 16-bit signs regs provided MOVW.
+  if (AVR_HAVE_MOVW)
+    for (int r = FIRST_GPR; r < REG_32; r += 2)
+      {
+       const gprmask_t m = regmask (r, 2);
+       if ((m & regs_signs) == m)
+         {
+           if (r16_signs)
+             emit_move_mask (r, r16_signs, 2, regs_signs);
+           else
+             {
+               emit_move_mask (r + 0, r_signs, 1, regs_signs);
+               emit_move_mask (r + 1, r_signs, 1, regs_signs);
+               r16_signs = r;
+             }
+         }
+      }
+
+  // Handle all remaining signs.
+  while (regs_signs)
+    emit_move_mask (ctz_hwi (regs_signs), r_signs, 1, regs_signs);
+
+  return true;
+}
+
+// Helper for the method above.  Move N_BYTES registers from R_SRC to R_DST,
+// keeping track of which regs are still to be done in MASK.
+void
+optimize_data_t::emit_move_mask (int r_dst, int r_src, int n_bytes,
+                                gprmask_t &mask)
+{
+  const gprmask_t mask_dst = regmask (r_dst, n_bytes);
+  const gprmask_t mask_src = regmask (r_src, n_bytes);
+  gcc_assert ((mask_dst & mask) == mask_dst);
+  gcc_assert ((mask_src & mask) == 0);
+  rtx xdst = gen_rtx_REG (size_to_mode (n_bytes), r_dst);
+  rtx xsrc = gen_rtx_REG (size_to_mode (n_bytes), r_src);
+  emit_valid_move_clobbercc (xdst, xsrc);
+  n_new_insns += 1;
+  mask &= ~mask_dst;
+}
+
+
+void
+bbinfo_t::optimize_one_block (bool &changed)
+{
+  memento_t prev_regs;
+
+  rtx_insn *insn = next_nondebug_insn_bb (bb, BB_HEAD (bb));
+
+  for (rtx_insn *next_insn; insn; insn = next_insn)
+    {
+      next_insn = next_nondebug_insn_bb (bb, insn);
+
+      avr_dump ("\n;; Working on insn %d\n%r\n\n", INSN_UID (insn), insn);
+
+      optimize_data_t od (prev_regs, regs);
+
+      od.prev.insn = prev_nondebug_insn_bb (bb, insn);
+      od.curr.insn = insn;
+
+      od.prev.ii.init1 (od.prev, 1, "IIprev ");
+      od.curr.ii.init1 (od.curr, 8, "IIcurr ");
+
+      start_sequence ();
+
+      bool found = ((bbinfo_t::try_fuse_p && od.try_fuse (this))
+                   || (bbinfo_t::try_bin_arg1_p && od.try_bin_arg1 (this))
+                   || (bbinfo_t::try_simplify_p && od.try_simplify (this))
+                   || (bbinfo_t::try_split_ldi_p && od.try_split_ldi (this))
+                   || (bbinfo_t::try_split_any_p && od.try_split_any (this)));
+
+      rtx_insn *new_insns = get_insns ();
+      end_sequence ();
+
+      gcc_assert (found == (od.n_new_insns >= 0));
+
+      ++tick;
+
+      // This insn will become the previous one in the next loop iteration.
+      // Just used in dumps.
+      rtx_insn *new_curr_insn;
+
+      if (! found)
+       {
+         // Nothing changed.
+         avr_dump (";; Keeping old route.\n");
+         gcc_assert (! od.delete_prev_p);
+
+         prev_regs = regs;
+         regs.apply_insn (insn, false);
+
+         new_curr_insn = insn;
+       }
+      else
+       {
+         // We have new_insns.
+         changed = true;
+
+         if (dump_file)
+           {
+             avr_dump ("\n;; EMIT %d new insn%s replacing ",
+                       od.n_new_insns, "s" + (od.n_new_insns == 1));
+             if (od.delete_prev_p)
+               avr_dump ("insn %d and ", INSN_UID (od.prev.insn));
+             avr_dump ("insn %d, delete_prev=%d:\n%L\n", INSN_UID (insn),
+                       od.delete_prev_p, new_insns);
+           }
+
+         new_curr_insn = od.emit_sequence (bb, new_insns);
+       } // found
+
+      if (dump_file && new_curr_insn)
+       {
+         avr_dump ("\n");
+
+         const int d = regs.distance_to (prev_regs);
+         if (d || new_curr_insn != insn)
+           avr_dump (";; %d regs changed state:\n", d);
+
+         if (new_curr_insn != insn)
+           {
+             avr_dump (";; Befor insn %d", INSN_UID (new_curr_insn));
+             prev_regs.dump ();
+           }
+
+         avr_dump (";; After insn %d", INSN_UID (new_curr_insn));
+         regs.dump ();
+       }
+    } // for BB insns
+}
+
+
+void
+bbinfo_t::optimize_one_function (function *func)
+{
+  bbinfo_t::fpd = XNEW (bbinfo_t::find_plies_data_t);
+  bbinfo_t::bb_info = XCNEWVEC (bbinfo_t, last_basic_block_for_fn (func));
+  int *post_order = XNEWVEC (int, n_basic_blocks_for_fn (func));
+
+  plies_t::max_n_plies = 0;
+
+  using elt0_getter_HRS = elt0_getter<HARD_REG_SET, HARD_REG_ELT_TYPE>;
+  memento_t::fixed_regs_mask = (gprmask_t) elt0_getter_HRS::get (fixed_reg_set);
+
+  // Option -mfuse-move=<0,23> provides a 3:2:2:2 mixed radix value:
+  // -mfuse-move= 0 1 2 3 4 5 6 7 8 9 10 1 2 3 4 5 6 7 8 9 20 1 2 3  Digit
+  // fuse           1   1   1   1   1    1   1   1   1   1    1   1      0
+  // bin_arg1         1 1     1 1      1 1     1 1     1 1      1 1      1
+  // split_any            1 1 1 1          1 1 1 1          1 1 1 1      2
+  // split_ldi                    1 1  1 1 1 1 1 1 1 1 1 1  1 1 1 1      3
+  // use arith                                     1 1 1 1  1 1 1 1      3
+
+  // Which optimization(s) to perform.
+  bbinfo_t::try_fuse_p = avr_fuse_move & 0x1;      // Digit 0 in [0, 1].
+  bbinfo_t::try_bin_arg1_p = avr_fuse_move & 0x2;  // Digit 1 in [0, 1].
+  bbinfo_t::try_split_any_p = avr_fuse_move & 0x4; // Digit 2 in [0, 1].
+  bbinfo_t::try_split_ldi_p = avr_fuse_move >> 3;       // Digit 3 in [0, 2].
+  bbinfo_t::use_arith_p = (avr_fuse_move >> 3) >= 2;    // Digit 3 in [0, 2].
+  bbinfo_t::use_set_some_p = bbinfo_t::try_split_ldi_p; // Digit 3 in [0, 2].
+  bbinfo_t::try_simplify_p = avr_fuse_move != 0;
+
+  // Topologically sort BBs from last to first.
+
+  const int n_post_order = post_order_compute (post_order, false, false);
+  bool changed = false;
+
+  // Traverse the BBs from first to last in order to increase the chance
+  // that register values from all incoming edges are known.
+
+  for (int n = n_post_order - 1; n >= 0; --n)
+    {
+      basic_block bb = BASIC_BLOCK_FOR_FN (func, post_order[n]);
+
+      bbinfo_t::bb_info[bb->index].bb = bb;
+      bbinfo_t::bb_info[bb->index].enter ();
+      bbinfo_t::bb_info[bb->index].optimize_one_block (changed);
+      bbinfo_t::bb_info[bb->index].leave ();
+    }
+
+  if (plies_t::max_n_plies)
+    avr_dump (";; max_n_plies=%d\n", (int) plies_t::max_n_plies);
+
+  if (changed)
+    {
+      df_note_add_problem ();
+      df_analyze ();
+    }
+
+  XDELETEVEC (post_order);
+  XDELETEVEC (bbinfo_t::bb_info);
+  XDELETE (bbinfo_t::fpd);
+}
+
+} // anonymous namespace
+
  
  namespace
  {
@@ -1938,3 +5116,11 @@ make_avr_pass_recompute_notes (gcc::context *ctxt)
  {
    return new avr_pass_recompute_notes (ctxt, "avr-notes-free-cfg");
  }
+
+// Optimize moves after reload.
+
+rtl_opt_pass *
+make_avr_pass_fuse_move (gcc::context *ctxt)
+{
+  return new avr_pass_fuse_move (ctxt, "avr-fuse-move");
+}
diff --git a/gcc/config/avr/avr-passes.def b/gcc/config/avr/avr-passes.def

index d39bdd89a11b00b8e728071b07593c5d51f65b88..857e6b521238b92948b89e5b010d5bf1167b323c 100644 (file)
--- a/gcc/config/avr/avr-passes.def
+++ b/gcc/config/avr/avr-passes.def
@@ -93,3 +93,14 @@ INSERT_PASS_AFTER (pass_expand, 1, avr_pass_casesi);
     Hence, run a mini pass right before split2 which introduces REG_CC.  */
  
  INSERT_PASS_BEFORE (pass_split_after_reload, 1, avr_pass_ifelse);
+
+/* A post reload pass that tracks known values held in registers
+   and performs optimizations based on that knowledge.
+   It also splits non-memory insns that can be represented in
+   terms of byte operations.
+
+   It runs between the two instances of the RTL peephole pass because
+   -  The RTL peepholer may provide a scratch reg for *reload_in<mode>.
+   -  The RTL peepholer may optimize insns involving lower registers.  */
+
+INSERT_PASS_AFTER (pass_peephole2, 1, avr_pass_fuse_move);
diff --git a/gcc/config/avr/avr-protos.h b/gcc/config/avr/avr-protos.h

index 96708eb4db5859aa7b028ba7cff8334c15caaf37..ae3cc62d038dd65e0b168ad11c08293a9b093a03 100644 (file)
--- a/gcc/config/avr/avr-protos.h
+++ b/gcc/config/avr/avr-protos.h
@@ -46,9 +46,17 @@ extern void avr_init_cumulative_args (CUMULATIVE_ARGS*, tree, rtx, tree);
  #endif /* TREE_CODE */
  
  #ifdef RTX_CODE
+extern rtx avr_chunk (machine_mode mode, rtx x, int n);
+extern rtx avr_byte (rtx x, int n);
+extern rtx avr_word (rtx x, int n);
+extern int8_t avr_int8 (rtx x, int n);
+extern uint8_t avr_uint8 (rtx x, int n);
+extern int16_t avr_int16 (rtx x, int n);
+extern uint16_t avr_uint16 (rtx x, int n);
  extern const char *output_movqi (rtx_insn *insn, rtx operands[], int *l);
  extern const char *output_movhi (rtx_insn *insn, rtx operands[], int *l);
  extern const char *output_movsisf (rtx_insn *insn, rtx operands[], int *l);
+extern const char *avr_out_set_some (rtx_insn *, rtx*, int*);
  extern const char *avr_out_tstsi (rtx_insn *, rtx*, int*);
  extern const char *avr_out_tsthi (rtx_insn *, rtx*, int*);
  extern const char *avr_out_tstpsi (rtx_insn *, rtx*, int*);
@@ -67,6 +75,7 @@ extern const char *avr_out_op8_set_ZN (rtx_code, rtx*, int*);
  extern int avr_len_op8_set_ZN (rtx_code, rtx*);
  extern bool avr_op8_ZN_operator (rtx);
  extern const char *avr_out_cmp_ext (rtx*, rtx_code, int*);
+extern bool avr_set_some_operation (rtx);
  
  extern const char *ashlqi3_out (rtx_insn *insn, rtx operands[], int *len);
  extern const char *ashlhi3_out (rtx_insn *insn, rtx operands[], int *len);
@@ -111,6 +120,7 @@ extern bool avr_has_nibble_0xf (rtx);
  
  extern int extra_constraint_Q (rtx x);
  extern int avr_adjust_insn_length (rtx_insn *insn, int len);
+extern void output_reload_in_const (rtx *, rtx clobber, int *len, bool clear_p);
  extern const char* output_reload_inhi (rtx*, rtx, int*);
  extern const char* output_reload_insisf (rtx*, rtx, int*);
  extern const char* avr_out_reload_inpsi (rtx*, rtx, int*);
@@ -173,6 +183,7 @@ namespace gcc { class context; }
  class rtl_opt_pass;
  
  extern rtl_opt_pass *make_avr_pass_fuse_add (gcc::context *);
+extern rtl_opt_pass *make_avr_pass_fuse_move (gcc::context *);
  extern rtl_opt_pass *make_avr_pass_pre_proep (gcc::context *);
  extern rtl_opt_pass *make_avr_pass_recompute_notes (gcc::context *);
  extern rtl_opt_pass *make_avr_pass_casesi (gcc::context *);
@@ -184,7 +195,15 @@ extern bool avr_split_fake_addressing_move (rtx_insn *insn, rtx *operands);
  
  /* From avr-log.cc */
  
+#ifdef GCC_DUMPFILE_H
+#define avr_dump(...)                                                  \
+  do {                                                                 \
+    if (dump_file)                                                     \
+      avr_vdump (dump_file, __FUNCTION__, __VA_ARGS__);                        \
+  } while (0)
+#else
  #define avr_dump(...) avr_vdump (NULL, __FUNCTION__, __VA_ARGS__)
+#endif /* GCC_DUMPFILE_H */
  #define avr_edump(...) avr_vdump (stderr, __FUNCTION__, __VA_ARGS__)
  #define avr_fdump(FIL, ...) avr_vdump (FIL, __FUNCTION__, __VA_ARGS__)
  
diff --git a/gcc/config/avr/avr.cc b/gcc/config/avr/avr.cc

index d0e39cd15dffa8daf80a6c7c9dc29e24ca7d342c..8fab896b70f660e25e52ed1e988117b1dc78afa8 100644 (file)
--- a/gcc/config/avr/avr.cc
+++ b/gcc/config/avr/avr.cc
@@ -251,7 +251,7 @@ avr_tolower (char *lo, const char *up)
     byte at which the chunk starts.  N must be an integral multiple
     of the mode size.  */
  
-static rtx
+rtx
  avr_chunk (machine_mode mode, rtx x, int n)
  {
    gcc_assert (n % GET_MODE_SIZE (mode) == 0);
@@ -262,7 +262,7 @@ avr_chunk (machine_mode mode, rtx x, int n)
  
  /* Return the N-th byte of X as an rtx.  */
  
-static rtx
+rtx
  avr_byte (rtx x, int n)
  {
    return avr_chunk (QImode, x, n);
@@ -271,7 +271,7 @@ avr_byte (rtx x, int n)
  
  /* Return the sub-word of X starting at byte number N.  */
  
-static rtx
+rtx
  avr_word (rtx x, int n)
  {
    return avr_chunk (HImode, x, n);
@@ -280,7 +280,7 @@ avr_word (rtx x, int n)
  
  /* Return the N-th byte of compile-time constant X as an int8_t.  */
  
-static int8_t
+int8_t
  avr_int8 (rtx x, int n)
  {
    gcc_assert (CONST_INT_P (x) || CONST_FIXED_P (x) || CONST_DOUBLE_P (x));
@@ -290,7 +290,7 @@ avr_int8 (rtx x, int n)
  
  /* Return the N-th byte of compile-time constant X as an uint8_t.  */
  
-static uint8_t
+uint8_t
  avr_uint8 (rtx x, int n)
  {
    return (uint8_t) avr_int8 (x, n);
@@ -300,7 +300,7 @@ avr_uint8 (rtx x, int n)
  /* Return the sub-word of compile-time constant X that starts
     at byte N as an int16_t.  */
  
-static int16_t
+int16_t
  avr_int16 (rtx x, int n)
  {
    gcc_assert (CONST_INT_P (x) || CONST_FIXED_P (x) || CONST_DOUBLE_P (x));
@@ -311,7 +311,7 @@ avr_int16 (rtx x, int n)
  /* Return the sub-word of compile-time constant X that starts
     at byte N as an uint16_t.  */
  
-static uint16_t
+uint16_t
  avr_uint16 (rtx x, int n)
  {
    return (uint16_t) avr_int16 (x, n);
@@ -519,7 +519,7 @@ avr_option_override (void)
      opt_pass *extra_peephole2
        = g->get_passes ()->get_pass_peephole2 ()->clone ();
      register_pass_info peep2_2_info
-      = { extra_peephole2, "peephole2", 1, PASS_POS_INSERT_AFTER };
+      = { extra_peephole2, "avr-fuse-move", 1, PASS_POS_INSERT_AFTER };
  
      register_pass (&peep2_2_info);
    }
@@ -3674,7 +3674,7 @@ avr_out_xload (rtx_insn * /*insn*/, rtx *op, int *plen)
     Load ld register with any value        : NONE
     Anything else:                         : CLOBBER  */
  
-static void
+void
  output_reload_in_const (rtx *op, rtx clobber_reg, int *len, bool clear_p)
  {
    rtx src = op[1];
@@ -5823,6 +5823,155 @@ out_movhi_mr_r (rtx_insn *insn, rtx op[], int *plen)
  }
  
  
+/* Output code for the "set_some" insn that sets some QImode GPRs.
+   $0 is a parallel; for its layout see the description of the next function.
+   $0[5] ... $0[8] are SETs of QImode registers to const_int values.  All
+      of them are bytes in the register described by $3 and $4.  SET $0[5]
+      is mandatory, but all the following ones are optional.
+   $1 is a QImode scratch d-register or const0_rtx.
+   $2 is the known 8-bit value held in $1 before the insn starts.  When the
+      code below clobbers $1, then it must restore $1 to $2 at the end.
+   $3 The register number of a GPR.
+   $4 The modesize of $3 in 1...4.
+   PLEN == 0:  Output instructions.
+   PLEN != 0:  Set *PLEN to the length of the sequence in words.  */
+
+const char *
+avr_out_set_some (rtx_insn *insn, rtx *xop, int *plen)
+{
+  const int vlen = XVECLEN (xop[0], 0);
+  const int sets_start = 5;
+  gcc_assert (vlen > sets_start);
+
+  if (plen)
+    *plen = 0;
+
+  rtx op[4];
+  rtx &dest = op[0], &src = op[1], &scratch = op[2], &oldval = op[3];
+  scratch = REG_P (xop[1]) ? xop[1] : NULL_RTX;
+  oldval = NULL_RTX;
+
+  /* There are 3 ways to get a scratch, starting withe the most preferred ones:
+     1) avr_find_unused_d_reg() need not to be restored, and it takes care
+       of fixed regs.  This is an unlikely case, e.g. with -fno-peephole2.
+     2) "set_some" provides a scratch register with a known content.
+       This scratch need not be saved but has to be restored to its value.
+     3) A last resort approach saves and restores some upper register.
+     Notice that "set_some" will only be emit when avr-fuse-move is fed
+     with mov insn(s) that don't have a scratch reg but need one;
+     hence "set_some" won't have a scratch reg at its disposal, either.  */
+
+  bool knows_way_p = false;
+
+  for (int i = sets_start; i < vlen; ++i)
+    {
+      rtx xset = XVECEXP (xop[0], 0, i);
+
+      gcc_assert (GET_CODE (xset) == SET
+                 && REG_P (dest = XEXP (xset, 0))
+                 && CONST_INT_P (src = XEXP (xset, 1)));
+
+      if (src == const0_rtx)
+       avr_asm_len ("clr %0", op, plen, 1);
+      else if (src == const1_rtx)
+       avr_asm_len ("clr %0" CR_TAB
+                    "inc %0", op, plen, 2);
+      else if (src == constm1_rtx)
+       avr_asm_len ("clr %0" CR_TAB
+                    "dec %0", op, plen, 2);
+      else
+       {
+         if (! knows_way_p)
+           {
+             knows_way_p = true;
+
+             static const machine_mode size_to_mode[4 + 1] =
+               {
+                 VOIDmode, QImode, HImode, PSImode, SImode
+               };
+
+             const int ex_regno = INTVAL (xop[3]);
+             const int ex_modesize = INTVAL (xop[4]);
+             rtx exclude = gen_rtx_REG (size_to_mode[ex_modesize], ex_regno);
+             rtx dreg = avr_find_unused_d_reg (insn, exclude);
+
+             if (dreg)
+               {
+                 // Way 1
+                 scratch = dreg;
+                 oldval = NULL_RTX;
+               }
+             else if (scratch)
+               {
+                 // Way 2
+                 if (! reg_unused_after (insn, scratch))
+                   oldval = xop[2];
+               }
+             else
+               {
+                 // Way 3
+                 scratch = all_regs_rtx[REG_24];
+                 oldval = tmp_reg_rtx;
+                 avr_asm_len ("mov %3,%2", op, plen, 1);
+               }
+           } // decide about way
+
+         avr_asm_len ("ldi %2,%1" CR_TAB
+                      "mov %0,%2", op, plen, 2);
+       } // needs a scratch
+    } // for $0[5] ... $0[8].
+
+  if (oldval)
+    avr_asm_len (REG_P (oldval)
+                ? "mov %2,%3"
+                : "ldi %2,%3", op, plen, 1);
+  return "";
+}
+
+
+/* Implements the `set_some_operation' predicate.
+   PARA is a parallel with the following elements:
+   [0] is a USE of an 8-bit scratch d-register or const0_rtx.
+   [1] is the known value held in [0].  When [0] is used as a scratch,
+       then its value has to be restored to [1] after the respective insn.
+   [2] is the regno of a GPR, and
+   [3] is the mode size of that GPR.  All SETs [5]... of PARA will set
+       bytes of that GPR, but in many cases not all of them.
+   [4]  In a clobber of REG_CC.
+   [5] [6] [7] [8]  SETs of an 8-bit register to a const_int value, where
+       all destinations are sub-bytes of [2].  Element [5] is mandatory,
+       and the following elements are optional.  */
+
+bool
+avr_set_some_operation (rtx para)
+{
+  const int sets_start = 5;
+  const int n_sets = XVECLEN (para, 0) - sets_start;
+
+  if (! IN_RANGE (n_sets, 1, 4))
+    return false;
+
+  if (GET_CODE (XVECEXP (para, 0, 4)) != CLOBBER
+      || GET_CODE (XVECEXP (para, 0, 0)) != USE
+      || GET_CODE (XVECEXP (para, 0, 1)) != USE
+      || GET_CODE (XVECEXP (para, 0, 2)) != USE
+      || GET_CODE (XVECEXP (para, 0, 3)) != USE)
+    return false;
+
+  for (int i = sets_start; i < XVECLEN (para, 0); ++i)
+    {
+      rtx xset = XVECEXP (para, 0, i);
+      if (! xset
+         || GET_CODE (xset) != SET
+         || ! register_operand (XEXP (xset, 0), QImode)
+         || ! const_int_operand (XEXP (xset, 1), QImode))
+       return false;
+    }
+
+  return true;
+}
+
+
  /* Implement `TARGET_FRAME_POINTER_REQUIRED'.  */
  /* Return 1 if frame pointer for current function required.  */
  
@@ -6652,9 +6801,9 @@ ashlqi3_out (rtx_insn *insn, rtx operands[], int *len)
  
         case 7:
           *len = 3;
-         return ("ror %0" CR_TAB
-                 "clr %0" CR_TAB
-                 "ror %0");
+         return ("bst %1,0" CR_TAB
+                 "clr %0"   CR_TAB
+                 "bld %0,7");
         }
      }
    else if (CONSTANT_P (operands[2]))
@@ -7077,74 +7226,48 @@ ashlsi3_out (rtx_insn *insn, rtx operands[], int *len)
    return "";
  }
  
-/* 8bit arithmetic shift right  ((signed char)x >> i) */
+/* 8-bit arithmetic shift right  (int8_t) x >> i.  */
  
  const char *
-ashrqi3_out (rtx_insn *insn, rtx operands[], int *len)
+ashrqi3_out (rtx_insn *insn, rtx operands[], int *plen)
  {
    if (CONST_INT_P (operands[2]))
      {
-      int k;
+      if (plen)
+       *plen = 0;
  
-      if (!len)
-       len = &k;
+      const int offs = INTVAL (operands[2]);
  
-      switch (INTVAL (operands[2]))
+      if (IN_RANGE (offs, 0, 5))
         {
-       case 1:
-         *len = 1;
-         return "asr %0";
-
-       case 2:
-         *len = 2;
-         return ("asr %0" CR_TAB
-                 "asr %0");
-
-       case 3:
-         *len = 3;
-         return ("asr %0" CR_TAB
-                 "asr %0" CR_TAB
-                 "asr %0");
-
-       case 4:
-         *len = 4;
-         return ("asr %0" CR_TAB
-                 "asr %0" CR_TAB
-                 "asr %0" CR_TAB
-                 "asr %0");
-
-       case 5:
-         *len = 5;
-         return ("asr %0" CR_TAB
-                 "asr %0" CR_TAB
-                 "asr %0" CR_TAB
-                 "asr %0" CR_TAB
-                 "asr %0");
-
-       case 6:
-         *len = 4;
-         return ("bst %0,6"  CR_TAB
-                 "lsl %0"    CR_TAB
-                 "sbc %0,%0" CR_TAB
-                 "bld %0,0");
-
-       default:
-         if (INTVAL (operands[2]) < 8)
-           break;
-
-         /* fall through */
-
-       case 7:
-         *len = 2;
-         return ("lsl %0" CR_TAB
-                 "sbc %0,%0");
+         for (int i = 0; i < offs; ++i)
+           avr_asm_len ("asr %0", operands, plen, 1);
+         return "";
+       }
+      else if (offs == 6)
+       {
+         return avr_asm_len ("bst %0,6"  CR_TAB
+                             "lsl %0"    CR_TAB
+                             "sbc %0,%0" CR_TAB
+                             "bld %0,0", operands, plen, 4);
+       }
+      else if (offs >= 7)
+       {
+         rtx xop[2] = { operands[0], operands[1] };
+         if (! reg_unused_after (insn, xop[1]))
+           {
+             avr_asm_len ("mov %0,%1", xop, plen, 1);
+             xop[1] = xop[0];
+           }
+         return avr_asm_len ("lsl %1" CR_TAB
+                             "sbc %0,%0", xop, plen, 2);
         }
      }
    else if (CONSTANT_P (operands[2]))
      fatal_insn ("internal compiler error.  Incorrect shift:", insn);
  
    out_shift_with_cnt ("asr %0",
-                     insn, operands, len, 1);
+                     insn, operands, plen, 1);
    return "";
  }
  
@@ -9359,7 +9482,7 @@ avr_out_insert_notbit (rtx_insn *insn, rtx op[], int *plen)
     -  <Shift> is any of: ASHIFT, LSHIFTRT, ASHIFTRT.
     -  The result depends on XOP[1].
     or  XOP[0] = XOP[1] & XOP[2]  where
-   -  XOP[0] and XOP[1] have the same mode which is one of: HI, PSI, SI.
+   -  XOP[0] and XOP[1] have the same mode which is one of: QI, HI, PSI, SI.
     -  XOP[2] is an exact const_int power of 2.
     Returns "".
     PLEN != 0: Set *PLEN to the code length in words.  Don't output anything.
@@ -10449,6 +10572,7 @@ avr_adjust_insn_length (rtx_insn *insn, int len)
      case ADJUST_LEN_EXTR_NOT: avr_out_extr_not (insn, op, &len); break;
      case ADJUST_LEN_EXTR: avr_out_extr (insn, op, &len); break;
      case ADJUST_LEN_INSV: avr_out_insv (insn, op, &len); break;
+    case ADJUST_LEN_SET_SOME: avr_out_set_some (insn, op, &len); break;
  
      case ADJUST_LEN_PLUS: avr_out_plus (insn, op, &len); break;
      case ADJUST_LEN_PLUS_EXT: avr_out_plus_ext (insn, op, &len); break;
@@ -12080,8 +12204,6 @@ avr_rtx_costs_1 (rtx x, machine_mode mode, int outer_code,
               *total += avr_operand_rtx_cost (XEXP (x, 1), mode, code, 1,
                                               speed);
             }
-         else if (IN_RANGE (INTVAL (XEXP (x, 1)), -63, 63))
-           *total = COSTS_N_INSNS (1);
           else
             *total = COSTS_N_INSNS (2);
           break;
@@ -12093,8 +12215,6 @@ avr_rtx_costs_1 (rtx x, machine_mode mode, int outer_code,
               *total += avr_operand_rtx_cost (XEXP (x, 1), mode, code, 1,
                                               speed);
             }
-         else if (IN_RANGE (INTVAL (XEXP (x, 1)), -63, 63))
-           *total = COSTS_N_INSNS (2);
           else
             *total = COSTS_N_INSNS (3);
           break;
@@ -12106,8 +12226,6 @@ avr_rtx_costs_1 (rtx x, machine_mode mode, int outer_code,
               *total += avr_operand_rtx_cost (XEXP (x, 1), mode, code, 1,
                                               speed);
             }
-         else if (IN_RANGE (INTVAL (XEXP (x, 1)), -63, 63))
-           *total = COSTS_N_INSNS (1);
           else
             *total = COSTS_N_INSNS (4);
           break;
diff --git a/gcc/config/avr/avr.md b/gcc/config/avr/avr.md

index bf5115020ab75930538af9721663c055b0081744..d0d78ba818a01e15078ad0d7e0cb54ac1c726170 100644 (file)
--- a/gcc/config/avr/avr.md
+++ b/gcc/config/avr/avr.md
@@ -169,7 +169,7 @@
     ashlhi, ashrhi, lshrhi,
     ashlsi, ashrsi, lshrsi,
     ashlpsi, ashrpsi, lshrpsi,
-   insert_bits, insv_notbit, insv,
+   insert_bits, insv_notbit, insv, set_some,
     add_set_ZN, add_set_N, cmp_uext, cmp_sext, cmp_lsr,
     no"
    (const_string "no"))
@@ -1128,6 +1128,27 @@
    [(set_attr "length" "4,4,8,9,4,10")
     (set_attr "adjust_len" "mov32")])
  
+;; Setting just some bytes of a register when some of them are already known.
+;; This is only needed for the lower regs, and when there is no scratch reg.
+(define_insn "set_some"
+  [(match_parallel 0 "set_some_operation"
+    [(use (match_operand:QI 1 "dreg_or_0_operand" "d Y00")) ; known d-reg or 0
+     (use (match_operand:QI 2 "const_int_operand" "n"))     ; known value of $1
+     (use (match_operand:QI 3 "const_int_operand" "n"))     ; regno
+     (use (match_operand:QI 4 "const_int_operand" "n"))     ; mode size of $3
+     (clobber (reg:CC REG_CC))
+     ;; 1...4 of these.
+     (set (match_operand:QI 5 "register_operand" "=r")
+          (match_operand:QI 6 "const_int_operand" "n"))])]
+  "reload_completed"
+  {
+    return avr_out_set_some (insn, operands, nullptr);
+  }
+  [(set (attr "length")
+        (symbol_ref "2 + 2 * (XVECLEN (operands[0], 0) - 5)"))
+   (set_attr "adjust_len" "set_some")])
+
+
  ;; fffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
  ;; move floating point numbers (32 bit)
  
@@ -4258,9 +4279,9 @@
  ; and
  
  (define_insn_and_split "andqi3"
-  [(set (match_operand:QI 0 "register_operand"       "=??r,d,*l")
-        (and:QI (match_operand:QI 1 "register_operand" "%0,0,0")
-                (match_operand:QI 2 "nonmemory_operand" "r,i,Ca1")))]
+  [(set (match_operand:QI 0 "register_operand"       "=??r,d,*l ,r")
+        (and:QI (match_operand:QI 1 "register_operand" "%0,0,0  ,r")
+                (match_operand:QI 2 "nonmemory_operand" "r,i,Ca1,Cb1")))]
    ""
    "#"
    "&& reload_completed"
@@ -4270,16 +4291,18 @@
                (clobber (reg:CC REG_CC))])])
  
  (define_insn "*andqi3"
-  [(set (match_operand:QI 0 "register_operand"       "=??r,d,*l")
-        (and:QI (match_operand:QI 1 "register_operand" "%0,0,0")
-                (match_operand:QI 2 "nonmemory_operand" "r,i,Ca1")))
+  [(set (match_operand:QI 0 "register_operand"       "=??r,d,*l ,r")
+        (and:QI (match_operand:QI 1 "register_operand" "%0,0,0  ,r")
+                (match_operand:QI 2 "nonmemory_operand" "r,i,Ca1,Cb1")))
     (clobber (reg:CC REG_CC))]
    "reload_completed"
    "@
         and %0,%2
         andi %0,lo8(%2)
-       * return avr_out_bitop (insn, operands, NULL);"
-  [(set_attr "length" "1,1,2")])
+       * return avr_out_bitop (insn, operands, NULL);
+       * return avr_out_insv (insn, operands, NULL);"
+  [(set_attr "length" "1,1,2,3")
+   (set_attr "adjust_len" "*,*,out_bitop,insv")])
  
  (define_insn_and_split "andhi3"
    [(set (match_operand:HI 0 "register_operand"       "=??r,d,d,r  ,r  ,r")
@@ -4591,8 +4614,8 @@
    [(parallel [(set (match_dup 0)
                     (xor:PSI (match_dup 1)
                              (match_dup 2)))
-                   (clobber (match_dup 3))
-                   (clobber (reg:CC REG_CC))])])
+              (clobber (match_dup 3))
+              (clobber (reg:CC REG_CC))])])
  
  (define_insn "*xorpsi3"
    [(set (match_operand:PSI 0 "register_operand"        "=??r,r  ,d  ,r")
@@ -5100,9 +5123,9 @@
  ;; "*ashlqi3"
  ;; "*ashlqq3"  "*ashluqq3"
  (define_insn_and_split "*ashl<mode>3_split"
-  [(set (match_operand:ALL1 0 "register_operand"              "=r,r,r,r,!d,r,r")
-        (ashift:ALL1 (match_operand:ALL1 1 "register_operand"  "0,0,0,0,0 ,0,0")
-                     (match_operand:QI 2 "nop_general_operand" "r,L,P,K,n ,n,Qm")))]
+  [(set (match_operand:ALL1 0 "register_operand"              "=r,r,r,r,r  ,!d,r,r")
+        (ashift:ALL1 (match_operand:ALL1 1 "register_operand"  "0,0,0,0,r  ,0 ,0,0")
+                     (match_operand:QI 2 "nop_general_operand" "r,L,P,K,C07,n ,n,Qm")))]
    ""
    "#"
    "&& reload_completed"
@@ -5112,15 +5135,15 @@
                (clobber (reg:CC REG_CC))])])
  
  (define_insn "*ashl<mode>3"
-  [(set (match_operand:ALL1 0 "register_operand"              "=r,r,r,r,!d,r,r")
-        (ashift:ALL1 (match_operand:ALL1 1 "register_operand"  "0,0,0,0,0 ,0,0")
-                     (match_operand:QI 2 "nop_general_operand" "r,L,P,K,n ,n,Qm")))
+  [(set (match_operand:ALL1 0 "register_operand"              "=r,r,r,r,r  ,!d,r,r")
+        (ashift:ALL1 (match_operand:ALL1 1 "register_operand"  "0,0,0,0,r  ,0 ,0,0")
+                     (match_operand:QI 2 "nop_general_operand" "r,L,P,K,C07,n ,n,Qm")))
     (clobber (reg:CC REG_CC))]
    "reload_completed"
    {
      return ashlqi3_out (insn, operands, NULL);
    }
-  [(set_attr "length" "5,0,1,2,4,6,9")
+  [(set_attr "length" "5,0,1,2,3,4,6,9")
     (set_attr "adjust_len" "ashlqi")])
  
  (define_insn_and_split "ashl<mode>3"
@@ -5459,9 +5482,9 @@
  ;; "ashrqi3"
  ;; "ashrqq3"  "ashruqq3"
  (define_insn_and_split "ashr<mode>3"
-  [(set (match_operand:ALL1 0 "register_operand"                  "=r,r,r,r,r          ,r      ,r")
-        (ashiftrt:ALL1 (match_operand:ALL1 1 "register_operand"    "0,0,0,0,0          ,0      ,0")
-                       (match_operand:QI 2 "nop_general_operand"   "r,L,P,K,C03 C04 C05,C06 C07,Qm")))]
+  [(set (match_operand:ALL1 0 "register_operand"                  "=r,r,r,r,r          ,r  ,r  ,r")
+        (ashiftrt:ALL1 (match_operand:ALL1 1 "register_operand"    "0,0,0,0,0          ,0  ,r  ,0")
+                       (match_operand:QI 2 "nop_general_operand"   "r,L,P,K,C03 C04 C05,C06,C07,Qm")))]
    ""
    "#"
    "&& reload_completed"
@@ -5471,15 +5494,15 @@
                (clobber (reg:CC REG_CC))])])
  
  (define_insn "*ashr<mode>3"
-  [(set (match_operand:ALL1 0 "register_operand"                  "=r,r,r,r,r          ,r      ,r")
-        (ashiftrt:ALL1 (match_operand:ALL1 1 "register_operand"    "0,0,0,0,0          ,0      ,0")
-                       (match_operand:QI 2 "nop_general_operand"   "r,L,P,K,C03 C04 C05,C06 C07,Qm")))
+  [(set (match_operand:ALL1 0 "register_operand"                  "=r,r,r,r,r          ,r  ,r  ,r")
+        (ashiftrt:ALL1 (match_operand:ALL1 1 "register_operand"    "0,0,0,0,0          ,0  ,r  ,0")
+                       (match_operand:QI 2 "nop_general_operand"   "r,L,P,K,C03 C04 C05,C06,C07,Qm")))
     (clobber (reg:CC REG_CC))]
    "reload_completed"
    {
      return ashrqi3_out (insn, operands, NULL);
    }
-  [(set_attr "length" "5,0,1,2,5,4,9")
+  [(set_attr "length" "5,0,1,2,5,4,3,9")
     (set_attr "adjust_len" "ashrqi")])
  
  ;; "ashrhi3"
@@ -6702,6 +6725,12 @@
                     (match_operand 1))
                (clobber (reg:CC REG_CC))])])
  
+(define_expand "gen_move_clobbercc_scratch"
+  [(parallel [(set (match_operand 0)
+                   (match_operand 1))
+              (clobber (match_operand 2))
+              (clobber (reg:CC REG_CC))])])
+
  ;; ----------------------------------------------------------------------
  ;; JUMP INSTRUCTIONS
  ;; ----------------------------------------------------------------------
diff --git a/gcc/config/avr/avr.opt b/gcc/config/avr/avr.opt

index 625323f4e6a6ddb77dd1c4b8b2e37ae46f826d16..5f2e52ccfc7969d93e6a9e2df7a6b2fd74533cb4 100644 (file)
--- a/gcc/config/avr/avr.opt
+++ b/gcc/config/avr/avr.opt
@@ -131,6 +131,14 @@ mfract-convert-truncate
  Target Mask(FRACT_CONV_TRUNC)
  Allow to use truncation instead of rounding towards zero for fractional fixed-point types.
  
+mfuse-move
+Target Alias(mfuse-move=, 23, 0) Optimization
+Optimization. Run a post-reload pass that tweaks move instructions.
+
+mfuse-move=
+Target Joined RejectNegative UInteger Var(avr_fuse_move) Init(0) Optimization IntegerRange(0, 23)
+-mfuse-move=<0,23>     Optimization. Run a post-reload pass that tweaks move instructions.
+
  mabsdata
  Target Mask(ABSDATA)
  Assume that all data in static storage can be accessed by LDS / STS instructions.  This option is only useful for reduced Tiny devices like ATtiny40.
diff --git a/gcc/config/avr/constraints.md b/gcc/config/avr/constraints.md

index 9512302c252e2cd065dafb93bb40ceb541d92f86..ac64009b3c0374a61e2d780a3e7cba22f111ab4b 100644 (file)
--- a/gcc/config/avr/constraints.md
+++ b/gcc/config/avr/constraints.md
@@ -188,6 +188,11 @@
    (and (match_code "const_int")
         (match_test "avr_popcount_each_byte (op, 4, (1<<0) | (1<<1) | (1<<8))")))
  
+(define_constraint "Cb1"
+  "Constant 1-byte integer that has exactly 1 bit set."
+  (and (match_code "const_int")
+       (match_test "single_one_operand (op, QImode)")))
+
  (define_constraint "Cb2"
    "Constant 2-byte integer that has exactly 1 bit set."
    (and (match_code "const_int")
diff --git a/gcc/config/avr/predicates.md b/gcc/config/avr/predicates.md

index c44ebffc7628cd9309395693dbc9b780efd0cf32..d852f1c9e084a4e81a5f9ab824f458883af80902 100644 (file)
--- a/gcc/config/avr/predicates.md
+++ b/gcc/config/avr/predicates.md
@@ -178,6 +178,11 @@
    (ior (match_operand 0 "register_operand")
         (match_operand 0 "const0_operand")))
  
+;; Returns true if OP is either the constant zero or an upper register.
+(define_predicate "dreg_or_0_operand"
+  (ior (match_operand 0 "d_register_operand")
+       (match_operand 0 "const0_operand")))
+
  ;; Returns 1 if OP is a SYMBOL_REF.
  (define_predicate "symbol_ref_operand"
    (match_code "symbol_ref"))
@@ -374,3 +379,7 @@
    (ior (match_code "const_fixed")
         (match_code "const_double")
         (match_operand 0 "immediate_operand")))
+
+(define_predicate "set_some_operation"
+  (and (match_code "parallel")
+       (match_test "avr_set_some_operation (op)")))
diff --git a/gcc/config/avr/t-avr b/gcc/config/avr/t-avr

index 3da13289f332aa28abcb232e40f611918ea2d8c4..a01a4f4c217307b090f025af62d41a954da3d9cd 100644 (file)
--- a/gcc/config/avr/t-avr
+++ b/gcc/config/avr/t-avr
@@ -60,6 +60,7 @@ avr-log.o: $(srcdir)/config/avr/avr-log.cc \
         $(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) $<
  
  avr-passes.o: $(srcdir)/config/avr/avr-passes.cc \
+  $(srcdir)/config/avr/avr-passes-fuse-move.h \
    $(CONFIG_H) $(SYSTEM_H) coretypes.h $(TM_H) $(TREE_H) $(INPUT_H)
         $(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) $<
  
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi

index c73025e2d0ec47e67931a3aa66f1ce0b64b0092c..cddbffea0e59e9696fe5b909700cbcbee485af08 100644 (file)
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -898,7 +898,7 @@ Objective-C and Objective-C++ Dialects}.
  
  @emph{AVR Options}
  @gccoptlist{-mmcu=@var{mcu}  -mabsdata  -maccumulate-args
--mbranch-cost=@var{cost}  -mfuse-add=@var{level}
+-mbranch-cost=@var{cost}  -mfuse-add=@var{level}  -mfuse-move=@var{level}
  -mcall-prologues  -mgas-isr-prologues  -mint8  -mflmap
  -mdouble=@var{bits}  -mlong-double=@var{bits}
  -mn_flash=@var{size}  -mno-interrupts
@@ -24088,6 +24088,18 @@ The default uses @code{@var{level}=1} for optimizations @option{-Og}
  and @option{-O1}, and @code{@var{level}=2} for higher optimizations.
  Valid values for @var{level} are @code{0}, @code{1} and @code{2}.
  
+@opindex mfuse-move
+@item -mfuse-move
+@itemx -mno-fuse-move
+@itemx -mfuse-move=@var{level}
+Run a post reload optimization pass that tries to fuse move instructions
+and to split multi-byte instructions into 8-bit operations.
+The default uses @code{@var{level}=3} for optimization @option{-O1},
+and @code{@var{level}=23} for higher optimizations.
+Valid values for @var{level} are in the range @code{0} @dots{} @code{23}
+which is a 3:2:2:2 mixed radix value.  Each digit controls some
+aspect of the optimization.
+
  @opindex mdouble
  @opindex mlong-double
  @item -mdouble=@var{bits}
diff --git a/gcc/testsuite/gcc.target/avr/torture/pr84211-fuse-move-1.c b/gcc/testsuite/gcc.target/avr/torture/pr84211-fuse-move-1.c

new file mode 100644 (file)

index 0000000..82ce227
--- /dev/null
+++ b/gcc/testsuite/gcc.target/avr/torture/pr84211-fuse-move-1.c
@@ -0,0 +1,38 @@
+/* { dg-do run } */
+/* { dg-additional-options -std=gnu99 } */
+
+#define USE_VALUE 0
+
+#include "test-gprs.h"
+
+void test12 (void)
+{
+#ifndef __AVR_TINY__
+  rtest (32,  2, 0x02020103);
+  rtest (32,  2, 0xff00ff00);
+  rtest (32,  2, 0xfd02fb08);
+  rtest (32,  2, 0xfefbfdf7);
+  rtest (32, 16, 0xb1b2b3b4);
+  rtest (32,  2, 0xc1c2c3c4);
+  rtest (32,  2, 0x1c2c3c4c);
+  rtest (32,  2, 0x1ff);
+  rtest (32,  2, 0);
+  rtest (32,  2, 0x01020408);
+  rtest (32,  2, 0xffeeddbb);
+  rtest (32,  6, 0x11223344);
+  rtest (32,  6, 0x22334411);
+  rtest (32, 10, 0x11122233);
+  rtest (32, 14, 0x0a0b0c0d);
+  rtest (32,  2, 0xa0b0c0d0);
+  rtest (32,  2, 0xffffffff);
+  rtest (32,  2, 0xfdffffbf);
+  rtest (32,  2, 0x12345678);
+#endif
+}
+
+int main (void)
+{
+  test12 ();
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/avr/torture/pr84211-fuse-move-2.c b/gcc/testsuite/gcc.target/avr/torture/pr84211-fuse-move-2.c

new file mode 100644 (file)

index 0000000..5c94cf0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/avr/torture/pr84211-fuse-move-2.c
@@ -0,0 +1,83 @@
+/* { dg-do run } */
+/* { dg-additional-options -std=gnu99 } */
+
+#define NI __attribute__((__noipa__))
+#define AI static inline __attribute__((__always_inline__))
+
+/*****************************************************************************/
+
+AI unsigned fn_crc_A (unsigned x, unsigned y)
+{
+  for (char i = 8; i--; x <<= 1)
+    y ^= (x ^ y) & 0x80 ? 79U : 0U;
+  return y;
+}
+
+NI unsigned fn_crc_N (unsigned x, unsigned y)
+{
+  for (char i = 8; i--; x <<= 1)
+    y ^= (x ^ y) & 0x80 ? 79U : 0U;
+  return y;
+}
+
+AI void test1_crc (unsigned x, unsigned y, int line)
+{
+  if (fn_crc_A (x, y) != fn_crc_N (x, y))
+    __builtin_exit (line);
+}
+
+__attribute__((__optimize__(3)))
+void test_crc (void)
+{
+  test1_crc (0x1ff, 0x1ff, __LINE__);
+  test1_crc (0x1ab, 0x1cd, __LINE__);
+  test1_crc (0xab, 0xcd, __LINE__);
+  test1_crc (0x87, 0x65, __LINE__);
+  test1_crc (0x3f, 0xb7, __LINE__);
+}
+
+/*****************************************************************************/
+
+AI long fn_build4_A (char a, char b, char c, char d)
+{
+    long la = a;
+    long lb = (long) b << 8;
+    long lc = (long) c << 16;
+    long ld = (long) b << 24;
+    long x = (la & 0xff) | (lb & 0xff00) | (lc & 0xff0000) | (ld & 0xff000000);
+    return x;
+}
+
+NI long fn_build4_N (char a, char b, char c, char d)
+{
+    long la = a;
+    long lb = (long) b << 8;
+    long lc = (long) c << 16;
+    long ld = (long) b << 24;
+    long x = (la & 0xff) | (lb & 0xff00) | (lc & 0xff0000) | (ld & 0xff000000);
+    return x;
+}
+
+AI void test1_build4 (char a, char b, char c, char d, int line)
+{
+  if (fn_build4_A (a, b, c, d) != fn_build4_N (a, b, c, d))
+    __builtin_exit (line);
+}
+
+void test_build4 (void)
+{
+  test1_build4 (1, 2, 3, 4, __LINE__);
+  test1_build4 (-2, -3, -4, -5, __LINE__);
+  test1_build4 (1, -2, 3, -4, __LINE__);
+  test1_build4 (-1, 2, -3, 4, __LINE__);
+}
+
+/*****************************************************************************/
+
+int main (void)
+{
+  test_crc ();
+  test_build4 ();
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/avr/torture/test-gprs.h b/gcc/testsuite/gcc.target/avr/torture/test-gprs.h

new file mode 100644 (file)

index 0000000..a982115
--- /dev/null
+++ b/gcc/testsuite/gcc.target/avr/torture/test-gprs.h
@@ -0,0 +1,174 @@
+typedef __UINT8_TYPE__   u8;
+typedef __UINT16_TYPE__  u16;
+typedef __uint24         u24;
+typedef __UINT32_TYPE__  u32;
+typedef __UINT64_TYPE__  u64;
+
+__attribute__((__used__))
+u8 gprs[32];
+
+// USE_VALUE = 1: Copy constant to value[] and pass that to test_gprs().
+// USE_VALUE = 0: Pass Lval label from .macro run_test_gprs to test_gprs().
+#ifndef USE_VALUE
+#error define USE_VALUE to 0 or 1
+#endif
+
+#if USE_VALUE
+__attribute__((__used__))
+u8 value[8];
+#endif
+
+#ifdef __AVR_HAVE_JMP_CALL__
+#define XCALL "call"
+#else
+#define XCALL "rcall"
+#endif
+
+#define GPRS_16_29 "16,17,18,19,20,21,22,23,24,25,26,27,28,29"
+
+#ifdef __AVR_TINY__
+#define FIRST_GPR 16
+#define GPRs_29 GPRS_16_29
+#else
+#define FIRST_GPR 0
+#define GPRs_29 "0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15," GPRS_16_29
+#endif
+
+#if __AVR_RODATA_IN_RAM__
+#define PGM __attribute__((progmem))
+#else
+#define PGM
+#endif
+
+// Save GPRs to gprs[].
+__attribute__((naked, used))
+void save_gprs (void)
+{
+    __asm (
+#ifdef __AVR_TINY__
+           "push r31"              "\n\t"
+           "push r30"              "\n\t"
+           "ldi r30, lo8(gprs+%0)" "\n\t"
+           "ldi r31, hi8(gprs+%0)" "\n\t"
+           ".irp n," GPRs_29       "\n\t"
+           "    st z+, \\n"        "\n\t"
+           ".endr"                 "\n\t"
+           "pop r16 $ st z+, r16"  "\n\t"
+           "pop r16 $ st z+, r16"  "\n\t"
+#else
+           "sts gprs+31, r31"      "\n\t"
+           "sts gprs+30, r30"      "\n\t"
+           "ldi r30, lo8(gprs+%0)" "\n\t"
+           "ldi r31, hi8(gprs+%0)" "\n\t"
+           ".irp n," GPRs_29       "\n\t"
+           "    st z+, \\n"        "\n\t"
+           ".endr"                 "\n\t"
+#endif
+           "ret"
+           :: "n" (FIRST_GPR));
+}
+
+// Restore GPRs from gprs[].
+__attribute__((naked, used))
+void restore_gprs (void)
+{
+    __asm ("ldi r30, lo8(gprs+%0)"   "\n\t"
+           "ldi r31, hi8(gprs+%0)"   "\n\t"
+           ".irp n," GPRs_29         "\n\t"
+           "    ld \\n, z+"          "\n\t"
+           ".endr"                   "\n\t"
+#ifdef __AVR_TINY__
+           "push r29"                "\n\t"
+           "ld   r29, z+"            "\n\t"
+           "push r29"                "\n\t"
+           "ld   r31, z"             "\n\t"
+           "pop  r30"                "\n\t"
+           "pop  r29"                "\n\t"
+#else
+           "ld  r30, z"              "\n\t"
+           "lds r31, gprs+31"        "\n\t"
+#endif
+           "ret"
+           :: "n" (FIRST_GPR));
+}
+
+// Write N-byte const value VAL:  *Z++ = VAL.
+__asm (".macro write_value n, val"                            "\n\t"
+       "    ldi r24, lo8(\\val)"                              "\n\t"
+       "    st  z+, r24"                                      "\n\t"
+       "    .if \\n > 1"                                      "\n\t"
+       "        write_value \"(\\n - 1)\", \"(\\val >> 8)\""  "\n\t"
+       "    .endif"                                           "\n\t"
+       ".endm");
+
+// 1) Save all gprs
+// 2) Call test_gprs (n, regno, Lval, line)
+// 3) Restore all gprs
+__asm (".macro run_test_gprs n, regno, val, Lval, line"     "\n\t"
+       "TestForLine\\line\\().L\\@:"                        "\n\t"
+       "    " XCALL " save_gprs"                            "\n\t"
+# if USE_VALUE
+       // Write VAL to value[]
+       "    ldi r30, lo8(\\Lval)"                           "\n\t"
+       "    ldi r31, hi8(\\Lval)"                           "\n\t"
+       "    write_value \\n, \\val"                         "\n\t"
+#endif
+       // Call test_gprs (u8 n, u8 regno, const void *Lval, int line)
+       "    ldi r24, \\n"                                   "\n\t"
+       "    ldi r22, \\regno"                               "\n\t"
+       "    ldi r21, hi8(\\Lval)"                           "\n\t"
+       "    ldi r20, lo8(\\Lval)"                           "\n\t"
+       "    ldi r19, hi8(\\line)"                           "\n\t"
+       "    ldi r18, lo8(\\line)"                           "\n\t"
+       "    " XCALL " test_gprs"                            "\n\t"
+       "    " XCALL " restore_gprs"                         "\n\t"
+       "DoneTestLine\\line\\().L\\@:"                       "\n\t"
+       ".endm");
+
+// Test if reg[REG] ... reg[REG + BITS/8 - 1] are holding VAL.
+#define rtest(BITS, REG, VAL)                                           \
+  do {                                                                  \
+    PGM static const u##BITS Lval = VAL;                                \
+    register u##BITS r##REG __asm (#REG) = VAL;                         \
+    __asm ("run_test_gprs %[size], %[reg], %[val], %[Lval], %[line] "   \
+           "; u" #BITS " r" #REG "=" #VAL ", line %[line];"             \
+           ::                                                           \
+           [size] "n" (BITS / 8), [reg] "n" (REG),                      \
+           [val] "n" ((u##BITS) VAL), [Lval] "i" (& Lval),              \
+           [line] "n" (__LINE__), "r" (r##REG));                        \
+  } while (0)
+
+#define rtest_nowhile0(BITS, REG, VAL)                                  \
+    PGM static const u##BITS Lval = VAL;                                \
+    register u##BITS r##REG __asm (#REG) = VAL;                         \
+    __asm ("run_test_gprs %[size], %[reg], %[val], %[Lval], %[line] "   \
+           "; u" #BITS " r" #REG "=" #VAL ", line %[line];"             \
+           ::                                                           \
+           [size] "n" (BITS / 8), [reg] "n" (REG),                      \
+           [val] "n" ((u##BITS) VAL), [Lval] "i" (& Lval),              \
+           [line] "n" (__LINE__), "r" (r##REG))
+
+
+#if ! __AVR_RODATA_IN_RAM__ || USE_VALUE
+#define LOAD_INCZ "ld %0,%a1+"
+#elif defined (__AVR_HAVE_LPMX__)
+#define LOAD_INCZ "lpm %0,%a1+"
+#else
+#define LOAD_INCZ "lpm $ mov %0,r0 $ adiw r30,1"
+#endif
+
+// Called by .macro run_test_gprs which is invoked by rtest()
+// resp. rtest_nowhile().
+__attribute__((__used__))
+void test_gprs (u8 n, u8 regno, const void *pval, int line)
+{
+  const u8 *r = gprs + regno;
+  const u8 *pv = pval;
+  for (u8 i = 0; i < n; ++i)
+    {
+      u8 vi;
+      __asm (LOAD_INCZ : "=r" (vi), "+z" (pv));
+      if (*r++ != vi)
+        __builtin_exit (line);
+    }
+}
author	Georg-Johann Lay <avr@gjlay.de>
	Sun, 17 Nov 2024 17:19:42 +0000 (18:19 +0100)
committer	Georg-Johann Lay <avr@gjlay.de>
	Mon, 18 Nov 2024 18:14:57 +0000 (19:14 +0100)
gcc/common/config/avr/avr-common.cc		patch \| blob \| blame \| history
gcc/config/avr/avr-log.cc		patch \| blob \| blame \| history
gcc/config/avr/avr-passes-fuse-move.h	[new file with mode: 0644]	patch \| blob
gcc/config/avr/avr-passes.cc		patch \| blob \| blame \| history
gcc/config/avr/avr-passes.def		patch \| blob \| blame \| history
gcc/config/avr/avr-protos.h		patch \| blob \| blame \| history
gcc/config/avr/avr.cc		patch \| blob \| blame \| history
gcc/config/avr/avr.md		patch \| blob \| blame \| history
gcc/config/avr/avr.opt		patch \| blob \| blame \| history
gcc/config/avr/constraints.md		patch \| blob \| blame \| history
gcc/config/avr/predicates.md		patch \| blob \| blame \| history
gcc/config/avr/t-avr		patch \| blob \| blame \| history
gcc/doc/invoke.texi		patch \| blob \| blame \| history
gcc/testsuite/gcc.target/avr/torture/pr84211-fuse-move-1.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/avr/torture/pr84211-fuse-move-2.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/avr/torture/test-gprs.h	[new file with mode: 0644]	patch \| blob