Indent labels

[thirdparty/binutils-gdb.git] / gas / config / tc-i386.c
diff --git a/gas/config/tc-i386.c b/gas/config/tc-i386.c

index 41a6a0817a832c76e57685b49876abc3b92a04dd..62b7cfbe6c7556f1a2d66d546893309a94726adf 100644 (file)
--- a/gas/config/tc-i386.c
+++ b/gas/config/tc-i386.c
@@ -1,5 +1,5 @@
  /* tc-i386.c -- Assemble code for the Intel 80386
-   Copyright (C) 1989-2019 Free Software Foundation, Inc.
+   Copyright (C) 1989-2020 Free Software Foundation, Inc.
  
     This file is part of GAS, the GNU Assembler.
  
@@ -44,10 +44,6 @@
  #endif
  #endif
  
-#ifndef REGISTER_WARNINGS
-#define REGISTER_WARNINGS 1
-#endif
-
  #ifndef INFER_ADDR_PREFIX
  #define INFER_ADDR_PREFIX 1
  #endif
@@ -182,6 +178,7 @@ static char *parse_insn (char *, char *);
  static char *parse_operands (char *, const char *);
  static void swap_operands (void);
  static void swap_2_operands (int, int);
+static enum flag_code i386_addressing_mode (void);
  static void optimize_imm (void);
  static void optimize_disp (void);
  static const insn_template *match_template (char);
@@ -353,6 +350,9 @@ struct _i386_insn
      unsigned int prefixes;
      unsigned char prefix[MAX_PREFIXES];
  
+    /* Register is in low 3 bits of opcode.  */
+    bfd_boolean short_form;
+
      /* The operand to a branch insn indicates an absolute branch.  */
      bfd_boolean jumpabsolute;
  
@@ -418,7 +418,7 @@ struct _i386_insn
      enum
        {
         vex_encoding_default = 0,
-       vex_encoding_vex2,
+       vex_encoding_vex,
         vex_encoding_vex3,
         vex_encoding_evex
        } vec_encoding;
@@ -597,9 +597,11 @@ static int shared = 0;
     0 if att syntax.  */
  static int intel_syntax = 0;
  
-/* 1 for Intel64 ISA,
-   0 if AMD64 ISA.  */
-static int intel64;
+static enum x86_64_isa
+{
+  amd64 = 1,   /* AMD64 ISA.  */
+  intel64      /* Intel64 ISA.  */
+} isa64;
  
  /* 1 for intel mnemonic,
     0 if att mnemonic.  */
@@ -981,6 +983,8 @@ static const arch_entry cpu_arch[] =
      CPU_SSE2_FLAGS, 0 },
    { STRING_COMMA_LEN (".sse3"), PROCESSOR_UNKNOWN,
      CPU_SSE3_FLAGS, 0 },
+  { STRING_COMMA_LEN (".sse4a"), PROCESSOR_UNKNOWN,
+    CPU_SSE4A_FLAGS, 0 },
    { STRING_COMMA_LEN (".ssse3"), PROCESSOR_UNKNOWN,
      CPU_SSSE3_FLAGS, 0 },
    { STRING_COMMA_LEN (".sse4.1"), PROCESSOR_UNKNOWN,
@@ -1051,6 +1055,8 @@ static const arch_entry cpu_arch[] =
      CPU_EPT_FLAGS, 0 },
    { STRING_COMMA_LEN (".lzcnt"), PROCESSOR_UNKNOWN,
      CPU_LZCNT_FLAGS, 0 },
+  { STRING_COMMA_LEN (".popcnt"), PROCESSOR_UNKNOWN,
+    CPU_POPCNT_FLAGS, 0 },
    { STRING_COMMA_LEN (".hle"), PROCESSOR_UNKNOWN,
      CPU_HLE_FLAGS, 0 },
    { STRING_COMMA_LEN (".rtm"), PROCESSOR_UNKNOWN,
@@ -1175,6 +1181,7 @@ static const noarch_entry cpu_noarch[] =
    { STRING_COMMA_LEN ("nosse"),  CPU_ANY_SSE_FLAGS },
    { STRING_COMMA_LEN ("nosse2"),  CPU_ANY_SSE2_FLAGS },
    { STRING_COMMA_LEN ("nosse3"),  CPU_ANY_SSE3_FLAGS },
+  { STRING_COMMA_LEN ("nosse4a"),  CPU_ANY_SSE4A_FLAGS },
    { STRING_COMMA_LEN ("nossse3"),  CPU_ANY_SSSE3_FLAGS },
    { STRING_COMMA_LEN ("nosse4.1"),  CPU_ANY_SSE4_1_FLAGS },
    { STRING_COMMA_LEN ("nosse4.2"),  CPU_ANY_SSE4_2_FLAGS },
@@ -1838,6 +1845,8 @@ cpu_flags_and_not (i386_cpu_flags x, i386_cpu_flags y)
    return x;
  }
  
+static const i386_cpu_flags avx512 = CPU_ANY_AVX512F_FLAGS;
+
  #define CPU_FLAGS_ARCH_MATCH           0x1
  #define CPU_FLAGS_64BIT_MATCH          0x2
  
@@ -2180,7 +2189,7 @@ operand_size_match (const insn_template *t)
  
    if (!t->opcode_modifier.d)
      {
-mismatch:
+    mismatch:
        if (!match)
         i.error = operand_size_mismatch;
        return match;
@@ -2235,15 +2244,14 @@ operand_type_match (i386_operand_type overlap,
    if (given.bitfield.baseindex == overlap.bitfield.baseindex)
      return 1;
  
-mismatch:
+ mismatch:
    i.error = operand_type_mismatch;
    return 0;
  }
  
  /* If given types g0 and g1 are registers they must be of the same type
     unless the expected operand type register overlap is null.
-   Memory operand size of certain SIMD instructions is also being checked
-   here.  */
+   Some Intel syntax memory operand size checking also happens here.  */
  
  static INLINE int
  operand_type_register_match (i386_operand_type g0,
@@ -2255,14 +2263,16 @@ operand_type_register_match (i386_operand_type g0,
        && g0.bitfield.class != RegSIMD
        && (!operand_type_check (g0, anymem)
           || g0.bitfield.unspecified
-         || t0.bitfield.class != RegSIMD))
+         || (t0.bitfield.class != Reg
+             && t0.bitfield.class != RegSIMD)))
      return 1;
  
    if (g1.bitfield.class != Reg
        && g1.bitfield.class != RegSIMD
        && (!operand_type_check (g1, anymem)
           || g1.bitfield.unspecified
-         || t1.bitfield.class != RegSIMD))
+         || (t1.bitfield.class != Reg
+             && t1.bitfield.class != RegSIMD)))
      return 1;
  
    if (g0.bitfield.byte == g1.bitfield.byte
@@ -3393,7 +3403,6 @@ tc_i386_fix_adjustable (fixS *fixP ATTRIBUTE_UNUSED)
    if (fixP->fx_r_type == BFD_RELOC_SIZE32
        || fixP->fx_r_type == BFD_RELOC_SIZE64
        || fixP->fx_r_type == BFD_RELOC_386_GOTOFF
-      || fixP->fx_r_type == BFD_RELOC_386_PLT32
        || fixP->fx_r_type == BFD_RELOC_386_GOT32
        || fixP->fx_r_type == BFD_RELOC_386_GOT32X
        || fixP->fx_r_type == BFD_RELOC_386_TLS_GD
@@ -3406,7 +3415,6 @@ tc_i386_fix_adjustable (fixS *fixP ATTRIBUTE_UNUSED)
        || fixP->fx_r_type == BFD_RELOC_386_TLS_LE
        || fixP->fx_r_type == BFD_RELOC_386_TLS_GOTDESC
        || fixP->fx_r_type == BFD_RELOC_386_TLS_DESC_CALL
-      || fixP->fx_r_type == BFD_RELOC_X86_64_PLT32
        || fixP->fx_r_type == BFD_RELOC_X86_64_GOT32
        || fixP->fx_r_type == BFD_RELOC_X86_64_GOTPCREL
        || fixP->fx_r_type == BFD_RELOC_X86_64_GOTPCRELX
@@ -3989,13 +3997,13 @@ optimize_encoding (void)
    unsigned int j;
  
    if (optimize_for_space
+      && !is_any_vex_encoding (&i.tm)
        && i.reg_operands == 1
        && i.imm_operands == 1
        && !i.types[1].bitfield.byte
        && i.op[0].imms->X_op == O_constant
        && fits_in_imm7 (i.op[0].imms->X_add_number)
-      && ((i.tm.base_opcode == 0xa8
-          && i.tm.extension_opcode == None)
+      && (i.tm.base_opcode == 0xa8
           || (i.tm.base_opcode == 0xf6
               && i.tm.extension_opcode == 0x0)))
      {
@@ -4008,21 +4016,20 @@ optimize_encoding (void)
           i.types[1].bitfield.byte = 1;
           /* Ignore the suffix.  */
           i.suffix = 0;
-         if (base_regnum >= 4
-             && !(i.op[1].regs->reg_flags & RegRex))
-           {
-             /* Handle SP, BP, SI and DI registers.  */
-             if (i.types[1].bitfield.word)
-               j = 16;
-             else if (i.types[1].bitfield.dword)
-               j = 32;
-             else
-               j = 48;
-             i.op[1].regs -= j;
-           }
+         /* Convert to byte registers.  */
+         if (i.types[1].bitfield.word)
+           j = 16;
+         else if (i.types[1].bitfield.dword)
+           j = 32;
+         else
+           j = 48;
+         if (!(i.op[1].regs->reg_flags & RegRex) && base_regnum < 4)
+           j += 8;
+         i.op[1].regs -= j;
         }
      }
    else if (flag_code == CODE_64BIT
+          && !is_any_vex_encoding (&i.tm)
            && ((i.types[1].bitfield.qword
                 && i.reg_operands == 1
                 && i.imm_operands == 1
@@ -4031,9 +4038,8 @@ optimize_encoding (void)
                      && i.tm.extension_opcode == None
                      && fits_in_unsigned_long (i.op[0].imms->X_add_number))
                     || (fits_in_imm31 (i.op[0].imms->X_add_number)
-                       && (((i.tm.base_opcode == 0x24
-                             || i.tm.base_opcode == 0xa8)
-                            && i.tm.extension_opcode == None)
+                       && ((i.tm.base_opcode == 0x24
+                            || i.tm.base_opcode == 0xa8)
                             || (i.tm.base_opcode == 0x80
                                 && i.tm.extension_opcode == 0x4)
                             || ((i.tm.base_opcode == 0xf6
@@ -4045,13 +4051,11 @@ optimize_encoding (void)
                || (i.types[0].bitfield.qword
                    && ((i.reg_operands == 2
                         && i.op[0].regs == i.op[1].regs
-                       && ((i.tm.base_opcode == 0x30
-                            || i.tm.base_opcode == 0x28)
-                           && i.tm.extension_opcode == None))
+                       && (i.tm.base_opcode == 0x30
+                           || i.tm.base_opcode == 0x28))
                        || (i.reg_operands == 1
                            && i.operands == 1
-                          && i.tm.base_opcode == 0x30
-                          && i.tm.extension_opcode == None)))))
+                          && i.tm.base_opcode == 0x30)))))
      {
        /* Optimize: -O:
            andq $imm31, %r64   -> andl $imm31, %r32
@@ -4085,13 +4089,13 @@ optimize_encoding (void)
               i.tm.base_opcode = 0xb8;
               i.tm.extension_opcode = None;
               i.tm.opcode_modifier.w = 0;
-             i.tm.opcode_modifier.shortform = 1;
               i.tm.opcode_modifier.modrm = 0;
             }
         }
      }
    else if (optimize > 1
            && !optimize_for_space
+          && !is_any_vex_encoding (&i.tm)
            && i.reg_operands == 2
            && i.op[0].regs == i.op[1].regs
            && ((i.tm.base_opcode & ~(Opcode_D | 1)) == 0x8
@@ -4322,14 +4326,16 @@ md_assemble (char *line)
    /* Now we've parsed the mnemonic into a set of templates, and have the
       operands at hand.  */
  
-  /* All intel opcodes have reversed operands except for "bound" and
-     "enter".  We also don't reverse intersegment "jmp" and "call"
-     instructions with 2 immediate operands so that the immediate segment
-     precedes the offset, as it does when in AT&T mode. */
+  /* All Intel opcodes have reversed operands except for "bound", "enter"
+     "monitor*", and "mwait*".  We also don't reverse intersegment "jmp"
+     and "call" instructions with 2 immediate operands so that the immediate
+     segment precedes the offset, as it does when in AT&T mode. */
    if (intel_syntax
        && i.operands > 1
        && (strcmp (mnemonic, "bound") != 0)
        && (strcmp (mnemonic, "invlpga") != 0)
+      && (strncmp (mnemonic, "monitor", 7) != 0)
+      && (strncmp (mnemonic, "mwait", 5) != 0)
        && !(operand_type_check (i.types[0], imm)
            && operand_type_check (i.types[1], imm)))
      swap_operands ();
@@ -4380,22 +4386,6 @@ md_assemble (char *line)
         : as_bad) (_("SSE instruction `%s' is used"), i.tm.name);
      }
  
-  /* Zap movzx and movsx suffix.  The suffix has been set from
-     "word ptr" or "byte ptr" on the source operand in Intel syntax
-     or extracted from mnemonic in AT&T syntax.  But we'll use
-     the destination register to choose the suffix for encoding.  */
-  if ((i.tm.base_opcode & ~9) == 0x0fb6)
-    {
-      /* In Intel syntax, there must be a suffix.  In AT&T syntax, if
-        there is no suffix, the default will be byte extension.  */
-      if (i.reg_operands != 2
-         && !i.suffix
-         && intel_syntax)
-       as_bad (_("ambiguous operand size for `%s'"), i.tm.name);
-
-      i.suffix = 0;
-    }
-
    if (i.tm.opcode_modifier.fwait)
      if (!add_prefix (FWAIT_OPCODE))
        return;
@@ -4575,6 +4565,7 @@ md_assemble (char *line)
           if (i.types[x].bitfield.class == Reg && i.types[x].bitfield.byte
               && (i.op[x].regs->reg_flags & RegRex64) == 0)
             {
+             gas_assert (!(i.op[x].regs->reg_flags & RegRex));
               /* In case it is "hi" register, give up.  */
               if (i.op[x].regs->reg_num > 3)
                 as_bad (_("can't encode register '%s%s' in an "
@@ -4593,7 +4584,7 @@ md_assemble (char *line)
    if (i.rex == 0 && i.rex_encoding)
      {
        /* Check if we can add a REX_OPCODE byte.  Look for 8 bit operand
-         that uses legacy register.  If it is "hi" register, don't add
+        that uses legacy register.  If it is "hi" register, don't add
          the REX_OPCODE byte.  */
        int x;
        for (x = 0; x < 2; x++)
@@ -4602,6 +4593,7 @@ md_assemble (char *line)
             && (i.op[x].regs->reg_flags & RegRex64) == 0
             && i.op[x].regs->reg_num > 3)
           {
+           gas_assert (!(i.op[x].regs->reg_flags & RegRex));
             i.rex_encoding = FALSE;
             break;
           }
@@ -4722,8 +4714,8 @@ parse_insn (char *line, char *mnemonic)
                   i.dir_encoding = dir_encoding_store;
                   break;
                 case 0x4:
-                 /* {vex2} */
-                 i.vec_encoding = vex_encoding_vex2;
+                 /* {vex} */
+                 i.vec_encoding = vex_encoding_vex;
                   break;
                 case 0x5:
                   /* {vex3} */
@@ -4800,7 +4792,7 @@ parse_insn (char *line, char *mnemonic)
  
    if (!current_templates)
      {
-check_suffix:
+    check_suffix:
        if (mnem_p > mnemonic)
         {
           /* See if we can get a match by trimming off a suffix.  */
@@ -5368,7 +5360,6 @@ check_VecOperands (const insn_template *t)
  {
    unsigned int op;
    i386_cpu_flags cpu;
-  static const i386_cpu_flags avx512 = CPU_ANY_AVX512F_FLAGS;
  
    /* Templates allowing for ZMMword as well as YMMword and/or XMMword for
       any one operand are implicity requiring AVX512VL support if the actual
@@ -5745,9 +5736,7 @@ match_template (char mnem_suffix)
    i386_opcode_modifier suffix_check;
    i386_operand_type operand_types [MAX_OPERANDS];
    int addr_prefix_disp;
-  unsigned int j;
-  unsigned int found_cpu_match, size_match;
-  unsigned int check_register;
+  unsigned int j, size_match, check_register;
    enum i386_error specific_error = 0;
  
  #if MAX_OPERANDS != 5
@@ -5799,9 +5788,7 @@ match_template (char mnem_suffix)
  
        /* Check processor support.  */
        i.error = unsupported;
-      found_cpu_match = (cpu_flags_match (t)
-                        == CPU_FLAGS_PERFECT_MATCH);
-      if (!found_cpu_match)
+      if (cpu_flags_match (t) != CPU_FLAGS_PERFECT_MATCH)
         continue;
  
        /* Check AT&T mnemonic.   */
@@ -5809,14 +5796,32 @@ match_template (char mnem_suffix)
        if (intel_mnemonic && t->opcode_modifier.attmnemonic)
         continue;
  
-      /* Check AT&T/Intel syntax and Intel64/AMD64 ISA.   */
+      /* Check AT&T/Intel syntax.  */
        i.error = unsupported_syntax;
        if ((intel_syntax && t->opcode_modifier.attsyntax)
-         || (!intel_syntax && t->opcode_modifier.intelsyntax)
-         || (intel64 && t->opcode_modifier.amd64)
-         || (!intel64 && t->opcode_modifier.intel64))
+         || (!intel_syntax && t->opcode_modifier.intelsyntax))
         continue;
  
+      /* Check Intel64/AMD64 ISA.   */
+      switch (isa64)
+       {
+       default:
+         /* Default: Don't accept Intel64.  */
+         if (t->opcode_modifier.isa64 == INTEL64)
+           continue;
+         break;
+       case amd64:
+         /* -mamd64: Don't accept Intel64 and Intel64 only.  */
+         if (t->opcode_modifier.isa64 >= INTEL64)
+           continue;
+         break;
+       case intel64:
+         /* -mintel64: Don't accept AMD64.  */
+         if (t->opcode_modifier.isa64 == AMD64 && flag_code == CODE_64BIT)
+           continue;
+         break;
+       }
+
        /* Check the suffix.  */
        i.error = invalid_instruction_suffix;
        if ((t->opcode_modifier.no_bsuf && suffix_check.no_bsuf)
@@ -5884,51 +5889,50 @@ match_template (char mnem_suffix)
             break;
         }
  
-      /* Address size prefix will turn Disp64/Disp32/Disp16 operand
-        into Disp32/Disp16/Disp32 operand.  */
-      if (i.prefix[ADDR_PREFIX] != 0)
-         {
-           /* There should be only one Disp operand.  */
-           switch (flag_code)
-           {
-           case CODE_16BIT:
-             for (j = 0; j < MAX_OPERANDS; j++)
-               {
-                 if (operand_types[j].bitfield.disp16)
-                   {
-                     addr_prefix_disp = j;
-                     operand_types[j].bitfield.disp32 = 1;
-                     operand_types[j].bitfield.disp16 = 0;
-                     break;
-                   }
-               }
+      if (!t->opcode_modifier.jump
+         || t->opcode_modifier.jump == JUMP_ABSOLUTE)
+       {
+         /* There should be only one Disp operand.  */
+         for (j = 0; j < MAX_OPERANDS; j++)
+           if (operand_type_check (operand_types[j], disp))
               break;
-           case CODE_32BIT:
-             for (j = 0; j < MAX_OPERANDS; j++)
+         if (j < MAX_OPERANDS)
+           {
+             bfd_boolean override = (i.prefix[ADDR_PREFIX] != 0);
+
+             addr_prefix_disp = j;
+
+             /* Address size prefix will turn Disp64/Disp32S/Disp32/Disp16
+                operand into Disp32/Disp32/Disp16/Disp32 operand.  */
+             switch (flag_code)
                 {
-                 if (operand_types[j].bitfield.disp32)
+               case CODE_16BIT:
+                 override = !override;
+                 /* Fall through.  */
+               case CODE_32BIT:
+                 if (operand_types[j].bitfield.disp32
+                     && operand_types[j].bitfield.disp16)
                     {
-                     addr_prefix_disp = j;
-                     operand_types[j].bitfield.disp32 = 0;
-                     operand_types[j].bitfield.disp16 = 1;
-                     break;
+                     operand_types[j].bitfield.disp16 = override;
+                     operand_types[j].bitfield.disp32 = !override;
                     }
-               }
-             break;
-           case CODE_64BIT:
-             for (j = 0; j < MAX_OPERANDS; j++)
-               {
-                 if (operand_types[j].bitfield.disp64)
+                 operand_types[j].bitfield.disp32s = 0;
+                 operand_types[j].bitfield.disp64 = 0;
+                 break;
+
+               case CODE_64BIT:
+                 if (operand_types[j].bitfield.disp32s
+                     || operand_types[j].bitfield.disp64)
                     {
-                     addr_prefix_disp = j;
-                     operand_types[j].bitfield.disp64 = 0;
-                     operand_types[j].bitfield.disp32 = 1;
-                     break;
+                     operand_types[j].bitfield.disp64 &= !override;
+                     operand_types[j].bitfield.disp32s &= !override;
+                     operand_types[j].bitfield.disp32 = override;
                     }
+                 operand_types[j].bitfield.disp16 = 0;
+                 break;
                 }
-             break;
             }
-         }
+       }
  
        /* Force 0x8b encoding for "mov foo@GOT, %eax".  */
        if (i.reloc[0] == BFD_RELOC_386_GOT32 && t->base_opcode == 0xa0)
@@ -6026,7 +6030,7 @@ match_template (char mnem_suffix)
               if (!t->opcode_modifier.d)
                 continue;
  
-check_reverse:
+           check_reverse:
               if (!(size_match & MATCH_REVERSE))
                 continue;
               /* Try reversing direction of operands.  */
@@ -6125,8 +6129,6 @@ check_reverse:
           /* Found either forward/reverse 2, 3 or 4 operand match here:
              slip through to break.  */
         }
-      if (!found_cpu_match)
-       continue;
  
        /* Check if vector and VEX operands are valid.  */
        if (check_VecOperands (t) || VEX_check_operands (t))
@@ -6298,8 +6300,18 @@ process_suffix (void)
    else if (i.tm.opcode_modifier.size == SIZE64)
      i.suffix = QWORD_MNEM_SUFFIX;
    else if (i.reg_operands
-          && (i.operands > 1 || i.types[0].bitfield.class == Reg))
+          && (i.operands > 1 || i.types[0].bitfield.class == Reg)
+          && !i.tm.opcode_modifier.addrprefixopreg)
      {
+      unsigned int numop = i.operands;
+
+      /* movsx/movzx want only their source operand considered here, for the
+        ambiguity checking below.  The suffix will be replaced afterwards
+        to represent the destination (register).  */
+      if (((i.tm.base_opcode | 8) == 0xfbe && i.tm.opcode_modifier.w)
+         || (i.tm.base_opcode == 0x63 && i.tm.cpu_flags.bitfield.cpu64))
+       --i.operands;
+
        /* If there's no instruction mnemonic suffix we try to invent one
          based on GPR operands.  */
        if (!i.suffix)
@@ -6308,50 +6320,32 @@ process_suffix (void)
              Destination register type is more significant than source
              register type.  crc32 in SSE4.2 prefers source register
              type. */
-         if (i.tm.base_opcode == 0xf20f38f0
-             && i.types[0].bitfield.class == Reg)
-           {
-             if (i.types[0].bitfield.byte)
-               i.suffix = BYTE_MNEM_SUFFIX;
-             else if (i.types[0].bitfield.word)
-               i.suffix = WORD_MNEM_SUFFIX;
-             else if (i.types[0].bitfield.dword)
-               i.suffix = LONG_MNEM_SUFFIX;
-             else if (i.types[0].bitfield.qword)
-               i.suffix = QWORD_MNEM_SUFFIX;
-           }
+         unsigned int op = i.tm.base_opcode != 0xf20f38f0 ? i.operands : 1;
  
-         if (!i.suffix)
-           {
-             int op;
-
-             if (i.tm.base_opcode == 0xf20f38f0)
-               {
-                 /* We have to know the operand size for crc32.  */
-                 as_bad (_("ambiguous memory operand size for `%s`"),
-                         i.tm.name);
-                 return 0;
-               }
+         while (op--)
+           if (i.tm.operand_types[op].bitfield.instance == InstanceNone
+               || i.tm.operand_types[op].bitfield.instance == Accum)
+             {
+               if (i.types[op].bitfield.class != Reg)
+                 continue;
+               if (i.types[op].bitfield.byte)
+                 i.suffix = BYTE_MNEM_SUFFIX;
+               else if (i.types[op].bitfield.word)
+                 i.suffix = WORD_MNEM_SUFFIX;
+               else if (i.types[op].bitfield.dword)
+                 i.suffix = LONG_MNEM_SUFFIX;
+               else if (i.types[op].bitfield.qword)
+                 i.suffix = QWORD_MNEM_SUFFIX;
+               else
+                 continue;
+               break;
+             }
  
-             for (op = i.operands; --op >= 0;)
-               if (i.tm.operand_types[op].bitfield.instance == InstanceNone
-                   || i.tm.operand_types[op].bitfield.instance == Accum)
-                 {
-                   if (i.types[op].bitfield.class != Reg)
-                     continue;
-                   if (i.types[op].bitfield.byte)
-                     i.suffix = BYTE_MNEM_SUFFIX;
-                   else if (i.types[op].bitfield.word)
-                     i.suffix = WORD_MNEM_SUFFIX;
-                   else if (i.types[op].bitfield.dword)
-                     i.suffix = LONG_MNEM_SUFFIX;
-                   else if (i.types[op].bitfield.qword)
-                     i.suffix = QWORD_MNEM_SUFFIX;
-                   else
-                     continue;
-                   break;
-                 }
-           }
+         /* As an exception, movsx/movzx silently default to a byte source
+            in AT&T mode.  */
+         if ((i.tm.base_opcode | 8) == 0xfbe && i.tm.opcode_modifier.w
+             && !i.suffix && !intel_syntax)
+           i.suffix = BYTE_MNEM_SUFFIX;
         }
        else if (i.suffix == BYTE_MNEM_SUFFIX)
         {
@@ -6398,13 +6392,11 @@ process_suffix (void)
         ;
        else
         abort ();
+
+      /* Undo the movsx/movzx change done above.  */
+      i.operands = numop;
      }
-  else if (i.tm.opcode_modifier.defaultsize
-          && !i.suffix
-          /* exclude fldenv/frstor/fsave/fstenv */
-          && i.tm.opcode_modifier.no_ssuf
-          /* exclude sysret */
-          && i.tm.base_opcode != 0x0f07)
+  else if (i.tm.opcode_modifier.defaultsize && !i.suffix)
      {
        i.suffix = stackop_size;
        if (stackop_size == LONG_MNEM_SUFFIX)
@@ -6425,8 +6417,7 @@ process_suffix (void)
                      i.tm.name);
         }
      }
-  else if (intel_syntax
-          && !i.suffix
+  else if (!i.suffix
            && (i.tm.opcode_modifier.jump == JUMP_ABSOLUTE
                || i.tm.opcode_modifier.jump == JUMP_BYTE
                || i.tm.opcode_modifier.jump == JUMP_INTERSEGMENT
@@ -6453,45 +6444,152 @@ process_suffix (void)
         }
      }
  
-  if (!i.suffix)
-    {
-      if (!intel_syntax)
+  if (!i.suffix
+      && (!i.tm.opcode_modifier.defaultsize
+         /* Also cover lret/retf/iret in 64-bit mode.  */
+         || (flag_code == CODE_64BIT
+             && !i.tm.opcode_modifier.no_lsuf
+             && !i.tm.opcode_modifier.no_qsuf))
+      && !i.tm.opcode_modifier.ignoresize
+      /* Accept FLDENV et al without suffix.  */
+      && (i.tm.opcode_modifier.no_ssuf || i.tm.opcode_modifier.floatmf))
+    {
+      unsigned int suffixes, evex = 0;
+
+      suffixes = !i.tm.opcode_modifier.no_bsuf;
+      if (!i.tm.opcode_modifier.no_wsuf)
+       suffixes |= 1 << 1;
+      if (!i.tm.opcode_modifier.no_lsuf)
+       suffixes |= 1 << 2;
+      if (!i.tm.opcode_modifier.no_ldsuf)
+       suffixes |= 1 << 3;
+      if (!i.tm.opcode_modifier.no_ssuf)
+       suffixes |= 1 << 4;
+      if (flag_code == CODE_64BIT && !i.tm.opcode_modifier.no_qsuf)
+       suffixes |= 1 << 5;
+
+      /* For [XYZ]MMWORD operands inspect operand sizes.  While generally
+        also suitable for AT&T syntax mode, it was requested that this be
+        restricted to just Intel syntax.  */
+      if (intel_syntax && is_any_vex_encoding (&i.tm) && !i.broadcast)
         {
-         if (i.tm.opcode_modifier.w)
+         unsigned int op;
+
+         for (op = 0; op < i.tm.operands; ++op)
             {
-             as_bad (_("no instruction mnemonic suffix given and "
-                       "no register operands; can't size instruction"));
-             return 0;
+             if (is_evex_encoding (&i.tm)
+                 && !cpu_arch_flags.bitfield.cpuavx512vl)
+               {
+                 if (i.tm.operand_types[op].bitfield.ymmword)
+                   i.tm.operand_types[op].bitfield.xmmword = 0;
+                 if (i.tm.operand_types[op].bitfield.zmmword)
+                   i.tm.operand_types[op].bitfield.ymmword = 0;
+                 if (!i.tm.opcode_modifier.evex
+                     || i.tm.opcode_modifier.evex == EVEXDYN)
+                   i.tm.opcode_modifier.evex = EVEX512;
+               }
+
+             if (i.tm.operand_types[op].bitfield.xmmword
+                 + i.tm.operand_types[op].bitfield.ymmword
+                 + i.tm.operand_types[op].bitfield.zmmword < 2)
+               continue;
+
+             /* Any properly sized operand disambiguates the insn.  */
+             if (i.types[op].bitfield.xmmword
+                 || i.types[op].bitfield.ymmword
+                 || i.types[op].bitfield.zmmword)
+               {
+                 suffixes &= ~(7 << 6);
+                 evex = 0;
+                 break;
+               }
+
+             if ((i.flags[op] & Operand_Mem)
+                 && i.tm.operand_types[op].bitfield.unspecified)
+               {
+                 if (i.tm.operand_types[op].bitfield.xmmword)
+                   suffixes |= 1 << 6;
+                 if (i.tm.operand_types[op].bitfield.ymmword)
+                   suffixes |= 1 << 7;
+                 if (i.tm.operand_types[op].bitfield.zmmword)
+                   suffixes |= 1 << 8;
+                 if (is_evex_encoding (&i.tm))
+                   evex = EVEX512;
+               }
             }
         }
-      else
-       {
-         unsigned int suffixes;
  
-         suffixes = !i.tm.opcode_modifier.no_bsuf;
-         if (!i.tm.opcode_modifier.no_wsuf)
-           suffixes |= 1 << 1;
-         if (!i.tm.opcode_modifier.no_lsuf)
-           suffixes |= 1 << 2;
-         if (!i.tm.opcode_modifier.no_ldsuf)
-           suffixes |= 1 << 3;
-         if (!i.tm.opcode_modifier.no_ssuf)
-           suffixes |= 1 << 4;
-         if (flag_code == CODE_64BIT && !i.tm.opcode_modifier.no_qsuf)
-           suffixes |= 1 << 5;
-
-         /* There are more than suffix matches.  */
-         if (i.tm.opcode_modifier.w
-             || ((suffixes & (suffixes - 1))
-                 && !i.tm.opcode_modifier.defaultsize
-                 && !i.tm.opcode_modifier.ignoresize))
+      /* Are multiple suffixes / operand sizes allowed?  */
+      if (suffixes & (suffixes - 1))
+       {
+         if (intel_syntax
+             && (!i.tm.opcode_modifier.defaultsize
+                 || operand_check == check_error))
             {
               as_bad (_("ambiguous operand size for `%s'"), i.tm.name);
               return 0;
             }
+         if (operand_check == check_error)
+           {
+             as_bad (_("no instruction mnemonic suffix given and "
+                       "no register operands; can't size `%s'"), i.tm.name);
+             return 0;
+           }
+         if (operand_check == check_warning)
+           as_warn (_("%s; using default for `%s'"),
+                      intel_syntax
+                      ? _("ambiguous operand size")
+                      : _("no instruction mnemonic suffix given and "
+                          "no register operands"),
+                      i.tm.name);
+
+         if (i.tm.opcode_modifier.floatmf)
+           i.suffix = SHORT_MNEM_SUFFIX;
+         else if ((i.tm.base_opcode | 8) == 0xfbe
+                  || (i.tm.base_opcode == 0x63
+                      && i.tm.cpu_flags.bitfield.cpu64))
+           /* handled below */;
+         else if (evex)
+           i.tm.opcode_modifier.evex = evex;
+         else if (flag_code == CODE_16BIT)
+           i.suffix = WORD_MNEM_SUFFIX;
+         else if (!i.tm.opcode_modifier.no_lsuf)
+           i.suffix = LONG_MNEM_SUFFIX;
+         else
+           i.suffix = QWORD_MNEM_SUFFIX;
         }
      }
  
+  if ((i.tm.base_opcode | 8) == 0xfbe
+      || (i.tm.base_opcode == 0x63 && i.tm.cpu_flags.bitfield.cpu64))
+    {
+      /* In Intel syntax, movsx/movzx must have a "suffix" (checked above).
+        In AT&T syntax, if there is no suffix (warned about above), the default
+        will be byte extension.  */
+      if (i.tm.opcode_modifier.w && i.suffix && i.suffix != BYTE_MNEM_SUFFIX)
+       i.tm.base_opcode |= 1;
+
+      /* For further processing, the suffix should represent the destination
+        (register).  This is already the case when one was used with
+        mov[sz][bw]*, but we need to replace it for mov[sz]x, or if there was
+        no suffix to begin with.  */
+      if (i.tm.opcode_modifier.w || i.tm.base_opcode == 0x63 || !i.suffix)
+       {
+         if (i.types[1].bitfield.word)
+           i.suffix = WORD_MNEM_SUFFIX;
+         else if (i.types[1].bitfield.qword)
+           i.suffix = QWORD_MNEM_SUFFIX;
+         else
+           i.suffix = LONG_MNEM_SUFFIX;
+
+         i.tm.opcode_modifier.w = 0;
+       }
+    }
+
+  if (!i.tm.opcode_modifier.modrm && i.reg_operands && i.tm.operands < 3)
+    i.short_form = (i.tm.operand_types[0].bitfield.class == Reg)
+                  != (i.tm.operand_types[1].bitfield.class == Reg);
+
    /* Change the opcode based on the operand size given by i.suffix.  */
    switch (i.suffix)
      {
@@ -6508,7 +6606,7 @@ process_suffix (void)
        /* It's not a byte, select word/dword operation.  */
        if (i.tm.opcode_modifier.w)
         {
-         if (i.tm.opcode_modifier.shortform)
+         if (i.short_form)
             i.tm.base_opcode |= 8;
           else
             i.tm.base_opcode |= 1;
@@ -6518,28 +6616,13 @@ process_suffix (void)
        /* Now select between word & dword operations via the operand
          size prefix, except for instructions that will ignore this
          prefix anyway.  */
-      if (i.reg_operands > 0
-         && i.types[0].bitfield.class == Reg
-         && i.tm.opcode_modifier.addrprefixopreg
-         && (i.tm.operand_types[0].bitfield.instance == Accum
-             || i.operands == 1))
-       {
-         /* The address size override prefix changes the size of the
-            first operand.  */
-         if ((flag_code == CODE_32BIT
-              && i.op[0].regs->reg_type.bitfield.word)
-             || (flag_code != CODE_32BIT
-                 && i.op[0].regs->reg_type.bitfield.dword))
-           if (!add_prefix (ADDR_PREFIX_OPCODE))
-             return 0;
-       }
-      else if (i.suffix != QWORD_MNEM_SUFFIX
-              && !i.tm.opcode_modifier.ignoresize
-              && !i.tm.opcode_modifier.floatmf
-              && !is_any_vex_encoding (&i.tm)
-              && ((i.suffix == LONG_MNEM_SUFFIX) == (flag_code == CODE_16BIT)
-                  || (flag_code == CODE_64BIT
-                      && i.tm.opcode_modifier.jump == JUMP_BYTE)))
+      if (i.suffix != QWORD_MNEM_SUFFIX
+         && !i.tm.opcode_modifier.ignoresize
+         && !i.tm.opcode_modifier.floatmf
+         && !is_any_vex_encoding (&i.tm)
+         && ((i.suffix == LONG_MNEM_SUFFIX) == (flag_code == CODE_16BIT)
+             || (flag_code == CODE_64BIT
+                 && i.tm.opcode_modifier.jump == JUMP_BYTE)))
         {
           unsigned int prefix = DATA_PREFIX_OPCODE;
  
@@ -6568,39 +6651,70 @@ process_suffix (void)
        break;
      }
  
-  if (i.reg_operands != 0
-      && i.operands > 1
-      && i.tm.opcode_modifier.addrprefixopreg
-      && i.tm.operand_types[0].bitfield.instance != Accum)
+  if (i.tm.opcode_modifier.addrprefixopreg)
      {
-      /* Check invalid register operand when the address size override
-        prefix changes the size of register operands.  */
-      unsigned int op;
-      enum { need_word, need_dword, need_qword } need;
+      gas_assert (!i.suffix);
+      gas_assert (i.reg_operands);
+
+      if (i.tm.operand_types[0].bitfield.instance == Accum
+         || i.operands == 1)
+       {
+         /* The address size override prefix changes the size of the
+            first operand.  */
+         if (flag_code == CODE_64BIT
+             && i.op[0].regs->reg_type.bitfield.word)
+           {
+             as_bad (_("16-bit addressing unavailable for `%s'"),
+                     i.tm.name);
+             return 0;
+           }
  
-      if (flag_code == CODE_32BIT)
-       need = i.prefix[ADDR_PREFIX] ? need_word : need_dword;
+         if ((flag_code == CODE_32BIT
+              ? i.op[0].regs->reg_type.bitfield.word
+              : i.op[0].regs->reg_type.bitfield.dword)
+             && !add_prefix (ADDR_PREFIX_OPCODE))
+           return 0;
+       }
        else
         {
-         if (i.prefix[ADDR_PREFIX])
+         /* Check invalid register operand when the address size override
+            prefix changes the size of register operands.  */
+         unsigned int op;
+         enum { need_word, need_dword, need_qword } need;
+
+         if (flag_code == CODE_32BIT)
+           need = i.prefix[ADDR_PREFIX] ? need_word : need_dword;
+         else if (i.prefix[ADDR_PREFIX])
             need = need_dword;
           else
             need = flag_code == CODE_64BIT ? need_qword : need_word;
-       }
  
-      for (op = 0; op < i.operands; op++)
-       if (i.types[op].bitfield.class == Reg
-           && ((need == need_word
-                && !i.op[op].regs->reg_type.bitfield.word)
-               || (need == need_dword
-                   && !i.op[op].regs->reg_type.bitfield.dword)
-               || (need == need_qword
-                   && !i.op[op].regs->reg_type.bitfield.qword)))
-         {
-           as_bad (_("invalid register operand size for `%s'"),
-                   i.tm.name);
-           return 0;
-         }
+         for (op = 0; op < i.operands; op++)
+           {
+             if (i.types[op].bitfield.class != Reg)
+               continue;
+
+             switch (need)
+               {
+               case need_word:
+                 if (i.op[op].regs->reg_type.bitfield.word)
+                   continue;
+                 break;
+               case need_dword:
+                 if (i.op[op].regs->reg_type.bitfield.dword)
+                   continue;
+                 break;
+               case need_qword:
+                 if (i.op[op].regs->reg_type.bitfield.qword)
+                   continue;
+                 break;
+               }
+
+             as_bad (_("invalid register operand size for `%s'"),
+                     i.tm.name);
+             return 0;
+           }
+       }
      }
  
    return 1;
@@ -6628,31 +6742,10 @@ check_byte_reg (void)
           && i.tm.operand_types[op].bitfield.word)
         continue;
  
-      /* crc32 doesn't generate this warning.  */
-      if (i.tm.base_opcode == 0xf20f38f0)
+      /* crc32 only wants its source operand checked here.  */
+      if (i.tm.base_opcode == 0xf20f38f0 && op)
         continue;
  
-      if ((i.types[op].bitfield.word
-          || i.types[op].bitfield.dword
-          || i.types[op].bitfield.qword)
-         && i.op[op].regs->reg_num < 4
-         /* Prohibit these changes in 64bit mode, since the lowering
-            would be more complicated.  */
-         && flag_code != CODE_64BIT)
-       {
-#if REGISTER_WARNINGS
-         if (!quiet_warnings)
-           as_warn (_("using `%s%s' instead of `%s%s' due to `%c' suffix"),
-                    register_prefix,
-                    (i.op[op].regs + (i.types[op].bitfield.word
-                                      ? REGNAM_AL - REGNAM_AX
-                                      : REGNAM_AL - REGNAM_EAX))->reg_name,
-                    register_prefix,
-                    i.op[op].regs->reg_name,
-                    i.suffix);
-#endif
-         continue;
-       }
        /* Any other register is bad.  */
        if (i.types[op].bitfield.class == Reg
           || i.types[op].bitfield.class == RegMMX
@@ -6697,28 +6790,16 @@ check_long_reg (void)
                 i.suffix);
         return 0;
        }
-    /* Warn if the e prefix on a general reg is missing.  */
-    else if ((!quiet_warnings || flag_code == CODE_64BIT)
-            && i.types[op].bitfield.word
+    /* Error if the e prefix on a general reg is missing.  */
+    else if (i.types[op].bitfield.word
              && (i.tm.operand_types[op].bitfield.class == Reg
                  || i.tm.operand_types[op].bitfield.instance == Accum)
              && i.tm.operand_types[op].bitfield.dword)
        {
-       /* Prohibit these changes in the 64bit mode, since the
-          lowering is more complicated.  */
-       if (flag_code == CODE_64BIT)
-         {
-           as_bad (_("incorrect register `%s%s' used with `%c' suffix"),
-                   register_prefix, i.op[op].regs->reg_name,
-                   i.suffix);
-           return 0;
-         }
-#if REGISTER_WARNINGS
-       as_warn (_("using `%s%s' instead of `%s%s' due to `%c' suffix"),
-                register_prefix,
-                (i.op[op].regs + REGNAM_EAX - REGNAM_AX)->reg_name,
-                register_prefix, i.op[op].regs->reg_name, i.suffix);
-#endif
+       as_bad (_("incorrect register `%s%s' used with `%c' suffix"),
+               register_prefix, i.op[op].regs->reg_name,
+               i.suffix);
+       return 0;
        }
      /* Warn if the r prefix on a general reg is present.  */
      else if (i.types[op].bitfield.qword
@@ -6818,29 +6899,17 @@ check_word_reg (void)
                 i.suffix);
         return 0;
        }
-    /* Warn if the e or r prefix on a general reg is present.  */
-    else if ((!quiet_warnings || flag_code == CODE_64BIT)
-            && (i.types[op].bitfield.dword
+    /* Error if the e or r prefix on a general reg is present.  */
+    else if ((i.types[op].bitfield.dword
                  || i.types[op].bitfield.qword)
              && (i.tm.operand_types[op].bitfield.class == Reg
                  || i.tm.operand_types[op].bitfield.instance == Accum)
              && i.tm.operand_types[op].bitfield.word)
        {
-       /* Prohibit these changes in the 64bit mode, since the
-          lowering is more complicated.  */
-       if (flag_code == CODE_64BIT)
-         {
-           as_bad (_("incorrect register `%s%s' used with `%c' suffix"),
-                   register_prefix, i.op[op].regs->reg_name,
-                   i.suffix);
-           return 0;
-         }
-#if REGISTER_WARNINGS
-       as_warn (_("using `%s%s' instead of `%s%s' due to `%c' suffix"),
-                register_prefix,
-                (i.op[op].regs + REGNAM_AX - REGNAM_EAX)->reg_name,
-                register_prefix, i.op[op].regs->reg_name, i.suffix);
-#endif
+       as_bad (_("incorrect register `%s%s' used with `%c' suffix"),
+               register_prefix, i.op[op].regs->reg_name,
+               i.suffix);
+       return 0;
        }
    return 1;
  }
@@ -7006,7 +7075,7 @@ process_operands (void)
         }
        else
         {
-duplicate:
+       duplicate:
           i.operands++;
           i.reg_operands++;
           i.tm.operands++;
@@ -7116,7 +7185,7 @@ duplicate:
          on one of their operands, the default segment is ds.  */
        default_seg = &ds;
      }
-  else if (i.tm.opcode_modifier.shortform)
+  else if (i.short_form)
      {
        /* The register or float register operand is in operand
          0 or 1.  */
@@ -7146,17 +7215,27 @@ duplicate:
         }
      }
  
-  if (i.tm.base_opcode == 0x8d /* lea */
-      && i.seg[0]
-      && !quiet_warnings)
-    as_warn (_("segment override on `%s' is ineffectual"), i.tm.name);
+  if ((i.seg[0] || i.prefix[SEG_PREFIX])
+      && i.tm.base_opcode == 0x8d /* lea */
+      && !is_any_vex_encoding(&i.tm))
+    {
+      if (!quiet_warnings)
+       as_warn (_("segment override on `%s' is ineffectual"), i.tm.name);
+      if (optimize)
+       {
+         i.seg[0] = NULL;
+         i.prefix[SEG_PREFIX] = 0;
+       }
+    }
  
    /* If a segment was explicitly specified, and the specified segment
-     is not the default, use an opcode prefix to select it.  If we
-     never figured out what the default segment is, then default_seg
-     will be zero at this point, and the specified segment prefix will
-     always be used.  */
-  if ((i.seg[0]) && (i.seg[0] != default_seg))
+     is neither the default nor the one already recorded from a prefix,
+     use an opcode prefix to select it.  If we never figured out what
+     the default segment is, then default_seg will be zero at this
+     point, and the specified segment prefix will always be used.  */
+  if (i.seg[0]
+      && i.seg[0] != default_seg
+      && i.seg[0]->seg_prefix != i.prefix[SEG_PREFIX])
      {
        if (!add_prefix (i.seg[0]->seg_prefix))
         return 0;
@@ -7862,6 +7941,18 @@ build_modrm_byte (void)
    return default_seg;
  }
  
+static unsigned int
+flip_code16 (unsigned int code16)
+{
+  gas_assert (i.tm.operands == 1);
+
+  return !(i.prefix[REX_PREFIX] & REX_W)
+        && (code16 ? i.tm.operand_types[0].bitfield.disp32
+                     || i.tm.operand_types[0].bitfield.disp32s
+                   : i.tm.operand_types[0].bitfield.disp16)
+        ? CODE16 : 0;
+}
+
  static void
  output_branch (void)
  {
@@ -7881,7 +7972,7 @@ output_branch (void)
      {
        prefix = 1;
        i.prefixes -= 1;
-      code16 ^= CODE16;
+      code16 ^= flip_code16(code16);
      }
    /* Pentium4 branch hints.  */
    if (i.prefix[SEG_PREFIX] == CS_PREFIX_OPCODE /* not taken */
@@ -7899,12 +7990,12 @@ output_branch (void)
    /* BND prefixed jump.  */
    if (i.prefix[BND_PREFIX] != 0)
      {
-      FRAG_APPEND_1_CHAR (i.prefix[BND_PREFIX]);
-      i.prefixes -= 1;
+      prefix++;
+      i.prefixes--;
      }
  
-  if (i.prefixes != 0 && !intel_syntax)
-    as_warn (_("skipping prefixes on this instruction"));
+  if (i.prefixes != 0)
+    as_warn (_("skipping prefixes on `%s'"), i.tm.name);
  
    /* It's always a symbol;  End frag & setup for relax.
       Make sure there is enough room in this frag for the largest
@@ -7919,6 +8010,8 @@ output_branch (void)
    if (i.prefix[SEG_PREFIX] == CS_PREFIX_OPCODE
        || i.prefix[SEG_PREFIX] == DS_PREFIX_OPCODE)
      *p++ = i.prefix[SEG_PREFIX];
+  if (i.prefix[BND_PREFIX] != 0)
+    *p++ = BND_PREFIX_OPCODE;
    if (i.prefix[REX_PREFIX] != 0)
      *p++ = i.prefix[REX_PREFIX];
    *p = i.tm.base_opcode;
@@ -8023,7 +8116,7 @@ output_jump (void)
         {
           FRAG_APPEND_1_CHAR (DATA_PREFIX_OPCODE);
           i.prefixes -= 1;
-         code16 ^= CODE16;
+         code16 ^= flip_code16(code16);
         }
  
        size = 4;
@@ -8031,21 +8124,21 @@ output_jump (void)
         size = 2;
      }
  
-  if (i.prefix[REX_PREFIX] != 0)
+  /* BND prefixed jump.  */
+  if (i.prefix[BND_PREFIX] != 0)
      {
-      FRAG_APPEND_1_CHAR (i.prefix[REX_PREFIX]);
+      FRAG_APPEND_1_CHAR (i.prefix[BND_PREFIX]);
        i.prefixes -= 1;
      }
  
-  /* BND prefixed jump.  */
-  if (i.prefix[BND_PREFIX] != 0)
+  if (i.prefix[REX_PREFIX] != 0)
      {
-      FRAG_APPEND_1_CHAR (i.prefix[BND_PREFIX]);
+      FRAG_APPEND_1_CHAR (i.prefix[REX_PREFIX]);
        i.prefixes -= 1;
      }
  
-  if (i.prefixes != 0 && !intel_syntax)
-    as_warn (_("skipping prefixes on this instruction"));
+  if (i.prefixes != 0)
+    as_warn (_("skipping prefixes on `%s'"), i.tm.name);
  
    p = frag_more (i.tm.opcode_length + size);
    switch (i.tm.opcode_length)
@@ -8098,18 +8191,15 @@ output_interseg_jump (void)
        i.prefixes -= 1;
        code16 ^= CODE16;
      }
-  if (i.prefix[REX_PREFIX] != 0)
-    {
-      prefix++;
-      i.prefixes -= 1;
-    }
+
+  gas_assert (!i.prefix[REX_PREFIX]);
  
    size = 4;
    if (code16)
      size = 2;
  
-  if (i.prefixes != 0 && !intel_syntax)
-    as_warn (_("skipping prefixes on this instruction"));
+  if (i.prefixes != 0)
+    as_warn (_("skipping prefixes on `%s'"), i.tm.name);
  
    /* 1 opcode; 2 segment; offset  */
    p = frag_more (prefix + 1 + 2 + size);
@@ -8546,7 +8636,9 @@ output_insn (void)
         x86_feature_2_used |= GNU_PROPERTY_X86_FEATURE_2_X87;
        if (i.has_regmmx
           || i.tm.base_opcode == 0xf77 /* emms */
-         || i.tm.base_opcode == 0xf0e /* femms */)
+         || i.tm.base_opcode == 0xf0e /* femms */
+         || i.tm.base_opcode == 0xf2a /* cvtpi2ps */
+         || i.tm.base_opcode == 0x660f2a /* cvtpi2pd */)
         x86_feature_2_used |= GNU_PROPERTY_X86_FEATURE_2_MMX;
        if (i.has_regxmm)
         x86_feature_2_used |= GNU_PROPERTY_X86_FEATURE_2_XMM;
@@ -8709,10 +8801,13 @@ output_insn (void)
  #if defined (OBJ_MAYBE_ELF) || defined (OBJ_ELF)
           /* For x32, add a dummy REX_OPCODE prefix for mov/add with
              R_X86_64_GOTTPOFF relocation so that linker can safely
-            perform IE->LE optimization.  */
+            perform IE->LE optimization.  A dummy REX_OPCODE prefix
+            is also needed for lea with R_X86_64_GOTPC32_TLSDESC
+            relocation for GDesc -> IE/LE optimization.  */
           if (x86_elf_abi == X86_64_X32_ABI
               && i.operands == 2
-             && i.reloc[0] == BFD_RELOC_X86_64_GOTTPOFF
+             && (i.reloc[0] == BFD_RELOC_X86_64_GOTTPOFF
+                 || i.reloc[0] == BFD_RELOC_X86_64_GOTPC32_TLSDESC)
               && i.prefix[REX_PREFIX] == 0)
             add_prefix (REX_OPCODE);
  #endif
@@ -9074,13 +9169,14 @@ output_disp (fragS *insn_start_frag, offsetT insn_start_off)
                           && i.rm.regmem == 5))
                   && (i.rm.mode == 2
                       || (i.rm.mode == 0 && i.rm.regmem == 5))
+                 && !is_any_vex_encoding(&i.tm)
                   && ((i.operands == 1
                        && i.tm.base_opcode == 0xff
                        && (i.rm.reg == 2 || i.rm.reg == 4))
                       || (i.operands == 2
                           && (i.tm.base_opcode == 0x8b
                               || i.tm.base_opcode == 0x85
-                             || (i.tm.base_opcode & 0xc7) == 0x03))))
+                             || (i.tm.base_opcode & ~0x38) == 0x03))))
                 {
                   if (object_64bit)
                     {
@@ -9938,10 +10034,11 @@ i386_displacement (char *disp_start, char *disp_end)
  
    operand_type_set (&bigdisp, 0);
    if (i.jumpabsolute
+      || i.types[this_operand].bitfield.baseindex
        || (current_templates->start->opcode_modifier.jump != JUMP
           && current_templates->start->opcode_modifier.jump != JUMP_DWORD))
      {
-      bigdisp.bitfield.disp32 = 1;
+      i386_addressing_mode ();
        override = (i.prefix[ADDR_PREFIX] != 0);
        if (flag_code == CODE_64BIT)
         {
@@ -9950,27 +10047,47 @@ i386_displacement (char *disp_start, char *disp_end)
               bigdisp.bitfield.disp32s = 1;
               bigdisp.bitfield.disp64 = 1;
             }
+         else
+           bigdisp.bitfield.disp32 = 1;
         }
        else if ((flag_code == CODE_16BIT) ^ override)
-       {
-         bigdisp.bitfield.disp32 = 0;
           bigdisp.bitfield.disp16 = 1;
-       }
+      else
+         bigdisp.bitfield.disp32 = 1;
      }
    else
      {
-      /* For PC-relative branches, the width of the displacement
-        is dependent upon data size, not address size.  */
+      /* For PC-relative branches, the width of the displacement may be
+        dependent upon data size, but is never dependent upon address size.
+        Also make sure to not unintentionally match against a non-PC-relative
+        branch template.  */
+      static templates aux_templates;
+      const insn_template *t = current_templates->start;
+      bfd_boolean has_intel64 = FALSE;
+
+      aux_templates.start = t;
+      while (++t < current_templates->end)
+       {
+         if (t->opcode_modifier.jump
+             != current_templates->start->opcode_modifier.jump)
+           break;
+         if ((t->opcode_modifier.isa64 >= INTEL64))
+           has_intel64 = TRUE;
+       }
+      if (t < current_templates->end)
+       {
+         aux_templates.end = t;
+         current_templates = &aux_templates;
+       }
+
        override = (i.prefix[DATA_PREFIX] != 0);
        if (flag_code == CODE_64BIT)
         {
-         if (override || i.suffix == WORD_MNEM_SUFFIX)
+         if ((override || i.suffix == WORD_MNEM_SUFFIX)
+             && (!intel64 || !has_intel64))
             bigdisp.bitfield.disp16 = 1;
           else
-           {
-             bigdisp.bitfield.disp32 = 1;
-             bigdisp.bitfield.disp32s = 1;
-           }
+           bigdisp.bitfield.disp32s = 1;
         }
        else
         {
@@ -10143,6 +10260,11 @@ i386_finalize_displacement (segT exp_seg ATTRIBUTE_UNUSED, expressionS *exp,
      }
  #endif
  
+  if (current_templates->start->opcode_modifier.jump == JUMP_BYTE
+      /* Constants get taken care of by optimize_disp().  */
+      && exp->X_op != O_constant)
+    i.types[this_operand].bitfield.disp8 = 1;
+
    /* Check if this is a displacement only operand.  */
    bigdisp = i.types[this_operand];
    bigdisp.bitfield.disp8 = 0;
@@ -10282,7 +10404,7 @@ i386_index_check (const char *operand_string)
        else
         return 1;
  
-bad_address:
+    bad_address:
        as_bad (_("`%s' is not a valid %s expression"),
               operand_string, kind);
        return 0;
@@ -12000,6 +12122,7 @@ const char *md_shortopts = "qnO::";
  #define OPTION_MALIGN_BRANCH_BOUNDARY (OPTION_MD_BASE + 27)
  #define OPTION_MALIGN_BRANCH_PREFIX_SIZE (OPTION_MD_BASE + 28)
  #define OPTION_MALIGN_BRANCH (OPTION_MD_BASE + 29)
+#define OPTION_MBRANCHES_WITH_32B_BOUNDARIES (OPTION_MD_BASE + 30)
  
  struct option md_longopts[] =
  {
@@ -12038,6 +12161,7 @@ struct option md_longopts[] =
    {"malign-branch-boundary", required_argument, NULL, OPTION_MALIGN_BRANCH_BOUNDARY},
    {"malign-branch-prefix-size", required_argument, NULL, OPTION_MALIGN_BRANCH_PREFIX_SIZE},
    {"malign-branch", required_argument, NULL, OPTION_MALIGN_BRANCH},
+  {"mbranches-within-32B-boundaries", no_argument, NULL, OPTION_MBRANCHES_WITH_32B_BOUNDARIES},
    {"mamd64", no_argument, NULL, OPTION_MAMD64},
    {"mintel64", no_argument, NULL, OPTION_MINTEL64},
    {NULL, no_argument, NULL, 0}
@@ -12500,12 +12624,20 @@ md_parse_option (int c, const char *arg)
        free (saved);
        break;
  
+    case OPTION_MBRANCHES_WITH_32B_BOUNDARIES:
+      align_branch_power = 5;
+      align_branch_prefix_size = 5;
+      align_branch = (align_branch_jcc_bit
+                     | align_branch_fused_bit
+                     | align_branch_jmp_bit);
+      break;
+
      case OPTION_MAMD64:
-      intel64 = 0;
+      isa64 = amd64;
        break;
  
      case OPTION_MINTEL64:
-      intel64 = 1;
+      isa64 = intel64;
        break;
  
      case 'O':
@@ -12763,6 +12895,9 @@ md_show_usage (FILE *stream)
    -malign-branch-prefix-size=NUM (default: 5)\n\
                            align branches with NUM prefixes per instruction\n"));
    fprintf (stream, _("\
+  -mbranches-within-32B-boundaries\n\
+                          align branches within 32 byte boundary\n"));
+  fprintf (stream, _("\
    -mamd64                 accept only AMD64 ISA [default]\n"));
    fprintf (stream, _("\
    -mintel64               accept only Intel64 ISA\n"));