+Tue Jan 18 16:19:55 MET 2000 Jan Hubicka <hubicka@freesoft.cz>
+
+ * i386.md (memstr): Do not use rep stosb for counts divisible by 4
+ when optimize_size.
+ (clrstrsi): Rewrite.
+ (strsethi, strsetqi): New expanders.
+ (strsethi_1, strsetqi_1, rep_stossi, rep_stosqi): New insn patterns.
+ (cmpstrsi): Emit compare insn before cmpstrsi_1
+ (cmpstrsi_nz): use flags, set type to str, prefix_length to 1.
+ (strlensi_1): Likewise.
+ (cmpstrsi_1): Likewise; do not output compare.
+ (strlen expander): Do not unroll when optimizing for size.
+ (*subsi3_carry): Rename to subsi3_carry
+ (addqi3_cc): New pattern.
+ * i386.h (processor_costs): Add move_ratio field.
+ (MOVE_RATIO): Use move_ratio field, set to 3 for OPTIMIZE_SIZE
+ * i386.c (*_cost): Set move_ratio.
+ (x86_unroll_strlen): Enable for Athlon, PPro and K6 too.
+ (x86_expand_strlensi_1): Rewrite the main loop.
+
2000-01-17 Richard Henderson <rth@cygnus.com>
* combine.c (combine_simplify_rtx): Give FLOAT_STORE_FLAG_VALUE a mode.
1, /* cost of multiply per each bit set */
23, /* cost of a divide/mod */
15, /* "large" insn */
+ 3, /* MOVE_RATIO */
4, /* cost for loading QImode using movzbl */
{2, 4, 2}, /* cost of loading integer registers
in QImode, HImode and SImode.
1, /* cost of multiply per each bit set */
40, /* cost of a divide/mod */
15, /* "large" insn */
+ 3, /* MOVE_RATIO */
4, /* cost for loading QImode using movzbl */
{2, 4, 2}, /* cost of loading integer registers
in QImode, HImode and SImode.
0, /* cost of multiply per each bit set */
25, /* cost of a divide/mod */
8, /* "large" insn */
+ 6, /* MOVE_RATIO */
6, /* cost for loading QImode using movzbl */
{2, 4, 2}, /* cost of loading integer registers
in QImode, HImode and SImode.
0, /* cost of multiply per each bit set */
17, /* cost of a divide/mod */
8, /* "large" insn */
+ 6, /* MOVE_RATIO */
2, /* cost for loading QImode using movzbl */
{4, 4, 4}, /* cost of loading integer registers
in QImode, HImode and SImode.
0, /* cost of multiply per each bit set */
18, /* cost of a divide/mod */
8, /* "large" insn */
+ 4, /* MOVE_RATIO */
3, /* cost for loading QImode using movzbl */
{4, 5, 4}, /* cost of loading integer registers
in QImode, HImode and SImode.
0, /* cost of multiply per each bit set */
19, /* cost of a divide/mod */
8, /* "large" insn */
+ 9, /* MOVE_RATIO */
4, /* cost for loading QImode using movzbl */
{4, 5, 4}, /* cost of loading integer registers
in QImode, HImode and SImode.
const int x86_movx = m_ATHLON /* m_386 | m_PPRO | m_K6 */;
const int x86_double_with_add = ~m_386;
const int x86_use_bit_test = m_386;
-const int x86_unroll_strlen = m_486 | m_PENT;
+const int x86_unroll_strlen = m_486 | m_PENT | m_PPRO | m_ATHLON | m_K6;
const int x86_use_q_reg = m_PENT | m_PPRO | m_K6;
const int x86_use_any_reg = m_486;
const int x86_cmove = m_PPRO | m_ATHLON;
rtx align_3_label = NULL_RTX;
rtx align_4_label = gen_label_rtx ();
rtx end_0_label = gen_label_rtx ();
- rtx end_2_label = gen_label_rtx ();
- rtx end_3_label = gen_label_rtx ();
rtx mem;
rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
+ rtx tmpreg = gen_reg_rtx (SImode);
align = 0;
if (GET_CODE (align_rtx) == CONST_INT)
mem = gen_rtx_MEM (SImode, out);
emit_move_insn (scratch, mem);
-
- /* Check first byte. */
- emit_insn (gen_cmpqi_0 (gen_lowpart (QImode, scratch), const0_rtx));
- tmp = gen_rtx_EQ (VOIDmode, flags, const0_rtx);
- tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
- gen_rtx_LABEL_REF (VOIDmode, end_0_label),
- pc_rtx);
- emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
-
- /* Check second byte. */
- emit_insn (gen_cmpqi_ext_3 (scratch, const0_rtx));
- tmp = gen_rtx_EQ (VOIDmode, flags, const0_rtx);
- tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
- gen_rtx_LABEL_REF (VOIDmode, end_3_label),
- pc_rtx);
- emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
-
- /* Check third byte. */
- emit_insn (gen_testsi_1 (scratch, GEN_INT (0x00ff0000)));
- tmp = gen_rtx_EQ (VOIDmode, flags, const0_rtx);
- tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
- gen_rtx_LABEL_REF (VOIDmode, end_2_label),
- pc_rtx);
- emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
-
- /* Check fourth byte and increment address. */
emit_insn (gen_addsi3 (out, out, GEN_INT (4)));
- emit_insn (gen_testsi_1 (scratch, GEN_INT (0xff000000)));
- tmp = gen_rtx_NE (VOIDmode, flags, const0_rtx);
- tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
- gen_rtx_LABEL_REF (VOIDmode, align_4_label),
- pc_rtx);
- emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
-
- /* Now generate fixups when the compare stops within a 4-byte word. */
- emit_insn (gen_subsi3 (out, out, GEN_INT (3)));
-
- emit_label (end_2_label);
- emit_insn (gen_addsi3 (out, out, const1_rtx));
- emit_label (end_3_label);
- emit_insn (gen_addsi3 (out, out, const1_rtx));
+ /* This formula yields a nonzero result iff one of the bytes is zero.
+ This saves three branches inside loop and many cycles. */
+
+ emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
+ emit_insn (gen_one_cmplsi2 (scratch, scratch));
+ emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
+ emit_insn (gen_andsi3 (tmpreg, tmpreg, GEN_INT (0x80808080)));
+ emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1, 0, align_4_label);
+
+ if (TARGET_CMOVE)
+ {
+ rtx reg = gen_reg_rtx (SImode);
+ emit_move_insn (reg, tmpreg);
+ emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
+
+ /* If zero is not in the first two bytes, move two bytes forward. */
+ emit_insn (gen_testsi_1 (tmpreg, GEN_INT (0x8080)));
+ tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
+ tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
+ emit_insn (gen_rtx_SET (VOIDmode, tmpreg,
+ gen_rtx_IF_THEN_ELSE (SImode, tmp,
+ reg,
+ tmpreg)));
+ /* Emit lea manually to avoid clobbering of flags. */
+ emit_insn (gen_rtx_SET (SImode, reg,
+ gen_rtx_PLUS (SImode, out, GEN_INT (2))));
+
+ tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
+ tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
+ emit_insn (gen_rtx_SET (VOIDmode, out,
+ gen_rtx_IF_THEN_ELSE (SImode, tmp,
+ reg,
+ out)));
+
+ }
+ else
+ {
+ rtx end_2_label = gen_label_rtx ();
+ /* Is zero in the first two bytes? */
+
+ emit_insn (gen_testsi_1 (tmpreg, GEN_INT (0x8080)));
+ tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
+ tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
+ tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
+ gen_rtx_LABEL_REF (VOIDmode, end_2_label),
+ pc_rtx);
+ tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
+ JUMP_LABEL (tmp) = end_2_label;
+
+ /* Not in the first two. Move two bytes forward. */
+ emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
+ emit_insn (gen_addsi3 (out, out, GEN_INT (2)));
+
+ emit_label (end_2_label);
+
+ }
+
+ /* Avoid branch in fixing the byte. */
+ tmpreg = gen_lowpart (QImode, tmpreg);
+ emit_insn (gen_addqi3_cc (tmpreg, tmpreg, tmpreg));
+ emit_insn (gen_subsi3_carry (out, out, GEN_INT (3)));
emit_label (end_0_label);
}
int mult_bit; /* cost of multiply per each bit set */
int divide; /* cost of a divide/mod */
int large_insn; /* insns larger than this cost more */
+ int move_ratio; /* The threshold of number of scalar memory-to-memory
+ move insns. */
int movzbl_load; /* cost of loading using movzbl */
int int_load[3]; /* cost of loading integer registers
in QImode, HImode and SImode relative
Increasing the value will always make code faster, but eventually
incurs high cost in increased code size.
- If you don't define this, a reasonable default is used.
+ If you don't define this, a reasonable default is used. */
- Make this large on i386, since the block move is very inefficient with small
- blocks, and the hard register needs of the block move require much reload
- work. */
-
-#define MOVE_RATIO 5
+#define MOVE_RATIO (optimize_size ? 3 : ix86_cost->move_ratio)
/* Define if shifts truncate the shift count
which implies one can omit a sign-extension or zero-extension
"add{l}\\t{%2, %0|%0, %2}"
[(set_attr "type" "alu")])
+(define_insn "addqi3_cc"
+ [(set (reg:CC 17) (plus:CC (match_operand:QI 1 "nonimmediate_operand" "%0,0")
+ (match_operand:QI 2 "general_operand" "ri,rm")))
+ (set (match_operand:QI 0 "nonimmediate_operand" "=rm,r")
+ (plus:QI (match_dup 1) (match_dup 2)))]
+ "ix86_binary_operator_ok (PLUS, QImode, operands)"
+ "add{b}\\t{%2, %0|%0, %2}"
+ [(set_attr "type" "alu")])
+
(define_insn "*addsi3_carry"
[(set (match_operand:SI 0 "nonimmediate_operand" "=rm,r")
(plus:SI (match_operand:SI 1 "nonimmediate_operand" "%0,0")
"sub{l}\\t{%2, %0|%0, %2}"
[(set_attr "type" "alu")])
-(define_insn "*subsi3_carry"
+(define_insn "subsi3_carry"
[(set (match_operand:SI 0 "nonimmediate_operand" "=rm,r")
(minus:SI (match_operand:SI 1 "nonimmediate_operand" "0,0")
(plus:SI (match_operand:SI 2 "general_operand" "ri,rm")
srcreg = copy_to_mode_reg (Pmode, XEXP (operands[1], 0));
emit_insn (gen_cld());
- /* When optimizing for size emit simple rep ; movsb instruction. */
- if (!optimize || optimize_size)
+ /* When optimizing for size emit simple rep ; movsb instruction for
+ counts not divisible by 4. */
+ if ((!optimize || optimize_size) && (INTVAL (operands[2]) & 0x03))
{
countreg = copy_to_mode_reg (SImode, operands[2]);
emit_insn (gen_rep_movqi (destreg, srcreg, countreg,
(set_attr "memory" "both")])
(define_expand "clrstrsi"
- [(set (reg:SI 19) (const_int 0))
- (set (match_dup 3) (const_int 0))
- (parallel [(set (match_operand:BLK 0 "memory_operand" "")
- (const_int 0))
- (use (match_operand:SI 1 "const_int_operand" ""))
- (use (match_operand:SI 2 "const_int_operand" ""))
- (use (match_dup 3))
- (use (reg:SI 19))
- (clobber (match_scratch:SI 4 ""))
- (clobber (match_dup 5))])]
+ [(use (match_operand:BLK 0 "memory_operand" ""))
+ (use (match_operand:SI 1 "const_int_operand" ""))
+ (use (match_operand:SI 2 "const_int_operand" ""))]
""
"
{
- rtx addr0;
+ rtx destreg, zeroreg, countreg;
if (GET_CODE (operands[1]) != CONST_INT)
FAIL;
- addr0 = copy_to_mode_reg (Pmode, XEXP (operands[0], 0));
+ destreg = copy_to_mode_reg (Pmode, XEXP (operands[0], 0));
+
+ emit_insn (gen_cld());
+
+ /* When optimizing for size emit simple rep ; movsb instruction for
+ counts not divisible by 4. */
+ if ((!optimize || optimize_size) && (INTVAL (operands[1]) & 0x03))
+ {
+ countreg = copy_to_mode_reg (SImode, operands[1]);
+ zeroreg = copy_to_mode_reg (QImode, const0_rtx);
+ emit_insn (gen_rep_stosqi (destreg, countreg, zeroreg,
+ destreg, countreg));
+ }
+ else
+ {
+ zeroreg = copy_to_mode_reg (SImode, const0_rtx);
+ if (INTVAL (operands[1]) & ~0x03)
+ {
+ countreg = copy_to_mode_reg (SImode,
+ GEN_INT ((INTVAL (operands[1]) >> 2)
+ & 0x3fffffff));
+ emit_insn (gen_rep_stossi (destreg, countreg, zeroreg,
+ destreg, countreg));
+ }
+ if (INTVAL (operands[1]) & 0x02)
+ emit_insn (gen_strsethi (destreg,
+ gen_rtx_SUBREG (HImode, zeroreg, 0)));
+ if (INTVAL (operands[1]) & 0x01)
+ emit_insn (gen_strsetqi (destreg,
+ gen_rtx_SUBREG (QImode, zeroreg, 0)));
+ }
+ DONE;
+}")
+
+;; Most CPUs don't like single string operations
+;; Handle this case here to simplify previous expander.
- operands[3] = gen_reg_rtx (SImode);
- operands[5] = addr0;
+(define_expand "strsethi"
+ [(set (mem:HI (match_operand:SI 0 "register_operand" ""))
+ (match_operand:HI 1 "register_operand" ""))
+ (parallel [(set (match_dup 0) (plus:SI (match_dup 0) (const_int 2)))
+ (clobber (reg:CC 17))])]
+ ""
+ "
+{
+ if (TARGET_SINGLE_STRINGOP || optimize_size)
+ {
+ emit_insn (gen_strsethi_1 (operands[0], operands[0], operands[1]));
+ DONE;
+ }
+}")
- operands[0] = gen_rtx_MEM (BLKmode, addr0);
+(define_expand "strsetqi"
+ [(set (mem:QI (match_operand:SI 0 "register_operand" ""))
+ (match_operand:QI 1 "register_operand" ""))
+ (parallel [(set (match_dup 0) (plus:SI (match_dup 0) (const_int 1)))
+ (clobber (reg:CC 17))])]
+ ""
+ "
+{
+ if (TARGET_SINGLE_STRINGOP || optimize_size)
+ {
+ emit_insn (gen_strsetqi_1 (operands[0], operands[0], operands[1]));
+ DONE;
+ }
}")
+(define_insn "strsethi_1"
+ [(set (mem:HI (match_operand:SI 1 "register_operand" "0"))
+ (match_operand:HI 2 "register_operand" "a"))
+ (set (match_operand:SI 0 "register_operand" "=D")
+ (plus:SI (match_dup 0)
+ (const_int 2)))
+ (use (reg:SI 19))]
+ "TARGET_SINGLE_STRINGOP || optimize_size"
+ "stosw"
+ [(set_attr "type" "str")
+ (set_attr "memory" "store")
+ (set_attr "length_prefix" "1")])
+
+(define_insn "strsetqi_1"
+ [(set (mem:QI (match_operand:SI 1 "register_operand" "0"))
+ (match_operand:QI 2 "register_operand" "a"))
+ (set (match_operand:SI 0 "register_operand" "=D")
+ (plus:SI (match_dup 0)
+ (const_int 1)))
+ (use (reg:SI 19))]
+ "TARGET_SINGLE_STRINGOP || optimize_size"
+ "stosb"
+ [(set_attr "type" "str")
+ (set_attr "memory" "store")])
+
;; It might seem that operand 0 could use predicate register_operand.
;; But strength reduction might offset the MEM expression. So we let
;; reload put the address into %edi.
-(define_insn "*clrstrsi_1"
- [(set (mem:BLK (match_operand:SI 0 "address_operand" "D"))
+(define_insn "rep_stossi"
+ [(set (match_operand:SI 1 "register_operand" "=c") (const_int 0))
+ (use (match_operand:SI 2 "register_operand" "a"))
+ (use (match_operand:SI 4 "register_operand" "1"))
+ (set (match_operand:SI 0 "register_operand" "=D")
+ (plus:SI (match_operand:SI 3 "address_operand" "0")
+ (ashift:SI (match_dup 3) (const_int 2))))
+ (set (mem:BLK (match_dup 3))
(const_int 0))
- (use (match_operand:SI 1 "const_int_operand" "n"))
- (use (match_operand:SI 2 "immediate_operand" "i"))
- (use (match_operand:SI 3 "register_operand" "a"))
- (use (reg:SI 19))
- (clobber (match_scratch:SI 4 "=&c"))
- (clobber (match_dup 0))]
+ (use (reg:SI 19))]
""
- "*
-{
- rtx xops[2];
+ "rep\;stosl|rep stosd"
+ [(set_attr "type" "str")
+ (set_attr "length_prefix" "1")
+ (set_attr "memory" "store")])
- if (GET_CODE (operands[1]) == CONST_INT)
- {
- unsigned int count = INTVAL (operands[1]) & 0xffffffff;
- if (count & ~0x03)
- {
- xops[0] = GEN_INT (count / 4);
- xops[1] = operands[4];
-
- /* K6: stos takes 1 cycle, rep stos takes 8 + %ecx cycles.
- 80386: 4/5+5n (+2 for set of ecx)
- 80486: 5/7+5n (+1 for set of ecx)
- */
- if (count / 4 < ((int) ix86_cpu < (int)PROCESSOR_PENTIUM ? 4 : 6))
- {
- do
- output_asm_insn (\"{stosl|stosd}\", xops);
- while ((count -= 4) > 3);
- }
- else
- {
- output_asm_insn (\"mov{l}\\t{%0, %1|%1, %0}\", xops);
- output_asm_insn (\"{rep\;stosl|rep stosd}\", xops);
- }
- }
- if (INTVAL (operands[1]) & 0x02)
- output_asm_insn (\"stosw\", operands);
- if (INTVAL (operands[1]) & 0x01)
- output_asm_insn (\"stosb\", operands);
- }
- else
- abort ();
- RET;
-}"
- [(set_attr "type" "multi")])
+(define_insn "rep_stosqi"
+ [(set (match_operand:SI 1 "register_operand" "=c") (const_int 0))
+ (use (match_operand:QI 2 "register_operand" "a"))
+ (use (match_operand:SI 4 "register_operand" "1"))
+ (set (match_operand:SI 0 "register_operand" "=D")
+ (plus:SI (match_operand:SI 3 "address_operand" "0") (match_dup 3)))
+ (set (mem:BLK (match_dup 3))
+ (const_int 0))
+ (use (reg:SI 19))]
+ ""
+ "rep\;stosb|rep stosb"
+ [(set_attr "type" "str")
+ (set_attr "length_prefix" "1")
+ (set_attr "memory" "store")])
(define_expand "cmpstrsi"
[(set (match_operand:SI 0 "register_operand" "")
emit_insn (gen_cmpstrsi_nz_1 (addr1, addr2, countreg, align));
}
else
- emit_insn (gen_cmpstrsi_1 (addr1, addr2, countreg, align));
+ {
+ emit_insn (gen_cmpsi_1 (countreg, countreg));
+ emit_insn (gen_cmpstrsi_1 (addr1, addr2, countreg, align));
+ }
outlow = gen_lowpart (QImode, out);
emit_insn (gen_cmpintqi (outlow));
(clobber (match_dup 2))]
""
"repz{\;| }cmpsb"
- [(set_attr "type" "multi")
- (set_attr "length" "3")])
+ [(set_attr "type" "str")
+ (set_attr "length_prefix" "1")])
;; The same, but the count is not known to not be zero.
(mem:BLK (match_operand:SI 1 "address_operand" "D")))
(const_int 0)))
(use (match_operand:SI 3 "immediate_operand" "i"))
+ (use (reg:CC 17))
(use (reg:SI 19))
(clobber (match_dup 0))
(clobber (match_dup 1))
(clobber (match_dup 2))]
""
- ;; The initial compare sets the zero flag.
- "cmp{l}\\t%2, %2\;repz{\;| }cmpsb"
- [(set_attr "type" "multi")
- (set_attr "length" "5")])
+ "repz{\;| }cmpsb"
+ [(set_attr "type" "str")
+ (set_attr "length_prefix" "1")])
(define_expand "strlensi"
[(set (match_operand:SI 0 "register_operand" "")
align = operands[3];
scratch1 = gen_reg_rtx (SImode);
- if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1)
+ if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
+ && !optimize_size)
{
/* Well it seems that some optimizer does not combine a call like
foo(strlen(bar), strlen(bar));
(clobber (reg:CC 17))]
""
"repnz{\;| }scasb"
- [(set_attr "type" "multi")
- (set_attr "length" "3")])
+ [(set_attr "type" "str")
+ (set_attr "length_prefix" "1")])
\f
;; Conditional move instructions.