diff options
Diffstat (limited to 'gcc/loop-doloop.c')
-rw-r--r-- | gcc/loop-doloop.c | 695 |
1 files changed, 695 insertions, 0 deletions
diff --git a/gcc/loop-doloop.c b/gcc/loop-doloop.c new file mode 100644 index 000000000..dcc20b191 --- /dev/null +++ b/gcc/loop-doloop.c @@ -0,0 +1,695 @@ +/* Perform doloop optimizations + Copyright (C) 2004, 2005, 2006, 2007, 2008, 2010 + Free Software Foundation, Inc. + Based on code by Michael P. Hayes (m.hayes@elec.canterbury.ac.nz) + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 3, or (at your option) any later +version. + +GCC is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the file COPYING3. If not see +<http://www.gnu.org/licenses/>. */ + +#include "config.h" +#include "system.h" +#include "coretypes.h" +#include "tm.h" +#include "rtl.h" +#include "flags.h" +#include "expr.h" +#include "hard-reg-set.h" +#include "basic-block.h" +#include "diagnostic-core.h" +#include "tm_p.h" +#include "cfgloop.h" +#include "output.h" +#include "params.h" +#include "target.h" + +/* This module is used to modify loops with a determinable number of + iterations to use special low-overhead looping instructions. + + It first validates whether the loop is well behaved and has a + determinable number of iterations (either at compile or run-time). + It then modifies the loop to use a low-overhead looping pattern as + follows: + + 1. A pseudo register is allocated as the loop iteration counter. + + 2. The number of loop iterations is calculated and is stored + in the loop counter. + + 3. At the end of the loop, the jump insn is replaced by the + doloop_end pattern. The compare must remain because it might be + used elsewhere. If the loop-variable or condition register are + used elsewhere, they will be eliminated by flow. + + 4. An optional doloop_begin pattern is inserted at the top of the + loop. + + TODO The optimization should only performed when either the biv used for exit + condition is unused at all except for the exit test, or if we do not have to + change its value, since otherwise we have to add a new induction variable, + which usually will not pay up (unless the cost of the doloop pattern is + somehow extremely lower than the cost of compare & jump, or unless the bct + register cannot be used for anything else but doloop -- ??? detect these + cases). */ + +#ifdef HAVE_doloop_end + +/* Return the loop termination condition for PATTERN or zero + if it is not a decrement and branch jump insn. */ + +rtx +doloop_condition_get (rtx doloop_pat) +{ + rtx cmp; + rtx inc; + rtx reg; + rtx inc_src; + rtx condition; + rtx pattern; + + /* The canonical doloop pattern we expect has one of the following + forms: + + 1) (parallel [(set (pc) (if_then_else (condition) + (label_ref (label)) + (pc))) + (set (reg) (plus (reg) (const_int -1))) + (additional clobbers and uses)]) + + The branch must be the first entry of the parallel (also required + by jump.c), and the second entry of the parallel must be a set of + the loop counter register. Some targets (IA-64) wrap the set of + the loop counter in an if_then_else too. + + 2) (set (reg) (plus (reg) (const_int -1)) + (set (pc) (if_then_else (reg != 0) + (label_ref (label)) + (pc))). */ + + pattern = PATTERN (doloop_pat); + + if (GET_CODE (pattern) != PARALLEL) + { + rtx cond; + rtx prev_insn = prev_nondebug_insn (doloop_pat); + + /* We expect the decrement to immediately precede the branch. */ + + if (prev_insn == NULL_RTX || !INSN_P (prev_insn)) + return 0; + + cmp = pattern; + inc = PATTERN (PREV_INSN (doloop_pat)); + /* We expect the condition to be of the form (reg != 0) */ + cond = XEXP (SET_SRC (cmp), 0); + if (GET_CODE (cond) != NE || XEXP (cond, 1) != const0_rtx) + return 0; + + } + else + { + cmp = XVECEXP (pattern, 0, 0); + inc = XVECEXP (pattern, 0, 1); + } + + /* Check for (set (reg) (something)). */ + if (GET_CODE (inc) != SET) + return 0; + reg = SET_DEST (inc); + if (! REG_P (reg)) + return 0; + + /* Check if something = (plus (reg) (const_int -1)). + On IA-64, this decrement is wrapped in an if_then_else. */ + inc_src = SET_SRC (inc); + if (GET_CODE (inc_src) == IF_THEN_ELSE) + inc_src = XEXP (inc_src, 1); + if (GET_CODE (inc_src) != PLUS + || XEXP (inc_src, 0) != reg + || XEXP (inc_src, 1) != constm1_rtx) + return 0; + + /* Check for (set (pc) (if_then_else (condition) + (label_ref (label)) + (pc))). */ + if (GET_CODE (cmp) != SET + || SET_DEST (cmp) != pc_rtx + || GET_CODE (SET_SRC (cmp)) != IF_THEN_ELSE + || GET_CODE (XEXP (SET_SRC (cmp), 1)) != LABEL_REF + || XEXP (SET_SRC (cmp), 2) != pc_rtx) + return 0; + + /* Extract loop termination condition. */ + condition = XEXP (SET_SRC (cmp), 0); + + /* We expect a GE or NE comparison with 0 or 1. */ + if ((GET_CODE (condition) != GE + && GET_CODE (condition) != NE) + || (XEXP (condition, 1) != const0_rtx + && XEXP (condition, 1) != const1_rtx)) + return 0; + + if ((XEXP (condition, 0) == reg) + || (GET_CODE (XEXP (condition, 0)) == PLUS + && XEXP (XEXP (condition, 0), 0) == reg)) + { + if (GET_CODE (pattern) != PARALLEL) + /* The second form we expect: + + (set (reg) (plus (reg) (const_int -1)) + (set (pc) (if_then_else (reg != 0) + (label_ref (label)) + (pc))). + + is equivalent to the following: + + (parallel [(set (pc) (if_then_else (reg != 1) + (label_ref (label)) + (pc))) + (set (reg) (plus (reg) (const_int -1))) + (additional clobbers and uses)]) + + So we return that form instead. + */ + condition = gen_rtx_fmt_ee (NE, VOIDmode, inc_src, const1_rtx); + + return condition; + } + + /* ??? If a machine uses a funny comparison, we could return a + canonicalized form here. */ + + return 0; +} + +/* Return nonzero if the loop specified by LOOP is suitable for + the use of special low-overhead looping instructions. DESC + describes the number of iterations of the loop. */ + +static bool +doloop_valid_p (struct loop *loop, struct niter_desc *desc) +{ + basic_block *body = get_loop_body (loop), bb; + rtx insn; + unsigned i; + bool result = true; + + /* Check for loops that may not terminate under special conditions. */ + if (!desc->simple_p + || desc->assumptions + || desc->infinite) + { + /* There are some cases that would require a special attention. + For example if the comparison is LEU and the comparison value + is UINT_MAX then the loop will not terminate. Similarly, if the + comparison code is GEU and the comparison value is 0, the + loop will not terminate. + + If the absolute increment is not 1, the loop can be infinite + even with LTU/GTU, e.g. for (i = 3; i > 0; i -= 2) + + ??? We could compute these conditions at run-time and have a + additional jump around the loop to ensure an infinite loop. + However, it is very unlikely that this is the intended + behavior of the loop and checking for these rare boundary + conditions would pessimize all other code. + + If the loop is executed only a few times an extra check to + restart the loop could use up most of the benefits of using a + count register loop. Note however, that normally, this + restart branch would never execute, so it could be predicted + well by the CPU. We should generate the pessimistic code by + default, and have an option, e.g. -funsafe-loops that would + enable count-register loops in this case. */ + if (dump_file) + fprintf (dump_file, "Doloop: Possible infinite iteration case.\n"); + result = false; + goto cleanup; + } + + for (i = 0; i < loop->num_nodes; i++) + { + bb = body[i]; + + for (insn = BB_HEAD (bb); + insn != NEXT_INSN (BB_END (bb)); + insn = NEXT_INSN (insn)) + { + /* Different targets have different necessities for low-overhead + looping. Call the back end for each instruction within the loop + to let it decide whether the insn prohibits a low-overhead loop. + It will then return the cause for it to emit to the dump file. */ + const char * invalid = targetm.invalid_within_doloop (insn); + if (invalid) + { + if (dump_file) + fprintf (dump_file, "Doloop: %s\n", invalid); + result = false; + goto cleanup; + } + } + } + result = true; + +cleanup: + free (body); + + return result; +} + +/* Adds test of COND jumping to DEST on edge *E and set *E to the new fallthru + edge. If the condition is always false, do not do anything. If it is always + true, redirect E to DEST and return false. In all other cases, true is + returned. */ + +static bool +add_test (rtx cond, edge *e, basic_block dest) +{ + rtx seq, jump, label; + enum machine_mode mode; + rtx op0 = XEXP (cond, 0), op1 = XEXP (cond, 1); + enum rtx_code code = GET_CODE (cond); + basic_block bb; + + mode = GET_MODE (XEXP (cond, 0)); + if (mode == VOIDmode) + mode = GET_MODE (XEXP (cond, 1)); + + start_sequence (); + op0 = force_operand (op0, NULL_RTX); + op1 = force_operand (op1, NULL_RTX); + label = block_label (dest); + do_compare_rtx_and_jump (op0, op1, code, 0, mode, NULL_RTX, + NULL_RTX, label, -1); + + jump = get_last_insn (); + if (!jump || !JUMP_P (jump)) + { + /* The condition is always false and the jump was optimized out. */ + end_sequence (); + return true; + } + + seq = get_insns (); + end_sequence (); + + /* There always is at least the jump insn in the sequence. */ + gcc_assert (seq != NULL_RTX); + + bb = split_edge_and_insert (*e, seq); + *e = single_succ_edge (bb); + + if (any_uncondjump_p (jump)) + { + /* The condition is always true. */ + delete_insn (jump); + redirect_edge_and_branch_force (*e, dest); + return false; + } + + JUMP_LABEL (jump) = label; + + /* The jump is supposed to handle an unlikely special case. */ + add_reg_note (jump, REG_BR_PROB, const0_rtx); + + LABEL_NUSES (label)++; + + make_edge (bb, dest, (*e)->flags & ~EDGE_FALLTHRU); + return true; +} + +/* Modify the loop to use the low-overhead looping insn where LOOP + describes the loop, DESC describes the number of iterations of the + loop, and DOLOOP_INSN is the low-overhead looping insn to emit at the + end of the loop. CONDITION is the condition separated from the + DOLOOP_SEQ. COUNT is the number of iterations of the LOOP. */ + +static void +doloop_modify (struct loop *loop, struct niter_desc *desc, + rtx doloop_seq, rtx condition, rtx count) +{ + rtx counter_reg; + rtx tmp, noloop = NULL_RTX; + rtx sequence; + rtx jump_insn; + rtx jump_label; + int nonneg = 0; + bool increment_count; + basic_block loop_end = desc->out_edge->src; + enum machine_mode mode; + rtx true_prob_val; + + jump_insn = BB_END (loop_end); + + if (dump_file) + { + fprintf (dump_file, "Doloop: Inserting doloop pattern ("); + if (desc->const_iter) + fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC, desc->niter); + else + fputs ("runtime", dump_file); + fputs (" iterations).\n", dump_file); + } + + /* Get the probability of the original branch. If it exists we would + need to update REG_BR_PROB of the new jump_insn. */ + true_prob_val = find_reg_note (jump_insn, REG_BR_PROB, NULL_RTX); + + /* Discard original jump to continue loop. The original compare + result may still be live, so it cannot be discarded explicitly. */ + delete_insn (jump_insn); + + counter_reg = XEXP (condition, 0); + if (GET_CODE (counter_reg) == PLUS) + counter_reg = XEXP (counter_reg, 0); + mode = GET_MODE (counter_reg); + + increment_count = false; + switch (GET_CODE (condition)) + { + case NE: + /* Currently only NE tests against zero and one are supported. */ + noloop = XEXP (condition, 1); + if (noloop != const0_rtx) + { + gcc_assert (noloop == const1_rtx); + increment_count = true; + } + break; + + case GE: + /* Currently only GE tests against zero are supported. */ + gcc_assert (XEXP (condition, 1) == const0_rtx); + + noloop = constm1_rtx; + + /* The iteration count does not need incrementing for a GE test. */ + increment_count = false; + + /* Determine if the iteration counter will be non-negative. + Note that the maximum value loaded is iterations_max - 1. */ + if (desc->niter_max + <= ((unsigned HOST_WIDEST_INT) 1 + << (GET_MODE_BITSIZE (mode) - 1))) + nonneg = 1; + break; + + /* Abort if an invalid doloop pattern has been generated. */ + default: + gcc_unreachable (); + } + + if (increment_count) + count = simplify_gen_binary (PLUS, mode, count, const1_rtx); + + /* Insert initialization of the count register into the loop header. */ + start_sequence (); + tmp = force_operand (count, counter_reg); + convert_move (counter_reg, tmp, 1); + sequence = get_insns (); + end_sequence (); + emit_insn_after (sequence, BB_END (loop_preheader_edge (loop)->src)); + + if (desc->noloop_assumptions) + { + rtx ass = copy_rtx (desc->noloop_assumptions); + basic_block preheader = loop_preheader_edge (loop)->src; + basic_block set_zero + = split_edge (loop_preheader_edge (loop)); + basic_block new_preheader + = split_edge (loop_preheader_edge (loop)); + edge te; + + /* Expand the condition testing the assumptions and if it does not pass, + reset the count register to 0. */ + redirect_edge_and_branch_force (single_succ_edge (preheader), new_preheader); + set_immediate_dominator (CDI_DOMINATORS, new_preheader, preheader); + + set_zero->count = 0; + set_zero->frequency = 0; + + te = single_succ_edge (preheader); + for (; ass; ass = XEXP (ass, 1)) + if (!add_test (XEXP (ass, 0), &te, set_zero)) + break; + + if (ass) + { + /* We reached a condition that is always true. This is very hard to + reproduce (such a loop does not roll, and thus it would most + likely get optimized out by some of the preceding optimizations). + In fact, I do not have any testcase for it. However, it would + also be very hard to show that it is impossible, so we must + handle this case. */ + set_zero->count = preheader->count; + set_zero->frequency = preheader->frequency; + } + + if (EDGE_COUNT (set_zero->preds) == 0) + { + /* All the conditions were simplified to false, remove the + unreachable set_zero block. */ + delete_basic_block (set_zero); + } + else + { + /* Reset the counter to zero in the set_zero block. */ + start_sequence (); + convert_move (counter_reg, noloop, 0); + sequence = get_insns (); + end_sequence (); + emit_insn_after (sequence, BB_END (set_zero)); + + set_immediate_dominator (CDI_DOMINATORS, set_zero, + recompute_dominator (CDI_DOMINATORS, + set_zero)); + } + + set_immediate_dominator (CDI_DOMINATORS, new_preheader, + recompute_dominator (CDI_DOMINATORS, + new_preheader)); + } + + /* Some targets (eg, C4x) need to initialize special looping + registers. */ +#ifdef HAVE_doloop_begin + { + rtx init; + unsigned level = get_loop_level (loop) + 1; + init = gen_doloop_begin (counter_reg, + desc->const_iter ? desc->niter_expr : const0_rtx, + GEN_INT (desc->niter_max), + GEN_INT (level)); + if (init) + { + start_sequence (); + emit_insn (init); + sequence = get_insns (); + end_sequence (); + emit_insn_after (sequence, BB_END (loop_preheader_edge (loop)->src)); + } + } +#endif + + /* Insert the new low-overhead looping insn. */ + emit_jump_insn_after (doloop_seq, BB_END (loop_end)); + jump_insn = BB_END (loop_end); + jump_label = block_label (desc->in_edge->dest); + JUMP_LABEL (jump_insn) = jump_label; + LABEL_NUSES (jump_label)++; + + /* Ensure the right fallthru edge is marked, for case we have reversed + the condition. */ + desc->in_edge->flags &= ~EDGE_FALLTHRU; + desc->out_edge->flags |= EDGE_FALLTHRU; + + /* Add a REG_NONNEG note if the actual or estimated maximum number + of iterations is non-negative. */ + if (nonneg) + add_reg_note (jump_insn, REG_NONNEG, NULL_RTX); + + /* Update the REG_BR_PROB note. */ + if (true_prob_val) + { + /* Seems safer to use the branch probability. */ + add_reg_note (jump_insn, REG_BR_PROB, + GEN_INT (desc->in_edge->probability)); + } +} + +/* Process loop described by LOOP validating that the loop is suitable for + conversion to use a low overhead looping instruction, replacing the jump + insn where suitable. Returns true if the loop was successfully + modified. */ + +static bool +doloop_optimize (struct loop *loop) +{ + enum machine_mode mode; + rtx doloop_seq, doloop_pat, doloop_reg; + rtx iterations, count; + rtx iterations_max; + rtx start_label; + rtx condition; + unsigned level, est_niter; + int max_cost; + struct niter_desc *desc; + unsigned word_mode_size; + unsigned HOST_WIDE_INT word_mode_max; + + if (dump_file) + fprintf (dump_file, "Doloop: Processing loop %d.\n", loop->num); + + iv_analysis_loop_init (loop); + + /* Find the simple exit of a LOOP. */ + desc = get_simple_loop_desc (loop); + + /* Check that loop is a candidate for a low-overhead looping insn. */ + if (!doloop_valid_p (loop, desc)) + { + if (dump_file) + fprintf (dump_file, + "Doloop: The loop is not suitable.\n"); + return false; + } + mode = desc->mode; + + est_niter = 3; + if (desc->const_iter) + est_niter = desc->niter; + /* If the estimate on number of iterations is reliable (comes from profile + feedback), use it. Do not use it normally, since the expected number + of iterations of an unrolled loop is 2. */ + if (loop->header->count) + est_niter = expected_loop_iterations (loop); + + if (est_niter < 3) + { + if (dump_file) + fprintf (dump_file, + "Doloop: Too few iterations (%u) to be profitable.\n", + est_niter); + return false; + } + + max_cost + = COSTS_N_INSNS (PARAM_VALUE (PARAM_MAX_ITERATIONS_COMPUTATION_COST)); + if (rtx_cost (desc->niter_expr, SET, optimize_loop_for_speed_p (loop)) + > max_cost) + { + if (dump_file) + fprintf (dump_file, + "Doloop: number of iterations too costly to compute.\n"); + return false; + } + + count = copy_rtx (desc->niter_expr); + iterations = desc->const_iter ? desc->niter_expr : const0_rtx; + iterations_max = GEN_INT (desc->niter_max); + level = get_loop_level (loop) + 1; + + /* Generate looping insn. If the pattern FAILs then give up trying + to modify the loop since there is some aspect the back-end does + not like. */ + start_label = block_label (desc->in_edge->dest); + doloop_reg = gen_reg_rtx (mode); + doloop_seq = gen_doloop_end (doloop_reg, iterations, iterations_max, + GEN_INT (level), start_label); + + word_mode_size = GET_MODE_BITSIZE (word_mode); + word_mode_max + = ((unsigned HOST_WIDE_INT) 1 << (word_mode_size - 1) << 1) - 1; + if (! doloop_seq + && mode != word_mode + /* Before trying mode different from the one in that # of iterations is + computed, we must be sure that the number of iterations fits into + the new mode. */ + && (word_mode_size >= GET_MODE_BITSIZE (mode) + || desc->niter_max <= word_mode_max)) + { + if (word_mode_size > GET_MODE_BITSIZE (mode)) + { + count = simplify_gen_unary (ZERO_EXTEND, word_mode, + count, mode); + iterations = simplify_gen_unary (ZERO_EXTEND, word_mode, + iterations, mode); + iterations_max = simplify_gen_unary (ZERO_EXTEND, word_mode, + iterations_max, mode); + } + else + { + count = lowpart_subreg (word_mode, count, mode); + iterations = lowpart_subreg (word_mode, iterations, mode); + iterations_max = lowpart_subreg (word_mode, iterations_max, mode); + } + PUT_MODE (doloop_reg, word_mode); + doloop_seq = gen_doloop_end (doloop_reg, iterations, iterations_max, + GEN_INT (level), start_label); + } + if (! doloop_seq) + { + if (dump_file) + fprintf (dump_file, + "Doloop: Target unwilling to use doloop pattern!\n"); + return false; + } + + /* If multiple instructions were created, the last must be the + jump instruction. Also, a raw define_insn may yield a plain + pattern. */ + doloop_pat = doloop_seq; + if (INSN_P (doloop_pat)) + { + while (NEXT_INSN (doloop_pat) != NULL_RTX) + doloop_pat = NEXT_INSN (doloop_pat); + if (!JUMP_P (doloop_pat)) + doloop_pat = NULL_RTX; + } + + if (! doloop_pat + || ! (condition = doloop_condition_get (doloop_pat))) + { + if (dump_file) + fprintf (dump_file, "Doloop: Unrecognizable doloop pattern!\n"); + return false; + } + + doloop_modify (loop, desc, doloop_seq, condition, count); + return true; +} + +/* This is the main entry point. Process all loops using doloop_optimize. */ + +void +doloop_optimize_loops (void) +{ + loop_iterator li; + struct loop *loop; + + FOR_EACH_LOOP (li, loop, 0) + { + doloop_optimize (loop); + } + + iv_analysis_done (); + +#ifdef ENABLE_CHECKING + verify_dominators (CDI_DOMINATORS); + verify_loop_structure (); +#endif +} +#endif /* HAVE_doloop_end */ + |