1 files changed, 695 insertions, 0 deletions
diff --git a/gcc/loop-doloop.c b/gcc/loop-doloop.c
new file mode 100644
index 000000000..dcc20b191
--- /dev/null
+++ b/gcc/loop-doloop.c
@@ -0,0 +1,695 @@
+/* Perform doloop optimizations
+   Copyright (C) 2004, 2005, 2006, 2007, 2008, 2010
+   Free Software Foundation, Inc.
+   Based on code by Michael P. Hayes (m.hayes@elec.canterbury.ac.nz)
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 3, or (at your option) any later
+version.
+
+GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "tm.h"
+#include "rtl.h"
+#include "flags.h"
+#include "expr.h"
+#include "hard-reg-set.h"
+#include "basic-block.h"
+#include "diagnostic-core.h"
+#include "tm_p.h"
+#include "cfgloop.h"
+#include "output.h"
+#include "params.h"
+#include "target.h"
+
+/* This module is used to modify loops with a determinable number of
+   iterations to use special low-overhead looping instructions.
+
+   It first validates whether the loop is well behaved and has a
+   determinable number of iterations (either at compile or run-time).
+   It then modifies the loop to use a low-overhead looping pattern as
+   follows:
+
+   1. A pseudo register is allocated as the loop iteration counter.
+
+   2. The number of loop iterations is calculated and is stored
+      in the loop counter.
+
+   3. At the end of the loop, the jump insn is replaced by the
+      doloop_end pattern.  The compare must remain because it might be
+      used elsewhere.  If the loop-variable or condition register are
+      used elsewhere, they will be eliminated by flow.
+
+   4. An optional doloop_begin pattern is inserted at the top of the
+      loop.
+
+   TODO The optimization should only performed when either the biv used for exit
+   condition is unused at all except for the exit test, or if we do not have to
+   change its value, since otherwise we have to add a new induction variable,
+   which usually will not pay up (unless the cost of the doloop pattern is
+   somehow extremely lower than the cost of compare & jump, or unless the bct
+   register cannot be used for anything else but doloop -- ??? detect these
+   cases).  */
+
+#ifdef HAVE_doloop_end
+
+/* Return the loop termination condition for PATTERN or zero
+   if it is not a decrement and branch jump insn.  */
+
+rtx
+doloop_condition_get (rtx doloop_pat)
+{
+  rtx cmp;
+  rtx inc;
+  rtx reg;
+  rtx inc_src;
+  rtx condition;
+  rtx pattern;
+
+  /* The canonical doloop pattern we expect has one of the following
+     forms:
+
+     1)  (parallel [(set (pc) (if_then_else (condition)
+	  			            (label_ref (label))
+				            (pc)))
+	             (set (reg) (plus (reg) (const_int -1)))
+	             (additional clobbers and uses)])
+
+     The branch must be the first entry of the parallel (also required
+     by jump.c), and the second entry of the parallel must be a set of
+     the loop counter register.  Some targets (IA-64) wrap the set of
+     the loop counter in an if_then_else too.
+
+     2)  (set (reg) (plus (reg) (const_int -1))
+         (set (pc) (if_then_else (reg != 0)
+	                         (label_ref (label))
+			         (pc))).  */
+
+  pattern = PATTERN (doloop_pat);
+
+  if (GET_CODE (pattern) != PARALLEL)
+    {
+      rtx cond;
+      rtx prev_insn = prev_nondebug_insn (doloop_pat);
+
+      /* We expect the decrement to immediately precede the branch.  */
+
+      if (prev_insn == NULL_RTX || !INSN_P (prev_insn))
+        return 0;
+
+      cmp = pattern;
+      inc = PATTERN (PREV_INSN (doloop_pat));
+      /* We expect the condition to be of the form (reg != 0)  */
+      cond = XEXP (SET_SRC (cmp), 0);
+      if (GET_CODE (cond) != NE || XEXP (cond, 1) != const0_rtx)
+        return 0;
+
+    }
+  else
+    {
+      cmp = XVECEXP (pattern, 0, 0);
+      inc = XVECEXP (pattern, 0, 1);
+    }
+
+  /* Check for (set (reg) (something)).  */
+  if (GET_CODE (inc) != SET)
+    return 0;
+  reg = SET_DEST (inc);
+  if (! REG_P (reg))
+    return 0;
+
+  /* Check if something = (plus (reg) (const_int -1)).
+     On IA-64, this decrement is wrapped in an if_then_else.  */
+  inc_src = SET_SRC (inc);
+  if (GET_CODE (inc_src) == IF_THEN_ELSE)
+    inc_src = XEXP (inc_src, 1);
+  if (GET_CODE (inc_src) != PLUS
+      || XEXP (inc_src, 0) != reg
+      || XEXP (inc_src, 1) != constm1_rtx)
+    return 0;
+
+  /* Check for (set (pc) (if_then_else (condition)
+                                       (label_ref (label))
+                                       (pc))).  */
+  if (GET_CODE (cmp) != SET
+      || SET_DEST (cmp) != pc_rtx
+      || GET_CODE (SET_SRC (cmp)) != IF_THEN_ELSE
+      || GET_CODE (XEXP (SET_SRC (cmp), 1)) != LABEL_REF
+      || XEXP (SET_SRC (cmp), 2) != pc_rtx)
+    return 0;
+
+  /* Extract loop termination condition.  */
+  condition = XEXP (SET_SRC (cmp), 0);
+
+  /* We expect a GE or NE comparison with 0 or 1.  */
+  if ((GET_CODE (condition) != GE
+       && GET_CODE (condition) != NE)
+      || (XEXP (condition, 1) != const0_rtx
+          && XEXP (condition, 1) != const1_rtx))
+    return 0;
+
+  if ((XEXP (condition, 0) == reg)
+      || (GET_CODE (XEXP (condition, 0)) == PLUS
+		   && XEXP (XEXP (condition, 0), 0) == reg))
+   {
+     if (GET_CODE (pattern) != PARALLEL)
+     /*  The second form we expect:
+
+         (set (reg) (plus (reg) (const_int -1))
+         (set (pc) (if_then_else (reg != 0)
+                                 (label_ref (label))
+                                 (pc))).
+
+         is equivalent to the following:
+
+         (parallel [(set (pc) (if_then_else (reg != 1)
+                                            (label_ref (label))
+                                            (pc)))
+                     (set (reg) (plus (reg) (const_int -1)))
+                     (additional clobbers and uses)])
+
+         So we return that form instead.
+     */
+        condition = gen_rtx_fmt_ee (NE, VOIDmode, inc_src, const1_rtx);
+
+    return condition;
+   }
+
+  /* ??? If a machine uses a funny comparison, we could return a
+     canonicalized form here.  */
+
+  return 0;
+}
+
+/* Return nonzero if the loop specified by LOOP is suitable for
+   the use of special low-overhead looping instructions.  DESC
+   describes the number of iterations of the loop.  */
+
+static bool
+doloop_valid_p (struct loop *loop, struct niter_desc *desc)
+{
+  basic_block *body = get_loop_body (loop), bb;
+  rtx insn;
+  unsigned i;
+  bool result = true;
+
+  /* Check for loops that may not terminate under special conditions.  */
+  if (!desc->simple_p
+      || desc->assumptions
+      || desc->infinite)
+    {
+      /* There are some cases that would require a special attention.
+	 For example if the comparison is LEU and the comparison value
+	 is UINT_MAX then the loop will not terminate.  Similarly, if the
+	 comparison code is GEU and the comparison value is 0, the
+	 loop will not terminate.
+
+	 If the absolute increment is not 1, the loop can be infinite
+	 even with LTU/GTU, e.g. for (i = 3; i > 0; i -= 2)
+
+	 ??? We could compute these conditions at run-time and have a
+	 additional jump around the loop to ensure an infinite loop.
+	 However, it is very unlikely that this is the intended
+	 behavior of the loop and checking for these rare boundary
+	 conditions would pessimize all other code.
+
+	 If the loop is executed only a few times an extra check to
+	 restart the loop could use up most of the benefits of using a
+	 count register loop.  Note however, that normally, this
+	 restart branch would never execute, so it could be predicted
+	 well by the CPU.  We should generate the pessimistic code by
+	 default, and have an option, e.g. -funsafe-loops that would
+	 enable count-register loops in this case.  */
+      if (dump_file)
+	fprintf (dump_file, "Doloop: Possible infinite iteration case.\n");
+      result = false;
+      goto cleanup;
+    }
+
+  for (i = 0; i < loop->num_nodes; i++)
+    {
+      bb = body[i];
+
+      for (insn = BB_HEAD (bb);
+	   insn != NEXT_INSN (BB_END (bb));
+	   insn = NEXT_INSN (insn))
+	{
+	  /* Different targets have different necessities for low-overhead
+	     looping.  Call the back end for each instruction within the loop
+	     to let it decide whether the insn prohibits a low-overhead loop.
+	     It will then return the cause for it to emit to the dump file.  */
+	  const char * invalid = targetm.invalid_within_doloop (insn);
+	  if (invalid)
+	    {
+	      if (dump_file)
+		fprintf (dump_file, "Doloop: %s\n", invalid);
+	      result = false;
+	      goto cleanup;
+	    }
+	}
+    }
+  result = true;
+
+cleanup:
+  free (body);
+
+  return result;
+}
+
+/* Adds test of COND jumping to DEST on edge *E and set *E to the new fallthru
+   edge.  If the condition is always false, do not do anything.  If it is always
+   true, redirect E to DEST and return false.  In all other cases, true is
+   returned.  */
+
+static bool
+add_test (rtx cond, edge *e, basic_block dest)
+{
+  rtx seq, jump, label;
+  enum machine_mode mode;
+  rtx op0 = XEXP (cond, 0), op1 = XEXP (cond, 1);
+  enum rtx_code code = GET_CODE (cond);
+  basic_block bb;
+
+  mode = GET_MODE (XEXP (cond, 0));
+  if (mode == VOIDmode)
+    mode = GET_MODE (XEXP (cond, 1));
+
+  start_sequence ();
+  op0 = force_operand (op0, NULL_RTX);
+  op1 = force_operand (op1, NULL_RTX);
+  label = block_label (dest);
+  do_compare_rtx_and_jump (op0, op1, code, 0, mode, NULL_RTX,
+			   NULL_RTX, label, -1);
+
+  jump = get_last_insn ();
+  if (!jump || !JUMP_P (jump))
+    {
+      /* The condition is always false and the jump was optimized out.  */
+      end_sequence ();
+      return true;
+    }
+
+  seq = get_insns ();
+  end_sequence ();
+
+  /* There always is at least the jump insn in the sequence.  */
+  gcc_assert (seq != NULL_RTX);
+
+  bb = split_edge_and_insert (*e, seq);
+  *e = single_succ_edge (bb);
+
+  if (any_uncondjump_p (jump))
+    {
+      /* The condition is always true.  */
+      delete_insn (jump);
+      redirect_edge_and_branch_force (*e, dest);
+      return false;
+    }
+
+  JUMP_LABEL (jump) = label;
+
+  /* The jump is supposed to handle an unlikely special case.  */
+  add_reg_note (jump, REG_BR_PROB, const0_rtx);
+
+  LABEL_NUSES (label)++;
+
+  make_edge (bb, dest, (*e)->flags & ~EDGE_FALLTHRU);
+  return true;
+}
+
+/* Modify the loop to use the low-overhead looping insn where LOOP
+   describes the loop, DESC describes the number of iterations of the
+   loop, and DOLOOP_INSN is the low-overhead looping insn to emit at the
+   end of the loop.  CONDITION is the condition separated from the
+   DOLOOP_SEQ.  COUNT is the number of iterations of the LOOP.  */
+
+static void
+doloop_modify (struct loop *loop, struct niter_desc *desc,
+	       rtx doloop_seq, rtx condition, rtx count)
+{
+  rtx counter_reg;
+  rtx tmp, noloop = NULL_RTX;
+  rtx sequence;
+  rtx jump_insn;
+  rtx jump_label;
+  int nonneg = 0;
+  bool increment_count;
+  basic_block loop_end = desc->out_edge->src;
+  enum machine_mode mode;
+  rtx true_prob_val;
+
+  jump_insn = BB_END (loop_end);
+
+  if (dump_file)
+    {
+      fprintf (dump_file, "Doloop: Inserting doloop pattern (");
+      if (desc->const_iter)
+	fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC, desc->niter);
+      else
+	fputs ("runtime", dump_file);
+      fputs (" iterations).\n", dump_file);
+    }
+
+  /* Get the probability of the original branch. If it exists we would
+     need to update REG_BR_PROB of the new jump_insn.  */
+  true_prob_val = find_reg_note (jump_insn, REG_BR_PROB, NULL_RTX);
+
+  /* Discard original jump to continue loop.  The original compare
+     result may still be live, so it cannot be discarded explicitly.  */
+  delete_insn (jump_insn);
+
+  counter_reg = XEXP (condition, 0);
+  if (GET_CODE (counter_reg) == PLUS)
+    counter_reg = XEXP (counter_reg, 0);
+  mode = GET_MODE (counter_reg);
+
+  increment_count = false;
+  switch (GET_CODE (condition))
+    {
+    case NE:
+      /* Currently only NE tests against zero and one are supported.  */
+      noloop = XEXP (condition, 1);
+      if (noloop != const0_rtx)
+	{
+	  gcc_assert (noloop == const1_rtx);
+	  increment_count = true;
+	}
+      break;
+
+    case GE:
+      /* Currently only GE tests against zero are supported.  */
+      gcc_assert (XEXP (condition, 1) == const0_rtx);
+
+      noloop = constm1_rtx;
+
+      /* The iteration count does not need incrementing for a GE test.  */
+      increment_count = false;
+
+      /* Determine if the iteration counter will be non-negative.
+	 Note that the maximum value loaded is iterations_max - 1.  */
+      if (desc->niter_max
+	  <= ((unsigned HOST_WIDEST_INT) 1
+	      << (GET_MODE_BITSIZE (mode) - 1)))
+	nonneg = 1;
+      break;
+
+      /* Abort if an invalid doloop pattern has been generated.  */
+    default:
+      gcc_unreachable ();
+    }
+
+  if (increment_count)
+    count = simplify_gen_binary (PLUS, mode, count, const1_rtx);
+
+  /* Insert initialization of the count register into the loop header.  */
+  start_sequence ();
+  tmp = force_operand (count, counter_reg);
+  convert_move (counter_reg, tmp, 1);
+  sequence = get_insns ();
+  end_sequence ();
+  emit_insn_after (sequence, BB_END (loop_preheader_edge (loop)->src));
+
+  if (desc->noloop_assumptions)
+    {
+      rtx ass = copy_rtx (desc->noloop_assumptions);
+      basic_block preheader = loop_preheader_edge (loop)->src;
+      basic_block set_zero
+	      = split_edge (loop_preheader_edge (loop));
+      basic_block new_preheader
+	      = split_edge (loop_preheader_edge (loop));
+      edge te;
+
+      /* Expand the condition testing the assumptions and if it does not pass,
+	 reset the count register to 0.  */
+      redirect_edge_and_branch_force (single_succ_edge (preheader), new_preheader);
+      set_immediate_dominator (CDI_DOMINATORS, new_preheader, preheader);
+
+      set_zero->count = 0;
+      set_zero->frequency = 0;
+
+      te = single_succ_edge (preheader);
+      for (; ass; ass = XEXP (ass, 1))
+	if (!add_test (XEXP (ass, 0), &te, set_zero))
+	  break;
+
+      if (ass)
+	{
+	  /* We reached a condition that is always true.  This is very hard to
+	     reproduce (such a loop does not roll, and thus it would most
+	     likely get optimized out by some of the preceding optimizations).
+	     In fact, I do not have any testcase for it.  However, it would
+	     also be very hard to show that it is impossible, so we must
+	     handle this case.  */
+	  set_zero->count = preheader->count;
+	  set_zero->frequency = preheader->frequency;
+	}
+
+      if (EDGE_COUNT (set_zero->preds) == 0)
+	{
+	  /* All the conditions were simplified to false, remove the
+	     unreachable set_zero block.  */
+	  delete_basic_block (set_zero);
+	}
+      else
+	{
+	  /* Reset the counter to zero in the set_zero block.  */
+	  start_sequence ();
+	  convert_move (counter_reg, noloop, 0);
+	  sequence = get_insns ();
+	  end_sequence ();
+	  emit_insn_after (sequence, BB_END (set_zero));
+
+	  set_immediate_dominator (CDI_DOMINATORS, set_zero,
+				   recompute_dominator (CDI_DOMINATORS,
+							set_zero));
+	}
+
+      set_immediate_dominator (CDI_DOMINATORS, new_preheader,
+			       recompute_dominator (CDI_DOMINATORS,
+						    new_preheader));
+    }
+
+  /* Some targets (eg, C4x) need to initialize special looping
+     registers.  */
+#ifdef HAVE_doloop_begin
+  {
+    rtx init;
+    unsigned level = get_loop_level (loop) + 1;
+    init = gen_doloop_begin (counter_reg,
+			     desc->const_iter ? desc->niter_expr : const0_rtx,
+			     GEN_INT (desc->niter_max),
+			     GEN_INT (level));
+    if (init)
+      {
+	start_sequence ();
+	emit_insn (init);
+	sequence = get_insns ();
+	end_sequence ();
+	emit_insn_after (sequence, BB_END (loop_preheader_edge (loop)->src));
+      }
+  }
+#endif
+
+  /* Insert the new low-overhead looping insn.  */
+  emit_jump_insn_after (doloop_seq, BB_END (loop_end));
+  jump_insn = BB_END (loop_end);
+  jump_label = block_label (desc->in_edge->dest);
+  JUMP_LABEL (jump_insn) = jump_label;
+  LABEL_NUSES (jump_label)++;
+
+  /* Ensure the right fallthru edge is marked, for case we have reversed
+     the condition.  */
+  desc->in_edge->flags &= ~EDGE_FALLTHRU;
+  desc->out_edge->flags |= EDGE_FALLTHRU;
+
+  /* Add a REG_NONNEG note if the actual or estimated maximum number
+     of iterations is non-negative.  */
+  if (nonneg)
+    add_reg_note (jump_insn, REG_NONNEG, NULL_RTX);
+
+  /* Update the REG_BR_PROB note.  */
+  if (true_prob_val)
+    {
+      /* Seems safer to use the branch probability.  */
+      add_reg_note (jump_insn, REG_BR_PROB,
+		    GEN_INT (desc->in_edge->probability));
+    }
+}
+
+/* Process loop described by LOOP validating that the loop is suitable for
+   conversion to use a low overhead looping instruction, replacing the jump
+   insn where suitable.  Returns true if the loop was successfully
+   modified.  */
+
+static bool
+doloop_optimize (struct loop *loop)
+{
+  enum machine_mode mode;
+  rtx doloop_seq, doloop_pat, doloop_reg;
+  rtx iterations, count;
+  rtx iterations_max;
+  rtx start_label;
+  rtx condition;
+  unsigned level, est_niter;
+  int max_cost;
+  struct niter_desc *desc;
+  unsigned word_mode_size;
+  unsigned HOST_WIDE_INT word_mode_max;
+
+  if (dump_file)
+    fprintf (dump_file, "Doloop: Processing loop %d.\n", loop->num);
+
+  iv_analysis_loop_init (loop);
+
+  /* Find the simple exit of a LOOP.  */
+  desc = get_simple_loop_desc (loop);
+
+  /* Check that loop is a candidate for a low-overhead looping insn.  */
+  if (!doloop_valid_p (loop, desc))
+    {
+      if (dump_file)
+	fprintf (dump_file,
+		 "Doloop: The loop is not suitable.\n");
+      return false;
+    }
+  mode = desc->mode;
+
+  est_niter = 3;
+  if (desc->const_iter)
+    est_niter = desc->niter;
+  /* If the estimate on number of iterations is reliable (comes from profile
+     feedback), use it.  Do not use it normally, since the expected number
+     of iterations of an unrolled loop is 2.  */
+  if (loop->header->count)
+    est_niter = expected_loop_iterations (loop);
+
+  if (est_niter < 3)
+    {
+      if (dump_file)
+	fprintf (dump_file,
+		 "Doloop: Too few iterations (%u) to be profitable.\n",
+		 est_niter);
+      return false;
+    }
+
+  max_cost
+    = COSTS_N_INSNS (PARAM_VALUE (PARAM_MAX_ITERATIONS_COMPUTATION_COST));
+  if (rtx_cost (desc->niter_expr, SET, optimize_loop_for_speed_p (loop))
+      > max_cost)
+    {
+      if (dump_file)
+	fprintf (dump_file,
+		 "Doloop: number of iterations too costly to compute.\n");
+      return false;
+    }
+
+  count = copy_rtx (desc->niter_expr);
+  iterations = desc->const_iter ? desc->niter_expr : const0_rtx;
+  iterations_max = GEN_INT (desc->niter_max);
+  level = get_loop_level (loop) + 1;
+
+  /* Generate looping insn.  If the pattern FAILs then give up trying
+     to modify the loop since there is some aspect the back-end does
+     not like.  */
+  start_label = block_label (desc->in_edge->dest);
+  doloop_reg = gen_reg_rtx (mode);
+  doloop_seq = gen_doloop_end (doloop_reg, iterations, iterations_max,
+			       GEN_INT (level), start_label);
+
+  word_mode_size = GET_MODE_BITSIZE (word_mode);
+  word_mode_max
+	  = ((unsigned HOST_WIDE_INT) 1 << (word_mode_size - 1) << 1) - 1;
+  if (! doloop_seq
+      && mode != word_mode
+      /* Before trying mode different from the one in that # of iterations is
+	 computed, we must be sure that the number of iterations fits into
+	 the new mode.  */
+      && (word_mode_size >= GET_MODE_BITSIZE (mode)
+	  || desc->niter_max <= word_mode_max))
+    {
+      if (word_mode_size > GET_MODE_BITSIZE (mode))
+	{
+	  count = simplify_gen_unary (ZERO_EXTEND, word_mode,
+				      count, mode);
+	  iterations = simplify_gen_unary (ZERO_EXTEND, word_mode,
+					   iterations, mode);
+	  iterations_max = simplify_gen_unary (ZERO_EXTEND, word_mode,
+					       iterations_max, mode);
+	}
+      else
+	{
+	  count = lowpart_subreg (word_mode, count, mode);
+	  iterations = lowpart_subreg (word_mode, iterations, mode);
+	  iterations_max = lowpart_subreg (word_mode, iterations_max, mode);
+	}
+      PUT_MODE (doloop_reg, word_mode);
+      doloop_seq = gen_doloop_end (doloop_reg, iterations, iterations_max,
+				   GEN_INT (level), start_label);
+    }
+  if (! doloop_seq)
+    {
+      if (dump_file)
+	fprintf (dump_file,
+		 "Doloop: Target unwilling to use doloop pattern!\n");
+      return false;
+    }
+
+  /* If multiple instructions were created, the last must be the
+     jump instruction.  Also, a raw define_insn may yield a plain
+     pattern.  */
+  doloop_pat = doloop_seq;
+  if (INSN_P (doloop_pat))
+    {
+      while (NEXT_INSN (doloop_pat) != NULL_RTX)
+	doloop_pat = NEXT_INSN (doloop_pat);
+      if (!JUMP_P (doloop_pat))
+	doloop_pat = NULL_RTX;
+    }
+
+  if (! doloop_pat
+      || ! (condition = doloop_condition_get (doloop_pat)))
+    {
+      if (dump_file)
+	fprintf (dump_file, "Doloop: Unrecognizable doloop pattern!\n");
+      return false;
+    }
+
+  doloop_modify (loop, desc, doloop_seq, condition, count);
+  return true;
+}
+
+/* This is the main entry point.  Process all loops using doloop_optimize.  */
+
+void
+doloop_optimize_loops (void)
+{
+  loop_iterator li;
+  struct loop *loop;
+
+  FOR_EACH_LOOP (li, loop, 0)
+    {
+      doloop_optimize (loop);
+    }
+
+  iv_analysis_done ();
+
+#ifdef ENABLE_CHECKING
+  verify_dominators (CDI_DOMINATORS);
+  verify_loop_structure ();
+#endif
+}
+#endif /* HAVE_doloop_end */
+