diff options
Diffstat (limited to 'gcc/tree-vect-slp.c')
-rw-r--r-- | gcc/tree-vect-slp.c | 2629 |
1 files changed, 2629 insertions, 0 deletions
diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c new file mode 100644 index 000000000..6eb67ae5a --- /dev/null +++ b/gcc/tree-vect-slp.c @@ -0,0 +1,2629 @@ +/* SLP - Basic Block Vectorization + Copyright (C) 2007, 2008, 2009, 2010 + Free Software Foundation, Inc. + Contributed by Dorit Naishlos <dorit@il.ibm.com> + and Ira Rosen <irar@il.ibm.com> + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 3, or (at your option) any later +version. + +GCC is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the file COPYING3. If not see +<http://www.gnu.org/licenses/>. */ + +#include "config.h" +#include "system.h" +#include "coretypes.h" +#include "tm.h" +#include "ggc.h" +#include "tree.h" +#include "target.h" +#include "basic-block.h" +#include "tree-pretty-print.h" +#include "gimple-pretty-print.h" +#include "tree-flow.h" +#include "tree-dump.h" +#include "cfgloop.h" +#include "cfglayout.h" +#include "expr.h" +#include "recog.h" +#include "optabs.h" +#include "tree-vectorizer.h" + +/* Extract the location of the basic block in the source code. + Return the basic block location if succeed and NULL if not. */ + +LOC +find_bb_location (basic_block bb) +{ + gimple stmt = NULL; + gimple_stmt_iterator si; + + if (!bb) + return UNKNOWN_LOC; + + for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si)) + { + stmt = gsi_stmt (si); + if (gimple_location (stmt) != UNKNOWN_LOC) + return gimple_location (stmt); + } + + return UNKNOWN_LOC; +} + + +/* Recursively free the memory allocated for the SLP tree rooted at NODE. */ + +static void +vect_free_slp_tree (slp_tree node) +{ + if (!node) + return; + + if (SLP_TREE_LEFT (node)) + vect_free_slp_tree (SLP_TREE_LEFT (node)); + + if (SLP_TREE_RIGHT (node)) + vect_free_slp_tree (SLP_TREE_RIGHT (node)); + + VEC_free (gimple, heap, SLP_TREE_SCALAR_STMTS (node)); + + if (SLP_TREE_VEC_STMTS (node)) + VEC_free (gimple, heap, SLP_TREE_VEC_STMTS (node)); + + free (node); +} + + +/* Free the memory allocated for the SLP instance. */ + +void +vect_free_slp_instance (slp_instance instance) +{ + vect_free_slp_tree (SLP_INSTANCE_TREE (instance)); + VEC_free (int, heap, SLP_INSTANCE_LOAD_PERMUTATION (instance)); + VEC_free (slp_tree, heap, SLP_INSTANCE_LOADS (instance)); +} + + +/* Get the defs for the rhs of STMT (collect them in DEF_STMTS0/1), check that + they are of a legal type and that they match the defs of the first stmt of + the SLP group (stored in FIRST_STMT_...). */ + +static bool +vect_get_and_check_slp_defs (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo, + slp_tree slp_node, gimple stmt, + VEC (gimple, heap) **def_stmts0, + VEC (gimple, heap) **def_stmts1, + enum vect_def_type *first_stmt_dt0, + enum vect_def_type *first_stmt_dt1, + tree *first_stmt_def0_type, + tree *first_stmt_def1_type, + tree *first_stmt_const_oprnd, + int ncopies_for_cost, + bool *pattern0, bool *pattern1) +{ + tree oprnd; + unsigned int i, number_of_oprnds; + tree def; + gimple def_stmt; + enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type}; + stmt_vec_info stmt_info = + vinfo_for_stmt (VEC_index (gimple, SLP_TREE_SCALAR_STMTS (slp_node), 0)); + enum gimple_rhs_class rhs_class; + struct loop *loop = NULL; + + if (loop_vinfo) + loop = LOOP_VINFO_LOOP (loop_vinfo); + + rhs_class = get_gimple_rhs_class (gimple_assign_rhs_code (stmt)); + number_of_oprnds = gimple_num_ops (stmt) - 1; /* RHS only */ + + for (i = 0; i < number_of_oprnds; i++) + { + oprnd = gimple_op (stmt, i + 1); + + if (!vect_is_simple_use (oprnd, loop_vinfo, bb_vinfo, &def_stmt, &def, + &dt[i]) + || (!def_stmt && dt[i] != vect_constant_def)) + { + if (vect_print_dump_info (REPORT_SLP)) + { + fprintf (vect_dump, "Build SLP failed: can't find def for "); + print_generic_expr (vect_dump, oprnd, TDF_SLIM); + } + + return false; + } + + /* Check if DEF_STMT is a part of a pattern in LOOP and get the def stmt + from the pattern. Check that all the stmts of the node are in the + pattern. */ + if (loop && def_stmt && gimple_bb (def_stmt) + && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)) + && vinfo_for_stmt (def_stmt) + && STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (def_stmt))) + { + if (!*first_stmt_dt0) + *pattern0 = true; + else + { + if (i == 1 && !*first_stmt_dt1) + *pattern1 = true; + else if ((i == 0 && !*pattern0) || (i == 1 && !*pattern1)) + { + if (vect_print_dump_info (REPORT_DETAILS)) + { + fprintf (vect_dump, "Build SLP failed: some of the stmts" + " are in a pattern, and others are not "); + print_generic_expr (vect_dump, oprnd, TDF_SLIM); + } + + return false; + } + } + + def_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (def_stmt)); + dt[i] = STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt)); + + if (*dt == vect_unknown_def_type) + { + if (vect_print_dump_info (REPORT_DETAILS)) + fprintf (vect_dump, "Unsupported pattern."); + return false; + } + + switch (gimple_code (def_stmt)) + { + case GIMPLE_PHI: + def = gimple_phi_result (def_stmt); + break; + + case GIMPLE_ASSIGN: + def = gimple_assign_lhs (def_stmt); + break; + + default: + if (vect_print_dump_info (REPORT_DETAILS)) + fprintf (vect_dump, "unsupported defining stmt: "); + return false; + } + } + + if (!*first_stmt_dt0) + { + /* op0 of the first stmt of the group - store its info. */ + *first_stmt_dt0 = dt[i]; + if (def) + *first_stmt_def0_type = TREE_TYPE (def); + else + *first_stmt_const_oprnd = oprnd; + + /* Analyze costs (for the first stmt of the group only). */ + if (rhs_class != GIMPLE_SINGLE_RHS) + /* Not memory operation (we don't call this functions for loads). */ + vect_model_simple_cost (stmt_info, ncopies_for_cost, dt, slp_node); + else + /* Store. */ + vect_model_store_cost (stmt_info, ncopies_for_cost, dt[0], slp_node); + } + + else + { + if (!*first_stmt_dt1 && i == 1) + { + /* op1 of the first stmt of the group - store its info. */ + *first_stmt_dt1 = dt[i]; + if (def) + *first_stmt_def1_type = TREE_TYPE (def); + else + { + /* We assume that the stmt contains only one constant + operand. We fail otherwise, to be on the safe side. */ + if (*first_stmt_const_oprnd) + { + if (vect_print_dump_info (REPORT_SLP)) + fprintf (vect_dump, "Build SLP failed: two constant " + "oprnds in stmt"); + return false; + } + *first_stmt_const_oprnd = oprnd; + } + } + else + { + /* Not first stmt of the group, check that the def-stmt/s match + the def-stmt/s of the first stmt. */ + if ((i == 0 + && (*first_stmt_dt0 != dt[i] + || (*first_stmt_def0_type && def + && !types_compatible_p (*first_stmt_def0_type, + TREE_TYPE (def))))) + || (i == 1 + && (*first_stmt_dt1 != dt[i] + || (*first_stmt_def1_type && def + && !types_compatible_p (*first_stmt_def1_type, + TREE_TYPE (def))))) + || (!def + && !types_compatible_p (TREE_TYPE (*first_stmt_const_oprnd), + TREE_TYPE (oprnd)))) + { + if (vect_print_dump_info (REPORT_SLP)) + fprintf (vect_dump, "Build SLP failed: different types "); + + return false; + } + } + } + + /* Check the types of the definitions. */ + switch (dt[i]) + { + case vect_constant_def: + case vect_external_def: + break; + + case vect_internal_def: + case vect_reduction_def: + if (i == 0) + VEC_safe_push (gimple, heap, *def_stmts0, def_stmt); + else + VEC_safe_push (gimple, heap, *def_stmts1, def_stmt); + break; + + default: + /* FORNOW: Not supported. */ + if (vect_print_dump_info (REPORT_SLP)) + { + fprintf (vect_dump, "Build SLP failed: illegal type of def "); + print_generic_expr (vect_dump, def, TDF_SLIM); + } + + return false; + } + } + + return true; +} + + +/* Recursively build an SLP tree starting from NODE. + Fail (and return FALSE) if def-stmts are not isomorphic, require data + permutation or are of unsupported types of operation. Otherwise, return + TRUE. */ + +static bool +vect_build_slp_tree (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo, + slp_tree *node, unsigned int group_size, + int *inside_cost, int *outside_cost, + int ncopies_for_cost, unsigned int *max_nunits, + VEC (int, heap) **load_permutation, + VEC (slp_tree, heap) **loads, + unsigned int vectorization_factor) +{ + VEC (gimple, heap) *def_stmts0 = VEC_alloc (gimple, heap, group_size); + VEC (gimple, heap) *def_stmts1 = VEC_alloc (gimple, heap, group_size); + unsigned int i; + VEC (gimple, heap) *stmts = SLP_TREE_SCALAR_STMTS (*node); + gimple stmt = VEC_index (gimple, stmts, 0); + enum vect_def_type first_stmt_dt0 = vect_uninitialized_def; + enum vect_def_type first_stmt_dt1 = vect_uninitialized_def; + enum tree_code first_stmt_code = ERROR_MARK, rhs_code = ERROR_MARK; + tree first_stmt_def1_type = NULL_TREE, first_stmt_def0_type = NULL_TREE; + tree lhs; + bool stop_recursion = false, need_same_oprnds = false; + tree vectype, scalar_type, first_op1 = NULL_TREE; + unsigned int ncopies; + optab optab; + int icode; + enum machine_mode optab_op2_mode; + enum machine_mode vec_mode; + tree first_stmt_const_oprnd = NULL_TREE; + struct data_reference *first_dr; + bool pattern0 = false, pattern1 = false; + HOST_WIDE_INT dummy; + bool permutation = false; + unsigned int load_place; + gimple first_load, prev_first_load = NULL; + + /* For every stmt in NODE find its def stmt/s. */ + FOR_EACH_VEC_ELT (gimple, stmts, i, stmt) + { + if (vect_print_dump_info (REPORT_SLP)) + { + fprintf (vect_dump, "Build SLP for "); + print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM); + } + + /* Fail to vectorize statements marked as unvectorizable. */ + if (!STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt))) + { + if (vect_print_dump_info (REPORT_SLP)) + { + fprintf (vect_dump, + "Build SLP failed: unvectorizable statement "); + print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM); + } + + return false; + } + + lhs = gimple_get_lhs (stmt); + if (lhs == NULL_TREE) + { + if (vect_print_dump_info (REPORT_SLP)) + { + fprintf (vect_dump, + "Build SLP failed: not GIMPLE_ASSIGN nor GIMPLE_CALL"); + print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM); + } + + return false; + } + + scalar_type = vect_get_smallest_scalar_type (stmt, &dummy, &dummy); + vectype = get_vectype_for_scalar_type (scalar_type); + if (!vectype) + { + if (vect_print_dump_info (REPORT_SLP)) + { + fprintf (vect_dump, "Build SLP failed: unsupported data-type "); + print_generic_expr (vect_dump, scalar_type, TDF_SLIM); + } + return false; + } + + ncopies = vectorization_factor / TYPE_VECTOR_SUBPARTS (vectype); + if (ncopies != 1) + { + if (vect_print_dump_info (REPORT_SLP)) + fprintf (vect_dump, "SLP with multiple types "); + + /* FORNOW: multiple types are unsupported in BB SLP. */ + if (bb_vinfo) + return false; + } + + /* In case of multiple types we need to detect the smallest type. */ + if (*max_nunits < TYPE_VECTOR_SUBPARTS (vectype)) + *max_nunits = TYPE_VECTOR_SUBPARTS (vectype); + + if (is_gimple_call (stmt)) + rhs_code = CALL_EXPR; + else + rhs_code = gimple_assign_rhs_code (stmt); + + /* Check the operation. */ + if (i == 0) + { + first_stmt_code = rhs_code; + + /* Shift arguments should be equal in all the packed stmts for a + vector shift with scalar shift operand. */ + if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR + || rhs_code == LROTATE_EXPR + || rhs_code == RROTATE_EXPR) + { + vec_mode = TYPE_MODE (vectype); + + /* First see if we have a vector/vector shift. */ + optab = optab_for_tree_code (rhs_code, vectype, + optab_vector); + + if (!optab + || optab_handler (optab, vec_mode) == CODE_FOR_nothing) + { + /* No vector/vector shift, try for a vector/scalar shift. */ + optab = optab_for_tree_code (rhs_code, vectype, + optab_scalar); + + if (!optab) + { + if (vect_print_dump_info (REPORT_SLP)) + fprintf (vect_dump, "Build SLP failed: no optab."); + return false; + } + icode = (int) optab_handler (optab, vec_mode); + if (icode == CODE_FOR_nothing) + { + if (vect_print_dump_info (REPORT_SLP)) + fprintf (vect_dump, "Build SLP failed: " + "op not supported by target."); + return false; + } + optab_op2_mode = insn_data[icode].operand[2].mode; + if (!VECTOR_MODE_P (optab_op2_mode)) + { + need_same_oprnds = true; + first_op1 = gimple_assign_rhs2 (stmt); + } + } + } + } + else + { + if (first_stmt_code != rhs_code + && (first_stmt_code != IMAGPART_EXPR + || rhs_code != REALPART_EXPR) + && (first_stmt_code != REALPART_EXPR + || rhs_code != IMAGPART_EXPR) + && !(STMT_VINFO_STRIDED_ACCESS (vinfo_for_stmt (stmt)) + && (first_stmt_code == ARRAY_REF + || first_stmt_code == INDIRECT_REF + || first_stmt_code == COMPONENT_REF + || first_stmt_code == MEM_REF))) + { + if (vect_print_dump_info (REPORT_SLP)) + { + fprintf (vect_dump, + "Build SLP failed: different operation in stmt "); + print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM); + } + + return false; + } + + if (need_same_oprnds + && !operand_equal_p (first_op1, gimple_assign_rhs2 (stmt), 0)) + { + if (vect_print_dump_info (REPORT_SLP)) + { + fprintf (vect_dump, + "Build SLP failed: different shift arguments in "); + print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM); + } + + return false; + } + } + + /* Strided store or load. */ + if (STMT_VINFO_STRIDED_ACCESS (vinfo_for_stmt (stmt))) + { + if (REFERENCE_CLASS_P (lhs)) + { + /* Store. */ + if (!vect_get_and_check_slp_defs (loop_vinfo, bb_vinfo, *node, + stmt, &def_stmts0, &def_stmts1, + &first_stmt_dt0, + &first_stmt_dt1, + &first_stmt_def0_type, + &first_stmt_def1_type, + &first_stmt_const_oprnd, + ncopies_for_cost, + &pattern0, &pattern1)) + return false; + } + else + { + /* Load. */ + /* FORNOW: Check that there is no gap between the loads. */ + if ((DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt)) == stmt + && DR_GROUP_GAP (vinfo_for_stmt (stmt)) != 0) + || (DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt)) != stmt + && DR_GROUP_GAP (vinfo_for_stmt (stmt)) != 1)) + { + if (vect_print_dump_info (REPORT_SLP)) + { + fprintf (vect_dump, "Build SLP failed: strided " + "loads have gaps "); + print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM); + } + + return false; + } + + /* Check that the size of interleaved loads group is not + greater than the SLP group size. */ + if (DR_GROUP_SIZE (vinfo_for_stmt (stmt)) > ncopies * group_size) + { + if (vect_print_dump_info (REPORT_SLP)) + { + fprintf (vect_dump, "Build SLP failed: the number of " + "interleaved loads is greater than" + " the SLP group size "); + print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM); + } + + return false; + } + + first_load = DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt)); + if (prev_first_load) + { + /* Check that there are no loads from different interleaving + chains in the same node. The only exception is complex + numbers. */ + if (prev_first_load != first_load + && rhs_code != REALPART_EXPR + && rhs_code != IMAGPART_EXPR) + { + if (vect_print_dump_info (REPORT_SLP)) + { + fprintf (vect_dump, "Build SLP failed: different " + "interleaving chains in one node "); + print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM); + } + + return false; + } + } + else + prev_first_load = first_load; + + if (first_load == stmt) + { + first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)); + if (vect_supportable_dr_alignment (first_dr, false) + == dr_unaligned_unsupported) + { + if (vect_print_dump_info (REPORT_SLP)) + { + fprintf (vect_dump, "Build SLP failed: unsupported " + "unaligned load "); + print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM); + } + + return false; + } + + /* Analyze costs (for the first stmt in the group). */ + vect_model_load_cost (vinfo_for_stmt (stmt), + ncopies_for_cost, *node); + } + + /* Store the place of this load in the interleaving chain. In + case that permutation is needed we later decide if a specific + permutation is supported. */ + load_place = vect_get_place_in_interleaving_chain (stmt, + first_load); + if (load_place != i) + permutation = true; + + VEC_safe_push (int, heap, *load_permutation, load_place); + + /* We stop the tree when we reach a group of loads. */ + stop_recursion = true; + continue; + } + } /* Strided access. */ + else + { + if (TREE_CODE_CLASS (rhs_code) == tcc_reference) + { + /* Not strided load. */ + if (vect_print_dump_info (REPORT_SLP)) + { + fprintf (vect_dump, "Build SLP failed: not strided load "); + print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM); + } + + /* FORNOW: Not strided loads are not supported. */ + return false; + } + + /* Not memory operation. */ + if (TREE_CODE_CLASS (rhs_code) != tcc_binary + && TREE_CODE_CLASS (rhs_code) != tcc_unary) + { + if (vect_print_dump_info (REPORT_SLP)) + { + fprintf (vect_dump, "Build SLP failed: operation"); + fprintf (vect_dump, " unsupported "); + print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM); + } + + return false; + } + + /* Find the def-stmts. */ + if (!vect_get_and_check_slp_defs (loop_vinfo, bb_vinfo, *node, stmt, + &def_stmts0, &def_stmts1, + &first_stmt_dt0, &first_stmt_dt1, + &first_stmt_def0_type, + &first_stmt_def1_type, + &first_stmt_const_oprnd, + ncopies_for_cost, + &pattern0, &pattern1)) + return false; + } + } + + /* Add the costs of the node to the overall instance costs. */ + *inside_cost += SLP_TREE_INSIDE_OF_LOOP_COST (*node); + *outside_cost += SLP_TREE_OUTSIDE_OF_LOOP_COST (*node); + + /* Strided loads were reached - stop the recursion. */ + if (stop_recursion) + { + if (permutation) + { + VEC_safe_push (slp_tree, heap, *loads, *node); + *inside_cost + += targetm.vectorize.builtin_vectorization_cost (vec_perm, NULL, 0) + * group_size; + } + else + { + /* We don't check here complex numbers chains, so we keep them in + LOADS for further check in vect_supported_load_permutation_p. */ + if (rhs_code == REALPART_EXPR || rhs_code == IMAGPART_EXPR) + VEC_safe_push (slp_tree, heap, *loads, *node); + } + + return true; + } + + /* Create SLP_TREE nodes for the definition node/s. */ + if (first_stmt_dt0 == vect_internal_def) + { + slp_tree left_node = XNEW (struct _slp_tree); + SLP_TREE_SCALAR_STMTS (left_node) = def_stmts0; + SLP_TREE_VEC_STMTS (left_node) = NULL; + SLP_TREE_LEFT (left_node) = NULL; + SLP_TREE_RIGHT (left_node) = NULL; + SLP_TREE_OUTSIDE_OF_LOOP_COST (left_node) = 0; + SLP_TREE_INSIDE_OF_LOOP_COST (left_node) = 0; + if (!vect_build_slp_tree (loop_vinfo, bb_vinfo, &left_node, group_size, + inside_cost, outside_cost, ncopies_for_cost, + max_nunits, load_permutation, loads, + vectorization_factor)) + return false; + + SLP_TREE_LEFT (*node) = left_node; + } + + if (first_stmt_dt1 == vect_internal_def) + { + slp_tree right_node = XNEW (struct _slp_tree); + SLP_TREE_SCALAR_STMTS (right_node) = def_stmts1; + SLP_TREE_VEC_STMTS (right_node) = NULL; + SLP_TREE_LEFT (right_node) = NULL; + SLP_TREE_RIGHT (right_node) = NULL; + SLP_TREE_OUTSIDE_OF_LOOP_COST (right_node) = 0; + SLP_TREE_INSIDE_OF_LOOP_COST (right_node) = 0; + if (!vect_build_slp_tree (loop_vinfo, bb_vinfo, &right_node, group_size, + inside_cost, outside_cost, ncopies_for_cost, + max_nunits, load_permutation, loads, + vectorization_factor)) + return false; + + SLP_TREE_RIGHT (*node) = right_node; + } + + return true; +} + + +static void +vect_print_slp_tree (slp_tree node) +{ + int i; + gimple stmt; + + if (!node) + return; + + fprintf (vect_dump, "node "); + FOR_EACH_VEC_ELT (gimple, SLP_TREE_SCALAR_STMTS (node), i, stmt) + { + fprintf (vect_dump, "\n\tstmt %d ", i); + print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM); + } + fprintf (vect_dump, "\n"); + + vect_print_slp_tree (SLP_TREE_LEFT (node)); + vect_print_slp_tree (SLP_TREE_RIGHT (node)); +} + + +/* Mark the tree rooted at NODE with MARK (PURE_SLP or HYBRID). + If MARK is HYBRID, it refers to a specific stmt in NODE (the stmt at index + J). Otherwise, MARK is PURE_SLP and J is -1, which indicates that all the + stmts in NODE are to be marked. */ + +static void +vect_mark_slp_stmts (slp_tree node, enum slp_vect_type mark, int j) +{ + int i; + gimple stmt; + + if (!node) + return; + + FOR_EACH_VEC_ELT (gimple, SLP_TREE_SCALAR_STMTS (node), i, stmt) + if (j < 0 || i == j) + STMT_SLP_TYPE (vinfo_for_stmt (stmt)) = mark; + + vect_mark_slp_stmts (SLP_TREE_LEFT (node), mark, j); + vect_mark_slp_stmts (SLP_TREE_RIGHT (node), mark, j); +} + + +/* Mark the statements of the tree rooted at NODE as relevant (vect_used). */ + +static void +vect_mark_slp_stmts_relevant (slp_tree node) +{ + int i; + gimple stmt; + stmt_vec_info stmt_info; + + if (!node) + return; + + FOR_EACH_VEC_ELT (gimple, SLP_TREE_SCALAR_STMTS (node), i, stmt) + { + stmt_info = vinfo_for_stmt (stmt); + gcc_assert (!STMT_VINFO_RELEVANT (stmt_info) + || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope); + STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope; + } + + vect_mark_slp_stmts_relevant (SLP_TREE_LEFT (node)); + vect_mark_slp_stmts_relevant (SLP_TREE_RIGHT (node)); +} + + +/* Check if the permutation required by the SLP INSTANCE is supported. + Reorganize the SLP nodes stored in SLP_INSTANCE_LOADS if needed. */ + +static bool +vect_supported_slp_permutation_p (slp_instance instance) +{ + slp_tree node = VEC_index (slp_tree, SLP_INSTANCE_LOADS (instance), 0); + gimple stmt = VEC_index (gimple, SLP_TREE_SCALAR_STMTS (node), 0); + gimple first_load = DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt)); + VEC (slp_tree, heap) *sorted_loads = NULL; + int index; + slp_tree *tmp_loads = NULL; + int group_size = SLP_INSTANCE_GROUP_SIZE (instance), i, j; + slp_tree load; + + /* FORNOW: The only supported loads permutation is loads from the same + location in all the loads in the node, when the data-refs in + nodes of LOADS constitute an interleaving chain. + Sort the nodes according to the order of accesses in the chain. */ + tmp_loads = (slp_tree *) xmalloc (sizeof (slp_tree) * group_size); + for (i = 0, j = 0; + VEC_iterate (int, SLP_INSTANCE_LOAD_PERMUTATION (instance), i, index) + && VEC_iterate (slp_tree, SLP_INSTANCE_LOADS (instance), j, load); + i += group_size, j++) + { + gimple scalar_stmt = VEC_index (gimple, SLP_TREE_SCALAR_STMTS (load), 0); + /* Check that the loads are all in the same interleaving chain. */ + if (DR_GROUP_FIRST_DR (vinfo_for_stmt (scalar_stmt)) != first_load) + { + if (vect_print_dump_info (REPORT_DETAILS)) + { + fprintf (vect_dump, "Build SLP failed: unsupported data " + "permutation "); + print_gimple_stmt (vect_dump, scalar_stmt, 0, TDF_SLIM); + } + + free (tmp_loads); + return false; + } + + tmp_loads[index] = load; + } + + sorted_loads = VEC_alloc (slp_tree, heap, group_size); + for (i = 0; i < group_size; i++) + VEC_safe_push (slp_tree, heap, sorted_loads, tmp_loads[i]); + + VEC_free (slp_tree, heap, SLP_INSTANCE_LOADS (instance)); + SLP_INSTANCE_LOADS (instance) = sorted_loads; + free (tmp_loads); + + if (!vect_transform_slp_perm_load (stmt, NULL, NULL, + SLP_INSTANCE_UNROLLING_FACTOR (instance), + instance, true)) + return false; + + return true; +} + + +/* Rearrange the statements of NODE according to PERMUTATION. */ + +static void +vect_slp_rearrange_stmts (slp_tree node, unsigned int group_size, + VEC (int, heap) *permutation) +{ + gimple stmt; + VEC (gimple, heap) *tmp_stmts; + unsigned int index, i; + + if (!node) + return; + + vect_slp_rearrange_stmts (SLP_TREE_LEFT (node), group_size, permutation); + vect_slp_rearrange_stmts (SLP_TREE_RIGHT (node), group_size, permutation); + + gcc_assert (group_size == VEC_length (gimple, SLP_TREE_SCALAR_STMTS (node))); + tmp_stmts = VEC_alloc (gimple, heap, group_size); + + for (i = 0; i < group_size; i++) + VEC_safe_push (gimple, heap, tmp_stmts, NULL); + + FOR_EACH_VEC_ELT (gimple, SLP_TREE_SCALAR_STMTS (node), i, stmt) + { + index = VEC_index (int, permutation, i); + VEC_replace (gimple, tmp_stmts, index, stmt); + } + + VEC_free (gimple, heap, SLP_TREE_SCALAR_STMTS (node)); + SLP_TREE_SCALAR_STMTS (node) = tmp_stmts; +} + + +/* Check if the required load permutation is supported. + LOAD_PERMUTATION contains a list of indices of the loads. + In SLP this permutation is relative to the order of strided stores that are + the base of the SLP instance. */ + +static bool +vect_supported_load_permutation_p (slp_instance slp_instn, int group_size, + VEC (int, heap) *load_permutation) +{ + int i = 0, j, prev = -1, next, k, number_of_groups; + bool supported, bad_permutation = false; + sbitmap load_index; + slp_tree node, other_complex_node; + gimple stmt, first = NULL, other_node_first; + unsigned complex_numbers = 0; + + /* FORNOW: permutations are only supported in SLP. */ + if (!slp_instn) + return false; + + if (vect_print_dump_info (REPORT_SLP)) + { + fprintf (vect_dump, "Load permutation "); + FOR_EACH_VEC_ELT (int, load_permutation, i, next) + fprintf (vect_dump, "%d ", next); + } + + /* In case of reduction every load permutation is allowed, since the order + of the reduction statements is not important (as opposed to the case of + strided stores). The only condition we need to check is that all the + load nodes are of the same size and have the same permutation (and then + rearrange all the nodes of the SLP instance according to this + permutation). */ + + /* Check that all the load nodes are of the same size. */ + FOR_EACH_VEC_ELT (slp_tree, SLP_INSTANCE_LOADS (slp_instn), i, node) + { + if (VEC_length (gimple, SLP_TREE_SCALAR_STMTS (node)) + != (unsigned) group_size) + return false; + + stmt = VEC_index (gimple, SLP_TREE_SCALAR_STMTS (node), 0); + if (is_gimple_assign (stmt) + && (gimple_assign_rhs_code (stmt) == REALPART_EXPR + || gimple_assign_rhs_code (stmt) == IMAGPART_EXPR)) + complex_numbers++; + } + + /* Complex operands can be swapped as following: + real_c = real_b + real_a; + imag_c = imag_a + imag_b; + i.e., we have {real_b, imag_a} and {real_a, imag_b} instead of + {real_a, imag_a} and {real_b, imag_b}. We check here that if interleaving + chains are mixed, they match the above pattern. */ + if (complex_numbers) + { + FOR_EACH_VEC_ELT (slp_tree, SLP_INSTANCE_LOADS (slp_instn), i, node) + { + FOR_EACH_VEC_ELT (gimple, SLP_TREE_SCALAR_STMTS (node), j, stmt) + { + if (j == 0) + first = stmt; + else + { + if (DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt)) != first) + { + if (complex_numbers != 2) + return false; + + if (i == 0) + k = 1; + else + k = 0; + + other_complex_node = VEC_index (slp_tree, + SLP_INSTANCE_LOADS (slp_instn), k); + other_node_first = VEC_index (gimple, + SLP_TREE_SCALAR_STMTS (other_complex_node), 0); + + if (DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt)) + != other_node_first) + return false; + } + } + } + } + } + + /* We checked that this case ok, so there is no need to proceed with + permutation tests. */ + if (complex_numbers == 2) + { + VEC_free (slp_tree, heap, SLP_INSTANCE_LOADS (slp_instn)); + VEC_free (int, heap, SLP_INSTANCE_LOAD_PERMUTATION (slp_instn)); + return true; + } + + node = SLP_INSTANCE_TREE (slp_instn); + stmt = VEC_index (gimple, SLP_TREE_SCALAR_STMTS (node), 0); + /* LOAD_PERMUTATION is a list of indices of all the loads of the SLP + instance, not all the loads belong to the same node or interleaving + group. Hence, we need to divide them into groups according to + GROUP_SIZE. */ + number_of_groups = VEC_length (int, load_permutation) / group_size; + + /* Reduction (there are no data-refs in the root). */ + if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))) + { + int first_group_load_index; + + /* Compare all the permutation sequences to the first one. */ + for (i = 1; i < number_of_groups; i++) + { + k = 0; + for (j = i * group_size; j < i * group_size + group_size; j++) + { + next = VEC_index (int, load_permutation, j); + first_group_load_index = VEC_index (int, load_permutation, k); + + if (next != first_group_load_index) + { + bad_permutation = true; + break; + } + + k++; + } + + if (bad_permutation) + break; + } + + if (!bad_permutation) + { + /* Check that the loads in the first sequence are different and there + are no gaps between them. */ + load_index = sbitmap_alloc (group_size); + sbitmap_zero (load_index); + for (k = 0; k < group_size; k++) + { + first_group_load_index = VEC_index (int, load_permutation, k); + if (TEST_BIT (load_index, first_group_load_index)) + { + bad_permutation = true; + break; + } + + SET_BIT (load_index, first_group_load_index); + } + + if (!bad_permutation) + for (k = 0; k < group_size; k++) + if (!TEST_BIT (load_index, k)) + { + bad_permutation = true; + break; + } + + sbitmap_free (load_index); + } + + if (!bad_permutation) + { + /* This permutation is valid for reduction. Since the order of the + statements in the nodes is not important unless they are memory + accesses, we can rearrange the statements in all the nodes + according to the order of the loads. */ + vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size, + load_permutation); + VEC_free (int, heap, SLP_INSTANCE_LOAD_PERMUTATION (slp_instn)); + return true; + } + } + + /* FORNOW: the only supported permutation is 0..01..1.. of length equal to + GROUP_SIZE and where each sequence of same drs is of GROUP_SIZE length as + well (unless it's reduction). */ + if (VEC_length (int, load_permutation) + != (unsigned int) (group_size * group_size)) + return false; + + supported = true; + load_index = sbitmap_alloc (group_size); + sbitmap_zero (load_index); + for (j = 0; j < group_size; j++) + { + for (i = j * group_size, k = 0; + VEC_iterate (int, load_permutation, i, next) && k < group_size; + i++, k++) + { + if (i != j * group_size && next != prev) + { + supported = false; + break; + } + + prev = next; + } + + if (TEST_BIT (load_index, prev)) + { + supported = false; + break; + } + + SET_BIT (load_index, prev); + } + + for (j = 0; j < group_size; j++) + if (!TEST_BIT (load_index, j)) + return false; + + sbitmap_free (load_index); + + if (supported && i == group_size * group_size + && vect_supported_slp_permutation_p (slp_instn)) + return true; + + return false; +} + + +/* Find the first load in the loop that belongs to INSTANCE. + When loads are in several SLP nodes, there can be a case in which the first + load does not appear in the first SLP node to be transformed, causing + incorrect order of statements. Since we generate all the loads together, + they must be inserted before the first load of the SLP instance and not + before the first load of the first node of the instance. */ + +static gimple +vect_find_first_load_in_slp_instance (slp_instance instance) +{ + int i, j; + slp_tree load_node; + gimple first_load = NULL, load; + + FOR_EACH_VEC_ELT (slp_tree, SLP_INSTANCE_LOADS (instance), i, load_node) + FOR_EACH_VEC_ELT (gimple, SLP_TREE_SCALAR_STMTS (load_node), j, load) + first_load = get_earlier_stmt (load, first_load); + + return first_load; +} + + +/* Find the last store in SLP INSTANCE. */ + +static gimple +vect_find_last_store_in_slp_instance (slp_instance instance) +{ + int i; + slp_tree node; + gimple last_store = NULL, store; + + node = SLP_INSTANCE_TREE (instance); + for (i = 0; + VEC_iterate (gimple, SLP_TREE_SCALAR_STMTS (node), i, store); + i++) + last_store = get_later_stmt (store, last_store); + + return last_store; +} + + +/* Analyze an SLP instance starting from a group of strided stores. Call + vect_build_slp_tree to build a tree of packed stmts if possible. + Return FALSE if it's impossible to SLP any stmt in the loop. */ + +static bool +vect_analyze_slp_instance (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo, + gimple stmt) +{ + slp_instance new_instance; + slp_tree node = XNEW (struct _slp_tree); + unsigned int group_size = DR_GROUP_SIZE (vinfo_for_stmt (stmt)); + unsigned int unrolling_factor = 1, nunits; + tree vectype, scalar_type = NULL_TREE; + gimple next; + unsigned int vectorization_factor = 0; + int inside_cost = 0, outside_cost = 0, ncopies_for_cost, i; + unsigned int max_nunits = 0; + VEC (int, heap) *load_permutation; + VEC (slp_tree, heap) *loads; + struct data_reference *dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)); + + if (dr) + { + scalar_type = TREE_TYPE (DR_REF (dr)); + vectype = get_vectype_for_scalar_type (scalar_type); + group_size = DR_GROUP_SIZE (vinfo_for_stmt (stmt)); + } + else + { + gcc_assert (loop_vinfo); + vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)); + group_size = VEC_length (gimple, LOOP_VINFO_REDUCTIONS (loop_vinfo)); + } + + if (!vectype) + { + if (vect_print_dump_info (REPORT_SLP)) + { + fprintf (vect_dump, "Build SLP failed: unsupported data-type "); + print_generic_expr (vect_dump, scalar_type, TDF_SLIM); + } + + return false; + } + + nunits = TYPE_VECTOR_SUBPARTS (vectype); + if (loop_vinfo) + vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo); + else + /* No multitypes in BB SLP. */ + vectorization_factor = nunits; + + /* Calculate the unrolling factor. */ + unrolling_factor = least_common_multiple (nunits, group_size) / group_size; + if (unrolling_factor != 1 && !loop_vinfo) + { + if (vect_print_dump_info (REPORT_SLP)) + fprintf (vect_dump, "Build SLP failed: unrolling required in basic" + " block SLP"); + + return false; + } + + /* Create a node (a root of the SLP tree) for the packed strided stores. */ + SLP_TREE_SCALAR_STMTS (node) = VEC_alloc (gimple, heap, group_size); + next = stmt; + if (dr) + { + /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS. */ + while (next) + { + VEC_safe_push (gimple, heap, SLP_TREE_SCALAR_STMTS (node), next); + next = DR_GROUP_NEXT_DR (vinfo_for_stmt (next)); + } + } + else + { + /* Collect reduction statements. */ + for (i = 0; VEC_iterate (gimple, LOOP_VINFO_REDUCTIONS (loop_vinfo), i, + next); + i++) + { + VEC_safe_push (gimple, heap, SLP_TREE_SCALAR_STMTS (node), next); + if (vect_print_dump_info (REPORT_DETAILS)) + { + fprintf (vect_dump, "pushing reduction into node: "); + print_gimple_stmt (vect_dump, next, 0, TDF_SLIM); + } + } + } + + SLP_TREE_VEC_STMTS (node) = NULL; + SLP_TREE_NUMBER_OF_VEC_STMTS (node) = 0; + SLP_TREE_LEFT (node) = NULL; + SLP_TREE_RIGHT (node) = NULL; + SLP_TREE_OUTSIDE_OF_LOOP_COST (node) = 0; + SLP_TREE_INSIDE_OF_LOOP_COST (node) = 0; + + /* Calculate the number of vector stmts to create based on the unrolling + factor (number of vectors is 1 if NUNITS >= GROUP_SIZE, and is + GROUP_SIZE / NUNITS otherwise. */ + ncopies_for_cost = unrolling_factor * group_size / nunits; + + load_permutation = VEC_alloc (int, heap, group_size * group_size); + loads = VEC_alloc (slp_tree, heap, group_size); + + /* Build the tree for the SLP instance. */ + if (vect_build_slp_tree (loop_vinfo, bb_vinfo, &node, group_size, + &inside_cost, &outside_cost, ncopies_for_cost, + &max_nunits, &load_permutation, &loads, + vectorization_factor)) + { + /* Create a new SLP instance. */ + new_instance = XNEW (struct _slp_instance); + SLP_INSTANCE_TREE (new_instance) = node; + SLP_INSTANCE_GROUP_SIZE (new_instance) = group_size; + /* Calculate the unrolling factor based on the smallest type in the + loop. */ + if (max_nunits > nunits) + unrolling_factor = least_common_multiple (max_nunits, group_size) + / group_size; + + SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor; + SLP_INSTANCE_OUTSIDE_OF_LOOP_COST (new_instance) = outside_cost; + SLP_INSTANCE_INSIDE_OF_LOOP_COST (new_instance) = inside_cost; + SLP_INSTANCE_LOADS (new_instance) = loads; + SLP_INSTANCE_FIRST_LOAD_STMT (new_instance) = NULL; + SLP_INSTANCE_LOAD_PERMUTATION (new_instance) = load_permutation; + if (VEC_length (slp_tree, loads)) + { + if (!vect_supported_load_permutation_p (new_instance, group_size, + load_permutation)) + { + if (vect_print_dump_info (REPORT_SLP)) + { + fprintf (vect_dump, "Build SLP failed: unsupported load " + "permutation "); + print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM); + } + + vect_free_slp_instance (new_instance); + return false; + } + + SLP_INSTANCE_FIRST_LOAD_STMT (new_instance) + = vect_find_first_load_in_slp_instance (new_instance); + } + else + VEC_free (int, heap, SLP_INSTANCE_LOAD_PERMUTATION (new_instance)); + + if (loop_vinfo) + VEC_safe_push (slp_instance, heap, + LOOP_VINFO_SLP_INSTANCES (loop_vinfo), + new_instance); + else + VEC_safe_push (slp_instance, heap, BB_VINFO_SLP_INSTANCES (bb_vinfo), + new_instance); + + if (vect_print_dump_info (REPORT_SLP)) + vect_print_slp_tree (node); + + return true; + } + + /* Failed to SLP. */ + /* Free the allocated memory. */ + vect_free_slp_tree (node); + VEC_free (int, heap, load_permutation); + VEC_free (slp_tree, heap, loads); + + return false; +} + + +/* Check if there are stmts in the loop can be vectorized using SLP. Build SLP + trees of packed scalar stmts if SLP is possible. */ + +bool +vect_analyze_slp (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo) +{ + unsigned int i; + VEC (gimple, heap) *strided_stores, *reductions = NULL; + gimple store; + bool ok = false; + + if (vect_print_dump_info (REPORT_SLP)) + fprintf (vect_dump, "=== vect_analyze_slp ==="); + + if (loop_vinfo) + { + strided_stores = LOOP_VINFO_STRIDED_STORES (loop_vinfo); + reductions = LOOP_VINFO_REDUCTIONS (loop_vinfo); + } + else + strided_stores = BB_VINFO_STRIDED_STORES (bb_vinfo); + + /* Find SLP sequences starting from groups of strided stores. */ + FOR_EACH_VEC_ELT (gimple, strided_stores, i, store) + if (vect_analyze_slp_instance (loop_vinfo, bb_vinfo, store)) + ok = true; + + if (bb_vinfo && !ok) + { + if (vect_print_dump_info (REPORT_SLP)) + fprintf (vect_dump, "Failed to SLP the basic block."); + + return false; + } + + /* Find SLP sequences starting from groups of reductions. */ + if (loop_vinfo && VEC_length (gimple, LOOP_VINFO_REDUCTIONS (loop_vinfo)) > 1 + && vect_analyze_slp_instance (loop_vinfo, bb_vinfo, + VEC_index (gimple, reductions, 0))) + ok = true; + + return true; +} + + +/* For each possible SLP instance decide whether to SLP it and calculate overall + unrolling factor needed to SLP the loop. */ + +void +vect_make_slp_decision (loop_vec_info loop_vinfo) +{ + unsigned int i, unrolling_factor = 1; + VEC (slp_instance, heap) *slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo); + slp_instance instance; + int decided_to_slp = 0; + + if (vect_print_dump_info (REPORT_SLP)) + fprintf (vect_dump, "=== vect_make_slp_decision ==="); + + FOR_EACH_VEC_ELT (slp_instance, slp_instances, i, instance) + { + /* FORNOW: SLP if you can. */ + if (unrolling_factor < SLP_INSTANCE_UNROLLING_FACTOR (instance)) + unrolling_factor = SLP_INSTANCE_UNROLLING_FACTOR (instance); + + /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we + call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and + loop-based vectorization. Such stmts will be marked as HYBRID. */ + vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance), pure_slp, -1); + decided_to_slp++; + } + + LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor; + + if (decided_to_slp && vect_print_dump_info (REPORT_SLP)) + fprintf (vect_dump, "Decided to SLP %d instances. Unrolling factor %d", + decided_to_slp, unrolling_factor); +} + + +/* Find stmts that must be both vectorized and SLPed (since they feed stmts that + can't be SLPed) in the tree rooted at NODE. Mark such stmts as HYBRID. */ + +static void +vect_detect_hybrid_slp_stmts (slp_tree node) +{ + int i; + gimple stmt; + imm_use_iterator imm_iter; + gimple use_stmt; + stmt_vec_info stmt_vinfo; + + if (!node) + return; + + FOR_EACH_VEC_ELT (gimple, SLP_TREE_SCALAR_STMTS (node), i, stmt) + if (PURE_SLP_STMT (vinfo_for_stmt (stmt)) + && TREE_CODE (gimple_op (stmt, 0)) == SSA_NAME) + FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, gimple_op (stmt, 0)) + if ((stmt_vinfo = vinfo_for_stmt (use_stmt)) + && !STMT_SLP_TYPE (stmt_vinfo) + && (STMT_VINFO_RELEVANT (stmt_vinfo) + || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_vinfo))) + && !(gimple_code (use_stmt) == GIMPLE_PHI + && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (use_stmt)) + == vect_reduction_def)) + vect_mark_slp_stmts (node, hybrid, i); + + vect_detect_hybrid_slp_stmts (SLP_TREE_LEFT (node)); + vect_detect_hybrid_slp_stmts (SLP_TREE_RIGHT (node)); +} + + +/* Find stmts that must be both vectorized and SLPed. */ + +void +vect_detect_hybrid_slp (loop_vec_info loop_vinfo) +{ + unsigned int i; + VEC (slp_instance, heap) *slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo); + slp_instance instance; + + if (vect_print_dump_info (REPORT_SLP)) + fprintf (vect_dump, "=== vect_detect_hybrid_slp ==="); + + FOR_EACH_VEC_ELT (slp_instance, slp_instances, i, instance) + vect_detect_hybrid_slp_stmts (SLP_INSTANCE_TREE (instance)); +} + + +/* Create and initialize a new bb_vec_info struct for BB, as well as + stmt_vec_info structs for all the stmts in it. */ + +static bb_vec_info +new_bb_vec_info (basic_block bb) +{ + bb_vec_info res = NULL; + gimple_stmt_iterator gsi; + + res = (bb_vec_info) xcalloc (1, sizeof (struct _bb_vec_info)); + BB_VINFO_BB (res) = bb; + + for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi)) + { + gimple stmt = gsi_stmt (gsi); + gimple_set_uid (stmt, 0); + set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, NULL, res)); + } + + BB_VINFO_STRIDED_STORES (res) = VEC_alloc (gimple, heap, 10); + BB_VINFO_SLP_INSTANCES (res) = VEC_alloc (slp_instance, heap, 2); + + bb->aux = res; + return res; +} + + +/* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the + stmts in the basic block. */ + +static void +destroy_bb_vec_info (bb_vec_info bb_vinfo) +{ + basic_block bb; + gimple_stmt_iterator si; + + if (!bb_vinfo) + return; + + bb = BB_VINFO_BB (bb_vinfo); + + for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si)) + { + gimple stmt = gsi_stmt (si); + stmt_vec_info stmt_info = vinfo_for_stmt (stmt); + + if (stmt_info) + /* Free stmt_vec_info. */ + free_stmt_vec_info (stmt); + } + + free_data_refs (BB_VINFO_DATAREFS (bb_vinfo)); + free_dependence_relations (BB_VINFO_DDRS (bb_vinfo)); + VEC_free (gimple, heap, BB_VINFO_STRIDED_STORES (bb_vinfo)); + VEC_free (slp_instance, heap, BB_VINFO_SLP_INSTANCES (bb_vinfo)); + free (bb_vinfo); + bb->aux = NULL; +} + + +/* Analyze statements contained in SLP tree node after recursively analyzing + the subtree. Return TRUE if the operations are supported. */ + +static bool +vect_slp_analyze_node_operations (bb_vec_info bb_vinfo, slp_tree node) +{ + bool dummy; + int i; + gimple stmt; + + if (!node) + return true; + + if (!vect_slp_analyze_node_operations (bb_vinfo, SLP_TREE_LEFT (node)) + || !vect_slp_analyze_node_operations (bb_vinfo, SLP_TREE_RIGHT (node))) + return false; + + FOR_EACH_VEC_ELT (gimple, SLP_TREE_SCALAR_STMTS (node), i, stmt) + { + stmt_vec_info stmt_info = vinfo_for_stmt (stmt); + gcc_assert (stmt_info); + gcc_assert (PURE_SLP_STMT (stmt_info)); + + if (!vect_analyze_stmt (stmt, &dummy, node)) + return false; + } + + return true; +} + + +/* Analyze statements in SLP instances of the basic block. Return TRUE if the + operations are supported. */ + +static bool +vect_slp_analyze_operations (bb_vec_info bb_vinfo) +{ + VEC (slp_instance, heap) *slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo); + slp_instance instance; + int i; + + for (i = 0; VEC_iterate (slp_instance, slp_instances, i, instance); ) + { + if (!vect_slp_analyze_node_operations (bb_vinfo, + SLP_INSTANCE_TREE (instance))) + { + vect_free_slp_instance (instance); + VEC_ordered_remove (slp_instance, slp_instances, i); + } + else + i++; + } + + if (!VEC_length (slp_instance, slp_instances)) + return false; + + return true; +} + +/* Check if loads and stores are mixed in the basic block (in that + case if we are not sure that the accesses differ, we can't vectorize the + basic block). Also return FALSE in case that there is statement marked as + not vectorizable. */ + +static bool +vect_bb_vectorizable_with_dependencies (bb_vec_info bb_vinfo) +{ + basic_block bb = BB_VINFO_BB (bb_vinfo); + gimple_stmt_iterator si; + bool detected_store = false; + gimple stmt; + struct data_reference *dr; + + for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si)) + { + stmt = gsi_stmt (si); + + /* We can't allow not analyzed statements, since they may contain data + accesses. */ + if (!STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt))) + return false; + + if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))) + continue; + + dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)); + if (DR_IS_READ (dr) && detected_store) + return false; + + if (!DR_IS_READ (dr)) + detected_store = true; + } + + return true; +} + +/* Check if vectorization of the basic block is profitable. */ + +static bool +vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo) +{ + VEC (slp_instance, heap) *slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo); + slp_instance instance; + int i; + unsigned int vec_outside_cost = 0, vec_inside_cost = 0, scalar_cost = 0; + unsigned int stmt_cost; + gimple stmt; + gimple_stmt_iterator si; + basic_block bb = BB_VINFO_BB (bb_vinfo); + stmt_vec_info stmt_info = NULL; + tree dummy_type = NULL; + int dummy = 0; + + /* Calculate vector costs. */ + FOR_EACH_VEC_ELT (slp_instance, slp_instances, i, instance) + { + vec_outside_cost += SLP_INSTANCE_OUTSIDE_OF_LOOP_COST (instance); + vec_inside_cost += SLP_INSTANCE_INSIDE_OF_LOOP_COST (instance); + } + + /* Calculate scalar cost. */ + for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si)) + { + stmt = gsi_stmt (si); + stmt_info = vinfo_for_stmt (stmt); + + if (!stmt_info || !STMT_VINFO_VECTORIZABLE (stmt_info) + || !PURE_SLP_STMT (stmt_info)) + continue; + + if (STMT_VINFO_DATA_REF (stmt_info)) + { + if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info))) + stmt_cost = targetm.vectorize.builtin_vectorization_cost + (scalar_load, dummy_type, dummy); + else + stmt_cost = targetm.vectorize.builtin_vectorization_cost + (scalar_store, dummy_type, dummy); + } + else + stmt_cost = targetm.vectorize.builtin_vectorization_cost + (scalar_stmt, dummy_type, dummy); + + scalar_cost += stmt_cost; + } + + if (vect_print_dump_info (REPORT_COST)) + { + fprintf (vect_dump, "Cost model analysis: \n"); + fprintf (vect_dump, " Vector inside of basic block cost: %d\n", + vec_inside_cost); + fprintf (vect_dump, " Vector outside of basic block cost: %d\n", + vec_outside_cost); + fprintf (vect_dump, " Scalar cost of basic block: %d", scalar_cost); + } + + /* Vectorization is profitable if its cost is less than the cost of scalar + version. */ + if (vec_outside_cost + vec_inside_cost >= scalar_cost) + return false; + + return true; +} + +/* Check if the basic block can be vectorized. */ + +bb_vec_info +vect_slp_analyze_bb (basic_block bb) +{ + bb_vec_info bb_vinfo; + VEC (ddr_p, heap) *ddrs; + VEC (slp_instance, heap) *slp_instances; + slp_instance instance; + int i, insns = 0; + gimple_stmt_iterator gsi; + int min_vf = 2; + int max_vf = MAX_VECTORIZATION_FACTOR; + bool data_dependence_in_bb = false; + + current_vector_size = 0; + + if (vect_print_dump_info (REPORT_DETAILS)) + fprintf (vect_dump, "===vect_slp_analyze_bb===\n"); + + for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi)) + { + gimple stmt = gsi_stmt (gsi); + if (!is_gimple_debug (stmt) + && !gimple_nop_p (stmt) + && gimple_code (stmt) != GIMPLE_LABEL) + insns++; + } + + if (insns > PARAM_VALUE (PARAM_SLP_MAX_INSNS_IN_BB)) + { + if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS)) + fprintf (vect_dump, "not vectorized: too many instructions in basic " + "block.\n"); + + return NULL; + } + + bb_vinfo = new_bb_vec_info (bb); + if (!bb_vinfo) + return NULL; + + if (!vect_analyze_data_refs (NULL, bb_vinfo, &min_vf)) + { + if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS)) + fprintf (vect_dump, "not vectorized: unhandled data-ref in basic " + "block.\n"); + + destroy_bb_vec_info (bb_vinfo); + return NULL; + } + + ddrs = BB_VINFO_DDRS (bb_vinfo); + if (!VEC_length (ddr_p, ddrs)) + { + if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS)) + fprintf (vect_dump, "not vectorized: not enough data-refs in basic " + "block.\n"); + + destroy_bb_vec_info (bb_vinfo); + return NULL; + } + + if (!vect_analyze_data_ref_dependences (NULL, bb_vinfo, &max_vf, + &data_dependence_in_bb) + || min_vf > max_vf + || (data_dependence_in_bb + && !vect_bb_vectorizable_with_dependencies (bb_vinfo))) + { + if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS)) + fprintf (vect_dump, "not vectorized: unhandled data dependence " + "in basic block.\n"); + + destroy_bb_vec_info (bb_vinfo); + return NULL; + } + + if (!vect_analyze_data_refs_alignment (NULL, bb_vinfo)) + { + if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS)) + fprintf (vect_dump, "not vectorized: bad data alignment in basic " + "block.\n"); + + destroy_bb_vec_info (bb_vinfo); + return NULL; + } + + if (!vect_analyze_data_ref_accesses (NULL, bb_vinfo)) + { + if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS)) + fprintf (vect_dump, "not vectorized: unhandled data access in basic " + "block.\n"); + + destroy_bb_vec_info (bb_vinfo); + return NULL; + } + + if (!vect_verify_datarefs_alignment (NULL, bb_vinfo)) + { + if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS)) + fprintf (vect_dump, "not vectorized: unsupported alignment in basic " + "block.\n"); + + destroy_bb_vec_info (bb_vinfo); + return NULL; + } + + /* Check the SLP opportunities in the basic block, analyze and build SLP + trees. */ + if (!vect_analyze_slp (NULL, bb_vinfo)) + { + if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS)) + fprintf (vect_dump, "not vectorized: failed to find SLP opportunities " + "in basic block.\n"); + + destroy_bb_vec_info (bb_vinfo); + return NULL; + } + + slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo); + + /* Mark all the statements that we want to vectorize as pure SLP and + relevant. */ + FOR_EACH_VEC_ELT (slp_instance, slp_instances, i, instance) + { + vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance), pure_slp, -1); + vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance)); + } + + if (!vect_slp_analyze_operations (bb_vinfo)) + { + if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS)) + fprintf (vect_dump, "not vectorized: bad operation in basic block.\n"); + + destroy_bb_vec_info (bb_vinfo); + return NULL; + } + + /* Cost model: check if the vectorization is worthwhile. */ + if (flag_vect_cost_model + && !vect_bb_vectorization_profitable_p (bb_vinfo)) + { + if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS)) + fprintf (vect_dump, "not vectorized: vectorization is not " + "profitable.\n"); + + destroy_bb_vec_info (bb_vinfo); + return NULL; + } + + if (vect_print_dump_info (REPORT_DETAILS)) + fprintf (vect_dump, "Basic block will be vectorized using SLP\n"); + + return bb_vinfo; +} + + +/* SLP costs are calculated according to SLP instance unrolling factor (i.e., + the number of created vector stmts depends on the unrolling factor). + However, the actual number of vector stmts for every SLP node depends on + VF which is set later in vect_analyze_operations (). Hence, SLP costs + should be updated. In this function we assume that the inside costs + calculated in vect_model_xxx_cost are linear in ncopies. */ + +void +vect_update_slp_costs_according_to_vf (loop_vec_info loop_vinfo) +{ + unsigned int i, vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); + VEC (slp_instance, heap) *slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo); + slp_instance instance; + + if (vect_print_dump_info (REPORT_SLP)) + fprintf (vect_dump, "=== vect_update_slp_costs_according_to_vf ==="); + + FOR_EACH_VEC_ELT (slp_instance, slp_instances, i, instance) + /* We assume that costs are linear in ncopies. */ + SLP_INSTANCE_INSIDE_OF_LOOP_COST (instance) *= vf + / SLP_INSTANCE_UNROLLING_FACTOR (instance); +} + + +/* For constant and loop invariant defs of SLP_NODE this function returns + (vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts. + OP_NUM determines if we gather defs for operand 0 or operand 1 of the RHS of + scalar stmts. NUMBER_OF_VECTORS is the number of vector defs to create. + REDUC_INDEX is the index of the reduction operand in the statements, unless + it is -1. */ + +static void +vect_get_constant_vectors (tree op, slp_tree slp_node, + VEC (tree, heap) **vec_oprnds, + unsigned int op_num, unsigned int number_of_vectors, + int reduc_index) +{ + VEC (gimple, heap) *stmts = SLP_TREE_SCALAR_STMTS (slp_node); + gimple stmt = VEC_index (gimple, stmts, 0); + stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt); + int nunits; + tree vec_cst; + tree t = NULL_TREE; + int j, number_of_places_left_in_vector; + tree vector_type; + tree vop; + int group_size = VEC_length (gimple, stmts); + unsigned int vec_num, i; + int number_of_copies = 1; + VEC (tree, heap) *voprnds = VEC_alloc (tree, heap, number_of_vectors); + bool constant_p, is_store; + tree neutral_op = NULL; + enum tree_code code = gimple_assign_rhs_code (stmt); + + if (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def) + { + if (reduc_index == -1) + { + VEC_free (tree, heap, *vec_oprnds); + return; + } + + op_num = reduc_index - 1; + op = gimple_op (stmt, reduc_index); + /* For additional copies (see the explanation of NUMBER_OF_COPIES below) + we need either neutral operands or the original operands. See + get_initial_def_for_reduction() for details. */ + switch (code) + { + case WIDEN_SUM_EXPR: + case DOT_PROD_EXPR: + case PLUS_EXPR: + case MINUS_EXPR: + case BIT_IOR_EXPR: + case BIT_XOR_EXPR: + if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (op))) + neutral_op = build_real (TREE_TYPE (op), dconst0); + else + neutral_op = build_int_cst (TREE_TYPE (op), 0); + + break; + + case MULT_EXPR: + if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (op))) + neutral_op = build_real (TREE_TYPE (op), dconst1); + else + neutral_op = build_int_cst (TREE_TYPE (op), 1); + + break; + + case BIT_AND_EXPR: + neutral_op = build_int_cst (TREE_TYPE (op), -1); + break; + + default: + neutral_op = NULL; + } + } + + if (STMT_VINFO_DATA_REF (stmt_vinfo)) + { + is_store = true; + op = gimple_assign_rhs1 (stmt); + } + else + is_store = false; + + gcc_assert (op); + + if (CONSTANT_CLASS_P (op)) + constant_p = true; + else + constant_p = false; + + vector_type = get_vectype_for_scalar_type (TREE_TYPE (op)); + gcc_assert (vector_type); + nunits = TYPE_VECTOR_SUBPARTS (vector_type); + + /* NUMBER_OF_COPIES is the number of times we need to use the same values in + created vectors. It is greater than 1 if unrolling is performed. + + For example, we have two scalar operands, s1 and s2 (e.g., group of + strided accesses of size two), while NUNITS is four (i.e., four scalars + of this type can be packed in a vector). The output vector will contain + two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES + will be 2). + + If GROUP_SIZE > NUNITS, the scalars will be split into several vectors + containing the operands. + + For example, NUNITS is four as before, and the group size is 8 + (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and + {s5, s6, s7, s8}. */ + + number_of_copies = least_common_multiple (nunits, group_size) / group_size; + + number_of_places_left_in_vector = nunits; + for (j = 0; j < number_of_copies; j++) + { + for (i = group_size - 1; VEC_iterate (gimple, stmts, i, stmt); i--) + { + if (is_store) + op = gimple_assign_rhs1 (stmt); + else + op = gimple_op (stmt, op_num + 1); + + if (reduc_index != -1) + { + struct loop *loop = (gimple_bb (stmt))->loop_father; + gimple def_stmt = SSA_NAME_DEF_STMT (op); + + gcc_assert (loop); + /* Get the def before the loop. */ + op = PHI_ARG_DEF_FROM_EDGE (def_stmt, + loop_preheader_edge (loop)); + if (j != (number_of_copies - 1) && neutral_op) + op = neutral_op; + } + + /* Create 'vect_ = {op0,op1,...,opn}'. */ + t = tree_cons (NULL_TREE, op, t); + + number_of_places_left_in_vector--; + + if (number_of_places_left_in_vector == 0) + { + number_of_places_left_in_vector = nunits; + + if (constant_p) + vec_cst = build_vector (vector_type, t); + else + vec_cst = build_constructor_from_list (vector_type, t); + VEC_quick_push (tree, voprnds, + vect_init_vector (stmt, vec_cst, vector_type, NULL)); + t = NULL_TREE; + } + } + } + + /* Since the vectors are created in the reverse order, we should invert + them. */ + vec_num = VEC_length (tree, voprnds); + for (j = vec_num - 1; j >= 0; j--) + { + vop = VEC_index (tree, voprnds, j); + VEC_quick_push (tree, *vec_oprnds, vop); + } + + VEC_free (tree, heap, voprnds); + + /* In case that VF is greater than the unrolling factor needed for the SLP + group of stmts, NUMBER_OF_VECTORS to be created is greater than + NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have + to replicate the vectors. */ + while (number_of_vectors > VEC_length (tree, *vec_oprnds)) + { + tree neutral_vec = NULL; + + if (neutral_op) + { + if (!neutral_vec) + neutral_vec = build_vector_from_val (vector_type, neutral_op); + + VEC_quick_push (tree, *vec_oprnds, neutral_vec); + } + else + { + for (i = 0; VEC_iterate (tree, *vec_oprnds, i, vop) && i < vec_num; i++) + VEC_quick_push (tree, *vec_oprnds, vop); + } + } +} + + +/* Get vectorized definitions from SLP_NODE that contains corresponding + vectorized def-stmts. */ + +static void +vect_get_slp_vect_defs (slp_tree slp_node, VEC (tree,heap) **vec_oprnds) +{ + tree vec_oprnd; + gimple vec_def_stmt; + unsigned int i; + + gcc_assert (SLP_TREE_VEC_STMTS (slp_node)); + + FOR_EACH_VEC_ELT (gimple, SLP_TREE_VEC_STMTS (slp_node), i, vec_def_stmt) + { + gcc_assert (vec_def_stmt); + vec_oprnd = gimple_get_lhs (vec_def_stmt); + VEC_quick_push (tree, *vec_oprnds, vec_oprnd); + } +} + + +/* Get vectorized definitions for SLP_NODE. + If the scalar definitions are loop invariants or constants, collect them and + call vect_get_constant_vectors() to create vector stmts. + Otherwise, the def-stmts must be already vectorized and the vectorized stmts + must be stored in the LEFT/RIGHT node of SLP_NODE, and we call + vect_get_slp_vect_defs() to retrieve them. + If VEC_OPRNDS1 is NULL, don't get vector defs for the second operand (from + the right node. This is used when the second operand must remain scalar. */ + +void +vect_get_slp_defs (tree op0, tree op1, slp_tree slp_node, + VEC (tree,heap) **vec_oprnds0, + VEC (tree,heap) **vec_oprnds1, int reduc_index) +{ + gimple first_stmt; + enum tree_code code; + int number_of_vects; + HOST_WIDE_INT lhs_size_unit, rhs_size_unit; + + first_stmt = VEC_index (gimple, SLP_TREE_SCALAR_STMTS (slp_node), 0); + /* The number of vector defs is determined by the number of vector statements + in the node from which we get those statements. */ + if (SLP_TREE_LEFT (slp_node)) + number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_LEFT (slp_node)); + else + { + number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); + /* Number of vector stmts was calculated according to LHS in + vect_schedule_slp_instance(), fix it by replacing LHS with RHS, if + necessary. See vect_get_smallest_scalar_type () for details. */ + vect_get_smallest_scalar_type (first_stmt, &lhs_size_unit, + &rhs_size_unit); + if (rhs_size_unit != lhs_size_unit) + { + number_of_vects *= rhs_size_unit; + number_of_vects /= lhs_size_unit; + } + } + + /* Allocate memory for vectorized defs. */ + *vec_oprnds0 = VEC_alloc (tree, heap, number_of_vects); + + /* SLP_NODE corresponds either to a group of stores or to a group of + unary/binary operations. We don't call this function for loads. + For reduction defs we call vect_get_constant_vectors(), since we are + looking for initial loop invariant values. */ + if (SLP_TREE_LEFT (slp_node) && reduc_index == -1) + /* The defs are already vectorized. */ + vect_get_slp_vect_defs (SLP_TREE_LEFT (slp_node), vec_oprnds0); + else + /* Build vectors from scalar defs. */ + vect_get_constant_vectors (op0, slp_node, vec_oprnds0, 0, number_of_vects, + reduc_index); + + if (STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt))) + /* Since we don't call this function with loads, this is a group of + stores. */ + return; + + /* For reductions, we only need initial values. */ + if (reduc_index != -1) + return; + + code = gimple_assign_rhs_code (first_stmt); + if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS || !vec_oprnds1) + return; + + /* The number of vector defs is determined by the number of vector statements + in the node from which we get those statements. */ + if (SLP_TREE_RIGHT (slp_node)) + number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_RIGHT (slp_node)); + else + number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); + + *vec_oprnds1 = VEC_alloc (tree, heap, number_of_vects); + + if (SLP_TREE_RIGHT (slp_node)) + /* The defs are already vectorized. */ + vect_get_slp_vect_defs (SLP_TREE_RIGHT (slp_node), vec_oprnds1); + else + /* Build vectors from scalar defs. */ + vect_get_constant_vectors (op1, slp_node, vec_oprnds1, 1, number_of_vects, + -1); +} + + +/* Create NCOPIES permutation statements using the mask MASK_BYTES (by + building a vector of type MASK_TYPE from it) and two input vectors placed in + DR_CHAIN at FIRST_VEC_INDX and SECOND_VEC_INDX for the first copy and + shifting by STRIDE elements of DR_CHAIN for every copy. + (STRIDE is the number of vectorized stmts for NODE divided by the number of + copies). + VECT_STMTS_COUNTER specifies the index in the vectorized stmts of NODE, where + the created stmts must be inserted. */ + +static inline void +vect_create_mask_and_perm (gimple stmt, gimple next_scalar_stmt, + tree mask, int first_vec_indx, int second_vec_indx, + gimple_stmt_iterator *gsi, slp_tree node, + tree builtin_decl, tree vectype, + VEC(tree,heap) *dr_chain, + int ncopies, int vect_stmts_counter) +{ + tree perm_dest; + gimple perm_stmt = NULL; + stmt_vec_info next_stmt_info; + int i, stride; + tree first_vec, second_vec, data_ref; + + stride = SLP_TREE_NUMBER_OF_VEC_STMTS (node) / ncopies; + + /* Initialize the vect stmts of NODE to properly insert the generated + stmts later. */ + for (i = VEC_length (gimple, SLP_TREE_VEC_STMTS (node)); + i < (int) SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++) + VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (node), NULL); + + perm_dest = vect_create_destination_var (gimple_assign_lhs (stmt), vectype); + for (i = 0; i < ncopies; i++) + { + first_vec = VEC_index (tree, dr_chain, first_vec_indx); + second_vec = VEC_index (tree, dr_chain, second_vec_indx); + + /* Generate the permute statement. */ + perm_stmt = gimple_build_call (builtin_decl, + 3, first_vec, second_vec, mask); + data_ref = make_ssa_name (perm_dest, perm_stmt); + gimple_call_set_lhs (perm_stmt, data_ref); + vect_finish_stmt_generation (stmt, perm_stmt, gsi); + + /* Store the vector statement in NODE. */ + VEC_replace (gimple, SLP_TREE_VEC_STMTS (node), + stride * i + vect_stmts_counter, perm_stmt); + + first_vec_indx += stride; + second_vec_indx += stride; + } + + /* Mark the scalar stmt as vectorized. */ + next_stmt_info = vinfo_for_stmt (next_scalar_stmt); + STMT_VINFO_VEC_STMT (next_stmt_info) = perm_stmt; +} + + +/* Given FIRST_MASK_ELEMENT - the mask element in element representation, + return in CURRENT_MASK_ELEMENT its equivalent in target specific + representation. Check that the mask is valid and return FALSE if not. + Return TRUE in NEED_NEXT_VECTOR if the permutation requires to move to + the next vector, i.e., the current first vector is not needed. */ + +static bool +vect_get_mask_element (gimple stmt, int first_mask_element, int m, + int mask_nunits, bool only_one_vec, int index, + int *mask, int *current_mask_element, + bool *need_next_vector, int *number_of_mask_fixes, + bool *mask_fixed, bool *needs_first_vector) +{ + int i; + + /* Convert to target specific representation. */ + *current_mask_element = first_mask_element + m; + /* Adjust the value in case it's a mask for second and third vectors. */ + *current_mask_element -= mask_nunits * (*number_of_mask_fixes - 1); + + if (*current_mask_element < mask_nunits) + *needs_first_vector = true; + + /* We have only one input vector to permute but the mask accesses values in + the next vector as well. */ + if (only_one_vec && *current_mask_element >= mask_nunits) + { + if (vect_print_dump_info (REPORT_DETAILS)) + { + fprintf (vect_dump, "permutation requires at least two vectors "); + print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM); + } + + return false; + } + + /* The mask requires the next vector. */ + if (*current_mask_element >= mask_nunits * 2) + { + if (*needs_first_vector || *mask_fixed) + { + /* We either need the first vector too or have already moved to the + next vector. In both cases, this permutation needs three + vectors. */ + if (vect_print_dump_info (REPORT_DETAILS)) + { + fprintf (vect_dump, "permutation requires at " + "least three vectors "); + print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM); + } + + return false; + } + + /* We move to the next vector, dropping the first one and working with + the second and the third - we need to adjust the values of the mask + accordingly. */ + *current_mask_element -= mask_nunits * *number_of_mask_fixes; + + for (i = 0; i < index; i++) + mask[i] -= mask_nunits * *number_of_mask_fixes; + + (*number_of_mask_fixes)++; + *mask_fixed = true; + } + + *need_next_vector = *mask_fixed; + + /* This was the last element of this mask. Start a new one. */ + if (index == mask_nunits - 1) + { + *number_of_mask_fixes = 1; + *mask_fixed = false; + *needs_first_vector = false; + } + + return true; +} + + +/* Generate vector permute statements from a list of loads in DR_CHAIN. + If ANALYZE_ONLY is TRUE, only check that it is possible to create valid + permute statements for SLP_NODE_INSTANCE. */ +bool +vect_transform_slp_perm_load (gimple stmt, VEC (tree, heap) *dr_chain, + gimple_stmt_iterator *gsi, int vf, + slp_instance slp_node_instance, bool analyze_only) +{ + stmt_vec_info stmt_info = vinfo_for_stmt (stmt); + tree mask_element_type = NULL_TREE, mask_type; + int i, j, k, m, scale, mask_nunits, nunits, vec_index = 0, scalar_index; + slp_tree node; + tree vectype = STMT_VINFO_VECTYPE (stmt_info), builtin_decl; + gimple next_scalar_stmt; + int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance); + int first_mask_element; + int index, unroll_factor, *mask, current_mask_element, ncopies; + bool only_one_vec = false, need_next_vector = false; + int first_vec_index, second_vec_index, orig_vec_stmts_num, vect_stmts_counter; + int number_of_mask_fixes = 1; + bool mask_fixed = false; + bool needs_first_vector = false; + + if (!targetm.vectorize.builtin_vec_perm) + { + if (vect_print_dump_info (REPORT_DETAILS)) + { + fprintf (vect_dump, "no builtin for vect permute for "); + print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM); + } + + return false; + } + + builtin_decl = targetm.vectorize.builtin_vec_perm (vectype, + &mask_element_type); + if (!builtin_decl || !mask_element_type) + { + if (vect_print_dump_info (REPORT_DETAILS)) + { + fprintf (vect_dump, "no builtin for vect permute for "); + print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM); + } + + return false; + } + + mask_type = get_vectype_for_scalar_type (mask_element_type); + mask_nunits = TYPE_VECTOR_SUBPARTS (mask_type); + mask = (int *) xmalloc (sizeof (int) * mask_nunits); + nunits = TYPE_VECTOR_SUBPARTS (vectype); + scale = mask_nunits / nunits; + unroll_factor = SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance); + + /* The number of vector stmts to generate based only on SLP_NODE_INSTANCE + unrolling factor. */ + orig_vec_stmts_num = group_size * + SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance) / nunits; + if (orig_vec_stmts_num == 1) + only_one_vec = true; + + /* Number of copies is determined by the final vectorization factor + relatively to SLP_NODE_INSTANCE unrolling factor. */ + ncopies = vf / SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance); + + /* Generate permutation masks for every NODE. Number of masks for each NODE + is equal to GROUP_SIZE. + E.g., we have a group of three nodes with three loads from the same + location in each node, and the vector size is 4. I.e., we have a + a0b0c0a1b1c1... sequence and we need to create the following vectors: + for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3 + for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3 + ... + + The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9} (in target + scpecific type, e.g., in bytes for Altivec. + The last mask is illegal since we assume two operands for permute + operation, and the mask element values can't be outside that range. + Hence, the last mask must be converted into {2,5,5,5}. + For the first two permutations we need the first and the second input + vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation + we need the second and the third vectors: {b1,c1,a2,b2} and + {c2,a3,b3,c3}. */ + + FOR_EACH_VEC_ELT (slp_tree, SLP_INSTANCE_LOADS (slp_node_instance), i, node) + { + scalar_index = 0; + index = 0; + vect_stmts_counter = 0; + vec_index = 0; + first_vec_index = vec_index++; + if (only_one_vec) + second_vec_index = first_vec_index; + else + second_vec_index = vec_index++; + + for (j = 0; j < unroll_factor; j++) + { + for (k = 0; k < group_size; k++) + { + first_mask_element = (i + j * group_size) * scale; + for (m = 0; m < scale; m++) + { + if (!vect_get_mask_element (stmt, first_mask_element, m, + mask_nunits, only_one_vec, index, mask, + ¤t_mask_element, &need_next_vector, + &number_of_mask_fixes, &mask_fixed, + &needs_first_vector)) + return false; + + mask[index++] = current_mask_element; + } + + if (index == mask_nunits) + { + tree mask_vec = NULL; + + while (--index >= 0) + { + tree t = build_int_cst (mask_element_type, mask[index]); + mask_vec = tree_cons (NULL, t, mask_vec); + } + mask_vec = build_vector (mask_type, mask_vec); + index = 0; + + if (!targetm.vectorize.builtin_vec_perm_ok (vectype, + mask_vec)) + { + if (vect_print_dump_info (REPORT_DETAILS)) + { + fprintf (vect_dump, "unsupported vect permute "); + print_generic_expr (vect_dump, mask_vec, 0); + } + free (mask); + return false; + } + + if (!analyze_only) + { + if (need_next_vector) + { + first_vec_index = second_vec_index; + second_vec_index = vec_index; + } + + next_scalar_stmt = VEC_index (gimple, + SLP_TREE_SCALAR_STMTS (node), scalar_index++); + + vect_create_mask_and_perm (stmt, next_scalar_stmt, + mask_vec, first_vec_index, second_vec_index, + gsi, node, builtin_decl, vectype, dr_chain, + ncopies, vect_stmts_counter++); + } + } + } + } + } + + free (mask); + return true; +} + + + +/* Vectorize SLP instance tree in postorder. */ + +static bool +vect_schedule_slp_instance (slp_tree node, slp_instance instance, + unsigned int vectorization_factor) +{ + gimple stmt; + bool strided_store, is_store; + gimple_stmt_iterator si; + stmt_vec_info stmt_info; + unsigned int vec_stmts_size, nunits, group_size; + tree vectype; + int i; + slp_tree loads_node; + + if (!node) + return false; + + vect_schedule_slp_instance (SLP_TREE_LEFT (node), instance, + vectorization_factor); + vect_schedule_slp_instance (SLP_TREE_RIGHT (node), instance, + vectorization_factor); + + stmt = VEC_index (gimple, SLP_TREE_SCALAR_STMTS (node), 0); + stmt_info = vinfo_for_stmt (stmt); + + /* VECTYPE is the type of the destination. */ + vectype = STMT_VINFO_VECTYPE (stmt_info); + nunits = (unsigned int) TYPE_VECTOR_SUBPARTS (vectype); + group_size = SLP_INSTANCE_GROUP_SIZE (instance); + + /* For each SLP instance calculate number of vector stmts to be created + for the scalar stmts in each node of the SLP tree. Number of vector + elements in one vector iteration is the number of scalar elements in + one scalar iteration (GROUP_SIZE) multiplied by VF divided by vector + size. */ + vec_stmts_size = (vectorization_factor * group_size) / nunits; + + /* In case of load permutation we have to allocate vectorized statements for + all the nodes that participate in that permutation. */ + if (SLP_INSTANCE_LOAD_PERMUTATION (instance)) + { + FOR_EACH_VEC_ELT (slp_tree, SLP_INSTANCE_LOADS (instance), i, loads_node) + { + if (!SLP_TREE_VEC_STMTS (loads_node)) + { + SLP_TREE_VEC_STMTS (loads_node) = VEC_alloc (gimple, heap, + vec_stmts_size); + SLP_TREE_NUMBER_OF_VEC_STMTS (loads_node) = vec_stmts_size; + } + } + } + + if (!SLP_TREE_VEC_STMTS (node)) + { + SLP_TREE_VEC_STMTS (node) = VEC_alloc (gimple, heap, vec_stmts_size); + SLP_TREE_NUMBER_OF_VEC_STMTS (node) = vec_stmts_size; + } + + if (vect_print_dump_info (REPORT_DETAILS)) + { + fprintf (vect_dump, "------>vectorizing SLP node starting from: "); + print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM); + } + + /* Loads should be inserted before the first load. */ + if (SLP_INSTANCE_FIRST_LOAD_STMT (instance) + && STMT_VINFO_STRIDED_ACCESS (stmt_info) + && !REFERENCE_CLASS_P (gimple_get_lhs (stmt))) + si = gsi_for_stmt (SLP_INSTANCE_FIRST_LOAD_STMT (instance)); + else + si = gsi_for_stmt (stmt); + + /* Stores should be inserted just before the last store. */ + if (STMT_VINFO_STRIDED_ACCESS (stmt_info) + && REFERENCE_CLASS_P (gimple_get_lhs (stmt))) + { + gimple last_store = vect_find_last_store_in_slp_instance (instance); + si = gsi_for_stmt (last_store); + } + + is_store = vect_transform_stmt (stmt, &si, &strided_store, node, instance); + return is_store; +} + + +/* Generate vector code for all SLP instances in the loop/basic block. */ + +bool +vect_schedule_slp (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo) +{ + VEC (slp_instance, heap) *slp_instances; + slp_instance instance; + unsigned int i, vf; + bool is_store = false; + + if (loop_vinfo) + { + slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo); + vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); + } + else + { + slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo); + vf = 1; + } + + FOR_EACH_VEC_ELT (slp_instance, slp_instances, i, instance) + { + /* Schedule the tree of INSTANCE. */ + is_store = vect_schedule_slp_instance (SLP_INSTANCE_TREE (instance), + instance, vf); + if (vect_print_dump_info (REPORT_VECTORIZED_LOCATIONS) + || vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS)) + fprintf (vect_dump, "vectorizing stmts using SLP."); + } + + FOR_EACH_VEC_ELT (slp_instance, slp_instances, i, instance) + { + slp_tree root = SLP_INSTANCE_TREE (instance); + gimple store; + unsigned int j; + gimple_stmt_iterator gsi; + + for (j = 0; VEC_iterate (gimple, SLP_TREE_SCALAR_STMTS (root), j, store) + && j < SLP_INSTANCE_GROUP_SIZE (instance); j++) + { + if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (store))) + break; + + /* Free the attached stmt_vec_info and remove the stmt. */ + gsi = gsi_for_stmt (store); + gsi_remove (&gsi, true); + free_stmt_vec_info (store); + } + } + + return is_store; +} + + +/* Vectorize the basic block. */ + +void +vect_slp_transform_bb (basic_block bb) +{ + bb_vec_info bb_vinfo = vec_info_for_bb (bb); + gimple_stmt_iterator si; + + gcc_assert (bb_vinfo); + + if (vect_print_dump_info (REPORT_DETAILS)) + fprintf (vect_dump, "SLPing BB\n"); + + for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si)) + { + gimple stmt = gsi_stmt (si); + stmt_vec_info stmt_info; + + if (vect_print_dump_info (REPORT_DETAILS)) + { + fprintf (vect_dump, "------>SLPing statement: "); + print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM); + } + + stmt_info = vinfo_for_stmt (stmt); + gcc_assert (stmt_info); + + /* Schedule all the SLP instances when the first SLP stmt is reached. */ + if (STMT_SLP_TYPE (stmt_info)) + { + vect_schedule_slp (NULL, bb_vinfo); + break; + } + } + + mark_sym_for_renaming (gimple_vop (cfun)); + /* The memory tags and pointers in vectorized statements need to + have their SSA forms updated. FIXME, why can't this be delayed + until all the loops have been transformed? */ + update_ssa (TODO_update_ssa); + + if (vect_print_dump_info (REPORT_DETAILS)) + fprintf (vect_dump, "BASIC BLOCK VECTORIZED\n"); + + destroy_bb_vec_info (bb_vinfo); +} + |