/*
 * Copyright (C) 2016 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef ART_COMPILER_OPTIMIZING_LOOP_OPTIMIZATION_H_
#define ART_COMPILER_OPTIMIZING_LOOP_OPTIMIZATION_H_

#include "base/macros.h"
#include "base/scoped_arena_allocator.h"
#include "base/scoped_arena_containers.h"
#include "induction_var_range.h"
#include "loop_analysis.h"
#include "nodes.h"
#include "optimization.h"
#include "superblock_cloner.h"

namespace art HIDDEN {

class CompilerOptions;
class ArchNoOptsLoopHelper;

// Determines whether predicated loop vectorization should be tried for ALL loops.
#ifdef ART_FORCE_TRY_PREDICATED_SIMD
  static constexpr bool kForceTryPredicatedSIMD = true;
#else
  static constexpr bool kForceTryPredicatedSIMD = false;
#endif

/**
 * Loop optimizations. Builds a loop hierarchy and applies optimizations to
 * the detected nested loops, such as removal of dead induction and empty loops
 * and inner loop vectorization.
 */
class HLoopOptimization : public HOptimization {
 public:
  HLoopOptimization(HGraph* graph,
                    const CodeGenerator& codegen,    // Needs info about the target.
                    HInductionVarAnalysis* induction_analysis,
                    OptimizingCompilerStats* stats,
                    const char* name = kLoopOptimizationPassName);

  bool Run() override;

  static constexpr const char* kLoopOptimizationPassName = "loop_optimization";

  // The maximum number of total instructions (trip_count * instruction_count),
  // where the optimization of removing SuspendChecks from the loop header could
  // be performed.
  static constexpr int64_t kMaxTotalInstRemoveSuspendCheck = 128;

 private:
  /**
   * A single loop inside the loop hierarchy representation.
   */
  struct LoopNode : public ArenaObject<kArenaAllocLoopOptimization> {
    explicit LoopNode(HLoopInformation* lp_info)
        : loop_info(lp_info),
          outer(nullptr),
          inner(nullptr),
          previous(nullptr),
          next(nullptr),
          try_catch_kind(TryCatchKind::kUnknown) {}

    enum class TryCatchKind {
      kUnknown,
      // Either if we have a try catch in the loop, or if the loop is inside of an outer try catch,
      // we set `kHasTryCatch`.
      kHasTryCatch,
      kNoTryCatch
    };

    HLoopInformation* loop_info;
    LoopNode* outer;
    LoopNode* inner;
    LoopNode* previous;
    LoopNode* next;
    TryCatchKind try_catch_kind;
  };

  /*
   * Vectorization restrictions (bit mask).
   */
  enum VectorRestrictions {
    kNone            = 0,        // no restrictions
    kNoMul           = 1 << 0,   // no multiplication
    kNoDiv           = 1 << 1,   // no division
    kNoShift         = 1 << 2,   // no shift
    kNoShr           = 1 << 3,   // no arithmetic shift right
    kNoHiBits        = 1 << 4,   // "wider" operations cannot bring in higher order bits
    kNoSignedHAdd    = 1 << 5,   // no signed halving add
    kNoUnsignedHAdd  = 1 << 6,   // no unsigned halving add
    kNoUnroundedHAdd = 1 << 7,   // no unrounded halving add
    kNoAbs           = 1 << 8,   // no absolute value
    kNoStringCharAt  = 1 << 9,   // no StringCharAt
    kNoReduction     = 1 << 10,  // no reduction
    kNoSAD           = 1 << 11,  // no sum of absolute differences (SAD)
    kNoWideSAD       = 1 << 12,  // no sum of absolute differences (SAD) with operand widening
    kNoDotProd       = 1 << 13,  // no dot product
    kNoIfCond        = 1 << 14,  // no if condition conversion
  };

  /*
   * Loop synthesis mode during vectorization
   * (sequential peeling/cleanup loop or vector loop).
   */
  enum class LoopSynthesisMode {
    kSequential,
    kVector
  };
  friend std::ostream& operator<<(std::ostream& os, const LoopSynthesisMode& fd_logger);

  /*
   * Representation of a unit-stride array reference.
   */
  struct ArrayReference {
    ArrayReference(HInstruction* b, HInstruction* o, DataType::Type t, bool l, bool c = false)
        : base(b), offset(o), type(t), lhs(l), is_string_char_at(c) { }
    bool operator<(const ArrayReference& other) const {
      return
          (base < other.base) ||
          (base == other.base &&
           (offset < other.offset || (offset == other.offset &&
                                      (type < other.type ||
                                       (type == other.type &&
                                        (lhs < other.lhs ||
                                         (lhs == other.lhs &&
                                          is_string_char_at < other.is_string_char_at)))))));
    }
    HInstruction* base;      // base address
    HInstruction* offset;    // offset + i
    DataType::Type type;     // component type
    bool lhs;                // def/use
    bool is_string_char_at;  // compressed string read
  };

  // This structure describes the control flow (CF) -> data flow (DF) conversion of the loop
  // with control flow (see below) for the purpose of predicated autovectorization.
  //
  // Lets define "loops without control-flow" (or non-CF loops) as loops with two consecutive
  // blocks and without the branching structure except for the loop exit. And
  // "loop with control-flow" (or CF-loops) - all other loops.
  //
  // In the execution of the original CF-loop on each iteration some basic block Y will be
  // either executed or not executed, depending on the control flow of the loop. More
  // specifically, a block will be executed if all the conditional branches of the nodes in
  // the control dependency graph for that block Y are taken according to the path from the loop
  // header to that basic block.
  //
  // This is the key idea of CF->DF conversion: a boolean value
  // 'ctrl_pred == cond1 && cond2 && ...' will determine whether the basic block Y will be
  // executed, where cond_K is whether the branch of the node K in the control dependency
  // graph upward traversal was taken in the 'right' direction.
  //
  // Def.: BB Y is control dependent on BB X iff
  //   (1) there exists a directed path P from X to Y with any basic block Z in P (excluding X
  //       and Y) post-dominated by Y and
  //   (2) X is not post-dominated by Y.
  //             ...
  //              X
  //     false /     \ true
  //          /       \
  //                  ...
  //                   |
  //                   Y
  //                  ...
  //
  // When doing predicated autovectorization of a CF loop, we use the CF->DF conversion approach:
  //  1) do the data analysis and vector operation creation as if it was a non-CF loop.
  //  2) for each HIf block create two vector predicate setting instructions - for True and False
  //     edges/paths.
  //  3) assign a governing vector predicate (see comments near HVecPredSetOperation)
  //     to each vector operation Alpha in the loop (including to those vector predicate setting
  //     instructions created in #2); do this by:
  //     - finding the immediate control dependent block of the instruction Alpha's block.
  //     - choosing the True or False predicate setting instruction (created in #2) depending
  //       on the path to the instruction.
  //
  // For more information check the papers:
  //
  //   - Allen, John R and Kennedy, Ken and Porterfield, Carrie and Warren, Joe,
  //     “Conversion of Control Dependence to Data Dependence,” in Proceedings of the 10th ACM
  //     SIGACT-SIGPLAN Symposium on Principles of Programming Languages, 1983, pp. 177–189.
  //   - JEANNE FERRANTE, KARL J. OTTENSTEIN, JOE D. WARREN,
  //     "The Program Dependence Graph and Its Use in Optimization"
  //
  class BlockPredicateInfo : public ArenaObject<kArenaAllocLoopOptimization> {
   public:
    BlockPredicateInfo() :
        control_predicate_(nullptr),
        true_predicate_(nullptr),
        false_predicate_(nullptr) {}

    void SetControlFlowInfo(HVecPredSetOperation* true_predicate,
                            HVecPredSetOperation* false_predicate) {
      DCHECK(!HasControlFlowOps());
      true_predicate_ = true_predicate;
      false_predicate_ = false_predicate;
    }

    bool HasControlFlowOps() const {
      // Note: a block must have both T/F predicates set or none of them.
      DCHECK_EQ(true_predicate_ == nullptr, false_predicate_ == nullptr);
      return true_predicate_ != nullptr;
    }

    HVecPredSetOperation* GetControlPredicate() const { return control_predicate_; }
    void SetControlPredicate(HVecPredSetOperation* control_predicate) {
      control_predicate_ = control_predicate;
    }

    HVecPredSetOperation* GetTruePredicate() const { return true_predicate_; }
    HVecPredSetOperation* GetFalsePredicate() const { return false_predicate_; }

   private:
    // Vector control predicate operation, associated with the block which will determine
    // the active lanes for all vector operations, originated from this block.
    HVecPredSetOperation* control_predicate_;

    // Vector predicate instruction, associated with the true sucessor of the block.
    HVecPredSetOperation* true_predicate_;
    // Vector predicate instruction, associated with the false sucessor of the block.
    HVecPredSetOperation* false_predicate_;
  };

  //
  // Loop setup and traversal.
  //

  bool LocalRun();
  void AddLoop(HLoopInformation* loop_info);
  void RemoveLoop(LoopNode* node);

  // Traverses all loops inner to outer to perform simplifications and optimizations.
  // Returns true if loops nested inside current loop (node) have changed.
  bool TraverseLoopsInnerToOuter(LoopNode* node);

  // Calculates `node`'s `try_catch_kind` and sets it to:
  // 1) kHasTryCatch if it has try catches (or if it's inside of an outer try catch)
  // 2) kNoTryCatch otherwise.
  void CalculateAndSetTryCatchKind(LoopNode* node);

  //
  // Optimization.
  //

  void SimplifyInduction(LoopNode* node);
  void SimplifyBlocks(LoopNode* node);

  // Performs optimizations specific to inner loop with finite header logic (empty loop removal,
  // unrolling, vectorization). Returns true if anything changed.
  bool TryOptimizeInnerLoopFinite(LoopNode* node);

  // Performs optimizations specific to inner loop. Returns true if anything changed.
  bool OptimizeInnerLoop(LoopNode* node);

  // Tries to apply loop unrolling for branch penalty reduction and better instruction scheduling
  // opportunities. Returns whether transformation happened. 'generate_code' determines whether the
  // optimization should be actually applied.
  bool TryUnrollingForBranchPenaltyReduction(LoopAnalysisInfo* analysis_info,
                                             bool generate_code = true);

  // Tries to apply loop peeling for loop invariant exits elimination. Returns whether
  // transformation happened. 'generate_code' determines whether the optimization should be
  // actually applied.
  bool TryPeelingForLoopInvariantExitsElimination(LoopAnalysisInfo* analysis_info,
                                                  bool generate_code = true);

  // Tries to perform whole loop unrolling for a small loop with a small trip count to eliminate
  // the loop check overhead and to have more opportunities for inter-iteration optimizations.
  // Returns whether transformation happened. 'generate_code' determines whether the optimization
  // should be actually applied.
  bool TryFullUnrolling(LoopAnalysisInfo* analysis_info, bool generate_code = true);

  // Tries to remove SuspendCheck for plain loops with a low trip count. The
  // SuspendCheck in the codegen makes sure that the thread can be interrupted
  // during execution for GC. Not being able to do so might decrease the
  // responsiveness of GC when a very long loop or a long recursion is being
  // executed. However, for plain loops with a small trip count, the removal of
  // SuspendCheck should not affect the GC's responsiveness by a large margin.
  // Consequently, since the thread won't be interrupted for plain loops, it is
  // assumed that the performance might increase by removing SuspendCheck.
  bool TryToRemoveSuspendCheckFromLoopHeader(LoopAnalysisInfo* analysis_info,
                                             bool generate_code = true);

  // Tries to apply scalar loop optimizations.
  bool TryLoopScalarOpts(LoopNode* node);

  //
  // Vectorization analysis and synthesis.
  //

  // Returns whether the data flow requirements are met for vectorization.
  //
  //   - checks whether instructions are vectorizable for the target.
  //   - conducts data dependence analysis for array references.
  //   - additionally, collects info on peeling and aligment strategy.
  bool CanVectorizeDataFlow(LoopNode* node, HBasicBlock* header, bool collect_alignment_info);

  // Does the checks (common for predicated and traditional mode) for the loop.
  bool ShouldVectorizeCommon(LoopNode* node, HPhi* main_phi, int64_t trip_count);

  // Try to vectorize the loop, returns whether it was successful.
  //
  // There are two versions/algorithms:
  //  - Predicated: all the vector operations have governing predicates which control
  //    which individual vector lanes will be active (see HVecPredSetOperation for more details).
  //    Example: vectorization using AArch64 SVE.
  //  - Traditional: a regular mode in which all vector operations lanes are unconditionally
  //    active.
  //    Example: vectoriation using AArch64 NEON.
  bool TryVectorizePredicated(LoopNode* node,
                              HBasicBlock* body,
                              HBasicBlock* exit,
                              HPhi* main_phi,
                              int64_t trip_count);

  bool TryVectorizedTraditional(LoopNode* node,
                                HBasicBlock* body,
                                HBasicBlock* exit,
                                HPhi* main_phi,
                                int64_t trip_count);

  // Vectorizes the loop for which all checks have been already done.
  void VectorizePredicated(LoopNode* node,
                           HBasicBlock* block,
                           HBasicBlock* exit);
  void VectorizeTraditional(LoopNode* node,
                            HBasicBlock* block,
                            HBasicBlock* exit,
                            int64_t trip_count);

  // Performs final steps for whole vectorization process: links reduction, removes the original
  // scalar loop, updates loop info.
  void FinalizeVectorization(LoopNode* node);

  // Helpers that do the vector instruction synthesis for the previously created loop; create
  // and fill the loop body with instructions.
  //
  // A version to generate a vector loop in predicated mode.
  void GenerateNewLoopPredicated(LoopNode* node,
                                 HBasicBlock* new_preheader,
                                 HInstruction* lo,
                                 HInstruction* hi,
                                 HInstruction* step);

  // A version to generate a vector loop in traditional mode or to generate
  // a scalar loop for both modes.
  void GenerateNewLoopScalarOrTraditional(LoopNode* node,
                                          HBasicBlock* new_preheader,
                                          HInstruction* lo,
                                          HInstruction* hi,
                                          HInstruction* step,
                                          uint32_t unroll);

  //
  // Helpers for GenerateNewLoop*.
  //

  // Updates vectorization bookkeeping date for the new loop, creates and returns
  // its main induction Phi.
  HPhi* InitializeForNewLoop(HBasicBlock* new_preheader, HInstruction* lo);

  // Finalizes reduction and induction phis' inputs for the newly created loop.
  void FinalizePhisForNewLoop(HPhi* phi, HInstruction* lo);

  // Creates empty predicate info object for each basic block and puts it into the map.
  void PreparePredicateInfoMap(LoopNode* node);

  // Set up block true/false predicates using info, collected through data flow and control
  // dependency analysis.
  void InitPredicateInfoMap(LoopNode* node, HVecPredSetOperation* loop_main_pred);

  // Performs instruction synthesis for the loop body.
  void GenerateNewLoopBodyOnce(LoopNode* node,
                               DataType::Type induc_type,
                               HInstruction* step);

  // Returns whether the vector loop needs runtime disambiguation test for array refs.
  bool NeedsArrayRefsDisambiguationTest() const { return vector_runtime_test_a_ != nullptr; }

  bool VectorizeDef(LoopNode* node, HInstruction* instruction, bool generate_code);
  bool VectorizeUse(LoopNode* node,
                    HInstruction* instruction,
                    bool generate_code,
                    DataType::Type type,
                    uint64_t restrictions);
  uint32_t GetVectorSizeInBytes();
  bool TrySetVectorType(DataType::Type type, /*out*/ uint64_t* restrictions);
  bool TrySetVectorLengthImpl(uint32_t length);

  bool TrySetVectorLength(DataType::Type type, uint32_t length) {
    bool res = TrySetVectorLengthImpl(length);
    // Currently the vectorizer supports only the mode when full SIMD registers are used.
    DCHECK_IMPLIES(res, DataType::Size(type) * length == GetVectorSizeInBytes());
    return res;
  }

  void GenerateVecInv(HInstruction* org, DataType::Type type);
  void GenerateVecSub(HInstruction* org, HInstruction* offset);
  void GenerateVecMem(HInstruction* org,
                      HInstruction* opa,
                      HInstruction* opb,
                      HInstruction* offset,
                      DataType::Type type);
  void GenerateVecReductionPhi(HPhi* phi);
  void GenerateVecReductionPhiInputs(HPhi* phi, HInstruction* reduction);
  HInstruction* ReduceAndExtractIfNeeded(HInstruction* instruction);
  HInstruction* GenerateVecOp(HInstruction* org,
                              HInstruction* opa,
                              HInstruction* opb,
                              DataType::Type type);

  // Vectorization idioms.
  bool VectorizeSaturationIdiom(LoopNode* node,
                                HInstruction* instruction,
                                bool generate_code,
                                DataType::Type type,
                                uint64_t restrictions);
  bool VectorizeHalvingAddIdiom(LoopNode* node,
                                HInstruction* instruction,
                                bool generate_code,
                                DataType::Type type,
                                uint64_t restrictions);
  bool VectorizeSADIdiom(LoopNode* node,
                         HInstruction* instruction,
                         bool generate_code,
                         DataType::Type type,
                         uint64_t restrictions);
  bool VectorizeDotProdIdiom(LoopNode* node,
                             HInstruction* instruction,
                             bool generate_code,
                             DataType::Type type,
                             uint64_t restrictions);
  bool VectorizeIfCondition(LoopNode* node,
                            HInstruction* instruction,
                            bool generate_code,
                            uint64_t restrictions);

  // Vectorization heuristics.
  Alignment ComputeAlignment(HInstruction* offset,
                             DataType::Type type,
                             bool is_string_char_at,
                             uint32_t peeling = 0);
  void SetAlignmentStrategy(const ScopedArenaVector<uint32_t>& peeling_votes,
                            const ArrayReference* peeling_candidate);
  uint32_t MaxNumberPeeled();
  bool IsVectorizationProfitable(int64_t trip_count);

  //
  // Helpers.
  //

  bool TrySetPhiInduction(HPhi* phi, bool restrict_uses);
  bool TrySetPhiReduction(HPhi* phi);

  // Detects loop header with a single induction (returned in main_phi), possibly
  // other phis for reductions, but no other side effects. Returns true on success.
  bool TrySetSimpleLoopHeader(HBasicBlock* block, /*out*/ HPhi** main_phi);

  bool IsEmptyBody(HBasicBlock* block);
  bool IsOnlyUsedAfterLoop(HLoopInformation* loop_info,
                           HInstruction* instruction,
                           bool collect_loop_uses,
                           /*out*/ uint32_t* use_count);
  bool IsUsedOutsideLoop(HLoopInformation* loop_info,
                         HInstruction* instruction);
  bool TryReplaceWithLastValue(HLoopInformation* loop_info,
                               HInstruction* instruction,
                               HBasicBlock* block);
  bool TryAssignLastValue(HLoopInformation* loop_info,
                          HInstruction* instruction,
                          HBasicBlock* block,
                          bool collect_loop_uses);
  void RemoveDeadInstructions(const HInstructionList& list);
  bool CanRemoveCycle();  // Whether the current 'iset_' is removable.

  bool IsInPredicatedVectorizationMode() const { return predicated_vectorization_mode_; }
  void MaybeInsertInVectorExternalSet(HInstruction* instruction);

  // Compiler options (to query ISA features).
  const CompilerOptions* compiler_options_;

  // Cached target SIMD vector register size in bytes.
  const size_t simd_register_size_;

  // Range information based on prior induction variable analysis.
  InductionVarRange induction_range_;

  // Phase-local heap memory allocator for the loop optimizer. Storage obtained
  // through this allocator is immediately released when the loop optimizer is done.
  ScopedArenaAllocator* loop_allocator_;

  // Global heap memory allocator. Used to build HIR.
  ArenaAllocator* global_allocator_;

  // Entries into the loop hierarchy representation. The hierarchy resides
  // in phase-local heap memory.
  LoopNode* top_loop_;
  LoopNode* last_loop_;

  // Temporary bookkeeping of a set of instructions.
  // Contents reside in phase-local heap memory.
  ScopedArenaSet<HInstruction*>* iset_;

  // Temporary bookkeeping of reduction instructions. Mapping is two-fold:
  // (1) reductions in the loop-body are mapped back to their phi definition,
  // (2) phi definitions are mapped to their initial value (updated during
  //     code generation to feed the proper values into the new chain).
  // Contents reside in phase-local heap memory.
  ScopedArenaSafeMap<HInstruction*, HInstruction*>* reductions_;

  // Flag that tracks if any simplifications have occurred.
  bool simplified_;

  // Whether to use predicated loop vectorization (e.g. for arm64 SVE target).
  bool predicated_vectorization_mode_;

  // Number of "lanes" for selected packed type.
  uint32_t vector_length_;

  // Set of array references in the vector loop.
  // Contents reside in phase-local heap memory.
  ScopedArenaSet<ArrayReference>* vector_refs_;

  // Static or dynamic loop peeling for alignment.
  uint32_t vector_static_peeling_factor_;
  const ArrayReference* vector_dynamic_peeling_candidate_;

  // Dynamic data dependence test of the form a != b.
  HInstruction* vector_runtime_test_a_;
  HInstruction* vector_runtime_test_b_;

  // Mapping used during vectorization synthesis for both the scalar peeling/cleanup
  // loop (mode is kSequential) and the actual vector loop (mode is kVector). The data
  // structure maps original instructions into the new instructions.
  // Contents reside in phase-local heap memory.
  ScopedArenaSafeMap<HInstruction*, HInstruction*>* vector_map_;

  // Permanent mapping used during vectorization synthesis.
  // Contents reside in phase-local heap memory.
  ScopedArenaSafeMap<HInstruction*, HInstruction*>* vector_permanent_map_;

  // Tracks vector operations that are inserted outside of the loop (preheader, exit)
  // as part of vectorization (e.g. replicate scalar for loop invariants and reduce ops
  // for loop reductions).
  //
  // The instructions in the set are live for the whole vectorization process of the current
  // loop, not just during generation of a particular loop version (as the sets above).
  //
  // Currently the set is being only used in the predicated mode - for assigning governing
  // predicates.
  ScopedArenaSet<HInstruction*>* vector_external_set_;

  // A mapping between a basic block of the original loop and its associated PredicateInfo.
  //
  // Only used in predicated loop vectorization mode.
  ScopedArenaSafeMap<HBasicBlock*, BlockPredicateInfo*>* predicate_info_map_;

  // Temporary vectorization bookkeeping.
  LoopSynthesisMode synthesis_mode_;  // synthesis mode
  HBasicBlock* vector_preheader_;  // preheader of the new loop
  HBasicBlock* vector_header_;  // header of the new loop
  HBasicBlock* vector_body_;  // body of the new loop
  HInstruction* vector_index_;  // normalized index of the new loop

  // Helper for target-specific behaviour for loop optimizations.
  ArchNoOptsLoopHelper* arch_loop_helper_;

  friend class LoopOptimizationTest;
  friend class PredicatedSimdLoopOptimizationTest;

  DISALLOW_COPY_AND_ASSIGN(HLoopOptimization);
};

}  // namespace art

#endif  // ART_COMPILER_OPTIMIZING_LOOP_OPTIMIZATION_H_
