www.llvm.org/doxygen/LowerMatrixIntrinsics_8cpp_source.html

//===- LowerMatrixIntrinsics.cpp -  Lower matrix intrinsics -----*- C++ -*-===//

//

// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

// See https://llvm.org/LICENSE.txt for license information.

// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

//

//===----------------------------------------------------------------------===//

//

// Lower matrix intrinsics to vector operations.

//

// TODO:

//  * Improve fusion:

//   * Support more cases, e.g. multiply-add, multiply-sub, operands/results

//     transposed.

//   * Improve cost-modeling, e.g. choose different number of rows/columns

//     columns for tiles, consider cost of copies on alias.

//

//===----------------------------------------------------------------------===//


#include "llvm/Transforms/Scalar/LowerMatrixIntrinsics.h"

#include "llvm/ADT/PostOrderIterator.h"

#include "llvm/ADT/STLExtras.h"

#include "llvm/ADT/ScopeExit.h"

#include "llvm/ADT/SmallVector.h"

#include "llvm/ADT/Statistic.h"

#include "llvm/Analysis/AliasAnalysis.h"

#include "llvm/Analysis/DomTreeUpdater.h"

#include "llvm/Analysis/LoopInfo.h"

#include "llvm/Analysis/OptimizationRemarkEmitter.h"

#include "llvm/Analysis/TargetTransformInfo.h"

#include "llvm/Analysis/ValueTracking.h"

#include "llvm/Analysis/VectorUtils.h"

#include "llvm/IR/CFG.h"

#include "llvm/IR/DataLayout.h"

#include "llvm/IR/DebugInfoMetadata.h"

#include "llvm/IR/DerivedTypes.h"

#include "llvm/IR/Function.h"

#include "llvm/IR/IRBuilder.h"

#include "llvm/IR/InstrTypes.h"

#include "llvm/IR/Instructions.h"

#include "llvm/IR/IntrinsicInst.h"

#include "llvm/IR/MatrixBuilder.h"

#include "llvm/IR/PatternMatch.h"

#include "llvm/IR/ProfDataUtils.h"

#include "llvm/Support/Alignment.h"

#include "llvm/Support/CommandLine.h"

#include "llvm/Support/Compiler.h"

#include "llvm/Support/Debug.h"

#include "llvm/Transforms/Utils/BasicBlockUtils.h"

#include "llvm/Transforms/Utils/LoopUtils.h"

#include "llvm/Transforms/Utils/MatrixUtils.h"


#include <cmath>


using namespace llvm;

using namespace PatternMatch;


#define DEBUG_TYPE "lower-matrix-intrinsics"


STATISTIC(FlattenedMatrices, "Number of matrix flattenings");

STATISTIC(ReshapedMatrices, "Number of matrix reshapes");

STATISTIC(SplitMatrices, "Number of matrix splits");


static cl::opt<bool>

    FuseMatrix("fuse-matrix", cl::init(true), cl::Hidden,

               cl::desc("Enable/disable fusing matrix instructions."));

// TODO: Allow and use non-square tiles.

static cl::opt<unsigned> TileSize(

    "fuse-matrix-tile-size", cl::init(4), cl::Hidden,

    cl::desc(

        "Tile size for matrix instruction fusion using square-shaped tiles."));

static cl::opt<unsigned>

    TileLoopsThreshold("fuse-matrix-loops-threshold", cl::init(200), cl::Hidden,

                       cl::desc("Generate loop nests for tiling when expected "

                                "number of operations exceeds threshold."));

static cl::opt<bool> ForceFusion(

    "force-fuse-matrix", cl::init(false), cl::Hidden,

    cl::desc("Force matrix instruction fusion even if not profitable."));

static cl::opt<bool> AllowContractEnabled(

    "matrix-allow-contract", cl::init(false), cl::Hidden,

    cl::desc("Allow the use of FMAs if available and profitable. This may "

             "result in different results, due to less rounding error."));


static cl::opt<bool>

    VerifyShapeInfo("verify-matrix-shapes", cl::Hidden,

                    cl::desc("Enable/disable matrix shape verification."),

                    cl::init(false));


enum class MatrixLayoutTy { ColumnMajor, RowMajor };


static cl::opt<MatrixLayoutTy> MatrixLayout(

    "matrix-default-layout", cl::init(MatrixLayoutTy::ColumnMajor),

    cl::desc("Sets the default matrix layout"),

    cl::values(clEnumValN(MatrixLayoutTy::ColumnMajor, "column-major",

                          "Use column-major layout"),

               clEnumValN(MatrixLayoutTy::RowMajor, "row-major",

                          "Use row-major layout")));


static cl::opt<bool> PrintAfterTransposeOpt("matrix-print-after-transpose-opt",

                                            cl::init(false));


static cl::opt<unsigned> SplitMatmulRemainderOverThreshold(

    "matrix-split-matmul-remainder-over-threshold", cl::Hidden,

    cl::desc("Illegal remainder vectors over this size in bits should be split "

             "in the inner loop of matmul"),

    cl::init(0));


namespace llvm {

extern cl::opt<bool> ProfcheckDisableMetadataFixes;

} // end namespace llvm


/// Helper function to either return Scope, if it is a subprogram or the

/// attached subprogram for a local scope.


static DISubprogram *getSubprogram(DIScope *Scope) {

  if (auto *Subprogram = dyn_cast<DISubprogram>(Scope))

    return Subprogram;

  return cast<DILocalScope>(Scope)->getSubprogram();

}


/// Return true if V is a splat of a value (which is used when multiplying a

/// matrix with a scalar).


static bool isSplat(Value *V) {

  if (auto *SV = dyn_cast<ShuffleVectorInst>(V))

    return SV->isZeroEltSplat();

  return false;

}


/// Match any mul operation (fp or integer).

template <typename LTy, typename RTy>


static auto m_AnyMul(const LTy &L, const RTy &R) {

  return m_CombineOr(m_Mul(L, R), m_FMul(L, R));

}


/// Match any add operation (fp or integer).

template <typename LTy, typename RTy>


static auto m_AnyAdd(const LTy &L, const RTy &R) {

  return m_CombineOr(m_Add(L, R), m_FAdd(L, R));

}


// Given an element pointer \p BasePtr to the start of a (sub) matrix, compute

// the start address of vector \p VecIdx with type (\p EltType x \p NumElements)

// assuming \p Stride elements between start two consecutive vectors.

// \p Stride must be >= \p NumElements.

// For column-major matrixes, the function computes the address of a column

// vectors and \p NumElements must be set to the number of elements in a column

// (= number of rows of the matrix). For row-major matrixes, the function

// computes the address of a row vector and \p NumElements must be set to the

// number of elements in a column (= number of columns of the matrix).

//

// Consider a 4x4 matrix in column-mjaor layout like below

//

//      0       1      2      3

// 0   v_0_0  v_0_1  v_0_2  v_0_3

// 1   v_1_0  v_1_1  v_1_2  v_1_3

// 2   v_2_0  v_2_1  v_2_2  v_2_3

// 3   v_3_0  v_3_1  v_3_2  v_3_3


// To compute the column addresses for a 2x3 sub-matrix at row 1 and column 1,

// we need a pointer to the first element of the submatrix as base pointer.

// Then we can use computeVectorAddr to compute the addresses for the columns

// of the sub-matrix.

//

// Column 0: computeVectorAddr(Base, 0 (column), 4 (stride), 2 (num rows), ..)

//           -> just returns Base

// Column 1: computeVectorAddr(Base, 1 (column), 4 (stride), 2 (num rows), ..)

//           -> returns Base + (1 * 4)

// Column 2: computeVectorAddr(Base, 2 (column), 4 (stride), 2 (num rows), ..)

//           -> returns Base + (2 * 4)

//

// The graphic below illustrates the number of elements in a column (marked

// with |) and the number of skipped elements (marked with }).

//

//         v_0_0  v_0_1 {v_0_2 {v_0_3

//                Base   Col 1  Col 2

//                  |     |      |

//         v_1_0 |v_1_1 |v_1_2 |v_1_3

//         v_2_0 |v_2_1 |v_2_2 |v_2_3

//         v_3_0 {v_3_1 {v_3_2  v_3_3

//


static Value *computeVectorAddr(Value *BasePtr, Value *VecIdx, Value *Stride,

                                unsigned NumElements, Type *EltType,

                                IRBuilder<> &Builder) {


  assert((!isa<ConstantInt>(Stride) ||

          cast<ConstantInt>(Stride)->getZExtValue() >= NumElements) &&

         "Stride must be >= the number of elements in the result vector.");


  // Compute the start of the vector with index VecIdx as VecIdx * Stride.

  Value *VecStart = Builder.CreateMul(VecIdx, Stride, "vec.start");


  // Get pointer to the start of the selected vector. Skip GEP creation,

  // if we select vector 0.

  if (isa<ConstantInt>(VecStart) && cast<ConstantInt>(VecStart)->isZero())

    VecStart = BasePtr;

  else

    VecStart = Builder.CreateInBoundsGEP(EltType, BasePtr, VecStart, "vec.gep");


  return VecStart;

}


namespace {

struct ShapeInfo {

  unsigned NumRows;

  unsigned NumColumns;


  bool IsColumnMajor;


  ShapeInfo(unsigned NumRows = 0, unsigned NumColumns = 0)

      : NumRows(NumRows), NumColumns(NumColumns),

        IsColumnMajor(MatrixLayout == MatrixLayoutTy::ColumnMajor) {}


  ShapeInfo(Value *NumRows, Value *NumColumns)

      : ShapeInfo(cast<ConstantInt>(NumRows)->getZExtValue(),

                  cast<ConstantInt>(NumColumns)->getZExtValue()) {}


  bool operator==(const ShapeInfo &other) {

    return NumRows == other.NumRows && NumColumns == other.NumColumns;

  }

  bool operator!=(const ShapeInfo &other) { return !(*this == other); }


  /// Returns true if shape-information is defined, meaning both dimensions

  /// are != 0.

  operator bool() const {

    assert(NumRows == 0 || NumColumns != 0);

    return NumRows != 0;

  }


  unsigned getStride() const {

    if (IsColumnMajor)

      return NumRows;

    return NumColumns;

  }


  unsigned getNumVectors() const {

    if (IsColumnMajor)

      return NumColumns;

    return NumRows;

  }


  /// Returns the transposed shape.

  ShapeInfo t() const { return ShapeInfo(NumColumns, NumRows); }


  friend raw_ostream &operator<<(raw_ostream &OS, ShapeInfo SI);


  LLVM_DUMP_METHOD void dump() const { dbgs() << *this << '\n'; }

};


raw_ostream &operator<<(raw_ostream &OS, ShapeInfo SI) {

  return OS << SI.NumRows << 'x' << SI.NumColumns;

}


} // namespace


static bool isShapePreserving(Value *V) {

  Instruction *I = dyn_cast<Instruction>(V);

  if (!I)

    return true;


  if (isa<SelectInst>(I))

    return true;


  if (I->isBinaryOp())

    return true;


  if (auto *Cast = dyn_cast<CastInst>(V)) {

    switch (Cast->getOpcode()) {

    case llvm::Instruction::Trunc:

    case llvm::Instruction::ZExt:

    case llvm::Instruction::SExt:

    case llvm::Instruction::FPToUI:

    case llvm::Instruction::FPToSI:

    case llvm::Instruction::UIToFP:

    case llvm::Instruction::SIToFP:

    case llvm::Instruction::FPTrunc:

    case llvm::Instruction::FPExt:

      return true;

    case llvm::Instruction::AddrSpaceCast:

    case CastInst::PtrToAddr:

    case CastInst::PtrToInt:

    case CastInst::IntToPtr:

      return false;

    case CastInst::BitCast: {

      if (auto *SrcVTy = dyn_cast<FixedVectorType>(Cast->getSrcTy()))

        if (auto *DestVTy = dyn_cast<FixedVectorType>(Cast->getDestTy()))

          return SrcVTy->getNumElements() == DestVTy->getNumElements();

      return false;

    }

    case llvm::Instruction::CastOpsEnd:

      llvm_unreachable("not an actual cast op");

    }

    llvm_unreachable("unhandled cast opcode");

  }


  if (auto *II = dyn_cast<IntrinsicInst>(V))

    switch (II->getIntrinsicID()) {

    case Intrinsic::abs:

    case Intrinsic::fabs:

      return true;

    default:

      return false;

    }


  switch (I->getOpcode()) {

  case Instruction::PHI:

  case Instruction::FNeg:

    return true;

  default:

    return false;

  }

}


/// Return an iterator over the operands of \p I that should share shape

/// information with \p I.


static iterator_range<Use *> getShapedOperandsForInst(Instruction *I) {

  assert(isShapePreserving(I) &&

         "Can't retrieve shaped operands for an instruction that does not "

         "preserve shape information");

  auto Ops = I->operands();

  return isa<SelectInst>(I) ? drop_begin(Ops) : Ops;

}


/// Return the ShapeInfo for the result of \p I, it it can be determined.

static std::optional<ShapeInfo>


computeShapeInfoForInst(Instruction *I,

                        const DenseMap<Value *, ShapeInfo> &ShapeMap) {

  Value *M;

  Value *N;

  Value *K;

  if (match(I, m_Intrinsic<Intrinsic::matrix_multiply>(

                   m_Value(), m_Value(), m_Value(M), m_Value(N), m_Value(K))))

    return ShapeInfo(M, K);

  if (match(I, m_Intrinsic<Intrinsic::matrix_transpose>(m_Value(), m_Value(M),

                                                        m_Value(N)))) {

    // Flip dimensions.

    return ShapeInfo(N, M);

  }

  if (match(I, m_Intrinsic<Intrinsic::matrix_column_major_store>(

                   m_Value(), m_Value(), m_Value(), m_Value(), m_Value(M),

                   m_Value(N))))

    return ShapeInfo(N, M);

  if (match(I, m_Intrinsic<Intrinsic::matrix_column_major_load>(

                   m_Value(), m_Value(), m_Value(), m_Value(M), m_Value(N))))

    return ShapeInfo(M, N);

  Value *MatrixA;

  if (match(I, m_Store(m_Value(MatrixA), m_Value()))) {

    auto OpShape = ShapeMap.find(MatrixA);

    if (OpShape != ShapeMap.end())

      return OpShape->second;

  }


  if (isShapePreserving(I)) {

    auto ShapedOps = getShapedOperandsForInst(I);

    // Find the first operand that has a known shape and use that.

    for (auto &Op : ShapedOps) {

      auto OpShape = ShapeMap.find(Op.get());

      if (OpShape != ShapeMap.end())

        return OpShape->second;

    }

  }

  return std::nullopt;

}


namespace {


/// LowerMatrixIntrinsics contains the methods used to lower matrix intrinsics.

///

/// Currently, the lowering for each matrix intrinsic is done as follows:

/// 1. Propagate the shape information from intrinsics to connected

/// instructions.

/// 2. Lower instructions with shape information (assuming column-major layout).

///  The lowering works similarly using row-major layout.

///  2.1. Get column vectors for each argument. If we already lowered the

///       definition of an argument, use the produced column vectors directly.

///       If not, split the operand vector containing an embedded matrix into

///       a set of column vectors,

///  2.2. Lower the instruction in terms of column major operations, which

///       yields a set of column vectors containing result matrix. Note that we

///       lower all instructions that have shape information. Besides the

///       intrinsics, this includes stores for example.

///  2.3. Update uses of the lowered instruction. If we have shape information

///       for a user, there is nothing to do, as we will look up the result

///       column matrix when lowering the user. For other uses, we embed the

///       result matrix in a flat vector and update the use.

///  2.4. Cache the result column matrix for the instruction we lowered

/// 3. After we lowered all instructions in a function, remove the now

///    obsolete instructions.

///

class LowerMatrixIntrinsics {

  Function &Func;

  const DataLayout &DL;

  const TargetTransformInfo &TTI;

  FunctionAnalysisManager *AM;

  AliasAnalysis *AA = nullptr;

  DominatorTree *DT = nullptr;

  LoopInfo *LI = nullptr;

  OptimizationRemarkEmitter *ORE = nullptr;


  /// Contains estimates of the number of operations (loads, stores, compute)

  /// required to lower a matrix operation.

  struct OpInfoTy {

    /// Number of stores emitted to generate this matrix.

    unsigned NumStores = 0;

    /// Number of loads emitted to generate this matrix.

    unsigned NumLoads = 0;

    /// Number of compute operations emitted to generate this matrix.

    unsigned NumComputeOps = 0;

    /// Most of the time transposes can be fused with matrix multiplies or can

    /// be folded away via algebraic simplifications.  This is the number of

    /// transposes that we failed to make "free" via such optimizations.

    unsigned NumExposedTransposes = 0;


    OpInfoTy &operator+=(const OpInfoTy &RHS) {

      NumStores += RHS.NumStores;

      NumLoads += RHS.NumLoads;

      NumComputeOps += RHS.NumComputeOps;

      NumExposedTransposes += RHS.NumExposedTransposes;

      return *this;

    }

  };


  /// Wrapper class representing a matrix as a set of vectors, either in row or

  /// column major layout. All vectors must have the same vector type.

  class MatrixTy {

    SmallVector<Value *, 16> Vectors;


    OpInfoTy OpInfo;


    bool IsColumnMajor = true;


  public:

    MatrixTy() : IsColumnMajor(MatrixLayout == MatrixLayoutTy::ColumnMajor) {}

    MatrixTy(ArrayRef<Value *> Vectors)

        : Vectors(Vectors),

          IsColumnMajor(MatrixLayout == MatrixLayoutTy::ColumnMajor) {}

    MatrixTy(unsigned NumRows, unsigned NumColumns, Type *EltTy)

        : IsColumnMajor(MatrixLayout == MatrixLayoutTy::ColumnMajor) {


      unsigned D = isColumnMajor() ? NumColumns : NumRows;

      for (unsigned J = 0; J < D; ++J)

        addVector(PoisonValue::get(FixedVectorType::get(

            EltTy, isColumnMajor() ? NumRows : NumColumns)));

    }


    Value *getVector(unsigned i) const { return Vectors[i]; }

    Value *getColumn(unsigned i) const {

      assert(isColumnMajor() && "only supported for column-major matrixes");

      return Vectors[i];

    }

    Value *getRow(unsigned i) const {

      assert(!isColumnMajor() && "only supported for row-major matrixes");

      return Vectors[i];

    }


    void setVector(unsigned i, Value *V) { Vectors[i] = V; }


    Type *getElementType() const { return getVectorTy()->getElementType(); }


    unsigned getNumVectors() const {

      if (isColumnMajor())

        return getNumColumns();

      return getNumRows();

    }


    unsigned getNumColumns() const {

      if (isColumnMajor())

        return Vectors.size();

      else {

        assert(Vectors.size() > 0 && "Cannot call getNumRows without columns");

        return getVectorTy()->getNumElements();

      }

    }

    unsigned getNumRows() const {

      if (isColumnMajor()) {

        assert(Vectors.size() > 0 && "Cannot call getNumRows without columns");

        return getVectorTy()->getNumElements();

      } else

        return Vectors.size();

    }


    void addVector(Value *V) { Vectors.push_back(V); }

    FixedVectorType *getColumnTy() {

      assert(isColumnMajor() && "only supported for column-major matrixes");

      return getVectorTy();

    }


    FixedVectorType *getVectorTy() const {

      return cast<FixedVectorType>(Vectors[0]->getType());

    }


    iterator_range<SmallVector<Value *, 8>::iterator> columns() {

      assert(isColumnMajor() &&

             "columns() only supported for column-major matrixes");

      return make_range(Vectors.begin(), Vectors.end());

    }


    iterator_range<SmallVector<Value *, 8>::iterator> vectors() {

      return make_range(Vectors.begin(), Vectors.end());

    }


    /// Embed the vectors of the matrix into a flat vector by concatenating

    /// them.

    Value *embedInVector(IRBuilder<> &Builder) const {

      return Vectors.size() == 1 ? Vectors[0]

                                 : concatenateVectors(Builder, Vectors);

    }


    MatrixTy &addNumLoads(unsigned N) {

      OpInfo.NumLoads += N;

      return *this;

    }


    void setNumLoads(unsigned N) { OpInfo.NumLoads = N; }


    MatrixTy &addNumStores(unsigned N) {

      OpInfo.NumStores += N;

      return *this;

    }


    MatrixTy &addNumExposedTransposes(unsigned N) {

      OpInfo.NumExposedTransposes += N;

      return *this;

    }


    MatrixTy &addNumComputeOps(unsigned N) {

      OpInfo.NumComputeOps += N;

      return *this;

    }


    unsigned getNumStores() const { return OpInfo.NumStores; }

    unsigned getNumLoads() const { return OpInfo.NumLoads; }

    unsigned getNumComputeOps() const { return OpInfo.NumComputeOps; }


    const OpInfoTy &getOpInfo() const { return OpInfo; }


    bool isColumnMajor() const { return IsColumnMajor; }


    unsigned getStride() const {

      if (isColumnMajor())

        return getNumRows();

      return getNumColumns();

    }


    ShapeInfo shape() const { return {getNumRows(), getNumColumns()}; }


    /// Extract a vector of \p NumElts starting at index (\p I, \p J). If the

    /// matrix is column-major, the result vector is extracted from a column

    /// vector, otherwise from a row vector.

    Value *extractVector(unsigned I, unsigned J, unsigned NumElts,

                         IRBuilder<> &Builder) const {

      Value *Vec = isColumnMajor() ? getColumn(J) : getRow(I);

      assert(cast<FixedVectorType>(Vec->getType())->getNumElements() >=

                 NumElts &&

             "Extracted vector will contain poison values");

      return Builder.CreateShuffleVector(

          Vec, createSequentialMask(isColumnMajor() ? I : J, NumElts, 0),

          "block");

    }

  };


  /// Maps instructions to their shape information. The shape information

  /// describes the shape to be used while lowering. This matches the shape of

  /// the result value of the instruction, with the only exceptions being store

  /// instructions and the matrix_column_major_store intrinsics. For those, the

  /// shape information indicates that those instructions should be lowered

  /// using shape information as well. Note that extra care is needed when

  /// erasing or RAUW'ing a value that is present in ShapeMap. If the

  /// replacement is also a matrix operation, use

  /// updateShapeAndReplaceAllUsesWith to make sure the replacement is added to

  /// ShapeMap.  We don't use ValueMap, as there are also cases where we do not

  /// want to add shape information for a replacement instruction. When directly

  /// erasing a value with an entry in ShapeMap, use

  /// eraseFromParentAndRemoveFromShapeMap to make sure ShapeMap is also updated

  /// accordingly.

  DenseMap<Value *, ShapeInfo> ShapeMap;


  /// List of instructions to remove. While lowering, we are not replacing all

  /// users of a lowered instruction, if shape information is available and

  /// those need to be removed after we finished lowering.

  SmallVector<Instruction *, 16> ToRemove;


  /// Map from instructions to their produced column matrix.

  MapVector<Value *, MatrixTy> Inst2ColumnMatrix;


private:

  static FastMathFlags getFastMathFlags(Instruction *Inst) {

    FastMathFlags FMF;


    if (isa<FPMathOperator>(*Inst))

      FMF = Inst->getFastMathFlags();


    FMF.setAllowContract(AllowContractEnabled || FMF.allowContract());


    return FMF;

  }


public:

  LowerMatrixIntrinsics(Function &F, TargetTransformInfo &TTI,

                        FunctionAnalysisManager *AM)

      : Func(F), DL(F.getDataLayout()), TTI(TTI), AM(AM) {}


  unsigned getNumOps(Type *VT) {

    assert(isa<FixedVectorType>(VT) && "Expected vector type");

    return getNumOps(VT->getScalarType(),

                     cast<FixedVectorType>(VT)->getNumElements());

  }


  /// Is this the minimal version executed in the backend pipelines.

  bool isMinimal() const {

    return !DT;

  }


  /// Return the estimated number of vector ops required for an operation on

  /// \p VT * N.

  unsigned getNumOps(Type *ST, unsigned N) {

    return std::ceil((ST->getPrimitiveSizeInBits() * N).getFixedValue() /

                     double(TTI.getRegisterBitWidth(

                                   TargetTransformInfo::RGK_FixedWidthVector)

                                .getFixedValue()));

  }


  /// Estimate the number of native vector operations for a multiply of matrices

  /// with dimensions \p R x \p M and \p M x \p C. Native ops are computed as

  /// ceil(ElementCount * ElementBits / RegisterBits).

  ///

  /// Native vector ops per operation type (VF = native vector elements):

  ///   FMAs:    C * ceil(R/VF) * M (one FMA per VF output elements)

  ///   A loads: ceil(R/VF) * M (A has M columns, ceil(R/VF) native loads each)

  ///   B loads: ceil(M/VF) * C (B has C columns, ceil(M/VF) native loads each)

  ///   Stores:  C * ceil(R/VF) (one store per VF output elements)

  unsigned getNumNativeVectorOps(Type *EltType, unsigned R, unsigned M,

                                 unsigned C) {

    unsigned NumFMAs = C * getNumOps(EltType, R) * M;

    unsigned NumALoads = getNumOps(EltType, R) * M;

    unsigned NumBLoads = getNumOps(EltType, M) * C;

    unsigned NumStores = getNumOps(EltType, R) * C;

    return NumFMAs + NumALoads + NumBLoads + NumStores;

  }


  /// Return the set of vectors that a matrix value is lowered to.

  ///

  /// If we lowered \p MatrixVal, just return the cache result matrix. Otherwise

  /// split the flat vector \p MatrixVal containing a matrix with shape \p SI

  /// into vectors.

  MatrixTy getMatrix(Value *MatrixVal, const ShapeInfo &SI,

                     IRBuilder<> &Builder) {

    FixedVectorType *VType = cast<FixedVectorType>(MatrixVal->getType());

    assert(VType->getNumElements() == SI.NumRows * SI.NumColumns &&

           "The vector size must match the number of matrix elements");


    // Check if we lowered MatrixVal using shape information. In that case,

    // return the existing matrix, if it matches the requested shape

    // information. If there is a mis-match, embed the result in a flat

    // vector and split it later.

    auto Found = Inst2ColumnMatrix.find(MatrixVal);

    if (Found != Inst2ColumnMatrix.end()) {

      MatrixTy &M = Found->second;

      // Return the found matrix, if its shape matches the requested shape

      // information

      if (SI.NumRows == M.getNumRows() && SI.NumColumns == M.getNumColumns())

        return M;


      MatrixVal = M.embedInVector(Builder);

    }


    // Otherwise split MatrixVal.

    SmallVector<Value *, 16> SplitVecs;

    for (unsigned MaskStart = 0; MaskStart < VType->getNumElements();

         MaskStart += SI.getStride()) {

      Value *V = Builder.CreateShuffleVector(

          MatrixVal, createSequentialMask(MaskStart, SI.getStride(), 0),

          "split");

      SplitVecs.push_back(V);

    }


    if (Instruction *Inst = dyn_cast<Instruction>(MatrixVal)) {

      if (Found != Inst2ColumnMatrix.end()) {

        // FIXME: re: "at least": SplitVecs.size() doesn't count the shuffles

        // that embedInVector created.

        LLVM_DEBUG(dbgs() << "matrix reshape from " << Found->second.shape()

                          << " to " << SI << " using at least "

                          << SplitVecs.size() << " shuffles on behalf of:\n"

                          << *Inst << '\n');

        ReshapedMatrices++;

      } else if (!ShapeMap.contains(MatrixVal)) {

        LLVM_DEBUG(

            dbgs()

            << "splitting a " << SI << " matrix with " << SplitVecs.size()

            << " shuffles beacuse we do not have a shape-aware lowering for "

               "its def:\n"

            << *Inst << '\n');

        (void)Inst;

        SplitMatrices++;

      } else {

        // The ShapeMap has it, so it's a case where we're being lowered

        // before the def, and we expect that InstCombine will clean things up

        // afterward.

      }

    }


    return {SplitVecs};

  }


  /// If \p V already has a known shape return false.  Otherwise set the shape

  /// for instructions that support it.

  bool setShapeInfo(Value *V, ShapeInfo Shape) {

    assert(Shape && "Shape not set");

    if (isa<UndefValue>(V) || !supportsShapeInfo(V))

      return false;


    auto SIter = ShapeMap.find(V);

    if (SIter != ShapeMap.end()) {

      if (VerifyShapeInfo && (SIter->second.NumRows != Shape.NumRows ||

                              SIter->second.NumColumns != Shape.NumColumns)) {

        errs() << "Conflicting shapes (" << SIter->second.NumRows << "x"

               << SIter->second.NumColumns << " vs " << Shape.NumRows << "x"

               << Shape.NumColumns << ") for " << *V << "\n";

        report_fatal_error(

            "Matrix shape verification failed, compilation aborted!");

      }


      LLVM_DEBUG(dbgs() << "  not overriding existing shape: "

                        << SIter->second.NumRows << " "

                        << SIter->second.NumColumns << " for " << *V << "\n");

      return false;

    }


    ShapeMap.insert({V, Shape});

    LLVM_DEBUG(dbgs() << "  " << Shape.NumRows << " x " << Shape.NumColumns

                      << " for " << *V << "\n");

    return true;

  }


  /// Returns true if shape information can be used for \p V. The supported

  /// instructions must match the instructions that can be lowered by this pass.

  bool supportsShapeInfo(Value *V) {

    Instruction *Inst = dyn_cast<Instruction>(V);

    if (!Inst)

      return false;


    IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst);

    if (II)

      switch (II->getIntrinsicID()) {

      case Intrinsic::matrix_multiply:

      case Intrinsic::matrix_transpose:

      case Intrinsic::matrix_column_major_load:

      case Intrinsic::matrix_column_major_store:

        return true;

      default:

        break;

      }

    return isShapePreserving(V) || isa<StoreInst>(V) || isa<LoadInst>(V);

  }


  /// Propagate the shape information of instructions to their users.

  /// The work list contains instructions for which we can compute the shape,

  /// either based on the information provided by matrix intrinsics or known

  /// shapes of operands.

  SmallVector<Instruction *, 32>

  propagateShapeForward(SmallVectorImpl<Instruction *> &WorkList) {

    SmallVector<Instruction *, 32> NewWorkList;

    // Pop an element for which we guaranteed to have at least one of the

    // operand shapes.  Add the shape for this and then add users to the work

    // list.

    LLVM_DEBUG(dbgs() << "Forward-propagate shapes:\n");

    while (!WorkList.empty()) {

      Instruction *Inst = WorkList.pop_back_val();


      // New entry, set the value and insert operands

      bool Propagate = false;

      if (auto SI = computeShapeInfoForInst(Inst, ShapeMap))

        Propagate = setShapeInfo(Inst, *SI);


      if (Propagate) {

        NewWorkList.push_back(Inst);

        for (auto *User : Inst->users())

          if (ShapeMap.count(User) == 0)

            WorkList.push_back(cast<Instruction>(User));

      }

    }


    return NewWorkList;

  }


  /// Propagate the shape to operands of instructions with shape information.

  /// \p Worklist contains the instruction for which we already know the shape.

  SmallVector<Instruction *, 32>

  propagateShapeBackward(SmallVectorImpl<Instruction *> &WorkList) {

    SmallVector<Instruction *, 32> NewWorkList;


    auto pushInstruction = [](Value *V,

                              SmallVectorImpl<Instruction *> &WorkList) {

      Instruction *I = dyn_cast<Instruction>(V);

      if (I)

        WorkList.push_back(I);

    };

    // Pop an element with known shape.  Traverse the operands, if their shape

    // derives from the result shape and is unknown, add it and add them to the

    // worklist.

    LLVM_DEBUG(dbgs() << "Backward-propagate shapes:\n");

    while (!WorkList.empty()) {

      Value *V = WorkList.pop_back_val();


      size_t BeforeProcessingV = WorkList.size();

      if (!isa<Instruction>(V))

        continue;


      Value *MatrixA;

      Value *MatrixB;

      Value *M;

      Value *N;

      Value *K;

      if (match(V, m_Intrinsic<Intrinsic::matrix_multiply>(

                       m_Value(MatrixA), m_Value(MatrixB), m_Value(M),

                       m_Value(N), m_Value(K)))) {

        if (setShapeInfo(MatrixA, {M, N}))

          pushInstruction(MatrixA, WorkList);


        if (setShapeInfo(MatrixB, {N, K}))

          pushInstruction(MatrixB, WorkList);


      } else if (match(V, m_Intrinsic<Intrinsic::matrix_transpose>(

                              m_Value(MatrixA), m_Value(M), m_Value(N)))) {

        // Flip dimensions.

        if (setShapeInfo(MatrixA, {M, N}))

          pushInstruction(MatrixA, WorkList);

      } else if (match(V, m_Intrinsic<Intrinsic::matrix_column_major_store>(

                              m_Value(MatrixA), m_Value(), m_Value(), m_Value(),

                              m_Value(M), m_Value(N)))) {

        if (setShapeInfo(MatrixA, {M, N})) {

          pushInstruction(MatrixA, WorkList);

        }

      } else if (isa<LoadInst>(V) ||

                 match(V, m_Intrinsic<Intrinsic::matrix_column_major_load>())) {

        // Nothing to do, no matrix input.

      } else if (isa<StoreInst>(V)) {

        // Nothing to do.  We forward-propagated to this so we would just

        // backward propagate to an instruction with an already known shape.

      } else if (isShapePreserving(V)) {

        auto ShapedOps = getShapedOperandsForInst(cast<Instruction>(V));

        // Propagate to all operands.

        ShapeInfo Shape = ShapeMap[V];

        for (Use &U : ShapedOps) {

          if (setShapeInfo(U.get(), Shape))

            pushInstruction(U.get(), WorkList);

        }

      }

      // After we discovered new shape info for new instructions in the

      // worklist, we use their users as seeds for the next round of forward

      // propagation.

      for (size_t I = BeforeProcessingV; I != WorkList.size(); I++)

        for (User *U : WorkList[I]->users())

          if (isa<Instruction>(U) && V != U)

            NewWorkList.push_back(cast<Instruction>(U));

    }

    return NewWorkList;

  }


  /// (Op0 op Op1)^T -> Op0^T op Op1^T

  /// Transpose \p Op0 and \p Op1 of shape \p Shape0 and \p Shape1, then use

  /// them on both sides of \p Operation.

  Instruction *distributeTransposes(

      Value *Op0, ShapeInfo Shape0, Value *Op1, ShapeInfo Shape1,

      MatrixBuilder &Builder,

      function_ref<Instruction *(Value *, ShapeInfo, Value *, ShapeInfo)>

          Operation) {

    Value *T0 = Builder.CreateMatrixTranspose(

        Op0, Shape0.NumRows, Shape0.NumColumns, Op0->getName() + "_t");

    // We are being run after shape prop, add shape for newly created

    // instructions so that we lower them later.

    setShapeInfo(T0, Shape0.t());

    Value *T1 = Builder.CreateMatrixTranspose(

        Op1, Shape1.NumRows, Shape1.NumColumns, Op1->getName() + "_t");

    setShapeInfo(T1, Shape1.t());

    return Operation(T0, Shape0.t(), T1, Shape1.t());

  }


  /// Erase \p Inst from both ShapeMap (if an entry exists) and erase \p Inst

  /// itself.

  void eraseFromParentAndRemoveFromShapeMap(Instruction *Inst) {

    ShapeMap.erase(Inst);

    Inst->eraseFromParent();

  }


  /// Erase \p V from \p BB and move \II forward to avoid invalidating

  /// iterators.

  void eraseFromParentAndMove(Value *V, BasicBlock::reverse_iterator &II,

                              BasicBlock &BB) {

    auto *Inst = cast<Instruction>(V);

    // Still used, don't erase.

    if (!Inst->use_empty())

      return;

    if (II != BB.rend() && Inst == &*II)

      ++II;

    eraseFromParentAndRemoveFromShapeMap(Inst);

  }


  /// Add a new entry to ShapeMap for \p New with \p Old's shape info, erase the

  /// entry for \p Old and replace all uses of \p Old with \p New.

  void updateShapeAndReplaceAllUsesWith(Instruction &Old, Value *New) {

    // We need to remove Old from the ShapeMap otherwise RAUW will replace it

    // with New. We should only add New it it supportsShapeInfo so we insert

    // it conditionally instead.

    auto S = ShapeMap.find(&Old);

    if (S != ShapeMap.end()) {

      ShapeInfo Shape = S->second;

      ShapeMap.erase(S);

      if (supportsShapeInfo(New))

        ShapeMap.insert({New, Shape});

    }

    Old.replaceAllUsesWith(New);

  }


  /// Sink a top-level transpose inside matmuls and adds.

  /// This creates and erases instructions as needed, and returns the newly

  /// created instruction while updating the iterator to avoid invalidation. If

  /// this returns nullptr, no new instruction was created.

  Instruction *sinkTranspose(Instruction &I, BasicBlock::reverse_iterator &II,

                             bool &Changed) {

    BasicBlock &BB = *I.getParent();

    IRBuilder<> IB(&I);

    MatrixBuilder Builder(IB);


    Value *TA, *TAMA, *TAMB;

    ConstantInt *R, *K, *C;

    if (!match(&I, m_Intrinsic<Intrinsic::matrix_transpose>(

                       m_Value(TA), m_ConstantInt(R), m_ConstantInt(C))))

      return nullptr;


    // Transpose of a transpose is a nop when the shapes match.

    Value *TATA;

    if (match(TA, m_Intrinsic<Intrinsic::matrix_transpose>(

                      m_Value(TATA), m_Specific(C), m_Specific(R)))) {

      updateShapeAndReplaceAllUsesWith(I, TATA);

      eraseFromParentAndMove(&I, II, BB);

      eraseFromParentAndMove(TA, II, BB);

      Changed = true;

      return nullptr;

    }


    // k^T -> k

    if (isSplat(TA)) {

      updateShapeAndReplaceAllUsesWith(I, TA);

      eraseFromParentAndMove(&I, II, BB);

      Changed = true;

      return nullptr;

    }


    // (A * B)^t -> B^t * A^t

    // RxK KxC      CxK   KxR

    if (match(TA, m_Intrinsic<Intrinsic::matrix_multiply>(

                      m_Value(TAMA), m_Value(TAMB), m_ConstantInt(R),

                      m_ConstantInt(K), m_ConstantInt(C)))) {

      auto NewInst = distributeTransposes(

          TAMB, {K, C}, TAMA, {R, K}, Builder,

          [&](Value *T0, ShapeInfo Shape0, Value *T1, ShapeInfo Shape1) {

            return Builder.CreateMatrixMultiply(T0, T1, Shape0.NumRows,

                                                Shape0.NumColumns,

                                                Shape1.NumColumns, "mmul");

          });

      updateShapeAndReplaceAllUsesWith(I, NewInst);

      eraseFromParentAndMove(&I, II, BB);

      eraseFromParentAndMove(TA, II, BB);

      Changed = true;

      return NewInst;

    }


    // Same as above, but with a mul, which occurs when multiplied

    // with a scalar.

    // (A * k)^t -> A^t * k

    //  R  x  C     RxC

    if (match(TA, m_AnyMul(m_Value(TAMA), m_Value(TAMB))) &&

        (isSplat(TAMA) || isSplat(TAMB))) {

      IRBuilder<> LocalBuilder(&I);

      // We know that the transposed operand is of shape RxC.

      // An when multiplied with a scalar, the shape is preserved.

      auto NewInst = distributeTransposes(

          TAMA, {R, C}, TAMB, {R, C}, Builder,

          [&](Value *T0, ShapeInfo Shape0, Value *T1, ShapeInfo Shape1) {

            bool IsFP = I.getType()->isFPOrFPVectorTy();

            auto *Mul = IsFP ? LocalBuilder.CreateFMul(T0, T1, "mmul")

                             : LocalBuilder.CreateMul(T0, T1, "mmul");

            auto *Result = cast<Instruction>(Mul);

            setShapeInfo(Result, Shape0);

            return Result;

          });

      updateShapeAndReplaceAllUsesWith(I, NewInst);

      eraseFromParentAndMove(&I, II, BB);

      eraseFromParentAndMove(TA, II, BB);

      Changed = true;

      return NewInst;

    }


    // (A + B)^t -> A^t + B^t

    // RxC RxC      CxR   CxR

    if (match(TA, m_AnyAdd(m_Value(TAMA), m_Value(TAMB)))) {

      IRBuilder<> LocalBuilder(&I);

      auto NewInst = distributeTransposes(

          TAMA, {R, C}, TAMB, {R, C}, Builder,

          [&](Value *T0, ShapeInfo Shape0, Value *T1, ShapeInfo Shape1) {

            bool IsFP = I.getType()->isFPOrFPVectorTy();

            auto *Add = IsFP ? LocalBuilder.CreateFAdd(T0, T1, "madd")

                             : LocalBuilder.CreateAdd(T0, T1, "madd");


            auto *Result = cast<Instruction>(Add);

            setShapeInfo(Result, Shape0);

            return Result;

          });

      updateShapeAndReplaceAllUsesWith(I, NewInst);

      eraseFromParentAndMove(&I, II, BB);

      eraseFromParentAndMove(TA, II, BB);

      Changed = true;

      return NewInst;

    }


    return nullptr;

  }


  bool liftTranspose(Instruction &I) {

    // Erase dead Instructions after lifting transposes from binops.

    auto CleanupBinOp = [this](Instruction &T, Value *A, Value *B) {

      if (T.use_empty())

        eraseFromParentAndRemoveFromShapeMap(&T);

      if (A->use_empty())

        eraseFromParentAndRemoveFromShapeMap(cast<Instruction>(A));

      if (A != B && B->use_empty())

        eraseFromParentAndRemoveFromShapeMap(cast<Instruction>(B));

    };


    Value *A, *B, *AT, *BT;

    ConstantInt *R, *K, *C;

    // A^t * B ^t -> (B * A)^t

    if (match(&I, m_Intrinsic<Intrinsic::matrix_multiply>(

                      m_Value(A), m_Value(B), m_ConstantInt(R),

                      m_ConstantInt(K), m_ConstantInt(C))) &&

        match(A, m_Intrinsic<Intrinsic::matrix_transpose>(m_Value(AT))) &&

        match(B, m_Intrinsic<Intrinsic::matrix_transpose>(m_Value((BT))))) {

      IRBuilder<> IB(&I);

      MatrixBuilder Builder(IB);

      Value *M = Builder.CreateMatrixMultiply(

          BT, AT, C->getZExtValue(), K->getZExtValue(), R->getZExtValue());

      setShapeInfo(M, {C, R});

      Instruction *NewInst = Builder.CreateMatrixTranspose(M, C->getZExtValue(),

                                                           R->getZExtValue());

      updateShapeAndReplaceAllUsesWith(I, NewInst);

      CleanupBinOp(I, A, B);

      return true;

    }

    // A^t + B ^t -> (A + B)^t. Pick rows and columns from first transpose. If

    // the shape of the second transpose is different, there's a shape conflict

    // which gets resolved by picking the shape of the first operand.

    else if (match(&I, m_FAdd(m_Value(A), m_Value(B))) &&

             match(A, m_Intrinsic<Intrinsic::matrix_transpose>(

                          m_Value(AT), m_ConstantInt(R), m_ConstantInt(C))) &&

             match(B, m_Intrinsic<Intrinsic::matrix_transpose>(

                          m_Value(BT), m_ConstantInt(), m_ConstantInt()))) {

      IRBuilder<> Builder(&I);

      auto *Add = Builder.CreateFAdd(AT, BT, "mfadd");

      MatrixBuilder MBuilder(Builder);

      Instruction *NewInst = MBuilder.CreateMatrixTranspose(

          Add, R->getZExtValue(), C->getZExtValue(), "mfadd_t");

      updateShapeAndReplaceAllUsesWith(I, NewInst);

      assert(computeShapeInfoForInst(NewInst, ShapeMap) ==

                 computeShapeInfoForInst(&I, ShapeMap) &&

             "Shape of new instruction doesn't match original shape.");

      CleanupBinOp(I, A, B);

      if (auto *AddI = dyn_cast<Instruction>(Add)) {

        setShapeInfo(AddI, {R, C});

        assert(

            computeShapeInfoForInst(AddI, ShapeMap).value_or(ShapeMap[AddI]) ==

                ShapeMap[AddI] &&

            "Shape of updated addition doesn't match cached shape.");

      }

      return true;

    }

    return false;

  }


  /// Try moving transposes in order to fold them away or into multiplies.

  bool optimizeTransposes() {

    bool Changed = false;

    // First sink all transposes inside matmuls and adds, hoping that we end up

    // with NN, NT or TN variants.

    for (BasicBlock &BB : reverse(Func)) {

      for (auto II = BB.rbegin(); II != BB.rend();) {

        Instruction &I = *II;

        // We may remove II.  By default continue on the next/prev instruction.

        ++II;

        if (Instruction *NewInst = sinkTranspose(I, II, Changed))

          II = std::next(BasicBlock::reverse_iterator(NewInst));

      }

    }


    // If we have a TT matmul or a TT add, lift the transpose. We may be able

    // to fold into consuming multiply or add.

    for (BasicBlock &BB : Func) {

      for (Instruction &I : llvm::make_early_inc_range(BB)) {

        Changed |= liftTranspose(I);

      }

    }

    return Changed;

  }


  bool Visit() {

    SmallVector<Instruction *, 32> WorkList;


    // Initially only the shape of matrix intrinsics is known.

    // Initialize the work list with ops carrying shape information.

    for (BasicBlock &BB : Func)

      for (Instruction &Inst : BB) {

        IntrinsicInst *II = dyn_cast<IntrinsicInst>(&Inst);

        if (!II)

          continue;


        switch (II->getIntrinsicID()) {

        case Intrinsic::matrix_multiply:

        case Intrinsic::matrix_transpose:

        case Intrinsic::matrix_column_major_load:

        case Intrinsic::matrix_column_major_store:

          WorkList.push_back(&Inst);

          break;

        default:

          break;

        }

      }


    // Avoid unnecessary work if there are no matrix intrinsics in the function.

    if (WorkList.empty())

      return false;


    if (AM) {

      ORE = &AM->getResult<OptimizationRemarkEmitterAnalysis>(Func);

      AA = &AM->getResult<AAManager>(Func);

      DT = &AM->getResult<DominatorTreeAnalysis>(Func);

      LI = &AM->getResult<LoopAnalysis>(Func);

    }


    // Propagate shapes until nothing changes any longer.

    while (!WorkList.empty()) {

      WorkList = propagateShapeForward(WorkList);

      WorkList = propagateShapeBackward(WorkList);

    }


    bool Changed = false;

    if (!isMinimal()) {

      Changed |= optimizeTransposes();

      if (PrintAfterTransposeOpt) {

        dbgs() << "Dump after matrix transpose optimization:\n";

        Func.print(dbgs());

      }

    }


    SmallVector<CallInst *, 16> MaybeFusableInsts;

    SmallVector<Instruction *, 16> MatrixInsts;

    SmallVector<IntrinsicInst *, 16> LifetimeEnds;


    // First, collect all instructions with shape information and candidates for

    // fusion (currently only matrix multiplies).

    ReversePostOrderTraversal<Function *> RPOT(&Func);

    for (auto *BB : RPOT)

      for (Instruction &I : *BB) {

        if (match(&I, m_Intrinsic<Intrinsic::lifetime_end>()))

          LifetimeEnds.push_back(cast<IntrinsicInst>(&I));

        if (!ShapeMap.contains(&I))

          continue;

        if (match(&I, m_Intrinsic<Intrinsic::matrix_multiply>()))

          MaybeFusableInsts.push_back(cast<CallInst>(&I));

        MatrixInsts.push_back(&I);

      }


    // Second, try to lower any dot products

    SmallPtrSet<Instruction *, 16> FusedInsts;

    for (CallInst *CI : MaybeFusableInsts)

      lowerDotProduct(CI, FusedInsts, getFastMathFlags(CI));


    // Third, try to fuse candidates.

    for (CallInst *CI : MaybeFusableInsts)

      if (!FusedInsts.contains(CI))

        LowerMatrixMultiplyFused(CI, FusedInsts, LifetimeEnds);


    Changed |= !FusedInsts.empty();


    // Fourth, pre-process all the PHINode's. The incoming values will be

    // assigned later in VisitPHI.

    for (Instruction *Inst : MatrixInsts) {

      if (FusedInsts.count(Inst))

        continue;


      auto *PHI = dyn_cast<PHINode>(Inst);

      if (!PHI)

        continue;


      const ShapeInfo &SI = ShapeMap.at(Inst);

      auto *EltTy = cast<FixedVectorType>(PHI->getType())->getElementType();

      MatrixTy PhiM(SI.NumRows, SI.NumColumns, EltTy);


      IRBuilder<> Builder(Inst);

      for (unsigned VI = 0, VE = PhiM.getNumVectors(); VI != VE; ++VI)

        PhiM.setVector(VI, Builder.CreatePHI(PhiM.getVectorTy(),

                                             PHI->getNumIncomingValues(),

                                             PHI->getName()));

      assert(!Inst2ColumnMatrix.contains(PHI) && "map already contains phi?");

      Inst2ColumnMatrix[PHI] = PhiM;

    }


    // Fifth, lower remaining instructions with shape information.

    for (Instruction *Inst : MatrixInsts) {

      if (FusedInsts.count(Inst))

        continue;


      const ShapeInfo &SI = ShapeMap.at(Inst);


      Value *Op1;

      Value *Op2;

      MatrixTy Result;

      IRBuilder<> Builder(Inst);

      if (auto *BinOp = dyn_cast<BinaryOperator>(Inst))

        Result = VisitBinaryOperator(BinOp, SI, Builder);

      else if (auto *Cast = dyn_cast<CastInst>(Inst))

        Result = VisitCastInstruction(Cast, SI, Builder);

      else if (auto *UnOp = dyn_cast<UnaryOperator>(Inst))

        Result = VisitUnaryOperator(UnOp, SI, Builder);

      else if (auto *Intr = dyn_cast<IntrinsicInst>(Inst))

        Result = VisitIntrinsicInst(Intr, SI, Builder);

      else if (auto *Select = dyn_cast<SelectInst>(Inst))

        Result = VisitSelectInst(Select, SI, Builder);

      else if (match(Inst, m_Load(m_Value(Op1))))

        Result = VisitLoad(cast<LoadInst>(Inst), SI, Op1, Builder);

      else if (match(Inst, m_Store(m_Value(Op1), m_Value(Op2))))

        Result = VisitStore(cast<StoreInst>(Inst), SI, Op1, Op2, Builder);

      else if (auto *PHI = dyn_cast<PHINode>(Inst))

        Result = VisitPHI(PHI, SI, Builder);

      else

        continue;


      finalizeLowering(Inst, Result, Builder);

      Changed = true;

    }


    if (ORE) {

      RemarkGenerator RemarkGen(Inst2ColumnMatrix, *ORE, Func);

      RemarkGen.emitRemarks();

    }


    // Delete the instructions backwards, as it has a reduced likelihood of

    // having to update as many def-use and use-def chains.

    //

    // Because we add to ToRemove during fusion we can't guarantee that defs

    // are before uses.  Change uses to poison temporarily as these should get

    // removed as well.

    //

    // For verification, we keep track of where we changed uses to poison in

    // PoisonedInsts and then check that we in fact remove them.

    SmallPtrSet<Instruction *, 16> PoisonedInsts;

    for (auto *Inst : reverse(ToRemove)) {

      for (Use &U : llvm::make_early_inc_range(Inst->uses())) {

        if (auto *Poisoned = dyn_cast<Instruction>(U.getUser()))

          PoisonedInsts.insert(Poisoned);

        U.set(PoisonValue::get(Inst->getType()));

      }

      Inst->eraseFromParent();

      PoisonedInsts.erase(Inst);

    }

    if (!PoisonedInsts.empty()) {

      // If we didn't remove all poisoned instructions, it's a hard error.

      dbgs() << "Poisoned but present instructions:\n";

      for (auto *I : PoisonedInsts)

        dbgs() << *I << "\n";

      llvm_unreachable("Poisoned but instruction not removed");

    }


    return Changed;

  }


  /// Replace intrinsic calls.

  MatrixTy VisitIntrinsicInst(IntrinsicInst *Inst, const ShapeInfo &SI,

                              IRBuilder<> &Builder) {

    assert(Inst->getCalledFunction() &&

           Inst->getCalledFunction()->isIntrinsic());


    switch (Inst->getCalledFunction()->getIntrinsicID()) {

    case Intrinsic::matrix_multiply:

      return LowerMultiply(Inst, Builder);

    case Intrinsic::matrix_transpose:

      return LowerTranspose(Inst, Builder);

    case Intrinsic::matrix_column_major_load:

      return LowerColumnMajorLoad(Inst, Builder);

    case Intrinsic::matrix_column_major_store:

      return LowerColumnMajorStore(Inst, Builder);

    case Intrinsic::abs:

    case Intrinsic::fabs: {

      MatrixTy Result;

      MatrixTy M = getMatrix(Inst->getOperand(0), SI, Builder);

      Builder.setFastMathFlags(getFastMathFlags(Inst));


      for (auto *Vector : M.vectors()) {

        switch (Inst->getIntrinsicID()) {

        case Intrinsic::abs:

          Result.addVector(Builder.CreateBinaryIntrinsic(Intrinsic::abs, Vector,

                                                         Inst->getOperand(1)));

          continue;

        case Intrinsic::fabs:

          Result.addVector(

              Builder.CreateUnaryIntrinsic(Inst->getIntrinsicID(), Vector));

          continue;

        default:

          llvm_unreachable("unexpected intrinsic");

        }

      }


      return Result.addNumComputeOps(getNumOps(Result.getVectorTy()) *

                                     Result.getNumVectors());

    }

    default:

      break;

    }

    llvm_unreachable(

        "only intrinsics supporting shape info should be seen here");

  }


  /// Compute the alignment for a column/row \p Idx with \p Stride between them.

  /// The address at \p Idx == 0 has alignment \p A. If \p Stride is a

  /// ConstantInt, reduce the initial alignment based on the byte offset. For

  /// non-ConstantInt strides, return the common alignment of the initial

  /// alignment and the element size in bytes.

  Align getAlignForIndex(unsigned Idx, Value *Stride, Type *ElementTy,

                         MaybeAlign A) const {

    Align InitialAlign = DL.getValueOrABITypeAlignment(A, ElementTy);

    if (Idx == 0)

      return InitialAlign;


    TypeSize ElementSizeInBits = DL.getTypeSizeInBits(ElementTy);

    if (auto *ConstStride = dyn_cast<ConstantInt>(Stride)) {

      uint64_t StrideInBytes =

          ConstStride->getZExtValue() * ElementSizeInBits / 8;

      return commonAlignment(InitialAlign, Idx * StrideInBytes);

    }

    return commonAlignment(InitialAlign, ElementSizeInBits / 8);

  }


  IntegerType *getIndexType(Value *Ptr) const {

    return cast<IntegerType>(DL.getIndexType(Ptr->getType()));

  }


  Value *getIndex(Value *Ptr, uint64_t V) const {

    return ConstantInt::get(getIndexType(Ptr), V);

  }


  Value *castToIndexType(Value *Ptr, Value *V, IRBuilder<> &Builder) const {

    assert(isa<IntegerType>(V->getType()) &&

           "Attempted to cast non-integral type to integer index");

    // In case the data layout's index type differs in width from the type of

    // the value we're given, truncate or zero extend to the appropriate width.

    // We zero extend here as indices are unsigned.

    return Builder.CreateZExtOrTrunc(V, getIndexType(Ptr),

                                     V->getName() + ".cast");

  }


  /// Load a matrix with \p Shape starting at \p Ptr and using \p Stride between

  /// vectors.

  MatrixTy loadMatrix(Type *Ty, Value *Ptr, MaybeAlign MAlign, Value *Stride,

                      bool IsVolatile, ShapeInfo Shape, IRBuilder<> &Builder) {

    auto *VType = cast<FixedVectorType>(Ty);

    Type *EltTy = VType->getElementType();

    Type *VecTy = FixedVectorType::get(EltTy, Shape.getStride());

    Value *EltPtr = Ptr;

    MatrixTy Result;

    Stride = castToIndexType(Ptr, Stride, Builder);

    for (unsigned I = 0, E = Shape.getNumVectors(); I < E; ++I) {

      Value *GEP = computeVectorAddr(

          EltPtr, Builder.getIntN(Stride->getType()->getScalarSizeInBits(), I),

          Stride, Shape.getStride(), EltTy, Builder);

      Value *Vector = Builder.CreateAlignedLoad(

          VecTy, GEP, getAlignForIndex(I, Stride, EltTy, MAlign),

          IsVolatile, "col.load");


      Result.addVector(Vector);

    }

    return Result.addNumLoads(getNumOps(Result.getVectorTy()) *

                              Result.getNumVectors());

  }


  /// Loads a sub-matrix with shape \p ResultShape from a \p R x \p C matrix,

  /// starting at \p MatrixPtr[I][J].

  MatrixTy loadMatrix(Value *MatrixPtr, MaybeAlign Align, bool IsVolatile,

                      ShapeInfo MatrixShape, Value *I, Value *J,

                      ShapeInfo ResultShape, Type *EltTy,

                      IRBuilder<> &Builder) {

    Value *Offset = Builder.CreateAdd(

        Builder.CreateMul(J, getIndex(MatrixPtr, MatrixShape.getStride())), I);


    Value *TileStart = Builder.CreateInBoundsGEP(EltTy, MatrixPtr, Offset);

    auto *TileTy = FixedVectorType::get(EltTy, ResultShape.NumRows *

                                                   ResultShape.NumColumns);


    return loadMatrix(TileTy, TileStart, Align,

                      getIndex(MatrixPtr, MatrixShape.getStride()), IsVolatile,

                      ResultShape, Builder);

  }


  /// Lower a load instruction with shape information.

  MatrixTy LowerLoad(Instruction *Inst, Value *Ptr, MaybeAlign Align,

                     Value *Stride, bool IsVolatile, ShapeInfo Shape,

                     IRBuilder<> &Builder) {

    return loadMatrix(Inst->getType(), Ptr, Align, Stride, IsVolatile, Shape,

                      Builder);

  }


  /// Lowers llvm.matrix.column.major.load.

  ///

  /// The intrinsic loads a matrix from memory using a stride between columns.

  MatrixTy LowerColumnMajorLoad(CallInst *Inst, IRBuilder<> &Builder) {

    assert(MatrixLayout == MatrixLayoutTy::ColumnMajor &&

           "Intrinsic only supports column-major layout!");

    Value *Ptr = Inst->getArgOperand(0);

    Value *Stride = Inst->getArgOperand(1);

    return LowerLoad(Inst, Ptr, Inst->getParamAlign(0), Stride,

                     cast<ConstantInt>(Inst->getArgOperand(2))->isOne(),

                     {Inst->getArgOperand(3), Inst->getArgOperand(4)}, Builder);

  }


  /// Stores a sub-matrix \p StoreVal into the \p R x \p C matrix starting at \p

  /// MatrixPtr[I][J].

  void storeMatrix(const MatrixTy &StoreVal, Value *MatrixPtr,

                   MaybeAlign MAlign, bool IsVolatile, ShapeInfo MatrixShape,

                   Value *I, Value *J, Type *EltTy, IRBuilder<> &Builder) {

    Value *Offset = Builder.CreateAdd(

        Builder.CreateMul(J, getIndex(MatrixPtr, MatrixShape.getStride())), I);


    Value *TileStart = Builder.CreateInBoundsGEP(EltTy, MatrixPtr, Offset);

    auto *TileTy = FixedVectorType::get(EltTy, StoreVal.getNumRows() *

                                                   StoreVal.getNumColumns());


    storeMatrix(TileTy, StoreVal, TileStart, MAlign,

                getIndex(MatrixPtr, MatrixShape.getStride()), IsVolatile,

                Builder);

  }


  /// Store matrix \p StoreVal starting at \p Ptr and using \p Stride between

  /// vectors.

  MatrixTy storeMatrix(Type *Ty, MatrixTy StoreVal, Value *Ptr,

                       MaybeAlign MAlign, Value *Stride, bool IsVolatile,

                       IRBuilder<> &Builder) {

    auto *VType = cast<FixedVectorType>(Ty);

    Value *EltPtr = Ptr;

    Stride = castToIndexType(Ptr, Stride, Builder);

    for (auto Vec : enumerate(StoreVal.vectors())) {

      Value *GEP = computeVectorAddr(

          EltPtr,

          Builder.getIntN(Stride->getType()->getScalarSizeInBits(),

                          Vec.index()),

          Stride, StoreVal.getStride(), VType->getElementType(), Builder);

      Builder.CreateAlignedStore(Vec.value(), GEP,

                                 getAlignForIndex(Vec.index(), Stride,

                                                  VType->getElementType(),

                                                  MAlign),

                                 IsVolatile);

    }

    return MatrixTy().addNumStores(getNumOps(StoreVal.getVectorTy()) *

                                   StoreVal.getNumVectors());

  }


  /// Lower a store instruction with shape information.

  MatrixTy LowerStore(Instruction *Inst, Value *Matrix, Value *Ptr,

                      MaybeAlign A, Value *Stride, bool IsVolatile,

                      ShapeInfo Shape, IRBuilder<> &Builder) {

    auto StoreVal = getMatrix(Matrix, Shape, Builder);

    return storeMatrix(Matrix->getType(), StoreVal, Ptr, A, Stride, IsVolatile,

                       Builder);

  }


  /// Lowers llvm.matrix.column.major.store.

  ///

  /// The intrinsic store a matrix back memory using a stride between columns.

  MatrixTy LowerColumnMajorStore(CallInst *Inst, IRBuilder<> &Builder) {

    assert(MatrixLayout == MatrixLayoutTy::ColumnMajor &&

           "Intrinsic only supports column-major layout!");

    Value *Matrix = Inst->getArgOperand(0);

    Value *Ptr = Inst->getArgOperand(1);

    Value *Stride = Inst->getArgOperand(2);

    return LowerStore(Inst, Matrix, Ptr, Inst->getParamAlign(1), Stride,

                      cast<ConstantInt>(Inst->getArgOperand(3))->isOne(),

                      {Inst->getArgOperand(4), Inst->getArgOperand(5)},

                      Builder);

  }


  // Set elements I..I+NumElts-1 to Block

  Value *insertVector(Value *Col, unsigned I, Value *Block,

                      IRBuilder<> &Builder) {


    // First, bring Block to the same size as Col

    unsigned BlockNumElts =

        cast<FixedVectorType>(Block->getType())->getNumElements();

    unsigned NumElts = cast<FixedVectorType>(Col->getType())->getNumElements();

    assert(NumElts >= BlockNumElts && "Too few elements for current block");


    Block = Builder.CreateShuffleVector(

        Block, createSequentialMask(0, BlockNumElts, NumElts - BlockNumElts));


    // If Col is 7 long and I is 2 and BlockNumElts is 2 the mask is: 0, 1, 7,

    // 8, 4, 5, 6

    SmallVector<int, 16> Mask;

    unsigned i;

    for (i = 0; i < I; i++)

      Mask.push_back(i);


    unsigned VecNumElts =

        cast<FixedVectorType>(Col->getType())->getNumElements();

    for (; i < I + BlockNumElts; i++)

      Mask.push_back(i - I + VecNumElts);


    for (; i < VecNumElts; i++)

      Mask.push_back(i);


    return Builder.CreateShuffleVector(Col, Block, Mask);

  }


  Value *createMulAdd(Value *Sum, Value *A, Value *B, bool UseFPOp,

                      IRBuilder<> &Builder, bool AllowContraction,

                      unsigned &NumComputeOps) {

    NumComputeOps += getNumOps(A->getType());

    if (!Sum)

      return UseFPOp ? Builder.CreateFMul(A, B) : Builder.CreateMul(A, B);


    if (UseFPOp) {

      if (AllowContraction) {

        // Use fmuladd for floating point operations and let the backend decide

        // if that's profitable.

        return Builder.CreateIntrinsic(Intrinsic::fmuladd, A->getType(),

                                       {A, B, Sum});

      }

      NumComputeOps += getNumOps(A->getType());

      Value *Mul = Builder.CreateFMul(A, B);

      return Builder.CreateFAdd(Sum, Mul);

    }


    NumComputeOps += getNumOps(A->getType());

    Value *Mul = Builder.CreateMul(A, B);

    return Builder.CreateAdd(Sum, Mul);

  }


  /// Cache \p Matrix as result of \p Inst and update the uses of \p Inst. For

  /// users with shape information, there's nothing to do: they will use the

  /// cached value when they are lowered. For other users, \p Matrix is

  /// flattened and the uses are updated to use it. Also marks \p Inst for

  /// deletion.

  void finalizeLowering(Instruction *Inst, MatrixTy Matrix,

                        IRBuilder<> &Builder) {

    auto inserted = Inst2ColumnMatrix.insert(std::make_pair(Inst, Matrix));

    (void)inserted;

    assert((inserted.second || isa<PHINode>(Inst)) &&

           "multiple matrix lowering mapping");


    ToRemove.push_back(Inst);

    Value *Flattened = nullptr;

    for (Use &U : llvm::make_early_inc_range(Inst->uses())) {

      if (ShapeMap.contains(U.getUser()))

        continue;


      if (!Flattened) {

        Flattened = Matrix.embedInVector(Builder);

        LLVM_DEBUG(

            if (Instruction *User = dyn_cast<Instruction>(U.getUser())) dbgs()

                << "flattening a " << Matrix.shape() << " matrix:\n"

                << *Inst

                << "\nbecause we do not have a shape-aware lowering for its "

                   "user:\n"

                << *User << '\n';);

        FlattenedMatrices++;

      }

      U.set(Flattened);

    }

  }


  /// Special case for MatMul lowering. Prevents scalar loads of row-major

  /// vectors Lowers to vector reduction add instead of sequential add if

  /// reassocation is enabled.

  void lowerDotProduct(CallInst *MatMul,

                       SmallPtrSet<Instruction *, 16> &FusedInsts,

                       FastMathFlags FMF) {

    if (FusedInsts.contains(MatMul) ||

        MatrixLayout != MatrixLayoutTy::ColumnMajor)

      return;

    ShapeInfo LShape(MatMul->getArgOperand(2), MatMul->getArgOperand(3));

    ShapeInfo RShape(MatMul->getArgOperand(3), MatMul->getArgOperand(4));


    if (LShape.NumRows != 1 || RShape.NumColumns != 1) // not a dot product

      return;


    Value *LHS = MatMul->getArgOperand(0);

    Value *RHS = MatMul->getArgOperand(1);


    Type *ElementType = cast<FixedVectorType>(LHS->getType())->getElementType();

    bool IsIntVec = ElementType->isIntegerTy();


    // Floating point reductions require reassocation.

    if (!IsIntVec && !FMF.allowReassoc())

      return;


    auto CanBeFlattened = [](Value *Op) {

      if (match(Op, m_BinOp()))

        return true;

      return match(

          Op, m_OneUse(m_CombineOr(

                  m_Load(m_Value()),

                  m_CombineOr(m_Intrinsic<Intrinsic::matrix_transpose>(),

                              m_Intrinsic<Intrinsic::matrix_column_major_load>(

                                  m_Value(), m_One())))));

    };

    // Returns the cost benefit of using \p Op with the dot product lowering. If

    // the returned cost is < 0, the argument is cheaper to use in the

    // dot-product lowering.

    auto GetCostForArg = [this, &CanBeFlattened](Value *Op, unsigned N) {

      if (!ShapeMap.contains(Op))

        return InstructionCost::getInvalid();


      if (!isa<Instruction>(Op))

        return InstructionCost(0);


      FixedVectorType *VecTy = cast<FixedVectorType>(Op->getType());

      Type *EltTy = VecTy->getElementType();


      if (!CanBeFlattened(Op)) {

        InstructionCost EmbedCost(0);

        // Roughly estimate the cost for embedding the columns into a vector.

        for (unsigned I = 1; I < N; ++I)

          EmbedCost += TTI.getShuffleCost(

              TTI::SK_Splice, FixedVectorType::get(EltTy, 1),

              FixedVectorType::get(EltTy, 1), {}, TTI::TCK_RecipThroughput);

        return EmbedCost;

      }


      if (match(Op, m_BinOp()) && ShapeMap.contains(Op)) {

        InstructionCost OriginalCost =

            TTI.getArithmeticInstrCost(cast<Instruction>(Op)->getOpcode(),

                                       EltTy) *

            N;

        InstructionCost NewCost = TTI.getArithmeticInstrCost(

            cast<Instruction>(Op)->getOpcode(), VecTy);

        return NewCost - OriginalCost;

      }


      if (match(Op, m_Intrinsic<Intrinsic::matrix_transpose>())) {

        // The transpose can be skipped for the dot product lowering, roughly

        // estimate the savings as the cost of embedding the columns in a

        // vector.

        InstructionCost EmbedCost(0);

        for (unsigned I = 1; I < N; ++I)

          EmbedCost -= TTI.getShuffleCost(

              TTI::SK_Splice, FixedVectorType::get(EltTy, 1),

              FixedVectorType::get(EltTy, 1), {}, TTI::TCK_RecipThroughput);

        return EmbedCost;

      }


      // Costs for loads.

      if (N == 1)

        return InstructionCost(0);


      return TTI.getMemoryOpCost(Instruction::Load, VecTy, Align(1), 0) -

             N * TTI.getMemoryOpCost(Instruction::Load, EltTy, Align(1), 0);

    };


    // Iterate over LHS and operations feeding LHS and check if it is profitable

    // to flatten the visited ops.  For each op, we compute the difference

    // between the flattened and matrix versions.

    SmallPtrSet<Value *, 4> Seen;

    SmallVector<Value *> WorkList;

    SmallVector<Value *> ToFlatten;

    WorkList.push_back(LHS);

    InstructionCost LHSCost(0);

    while (!WorkList.empty()) {

      Value *Op = WorkList.pop_back_val();

      if (!Seen.insert(Op).second)

        continue;


      InstructionCost OpCost = GetCostForArg(Op, LShape.NumColumns);

      if (OpCost + LHSCost >= LHSCost)

        continue;


      LHSCost += OpCost;

      ToFlatten.push_back(Op);

      if (auto *I = dyn_cast<Instruction>(Op))

        WorkList.append(I->op_begin(), I->op_end());

    }


    // We compare the costs of a vector.reduce.add to sequential add.

    int AddOpCode = IsIntVec ? Instruction::Add : Instruction::FAdd;

    int MulOpCode = IsIntVec ? Instruction::Mul : Instruction::FMul;

    InstructionCost ReductionCost =

        TTI.getArithmeticReductionCost(

            AddOpCode, cast<FixedVectorType>(LHS->getType()),

            IsIntVec ? std::nullopt : std::optional(FMF)) +

        TTI.getArithmeticInstrCost(MulOpCode, LHS->getType());

    InstructionCost SequentialAddCost =

        TTI.getArithmeticInstrCost(AddOpCode, ElementType) *

            (LShape.NumColumns - 1) +

        TTI.getArithmeticInstrCost(MulOpCode, ElementType) *

            (LShape.NumColumns);

    if ((LHSCost + ReductionCost - SequentialAddCost) > InstructionCost(0))

      return;


    FusedInsts.insert(MatMul);

    IRBuilder<> Builder(MatMul);

    auto FlattenArg = [&Builder, &FusedInsts, &CanBeFlattened,

                       this](Value *Op) {

      // Matmul must be the only user of loads because we don't use LowerLoad

      // for row vectors (LowerLoad results in scalar loads and shufflevectors

      // instead of single vector load).

      if (!CanBeFlattened(Op))

        return;


      if (match(Op, m_BinOp())) {

        auto It = ShapeMap.find(Op);

        if (It != ShapeMap.end()) {

          It->second = It->second.t();

          return;

        }

      }


      FusedInsts.insert(cast<Instruction>(Op));

      // If vector uses the builtin load, lower to a LoadInst

      Value *Arg;

      if (match(Op, m_Intrinsic<Intrinsic::matrix_column_major_load>(

                        m_Value(Arg)))) {

        auto *NewLoad = Builder.CreateLoad(Op->getType(), Arg);

        Op->replaceAllUsesWith(NewLoad);

        eraseFromParentAndRemoveFromShapeMap(cast<Instruction>(Op));

        return;

      } else if (match(Op, m_Intrinsic<Intrinsic::matrix_transpose>(

                               m_Value(Arg)))) {

        ToRemove.push_back(cast<Instruction>(Op));

        Op->replaceAllUsesWith(Arg);

        return;

      }

    };


    for (auto *V : ToFlatten)

      FlattenArg(V);


    LHS = MatMul->getArgOperand(0);


    // Insert mul/fmul and llvm.vector.reduce.fadd

    Value *Mul =

        IsIntVec ? Builder.CreateMul(LHS, RHS) : Builder.CreateFMul(LHS, RHS);


    Value *Result;

    if (IsIntVec)

      Result = Builder.CreateAddReduce(Mul);

    else {

      Result = Builder.CreateFAddReduce(

          ConstantFP::get(

              cast<FixedVectorType>(LHS->getType())->getElementType(), 0.0),

          Mul);

      cast<Instruction>(Result)->setFastMathFlags(FMF);

    }


    // pack scalar back into a matrix and then replace matmul inst

    Result = Builder.CreateInsertElement(PoisonValue::get(MatMul->getType()),

                                         Result, uint64_t(0));

    MatMul->replaceAllUsesWith(Result);

    FusedInsts.insert(MatMul);

    ToRemove.push_back(MatMul);

  }


  /// Given \p Remainder iterations of the the matmul inner loop,

  /// potentially lower \p Blocksize that is used for the underlying

  /// vector.

  unsigned capBlockSize(unsigned BlockSize, unsigned Remainder, Type *EltType) {

    if (BlockSize <= Remainder)

      return BlockSize;


    // If the remainder is also a legal type just use it.

    auto *VecTy = FixedVectorType::get(EltType, Remainder);

    if (TTI.isTypeLegal(VecTy))

      return Remainder;


    // Similarly, if the vector is small enough that we don't want

    // to split further.

    if (VecTy->getPrimitiveSizeInBits() <= SplitMatmulRemainderOverThreshold)

      return Remainder;


    // Gradually lower the vectorization factor to cover the

    // remainder.

    do {

      BlockSize /= 2;

    } while (BlockSize > Remainder);

    return BlockSize;

  }


  /// Compute \p Result += \p A * \p B for input matrices with left-associating

  /// addition.

  ///

  /// We can fold a transpose into the operand that is used to extract scalars.

  /// This is the first operands with row-major and the second with

  /// column-major.  If \p IsScalarMatrixTransposed we assume the appropriate

  /// operand is transposed.

  void emitMatrixMultiply(MatrixTy &Result, const MatrixTy &A,

                          const MatrixTy &B, IRBuilder<> &Builder, bool IsTiled,

                          bool IsScalarMatrixTransposed, FastMathFlags FMF) {

    const unsigned VF = std::max<unsigned>(

        TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)

                .getFixedValue() /

            Result.getElementType()->getPrimitiveSizeInBits().getFixedValue(),

        1U);

    unsigned R = Result.getNumRows();

    unsigned C = Result.getNumColumns();

    unsigned M = A.getNumColumns();


    bool IsFP = Result.getElementType()->isFloatingPointTy();

    assert(A.isColumnMajor() == B.isColumnMajor() &&

           Result.isColumnMajor() == A.isColumnMajor() &&

           "operands must agree on matrix layout");

    unsigned NumComputeOps = 0;


    Builder.setFastMathFlags(FMF);


    if (A.isColumnMajor()) {

      // Multiply columns from the first operand with scalars from the second

      // operand. Then move along the K axes and accumulate the columns.  With

      // this the adds can be vectorized without reassociation.

      for (unsigned J = 0; J < C; ++J) {

        unsigned BlockSize = VF;

        // If Result is zero, we don't need to accumulate in the K==0 iteration.

        bool isSumZero = isa<ConstantAggregateZero>(Result.getColumn(J));


        for (unsigned I = 0; I < R; I += BlockSize) {

          // Lower block size to make sure we stay within bounds.

          BlockSize = capBlockSize(BlockSize, R - I, Result.getElementType());

          Value *Sum = IsTiled ? Result.extractVector(I, J, BlockSize, Builder)

                               : nullptr;

          for (unsigned K = 0; K < M; ++K) {

            Value *L = A.extractVector(I, K, BlockSize, Builder);

            Value *RH = Builder.CreateExtractElement(

                B.getColumn(IsScalarMatrixTransposed ? K : J),

                IsScalarMatrixTransposed ? J : K);

            Value *Splat = Builder.CreateVectorSplat(BlockSize, RH, "splat");

            Sum =

                createMulAdd(isSumZero && K == 0 ? nullptr : Sum, L, Splat,

                             IsFP, Builder, FMF.allowContract(), NumComputeOps);

          }

          Result.setVector(J,

                           insertVector(Result.getVector(J), I, Sum, Builder));

        }

      }

    } else {

      // Multiply rows from the second operand with scalars from the first

      // operand. Then move along the K axes and accumulate the rows.  With this

      // the adds can be vectorized without reassociation.

      for (unsigned I = 0; I < R; ++I) {

        unsigned BlockSize = VF;

        bool isSumZero = isa<ConstantAggregateZero>(Result.getRow(I));

        for (unsigned J = 0; J < C; J += BlockSize) {

          // Lower the vectorization factor to cover the remainder.

          BlockSize = capBlockSize(BlockSize, C - J, Result.getElementType());


          Value *Sum = nullptr;

          for (unsigned K = 0; K < M; ++K) {

            Value *R = B.extractVector(K, J, BlockSize, Builder);

            Value *LH = Builder.CreateExtractElement(

                A.getVector(IsScalarMatrixTransposed ? K : I),

                IsScalarMatrixTransposed ? I : K);

            Value *Splat = Builder.CreateVectorSplat(BlockSize, LH, "splat");

            Sum =

                createMulAdd(isSumZero && K == 0 ? nullptr : Sum, Splat, R,

                             IsFP, Builder, FMF.allowContract(), NumComputeOps);

          }

          Result.setVector(I,

                           insertVector(Result.getVector(I), J, Sum, Builder));

        }

      }

    }

    Result.addNumComputeOps(NumComputeOps);

  }


  /// Ensure that the memory in \p Load does not alias \p Store by potentially

  /// copying it to a new location.  This new or otherwise the original location

  /// is returned.

  std::pair<Value *, AllocaInst *>

  getNonAliasingPointer(LoadInst *Load, StoreInst *Store, CallInst *MatMul) {

    MemoryLocation StoreLoc = MemoryLocation::get(Store);

    MemoryLocation LoadLoc = MemoryLocation::get(Load);


    // If we can statically determine noalias we're good.

    if (AA->isNoAlias(LoadLoc, StoreLoc))

      return {Load->getPointerOperand(), nullptr};


    // If the pointers are in different address spaces, we cannot compare them

    // at runtime. Conservatively copy the load operand to a new buffer.

    IRBuilder<> AllocaBuilder(&Func.getEntryBlock().front());

    if (Load->getPointerAddressSpace() != Store->getPointerAddressSpace()) {

      auto *VT = cast<FixedVectorType>(Load->getType());

      auto *ArrayTy =

          ArrayType::get(VT->getElementType(), VT->getNumElements());

      AllocaInst *Alloca =

          AllocaBuilder.CreateAlloca(ArrayTy, Load->getPointerAddressSpace());

      IRBuilder<> Builder(MatMul);

      Builder.CreateLifetimeStart(Alloca);

      Builder.CreateMemCpy(Alloca, Alloca->getAlign(),

                           Load->getPointerOperand(), Load->getAlign(),

                           LoadLoc.Size.getValue());

      return {Alloca, Alloca};

    }


    // Create code to check if the memory locations of the Load and Store

    // overlap and if they do, copy Load's operand to a new buffer.


    // First, create  new blocks for 2n part of the check and the copy.

    BasicBlock *Check0 = MatMul->getParent();

    // FIXME: Use lazy DTU and update SplitBlock to accept a DTU instead of a

    // DT. Manually collect dominator tree updates, to avoid unnecessary work,

    // as we adjust Check0 and Check1's branches.

    SmallVector<DominatorTree::UpdateType, 4> DTUpdates;

    for (BasicBlock *Succ : successors(Check0))

      DTUpdates.push_back({DT->Delete, Check0, Succ});


    BasicBlock *Check1 =

        SplitBlock(MatMul->getParent(), MatMul, (DomTreeUpdater *)nullptr, LI,

                   nullptr, "alias_cont");

    BasicBlock *Copy =

        SplitBlock(MatMul->getParent(), MatMul, (DomTreeUpdater *)nullptr, LI,

                   nullptr, "copy");

    BasicBlock *Fusion =

        SplitBlock(MatMul->getParent(), MatMul, (DomTreeUpdater *)nullptr, LI,

                   nullptr, "no_alias");


    // Check if the loaded memory location begins before the end of the store

    // location. If the condition holds, they might overlap, otherwise they are

    // guaranteed to not overlap.

    IRBuilder<> Builder(MatMul);

    Check0->getTerminator()->eraseFromParent();

    Builder.SetInsertPoint(Check0);

    Type *AddrTy = DL.getAddressType(Store->getPointerOperand()->getType());

    Value *StoreBegin = Store->getPointerOperand();

    Value *StoreEnd = Builder.CreatePtrAdd(

        StoreBegin, ConstantInt::get(AddrTy, StoreLoc.Size.getValue()),

        "store.end",

        GEPNoWrapFlags::inBounds() | GEPNoWrapFlags::noUnsignedWrap());

    Value *LoadBegin = Load->getPointerOperand();

    CondBrInst *BR1 = Builder.CreateCondBr(

        Builder.CreateICmpULT(LoadBegin, StoreEnd), Check1, Fusion);

    setExplicitlyUnknownBranchWeightsIfProfiled(*BR1, DEBUG_TYPE);


    // Check if the store begins before the end of the load location. If the

    // condition holds, they alias, otherwise they are guaranteed to not

    // overlap.

    Check1->getTerminator()->eraseFromParent();

    Builder.SetInsertPoint(Check1, Check1->begin());


    auto *VT = cast<FixedVectorType>(Load->getType());

    // Use an array type for the alloca, to avoid potentially huge alignment

    // requirements for large vector types.

    auto *ArrayTy = ArrayType::get(VT->getElementType(), VT->getNumElements());

    AllocaInst *Alloca =

        AllocaBuilder.CreateAlloca(ArrayTy, Load->getPointerAddressSpace());

    Builder.CreateLifetimeStart(Alloca);


    Value *LoadEnd = Builder.CreatePtrAdd(

        LoadBegin, ConstantInt::get(AddrTy, LoadLoc.Size.getValue()),

        "load.end",

        GEPNoWrapFlags::inBounds() | GEPNoWrapFlags::noUnsignedWrap());

    CondBrInst *BR2 = Builder.CreateCondBr(

        Builder.CreateICmpULT(StoreBegin, LoadEnd), Copy, Fusion);

    setExplicitlyUnknownBranchWeightsIfProfiled(*BR2, DEBUG_TYPE);


    // Copy load operand to new alloca.

    Builder.SetInsertPoint(Copy, Copy->begin());

    Builder.CreateMemCpy(Alloca, Alloca->getAlign(), Load->getPointerOperand(),

                         Load->getAlign(), LoadLoc.Size.getValue());

    Builder.SetInsertPoint(Fusion, Fusion->begin());

    PHINode *PHI = Builder.CreatePHI(Load->getPointerOperandType(), 3);

    PHI->addIncoming(Load->getPointerOperand(), Check0);

    PHI->addIncoming(Load->getPointerOperand(), Check1);

    PHI->addIncoming(Alloca, Copy);


    // Adjust DT.

    DTUpdates.push_back({DT->Insert, Check0, Check1});

    DTUpdates.push_back({DT->Insert, Check0, Fusion});

    DTUpdates.push_back({DT->Insert, Check1, Copy});

    DTUpdates.push_back({DT->Insert, Check1, Fusion});

    DT->applyUpdates(DTUpdates);

    return {PHI, Alloca};

  }


  bool isFusionProfitable(CallInst *MatMul) {

    if (ForceFusion)

      return true;


    ShapeInfo LShape(MatMul->getArgOperand(2), MatMul->getArgOperand(3));

    ShapeInfo RShape(MatMul->getArgOperand(3), MatMul->getArgOperand(4));


    const unsigned R = LShape.NumRows;

    const unsigned C = RShape.NumColumns;

    const unsigned M = LShape.NumColumns;

    auto *EltType = cast<FixedVectorType>(MatMul->getType())->getElementType();


    const unsigned VF = std::max<unsigned>(

        TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)

                .getFixedValue() /

            EltType->getPrimitiveSizeInBits().getFixedValue(),

        1U);


    // Cost model for tiling

    //

    // For tiling to be beneficial, we need reuse either along the R or

    // the C axis.  We vectorize along the R axis so that means at least

    // 3 elements.

    // TODO: Also consider cost of copying if operands alias.

    if (R <= VF && C == 1)

      return false;

    // Then we need enough elements to exceed the number of vector

    // registers we have.  Note that this is an oversimplification since

    // fusing also takes some extra loads which may exceed the number of

    // reloads necessary.

    unsigned Op0Regs = (R + VF - 1) / VF * M;

    unsigned Op1Regs = (M + VF - 1) / VF * C;

    return Op0Regs + Op1Regs >

           TTI.getNumberOfRegisters(TTI.getRegisterClassForType(true));

  }


  MatrixTy getZeroMatrix(Type *EltType, unsigned R, unsigned C) {

    MatrixTy Res;

    auto *ColumType = FixedVectorType::get(EltType, R);

    for (unsigned I = 0; I < C; ++I)

      Res.addVector(ConstantAggregateZero::get(ColumType));

    return Res;

  }


  void createTiledLoops(CallInst *MatMul, Value *LPtr, ShapeInfo LShape,

                        Value *RPtr, ShapeInfo RShape, StoreInst *Store) {

    auto *EltType = cast<FixedVectorType>(MatMul->getType())->getElementType();


    // Create the main tiling loop nest.

    TileInfo TI(LShape.NumRows, RShape.NumColumns, LShape.NumColumns, TileSize);

    DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);

    Instruction *InsertI = cast<Instruction>(MatMul);

    BasicBlock *Start = InsertI->getParent();

    BasicBlock *End =

        SplitBlock(InsertI->getParent(), InsertI, DT, LI, nullptr, "continue");

    IRBuilder<> Builder(MatMul);

    BasicBlock *InnerBody = TI.CreateTiledLoops(Start, End, Builder, DTU, *LI);


    Type *TileVecTy =

        FixedVectorType::get(MatMul->getType()->getScalarType(), TileSize);

    MatrixTy TileResult;

    // Insert in the inner loop header.

    Builder.SetInsertPoint(TI.KLoop.Header->getTerminator());

    // Create PHI nodes for the result columns to accumulate across iterations.

    SmallVector<PHINode *, 4> ColumnPhis;

    for (unsigned I = 0; I < TileSize; I++) {

      auto *Phi = Builder.CreatePHI(TileVecTy, 2, "result.vec." + Twine(I));

      Phi->addIncoming(ConstantAggregateZero::get(TileVecTy),

                       TI.RowLoop.Header->getSingleSuccessor());

      TileResult.addVector(Phi);

      ColumnPhis.push_back(Phi);

    }


    // Insert in the inner loop body, which computes

    //   Res += Load(CurrentRow, K) * Load(K, CurrentColumn)

    Builder.SetInsertPoint(InnerBody->getTerminator());

    // Load tiles of the operands.

    MatrixTy A =

        loadMatrix(LPtr, {}, false, LShape, TI.RowLoop.Index, TI.KLoop.Index,

                   {TileSize, TileSize}, EltType, Builder);

    MatrixTy B =

        loadMatrix(RPtr, {}, false, RShape, TI.KLoop.Index, TI.ColumnLoop.Index,

                   {TileSize, TileSize}, EltType, Builder);

    emitMatrixMultiply(TileResult, A, B, Builder, true, false,

                       getFastMathFlags(MatMul));

    // Store result after the inner loop is done.

    Builder.SetInsertPoint(TI.RowLoop.Latch->getTerminator());

    storeMatrix(TileResult, Store->getPointerOperand(), Store->getAlign(),

                Store->isVolatile(), {LShape.NumRows, RShape.NumColumns},

                TI.RowLoop.Index, TI.ColumnLoop.Index, EltType, Builder);


    for (unsigned I = 0; I < TileResult.getNumVectors(); I++)

      ColumnPhis[I]->addIncoming(TileResult.getVector(I), TI.KLoop.Latch);


    // Force unrolling of a few iterations of the inner loop, to make sure there

    // is enough work per iteration.

    // FIXME: The unroller should make this decision directly instead, but

    // currently the cost-model is not up to the task.

    unsigned InnerLoopUnrollCount = std::min(10u, LShape.NumColumns / TileSize);

    addStringMetadataToLoop(LI->getLoopFor(TI.KLoop.Header),

                            "llvm.loop.unroll.count", InnerLoopUnrollCount);

  }


  void emitSIMDTiling(CallInst *MatMul, LoadInst *LoadOp0, LoadInst *LoadOp1,

                      StoreInst *Store,

                      SmallPtrSetImpl<Instruction *> &FusedInsts) {

    assert(MatrixLayout == MatrixLayoutTy::ColumnMajor &&

           "Tiling only supported for column-major matrixes at the moment!");

    if (!isFusionProfitable(MatMul))

      return;


    ShapeInfo LShape(MatMul->getArgOperand(2), MatMul->getArgOperand(3));

    ShapeInfo RShape(MatMul->getArgOperand(3), MatMul->getArgOperand(4));


    const unsigned R = LShape.NumRows;

    const unsigned C = RShape.NumColumns;

    const unsigned M = LShape.NumColumns;

    auto *EltType = cast<FixedVectorType>(MatMul->getType())->getElementType();


    auto [APtr, AAlloca] = getNonAliasingPointer(LoadOp0, Store, MatMul);

    auto [BPtr, BAlloca] = getNonAliasingPointer(LoadOp1, Store, MatMul);

    Value *CPtr = Store->getPointerOperand();


    // Use loop-based tiling when the number of expected operations exceeds

    // threshold.

    unsigned NumOps = getNumNativeVectorOps(EltType, R, M, C);

    bool UseLoops =

        (NumOps > TileLoopsThreshold) && R % TileSize == 0 && C % TileSize == 0;

    if (UseLoops)

      createTiledLoops(MatMul, APtr, LShape, BPtr, RShape, Store);

    else {

      IRBuilder<> Builder(Store);

      for (unsigned J = 0; J < C; J += TileSize)

        for (unsigned I = 0; I < R; I += TileSize) {

          const unsigned TileR = std::min(R - I, unsigned(TileSize));

          const unsigned TileC = std::min(C - J, unsigned(TileSize));

          MatrixTy Res = getZeroMatrix(EltType, TileR, TileC);


          for (unsigned K = 0; K < M; K += TileSize) {

            const unsigned TileM = std::min(M - K, unsigned(TileSize));

            MatrixTy A =

                loadMatrix(APtr, LoadOp0->getAlign(), LoadOp0->isVolatile(),

                           LShape, getIndex(APtr, I), getIndex(APtr, K),

                           {TileR, TileM}, EltType, Builder);

            MatrixTy B =

                loadMatrix(BPtr, LoadOp1->getAlign(), LoadOp1->isVolatile(),

                           RShape, getIndex(BPtr, K), getIndex(BPtr, J),

                           {TileM, TileC}, EltType, Builder);

            emitMatrixMultiply(Res, A, B, Builder, true, false,

                               getFastMathFlags(MatMul));

          }

          storeMatrix(Res, CPtr, Store->getAlign(), Store->isVolatile(), {R, M},

                      getIndex(CPtr, I), getIndex(CPtr, J), EltType, Builder);

        }

    }


    // End the lifetime of the allocas used for alias-safe copies.

    {

      IRBuilder<> Builder(Store);

      if (AAlloca)

        Builder.CreateLifetimeEnd(AAlloca);

      if (BAlloca)

        Builder.CreateLifetimeEnd(BAlloca);

    }


    // Mark eliminated instructions as fused and remove them.

    FusedInsts.insert(Store);

    FusedInsts.insert(MatMul);

    eraseFromParentAndRemoveFromShapeMap(Store);

    eraseFromParentAndRemoveFromShapeMap(MatMul);

    if (LoadOp0->use_empty()) {

      FusedInsts.insert(LoadOp0);

      eraseFromParentAndRemoveFromShapeMap(LoadOp0);

    }

    if (LoadOp1 != LoadOp0 && LoadOp1->use_empty()) {

      FusedInsts.insert(LoadOp1);

      eraseFromParentAndRemoveFromShapeMap(LoadOp1);

    }

  }


  /// Try to lower matrix multiply chains by fusing operations.

  ///

  /// Call finalizeLowering on lowered instructions.  Instructions that are

  /// completely eliminated by fusion are added to \p FusedInsts.

  void

  LowerMatrixMultiplyFused(CallInst *MatMul,

                           SmallPtrSetImpl<Instruction *> &FusedInsts,

                           SmallVector<IntrinsicInst *, 16> &LifetimeEnds) {

    if (!FuseMatrix || !DT || TileSize == 0)

      return;


    assert(AA && LI && "Analyses should be available");


    Value *A = MatMul->getArgOperand(0);

    Value *B = MatMul->getArgOperand(1);


    // We can fold the transpose into the operand that is used to fetch scalars.

    Value *T;

    if (MatrixLayout == MatrixLayoutTy::ColumnMajor

            ? match(B, m_Intrinsic<Intrinsic::matrix_transpose>(m_Value(T)))

            : match(A, m_Intrinsic<Intrinsic::matrix_transpose>(m_Value(T)))) {

      IRBuilder<> Builder(MatMul);

      auto *EltType =

          cast<FixedVectorType>(MatMul->getType())->getElementType();

      ShapeInfo LShape(MatMul->getArgOperand(2), MatMul->getArgOperand(3));

      ShapeInfo RShape(MatMul->getArgOperand(3), MatMul->getArgOperand(4));

      const unsigned R = LShape.NumRows;

      const unsigned M = LShape.NumColumns;

      const unsigned C = RShape.NumColumns;


      MatrixTy MA;

      MatrixTy MB;


      Value *Transpose;

      if (MatrixLayout == MatrixLayoutTy::ColumnMajor) {

        MA = getMatrix(A, ShapeInfo(R, M), Builder);

        MB = getMatrix(T, ShapeInfo(C, M), Builder);

        Transpose = B;

      } else {

        MA = getMatrix(T, ShapeInfo(R, M), Builder);

        MB = getMatrix(B, ShapeInfo(C, M), Builder);

        Transpose = A;

      }


      // Initialize the output

      MatrixTy Result(R, C, EltType);


      emitMatrixMultiply(Result, MA, MB, Builder, false, true,

                         getFastMathFlags(MatMul));


      FusedInsts.insert(MatMul);

      if (Transpose->hasOneUse()) {

        FusedInsts.insert(cast<Instruction>(Transpose));

        ToRemove.push_back(cast<Instruction>(Transpose));

        // TODO: add a fake entry for the folded instruction so that this is

        // included in the expression in the remark.

        Inst2ColumnMatrix[Transpose] = MatrixTy(M, C, EltType);

      }

      finalizeLowering(MatMul, Result, Builder);

      return;

    }


    if (!MatMul->hasOneUse() || MatrixLayout != MatrixLayoutTy::ColumnMajor)

      return;


    // Lower {ld, ld} -> matmul -> st chains.  No need to call finalizeLowering

    // since the single store user will be lowered as part of this.

    auto *LoadOp0 = dyn_cast<LoadInst>(A);

    auto *LoadOp1 = dyn_cast<LoadInst>(B);

    auto *Store = dyn_cast<StoreInst>(*MatMul->user_begin());

    if (LoadOp0 && LoadOp1 && Store) {

      // The store address must dominate the MatMul instruction, otherwise

      // we create invalid IR.

      SetVector<Value *> WorkList;

      WorkList.insert(Store->getOperand(1));

      SmallVector<Instruction *> ToHoist;

      for (unsigned I = 0; I != WorkList.size(); ++I) {

        Value *Current = WorkList[I];

        auto *CurrI = dyn_cast<Instruction>(Current);

        if (!CurrI)

          continue;

        if (isa<PHINode>(CurrI))

          return;

        if (DT->dominates(CurrI, MatMul))

          continue;

        if (CurrI->mayHaveSideEffects() || CurrI->mayReadFromMemory())

          return;

        ToHoist.push_back(CurrI);

        WorkList.insert_range(CurrI->operands());

      }


      sort(ToHoist, [this](Instruction *A, Instruction *B) {

        return DT->dominates(A, B);

      });

      for (Instruction *I : ToHoist)

        I->moveBefore(MatMul->getIterator());


      // Deal with lifetime.end calls that might be between Load0/Load1 and the

      // store. To avoid introducing loads to dead objects (i.e. after the

      // lifetime has been termined by @llvm.lifetime.end), either sink them

      // after the store if in the same block, or remove the lifetime.end marker

      // otherwise. This might pessimize further optimizations, by extending the

      // lifetime of the object until the function returns, but should be

      // conservatively correct.

      MemoryLocation Load0Loc = MemoryLocation::get(LoadOp0);

      MemoryLocation Load1Loc = MemoryLocation::get(LoadOp1);

      BasicBlock *StoreParent = Store->getParent();

      bool FusableOpsInSameBlock = LoadOp0->getParent() == StoreParent &&

                                   LoadOp1->getParent() == StoreParent;

      for (unsigned Idx = 0; Idx != LifetimeEnds.size();) {

        IntrinsicInst *End = LifetimeEnds[Idx];

        llvm::scope_exit Inc([&Idx]() { Idx++; });

        // If the lifetime.end is guaranteed to be before the loads or after the

        // store, it won't interfere with fusion.

        if (DT->dominates(End, LoadOp0) && DT->dominates(End, LoadOp1))

          continue;

        if (DT->dominates(Store, End))

          continue;

        // If all fusable ops are in the same block and the lifetime.end is in a

        // different block, it won't interfere with fusion.

        if (FusableOpsInSameBlock && End->getParent() != StoreParent)

          continue;


        // If the loads don't alias the lifetime.end, it won't interfere with

        // fusion.

        MemoryLocation EndLoc = MemoryLocation::getForArgument(End, 0, nullptr);

        if (!EndLoc.Ptr)

          continue;

        if (AA->isNoAlias(Load0Loc, EndLoc) && AA->isNoAlias(Load1Loc, EndLoc))

          continue;


        // If both lifetime.end and the store are in the same block, extend the

        // lifetime until after the store, so the new lifetime covers the loads

        // we introduce later.

        if (End->getParent() == StoreParent) {

          End->moveAfter(Store);

          continue;

        }


        // Otherwise remove the conflicting lifetime.end marker.

        ToRemove.push_back(End);

        std::swap(LifetimeEnds[Idx], LifetimeEnds.back());

        LifetimeEnds.pop_back();

        Inc.release();

      }


      emitSIMDTiling(MatMul, LoadOp0, LoadOp1, Store, FusedInsts);

      return;

    }

  }


  /// Lowers llvm.matrix.multiply.

  MatrixTy LowerMultiply(CallInst *MatMul, IRBuilder<> &Builder) {

    auto *EltType = cast<FixedVectorType>(MatMul->getType())->getElementType();

    ShapeInfo LShape(MatMul->getArgOperand(2), MatMul->getArgOperand(3));

    ShapeInfo RShape(MatMul->getArgOperand(3), MatMul->getArgOperand(4));


    const MatrixTy &Lhs = getMatrix(MatMul->getArgOperand(0), LShape, Builder);

    const MatrixTy &Rhs = getMatrix(MatMul->getArgOperand(1), RShape, Builder);

    assert(Lhs.getElementType() == Rhs.getElementType() &&

           "Matrix multiply argument element types do not match.");


    const unsigned R = LShape.NumRows;

    const unsigned C = RShape.NumColumns;

    assert(LShape.NumColumns == RShape.NumRows);


    // Initialize the output

    MatrixTy Result(R, C, EltType);

    assert(Lhs.getElementType() == Result.getElementType() &&

           "Matrix multiply result element type does not match arguments.");


    emitMatrixMultiply(Result, Lhs, Rhs, Builder, false, false,

                       getFastMathFlags(MatMul));

    return Result;

  }


  /// Lowers llvm.matrix.transpose.

  MatrixTy LowerTranspose(CallInst *Inst, IRBuilder<> &Builder) {

    MatrixTy Result;

    Value *InputVal = Inst->getArgOperand(0);

    FixedVectorType *VectorTy = cast<FixedVectorType>(InputVal->getType());

    ShapeInfo ArgShape(Inst->getArgOperand(1), Inst->getArgOperand(2));

    MatrixTy InputMatrix = getMatrix(InputVal, ArgShape, Builder);


    const unsigned NewNumVecs =

        InputMatrix.isColumnMajor() ? ArgShape.NumRows : ArgShape.NumColumns;

    const unsigned NewNumElts =

        InputMatrix.isColumnMajor() ? ArgShape.NumColumns : ArgShape.NumRows;


    for (unsigned I = 0; I < NewNumVecs; ++I) {

      // Build a single result vector. First initialize it.

      Value *ResultVector = PoisonValue::get(

          FixedVectorType::get(VectorTy->getElementType(), NewNumElts));

      // Go through the old elements and insert it into the resulting vector.

      for (auto J : enumerate(InputMatrix.vectors())) {

        Value *Elt = Builder.CreateExtractElement(J.value(), I);

        // Row and column indices are transposed.

        ResultVector =

            Builder.CreateInsertElement(ResultVector, Elt, J.index());

      }

      Result.addVector(ResultVector);

    }


    // TODO: Improve estimate of operations needed for transposes. Currently we

    // just count the insertelement/extractelement instructions, but do not

    // account for later simplifications/combines.

    return Result.addNumComputeOps(2 * ArgShape.NumRows * ArgShape.NumColumns)

        .addNumExposedTransposes(1);

  }


  /// Lower load instructions.

  MatrixTy VisitLoad(LoadInst *Inst, const ShapeInfo &SI, Value *Ptr,

                     IRBuilder<> &Builder) {

    return LowerLoad(Inst, Ptr, Inst->getAlign(), getIndex(Ptr, SI.getStride()),

                     Inst->isVolatile(), SI, Builder);

  }


  MatrixTy VisitStore(StoreInst *Inst, const ShapeInfo &SI, Value *StoredVal,

                      Value *Ptr, IRBuilder<> &Builder) {

    return LowerStore(Inst, StoredVal, Ptr, Inst->getAlign(),

                      getIndex(Ptr, SI.getStride()), Inst->isVolatile(), SI,

                      Builder);

  }


  MatrixTy VisitPHI(PHINode *Inst, const ShapeInfo &SI, IRBuilder<> &Builder) {

    auto BlockIP = Inst->getParent()->getFirstInsertionPt();

    Builder.SetInsertPoint(BlockIP);

    MatrixTy PhiM = getMatrix(Inst, SI, Builder);


    for (auto [IncomingV, IncomingB] :

         llvm::zip_equal(Inst->incoming_values(), Inst->blocks())) {

      // getMatrix() may insert some instructions to help with reshaping. The

      // safest place for those is at the top of the block after the rest of the

      // PHI's. Even better, if we can put it in the incoming block.

      Builder.SetInsertPoint(BlockIP);

      if (auto *IncomingInst = dyn_cast<Instruction>(IncomingV))

        if (auto MaybeIP = IncomingInst->getInsertionPointAfterDef())

          Builder.SetInsertPoint(*MaybeIP);


      MatrixTy OpM = getMatrix(IncomingV, SI, Builder);


      for (unsigned VI = 0, VE = PhiM.getNumVectors(); VI != VE; ++VI) {

        PHINode *NewPHI = cast<PHINode>(PhiM.getVector(VI));

        NewPHI->addIncoming(OpM.getVector(VI), IncomingB);

      }

    }


    // finalizeLowering() may also insert instructions in some cases. The safe

    // place for those is at the end of the initial block of PHIs.

    Builder.SetInsertPoint(BlockIP);

    return PhiM;

  }


  /// Lower binary operators.

  MatrixTy VisitBinaryOperator(BinaryOperator *Inst, const ShapeInfo &SI,

                               IRBuilder<> &Builder) {

    Value *Lhs = Inst->getOperand(0);

    Value *Rhs = Inst->getOperand(1);


    MatrixTy Result;

    MatrixTy A = getMatrix(Lhs, SI, Builder);

    MatrixTy B = getMatrix(Rhs, SI, Builder);

    assert(A.isColumnMajor() == B.isColumnMajor() &&

           Result.isColumnMajor() == A.isColumnMajor() &&

           "operands must agree on matrix layout");


    Builder.setFastMathFlags(getFastMathFlags(Inst));


    for (auto [AV, BV] : llvm::zip_equal(A.vectors(), B.vectors()))

      Result.addVector(Builder.CreateBinOp(Inst->getOpcode(), AV, BV));


    return Result.addNumComputeOps(getNumOps(Result.getVectorTy()) *

                                   Result.getNumVectors());

  }


  /// Lower unary operators.

  MatrixTy VisitUnaryOperator(UnaryOperator *Inst, const ShapeInfo &SI,

                              IRBuilder<> &Builder) {

    Value *Op = Inst->getOperand(0);


    MatrixTy Result;

    MatrixTy M = getMatrix(Op, SI, Builder);


    Builder.setFastMathFlags(getFastMathFlags(Inst));


    // Helper to perform unary op on vectors.

    auto BuildVectorOp = [&Builder, Inst](Value *Op) {

      switch (Inst->getOpcode()) {

      case Instruction::FNeg:

        return Builder.CreateFNeg(Op);

      default:

        llvm_unreachable("Unsupported unary operator for matrix");

      }

    };


    for (auto *Vector : M.vectors())

      Result.addVector(BuildVectorOp(Vector));


    return Result.addNumComputeOps(getNumOps(Result.getVectorTy()) *

                                   Result.getNumVectors());

  }


  /// Lower cast instructions.

  MatrixTy VisitCastInstruction(CastInst *Inst, const ShapeInfo &Shape,

                                IRBuilder<> &Builder) {

    Value *Op = Inst->getOperand(0);


    MatrixTy Result;

    MatrixTy M = getMatrix(Op, Shape, Builder);


    Builder.setFastMathFlags(getFastMathFlags(Inst));


    auto *OrigVTy = cast<VectorType>(Inst->getType());

    auto *NewVTy = VectorType::get(OrigVTy->getElementType(),

                                   ElementCount::getFixed(M.getStride()));


    for (auto *Vector : M.vectors())

      Result.addVector(Builder.CreateCast(Inst->getOpcode(), Vector, NewVTy));


    return Result.addNumComputeOps(getNumOps(Result.getVectorTy()) *

                                   Result.getNumVectors());

  }


  /// Lower selects.

  MatrixTy VisitSelectInst(SelectInst *Inst, const ShapeInfo &Shape,

                           IRBuilder<> &Builder) {

    Value *Cond = Inst->getOperand(0);

    Value *OpA = Inst->getOperand(1);

    Value *OpB = Inst->getOperand(2);


    MatrixTy Result;

    MatrixTy A = getMatrix(OpA, Shape, Builder);

    MatrixTy B = getMatrix(OpB, Shape, Builder);


    SmallVector<Value*> CondV;

    Instruction *MDFrom = nullptr;

    if (isa<FixedVectorType>(Cond->getType())) {

      MatrixTy C = getMatrix(Cond, Shape, Builder);

      llvm::copy(C.vectors(), std::back_inserter(CondV));

    } else {

      CondV.resize(A.getNumVectors());

      llvm::fill(CondV, Cond);

      if (!ProfcheckDisableMetadataFixes)

        MDFrom = Inst;

    }


    for (auto [CV, AV, BV] : llvm::zip_equal(CondV, A.vectors(), B.vectors())) {

      assert(!(isa<VectorType>(CV->getType()) && static_cast<bool>(MDFrom)) &&

             "If we have a vector conditional, we should be propagating "

             "profile information.");

      Result.addVector(Builder.CreateSelect(CV, AV, BV, "", MDFrom));

    }


    return Result.addNumComputeOps(getNumOps(Result.getVectorTy()) *

                                   Result.getNumVectors());

  }


  /// Helper to linearize a matrix expression tree into a string. Currently

  /// matrix expressions are linarized by starting at an expression leaf and

  /// linearizing bottom up.

  struct ExprLinearizer {

    unsigned LengthToBreak = 100;

    std::string Str;

    raw_string_ostream Stream;

    unsigned LineLength = 0;

    const DataLayout &DL;


    /// Mapping from instructions to matrixes. It is used to identify

    /// matrix instructions.

    const MapVector<Value *, MatrixTy> &Inst2Matrix;


    /// Mapping from values to the leaves of all expressions that the value is

    /// part of.

    const DenseMap<Value *, SmallPtrSet<Value *, 2>> &Shared;


    /// Set of matrix expressions in the scope of a given DISubprogram.

    const SmallSetVector<Value *, 32> &ExprsInSubprogram;


    /// Leaf node of the expression to linearize.

    Value *Leaf;


    /// Used to keep track of sub-expressions that get reused while linearizing

    /// the expression. Re-used sub-expressions are marked as (reused).

    SmallPtrSet<Value *, 8> ReusedExprs;


    ExprLinearizer(const DataLayout &DL,

                   const MapVector<Value *, MatrixTy> &Inst2Matrix,

                   const DenseMap<Value *, SmallPtrSet<Value *, 2>> &Shared,

                   const SmallSetVector<Value *, 32> &ExprsInSubprogram,

                   Value *Leaf)

        : Stream(Str), DL(DL), Inst2Matrix(Inst2Matrix), Shared(Shared),

          ExprsInSubprogram(ExprsInSubprogram), Leaf(Leaf) {}


    void indent(unsigned N) {

      LineLength += N;

      for (unsigned i = 0; i < N; i++)

        Stream << " ";

    }


    void lineBreak() {

      Stream << "\n";

      LineLength = 0;

    }


    void maybeIndent(unsigned Indent) {

      if (LineLength >= LengthToBreak)

        lineBreak();


      if (LineLength == 0)

        indent(Indent);

    }


    void write(StringRef S) {

      LineLength += S.size();

      Stream << S;

    }


    Value *getUnderlyingObjectThroughLoads(Value *V) {

      if (Value *Ptr = getPointerOperand(V))

        return getUnderlyingObjectThroughLoads(Ptr);

      else if (V->getType()->isPointerTy())

        return getUnderlyingObject(V);

      return V;

    }


    /// Returns true if \p V is a matrix value in the given subprogram.

    bool isMatrix(Value *V) const { return ExprsInSubprogram.count(V); }


    /// If \p V is a matrix value, print its shape as NumRows x NumColumns to

    /// \p SS.

    void prettyPrintMatrixType(Value *V, raw_string_ostream &SS) {

      auto M = Inst2Matrix.find(V);

      if (M == Inst2Matrix.end())

        SS << "unknown";

      else {

        SS << M->second.getNumRows();

        SS << "x";

        SS << M->second.getNumColumns();

      }

    }


    /// Write the called function name. Handles calls to llvm.matrix.*

    /// specially: we write the name, followed by the dimensions of the input

    /// matrixes, followed by the scalar type name.

    void writeFnName(CallInst *CI) {

      if (!CI->getCalledFunction())

        write("<no called fn>");

      else {

        StringRef Name = CI->getCalledFunction()->getName();

        if (!Name.starts_with("llvm.matrix")) {

          write(Name);

          return;

        }

        auto *II = cast<IntrinsicInst>(CI);

        write(Intrinsic::getBaseName(II->getIntrinsicID())

                  .drop_front(StringRef("llvm.matrix.").size()));

        write(".");

        std::string Tmp;

        raw_string_ostream SS(Tmp);


        switch (II->getIntrinsicID()) {

        case Intrinsic::matrix_multiply:

          prettyPrintMatrixType(II->getOperand(0), SS);

          SS << ".";

          prettyPrintMatrixType(II->getOperand(1), SS);

          SS << "." << *II->getType()->getScalarType();

          break;

        case Intrinsic::matrix_transpose:

          prettyPrintMatrixType(II->getOperand(0), SS);

          SS << "." << *II->getType()->getScalarType();

          break;

        case Intrinsic::matrix_column_major_load:

          prettyPrintMatrixType(II, SS);

          SS << "." << *II->getType()->getScalarType();

          break;

        case Intrinsic::matrix_column_major_store:

          prettyPrintMatrixType(II->getOperand(0), SS);

          SS << "." << *II->getOperand(0)->getType()->getScalarType();

          break;

        default:

          llvm_unreachable("Unhandled case");

        }

        write(Tmp);

      }

    }


    unsigned getNumShapeArgs(CallInst *CI) const {

      if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI)) {

        switch (II->getIntrinsicID()) {

        case Intrinsic::matrix_multiply:

          return 3;

        case Intrinsic::matrix_transpose:

          return 2;

        case Intrinsic::matrix_column_major_load:

        case Intrinsic::matrix_column_major_store:

          return 3;

        default:

          return 0;

        }

      }

      return 0;

    }


    /// Special printing for values: for pointers, we print if they refer to an

    /// (function) external address or a stack address, for other values we

    /// either print the constant or "scalar"/"matrix" for other values.

    void write(Value *V) {

      V = getUnderlyingObjectThroughLoads(V);

      if (V->getType()->isPointerTy()) {

        if (isa<AllocaInst>(V)) {

          Stream << "stack addr";

          LineLength += StringRef("stack addr").size();

        } else {

          Stream << "addr";

          LineLength += StringRef("addr").size();

        }

        if (!V->getName().empty()) {

          Stream << " %" << V->getName() << "";

          LineLength += V->getName().size() + 2;

        }

        return;

      }


      std::string Tmp;

      raw_string_ostream TmpStream(Tmp);


      if (auto *CI = dyn_cast<ConstantInt>(V))

        TmpStream << CI->getValue();

      else if (isa<Constant>(V))

        TmpStream << "constant";

      else {

        if (isMatrix(V))

          TmpStream << "matrix";

        else

          TmpStream << "scalar";

      }

      Tmp = std::string(StringRef(Tmp).trim());

      LineLength += Tmp.size();

      Stream << Tmp;

    }


    /// Linearize expression \p Expr starting at an indentation of \p Indent.

    /// Expressions that are re-used multiple times are prefixed with (reused)

    /// at the re-used root instruction.

    void linearizeExpr(Value *Expr, unsigned Indent, bool ParentReused,

                       bool ParentShared) {

      auto *I = cast<Instruction>(Expr);

      maybeIndent(Indent);

      SmallVector<Value *, 8> Ops;


      // Is Expr shared with other expression leaves?

      bool ExprShared = false;


      // Deal with shared subtrees. Mark them as shared, if required.

      if (!ParentShared) {

        auto SI = Shared.find(Expr);

        assert(SI != Shared.end() && SI->second.count(Leaf));


        for (Value *S : SI->second) {

          if (S == Leaf)

            continue;

          DebugLoc DL = cast<Instruction>(S)->getDebugLoc();

          write("shared with remark at line " + std::to_string(DL.getLine()) +

                " column " + std::to_string(DL.getCol()) + " (");

        }

        ExprShared = SI->second.size() > 1;

      }


      bool Reused = !ReusedExprs.insert(Expr).second;

      if (Reused && !ParentReused)

        write("(reused) ");


      if (auto *CI = dyn_cast<CallInst>(I)) {

        writeFnName(CI);


        Ops.append(CI->arg_begin(), CI->arg_end() - getNumShapeArgs(CI));

      } else if (isa<BitCastInst>(Expr)) {

        // Special case bitcasts, which are used to materialize matrixes from

        // non-matrix ops.

        write("matrix");

        return;

      } else {

        Ops.append(I->value_op_begin(), I->value_op_end());

        write(I->getOpcodeName());

      }


      write("(");


      unsigned NumOpsToBreak = 1;

      if (match(Expr, m_Intrinsic<Intrinsic::matrix_column_major_load>()))

        NumOpsToBreak = 2;


      for (Value *Op : Ops) {

        if (Ops.size() > NumOpsToBreak)

          lineBreak();


        maybeIndent(Indent + 1);

        if (isMatrix(Op))

          linearizeExpr(Op, Indent + 1, Reused, ExprShared);

        else

          write(Op);

        if (Op != Ops.back())

          write(", ");

      }


      write(")");

    }


    const std::string &getResult() {

      return Str;

    }

  };


  /// Generate remarks for matrix operations in a function. To generate remarks

  /// for matrix expressions, the following approach is used:

  /// 1. Use the inlined-at debug information to group matrix operations to the

  ///    DISubprograms they are contained in.

  /// 2. Collect leaves of matrix expressions (done in

  ///    RemarkGenerator::getExpressionLeaves) for each subprogram - expression

  //     mapping.  Leaves are lowered matrix instructions without other matrix

  //     users (like stores) in the current subprogram.

  /// 3. For each leaf, create a remark containing a linearizied version of the

  ///    matrix expression. The expression is linearized by a recursive

  ///    bottom-up traversal of the matrix operands, starting at a leaf. Note

  ///    that multiple leaves can share sub-expressions. Shared subexpressions

  ///    are explicitly marked as shared().

  struct RemarkGenerator {

    const MapVector<Value *, MatrixTy> &Inst2Matrix;

    OptimizationRemarkEmitter &ORE;

    Function &Func;

    const DataLayout &DL;


    RemarkGenerator(const MapVector<Value *, MatrixTy> &Inst2Matrix,

                    OptimizationRemarkEmitter &ORE, Function &Func)

        : Inst2Matrix(Inst2Matrix), ORE(ORE), Func(Func),

          DL(Func.getDataLayout()) {}


    /// Return all leaves of the expressions in \p ExprsInSubprogram. Those are

    /// instructions in Inst2Matrix returning void or without any users in

    /// \p ExprsInSubprogram. Currently that should only include stores.

    SmallVector<Value *, 4>

    getExpressionLeaves(const SmallSetVector<Value *, 32> &ExprsInSubprogram) {

      SmallVector<Value *, 4> Leaves;

      for (auto *Expr : ExprsInSubprogram)

        if (Expr->getType()->isVoidTy() ||

            !any_of(Expr->users(), [&ExprsInSubprogram](User *U) {

              return ExprsInSubprogram.count(U);

            }))

          Leaves.push_back(Expr);

      return Leaves;

    }


    /// Recursively traverse expression \p V starting at \p Leaf and add \p Leaf

    /// to all visited expressions in \p Shared. Limit the matrix operations to

    /// the ones in \p ExprsInSubprogram.

    void collectSharedInfo(Value *Leaf, Value *V,

                           const SmallSetVector<Value *, 32> &ExprsInSubprogram,

                           DenseMap<Value *, SmallPtrSet<Value *, 2>> &Shared) {


      if (!ExprsInSubprogram.count(V))

        return;


      Shared[V].insert(Leaf);


      for (Value *Op : cast<Instruction>(V)->operand_values())

        collectSharedInfo(Leaf, Op, ExprsInSubprogram, Shared);

    }


    /// Calculate the number of exclusive and shared op counts for expression

    /// starting at \p V. Expressions used multiple times are counted once.

    /// Limit the matrix operations to the ones in \p ExprsInSubprogram.

    std::pair<OpInfoTy, OpInfoTy>

    sumOpInfos(Value *Root, SmallPtrSetImpl<Value *> &ReusedExprs,

               const SmallSetVector<Value *, 32> &ExprsInSubprogram,

               DenseMap<Value *, SmallPtrSet<Value *, 2>> &Shared) const {

      if (!ExprsInSubprogram.count(Root))

        return {};


      // Already counted this expression. Stop.

      if (!ReusedExprs.insert(Root).second)

        return {};


      OpInfoTy SharedCount;

      OpInfoTy Count;


      auto I = Shared.find(Root);

      auto CM = Inst2Matrix.find(Root);

      if (I->second.size() == 1)

        Count = CM->second.getOpInfo();

      else

        SharedCount = CM->second.getOpInfo();


      for (Value *Op : cast<Instruction>(Root)->operand_values()) {

        auto C = sumOpInfos(Op, ReusedExprs, ExprsInSubprogram, Shared);

        Count += C.first;

        SharedCount += C.second;

      }

      return {Count, SharedCount};

    }


    void emitRemarks() {

      if (!ORE.allowExtraAnalysis(DEBUG_TYPE))

        return;


      // Map matrix operations to their containting subprograms, by traversing

      // the inlinedAt chain. If the function does not have a DISubprogram, we

      // only map them to the containing function.

      MapVector<DISubprogram *, SmallVector<Value *, 8>> Subprog2Exprs;

      for (const auto &KV : Inst2Matrix) {

        if (Func.getSubprogram()) {

          auto *I = cast<Instruction>(KV.first);

          DILocation *Context = I->getDebugLoc();

          while (Context) {

            Subprog2Exprs[getSubprogram(Context->getScope())].push_back(

                KV.first);

            Context = DebugLoc(Context).getInlinedAt();

          }

        } else {

          Subprog2Exprs[nullptr].push_back(KV.first);

        }

      }

      for (auto &KV : Subprog2Exprs) {

        SmallSetVector<Value *, 32> ExprsInSubprogram(KV.second.begin(),

                                                      KV.second.end());

        auto Leaves = getExpressionLeaves(ExprsInSubprogram);


        DenseMap<Value *, SmallPtrSet<Value *, 2>> Shared;

        for (Value *Leaf : Leaves)

          collectSharedInfo(Leaf, Leaf, ExprsInSubprogram, Shared);


        // Generate remarks for each leaf.

        for (auto *L : Leaves) {


          DebugLoc Loc = cast<Instruction>(L)->getDebugLoc();

          DILocation *Context = cast<Instruction>(L)->getDebugLoc();

          while (Context) {

            if (getSubprogram(Context->getScope()) == KV.first) {

              Loc = Context;

              break;

            }

            Context = DebugLoc(Context).getInlinedAt();

          }


          SmallPtrSet<Value *, 8> ReusedExprs;

          OpInfoTy Counts, SharedCounts;

          std::tie(Counts, SharedCounts) =

              sumOpInfos(L, ReusedExprs, ExprsInSubprogram, Shared);


          OptimizationRemark Rem(DEBUG_TYPE, "matrix-lowered", Loc,

                                 cast<Instruction>(L)->getParent());


          Rem << "Lowered with ";

          Rem << ore::NV("NumStores", Counts.NumStores) << " stores, "

              << ore::NV("NumLoads", Counts.NumLoads) << " loads, "

              << ore::NV("NumComputeOps", Counts.NumComputeOps)

              << " compute ops, "

              << ore::NV("NumExposedTransposes", Counts.NumExposedTransposes)

              << " exposed transposes";


          if (SharedCounts.NumStores > 0 || SharedCounts.NumLoads > 0 ||

              SharedCounts.NumComputeOps > 0) {

            Rem << ",\nadditionally "

                << ore::NV("NumStores", SharedCounts.NumStores) << " stores, "

                << ore::NV("NumLoads", SharedCounts.NumLoads) << " loads, "

                << ore::NV("NumFPOps", SharedCounts.NumComputeOps)

                << " compute ops"

                << " are shared with other expressions";

          }


          Rem << ("\n" + linearize(L, Shared, ExprsInSubprogram, DL));

          ORE.emit(Rem);

        }

      }

    }


    std::string

    linearize(Value *L,

              const DenseMap<Value *, SmallPtrSet<Value *, 2>> &Shared,

              const SmallSetVector<Value *, 32> &ExprsInSubprogram,

              const DataLayout &DL) {

      ExprLinearizer Lin(DL, Inst2Matrix, Shared, ExprsInSubprogram, L);

      Lin.linearizeExpr(L, 0, false, false);

      return Lin.getResult();

    }

  };

};

} // namespace


PreservedAnalyses LowerMatrixIntrinsicsPass::run(Function &F,

                                                 FunctionAnalysisManager &AM) {

  auto &TTI = AM.getResult<TargetIRAnalysis>(F);


  LowerMatrixIntrinsics LMT(F, TTI, Minimal ? nullptr : &AM);

  if (LMT.Visit()) {

    PreservedAnalyses PA;

    if (!Minimal) {

      PA.preserve<LoopAnalysis>();

      PA.preserve<DominatorTreeAnalysis>();

    }

    return PA;

  }

  return PreservedAnalyses::all();

}


void LowerMatrixIntrinsicsPass::printPipeline(

    raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {

  static_cast<PassInfoMixin<LowerMatrixIntrinsicsPass> *>(this)->printPipeline(

      OS, MapClassName2PassName);

  OS << '<';

  if (Minimal)

    OS << "minimal";

  OS << '>';

}


assert
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")

Select
AMDGPU Register Bank Select
Definition AMDGPURegBankSelect.cpp:68

PHI
Rewrite undef for PHI
Definition AMDGPURewriteUndefForPHI.cpp:98

AliasAnalysis.h

Alignment.h

getParent
static const Function * getParent(const Value *V)
Definition BasicAliasAnalysis.cpp:894

BasicBlockUtils.h

BT
BitTracker BT
Definition BitTracker.cpp:68

A
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")

D
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")

E
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")

B
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")

CommandLine.h

clEnumValN
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
Definition CommandLine.h:687

Compiler.h

LLVM_DUMP_METHOD
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition Compiler.h:661

IntrinsicCostStrategy::InstructionCost
@ InstructionCost
Definition CostModel.cpp:51

DataLayout.h

DebugInfoMetadata.h

DerivedTypes.h

DomTreeUpdater.h

GEP
Hexagon Common GEP
Definition HexagonCommonGEP.cpp:164

getIndexType
static Type * getIndexType(Value *In)
Definition HexagonVectorCombine.cpp:2085

vectors
hexagon Hexagon specific predictive commoning for HVX vectors
Definition HexagonVectorLoopCarriedReuse.cpp:209

IRBuilder.h

CFG.h
This file provides various utilities for inspecting and working with the control flow graph in LLVM I...

Function.h

IntrinsicInst.h

users
iv users
Definition IVUsers.cpp:48

InstrTypes.h

Instructions.h

getOpcode
static Value * getOpcode(Value &V, Type &Ty, InstrumentationConfig &IConf, InstrumentorIRBuilderTy &IIRB)
Definition Instrumentor.cpp:1003

NumOps
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
Definition ItaniumDemangle.h:3473

TemplateParamKind::Type
@ Type
Definition ItaniumDemangle.h:1243

Ops
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
Definition ItaniumDemangle.h:3391

isZero
static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT, AssumptionCache *AC)
Definition Lint.cpp:539

Matrix
Live Register Matrix
Definition LiveRegMatrix.cpp:46

LoopInfo.h

LoopUtils.h

getSubprogram
static DISubprogram * getSubprogram(DIScope *Scope)
Helper function to either return Scope, if it is a subprogram or the attached subprogram for a local ...
Definition LowerMatrixIntrinsics.cpp:114

ForceFusion
static cl::opt< bool > ForceFusion("force-fuse-matrix", cl::init(false), cl::Hidden, cl::desc("Force matrix instruction fusion even if not profitable."))

m_AnyAdd
static auto m_AnyAdd(const LTy &L, const RTy &R)
Match any add operation (fp or integer).
Definition LowerMatrixIntrinsics.cpp:136

VerifyShapeInfo
static cl::opt< bool > VerifyShapeInfo("verify-matrix-shapes", cl::Hidden, cl::desc("Enable/disable matrix shape verification."), cl::init(false))

isShapePreserving
static bool isShapePreserving(Value *V)
Definition LowerMatrixIntrinsics.cpp:254

TileLoopsThreshold
static cl::opt< unsigned > TileLoopsThreshold("fuse-matrix-loops-threshold", cl::init(200), cl::Hidden, cl::desc("Generate loop nests for tiling when expected " "number of operations exceeds threshold."))

m_AnyMul
static auto m_AnyMul(const LTy &L, const RTy &R)
Match any mul operation (fp or integer).
Definition LowerMatrixIntrinsics.cpp:130

SplitMatmulRemainderOverThreshold
static cl::opt< unsigned > SplitMatmulRemainderOverThreshold("matrix-split-matmul-remainder-over-threshold", cl::Hidden, cl::desc("Illegal remainder vectors over this size in bits should be split " "in the inner loop of matmul"), cl::init(0))

isSplat
static bool isSplat(Value *V)
Return true if V is a splat of a value (which is used when multiplying a matrix with a scalar).
Definition LowerMatrixIntrinsics.cpp:122

FuseMatrix
static cl::opt< bool > FuseMatrix("fuse-matrix", cl::init(true), cl::Hidden, cl::desc("Enable/disable fusing matrix instructions."))

AllowContractEnabled
static cl::opt< bool > AllowContractEnabled("matrix-allow-contract", cl::init(false), cl::Hidden, cl::desc("Allow the use of FMAs if available and profitable. This may " "result in different results, due to less rounding error."))

computeShapeInfoForInst
static std::optional< ShapeInfo > computeShapeInfoForInst(Instruction *I, const DenseMap< Value *, ShapeInfo > &ShapeMap)
Return the ShapeInfo for the result of I, it it can be determined.
Definition LowerMatrixIntrinsics.cpp:324

MatrixLayoutTy
MatrixLayoutTy
Definition LowerMatrixIntrinsics.cpp:89

MatrixLayoutTy::RowMajor
@ RowMajor
Definition LowerMatrixIntrinsics.cpp:89

MatrixLayoutTy::ColumnMajor
@ ColumnMajor
Definition LowerMatrixIntrinsics.cpp:89

PrintAfterTransposeOpt
static cl::opt< bool > PrintAfterTransposeOpt("matrix-print-after-transpose-opt", cl::init(false))

DEBUG_TYPE
#define DEBUG_TYPE
Definition LowerMatrixIntrinsics.cpp:58

getShapedOperandsForInst
static iterator_range< Use * > getShapedOperandsForInst(Instruction *I)
Return an iterator over the operands of I that should share shape information with I.
Definition LowerMatrixIntrinsics.cpp:314

computeVectorAddr
static Value * computeVectorAddr(Value *BasePtr, Value *VecIdx, Value *Stride, unsigned NumElements, Type *EltType, IRBuilder<> &Builder)
Definition LowerMatrixIntrinsics.cpp:180

TileSize
static cl::opt< unsigned > TileSize("fuse-matrix-tile-size", cl::init(4), cl::Hidden, cl::desc("Tile size for matrix instruction fusion using square-shaped tiles."))

MatrixLayout
static cl::opt< MatrixLayoutTy > MatrixLayout("matrix-default-layout", cl::init(MatrixLayoutTy::ColumnMajor), cl::desc("Sets the default matrix layout"), cl::values(clEnumValN(MatrixLayoutTy::ColumnMajor, "column-major", "Use column-major layout"), clEnumValN(MatrixLayoutTy::RowMajor, "row-major", "Use row-major layout")))

LowerMatrixIntrinsics.h

F
#define F(x, y, z)
Definition MD5.cpp:54

I
#define I(x, y, z)
Definition MD5.cpp:57

MatrixBuilder.h

MatrixUtils.h

Context
@ Context
Definition MemProfContextDisambiguation.cpp:135

T
#define T
Definition Mips16ISelLowering.cpp:282

T1
#define T1
Definition Mips16ISelLowering.cpp:281

II
uint64_t IntrinsicInst * II
Definition NVVMIntrRange.cpp:46

OptimizationRemarkEmitter.h

Operation
PowerPC Reduce CR logical Operation
Definition PPCReduceCRLogicals.cpp:736

PatternMatch.h

PostOrderIterator.h
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.

ProfDataUtils.h
This file contains the declarations for profiling metadata utility functions.

Cond
const SmallVectorImpl< MachineOperand > & Cond
Definition RISCVRedundantCopyElimination.cpp:73

extractVector
static Value * extractVector(IRBuilderTy &IRB, Value *V, unsigned BeginIndex, unsigned EndIndex, const Twine &Name)
Definition SROA.cpp:2516

insertVector
static Value * insertVector(IRBuilderTy &IRB, Value *Old, Value *V, unsigned BeginIndex, const Twine &Name)
Definition SROA.cpp:2538

STLExtras.h
This file contains some templates that are useful if you are working with the STL at all.

ScopeExit.h
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...

SmallVector.h
This file defines the SmallVector class.

Statistic.h
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...

STATISTIC
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171

Debug.h

LLVM_DEBUG
#define LLVM_DEBUG(...)
Definition Debug.h:119

getType
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39

BlockSize
static const int BlockSize
Definition TarWriter.cpp:33

TargetTransformInfo.h
This pass exposes codegen information to IR-level passes.

ValueTracking.h

VectorUtils.h

LowerStore
static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Definition X86ISelLowering.cpp:26379

LowerLoad
static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Definition X86ISelLowering.cpp:26464

RHS
Value * RHS
Definition X86PartialReduction.cpp:81

LHS
Value * LHS
Definition X86PartialReduction.cpp:80

llvm::AllocaInst::getAlign
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
Definition Instructions.h:129

llvm::AnalysisManager::getResult
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition PassManager.h:434

llvm::BasicBlock::begin
iterator begin()
Instruction iterator methods.
Definition BasicBlock.h:461

llvm::BasicBlock::getParent
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213

llvm::BasicBlock::rbegin
reverse_iterator rbegin()
Definition BasicBlock.h:477

llvm::BasicBlock::reverse_iterator
InstListType::reverse_iterator reverse_iterator
Definition BasicBlock.h:172

llvm::BasicBlock::rend
reverse_iterator rend()
Definition BasicBlock.h:479

llvm::BasicBlock::getTerminator
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction; assumes that the block is well-formed.
Definition BasicBlock.h:237

llvm::BinaryOperator::getOpcode
BinaryOps getOpcode() const
Definition InstrTypes.h:409

llvm::CallBase::getCalledFunction
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition InstrTypes.h:1417

llvm::CallBase::arg_begin
User::op_iterator arg_begin()
Return the iterator pointing to the beginning of the argument list.
Definition InstrTypes.h:1336

llvm::CallBase::getParamAlign
MaybeAlign getParamAlign(unsigned ArgNo) const
Extract the alignment for a call or parameter (0=unknown).
Definition InstrTypes.h:1847

llvm::CallBase::getArgOperand
Value * getArgOperand(unsigned i) const
Definition InstrTypes.h:1361

llvm::CallBase::arg_end
User::op_iterator arg_end()
Return the iterator pointing to the end of the argument list.
Definition InstrTypes.h:1342

llvm::CastInst::getOpcode
Instruction::CastOps getOpcode() const
Return the opcode of this CastInst.
Definition InstrTypes.h:674

llvm::ConstantAggregateZero::get
static LLVM_ABI ConstantAggregateZero * get(Type *Ty)
Definition Constants.cpp:1814

llvm::DILocalScope::getSubprogram
LLVM_ABI DISubprogram * getSubprogram() const
Get the subprogram for this scope.
Definition DebugInfoMetadata.cpp:1331

llvm::DIScope
Base class for scope-like contexts.
Definition DebugInfoMetadata.h:527

llvm::DISubprogram
Subprogram description. Uses SubclassData1.
Definition DebugInfoMetadata.h:2285

llvm::DenseMapBase::find
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:178

llvm::DenseMapBase::end
iterator end()
Definition DenseMap.h:81

llvm::DenseMap
Definition DenseMap.h:772

llvm::DominatorTreeAnalysis
Analysis pass which computes a DominatorTree.
Definition Dominators.h:278

llvm::ElementCount::getFixed
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition TypeSize.h:309

llvm::FastMathFlags::setAllowContract
void setAllowContract(bool B=true)
Definition FMF.h:93

llvm::FastMathFlags::allowReassoc
bool allowReassoc() const
Flag queries.
Definition FMF.h:67

llvm::FastMathFlags::allowContract
bool allowContract() const
Definition FMF.h:72

llvm::FixedVectorType::getNumElements
unsigned getNumElements() const
Definition DerivedTypes.h:685

llvm::FixedVectorType::get
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:873

llvm::Function
Definition Function.h:65

llvm::Function::getIntrinsicID
Intrinsic::ID getIntrinsicID() const LLVM_READONLY
getIntrinsicID - This method returns the ID number of the specified function, or Intrinsic::not_intri...
Definition Function.h:246

llvm::Function::isIntrinsic
bool isIntrinsic() const
isIntrinsic - Returns true if the function's name starts with "llvm.".
Definition Function.h:251

llvm::GEPNoWrapFlags::inBounds
static GEPNoWrapFlags inBounds()
Definition GEPNoWrapFlags.h:50

llvm::GEPNoWrapFlags::noUnsignedWrap
static GEPNoWrapFlags noUnsignedWrap()
Definition GEPNoWrapFlags.h:56

llvm::IRBuilderBase::CreateFAddReduce
LLVM_ABI CallInst * CreateFAddReduce(Value *Acc, Value *Src)
Create a sequential vector fadd reduction intrinsic of the source vector.
Definition IRBuilder.cpp:442

llvm::IRBuilderBase::CreateICmpULT
Value * CreateICmpULT(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2390

llvm::IRBuilderBase::CreateInsertElement
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2627

llvm::IRBuilderBase::CreateLifetimeStart
LLVM_ABI CallInst * CreateLifetimeStart(Value *Ptr)
Create a lifetime.start intrinsic.
Definition IRBuilder.cpp:500

llvm::IRBuilderBase::CreateExtractElement
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2615

llvm::IRBuilderBase::CreateLifetimeEnd
LLVM_ABI CallInst * CreateLifetimeEnd(Value *Ptr)
Create a lifetime.end intrinsic.
Definition IRBuilder.cpp:506

llvm::IRBuilderBase::CreateAlignedLoad
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition IRBuilder.h:1935

llvm::IRBuilderBase::CreateZExtOrTrunc
Value * CreateZExtOrTrunc(Value *V, Type *DestTy, const Twine &Name="")
Create a ZExt or Trunc from the integer value V to DestTy.
Definition IRBuilder.h:2138

llvm::IRBuilderBase::CreateMemCpy
CallInst * CreateMemCpy(Value *Dst, MaybeAlign DstAlign, Value *Src, MaybeAlign SrcAlign, uint64_t Size, bool isVolatile=false, const AAMDNodes &AAInfo=AAMDNodes())
Create and insert a memcpy between the specified pointers.
Definition IRBuilder.h:717

llvm::IRBuilderBase::CreateCondBr
CondBrInst * CreateCondBr(Value *Cond, BasicBlock *True, BasicBlock *False, MDNode *BranchWeights=nullptr, MDNode *Unpredictable=nullptr)
Create a conditional 'br Cond, TrueDest, FalseDest' instruction.
Definition IRBuilder.h:1238

llvm::IRBuilderBase::CreateFAdd
Value * CreateFAdd(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Definition IRBuilder.h:1658

llvm::IRBuilderBase::CreateVectorSplat
LLVM_ABI Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
Definition IRBuilder.cpp:1240

llvm::IRBuilderBase::CreateIntrinsic
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > OverloadTypes, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="", ArrayRef< OperandBundleDef > OpBundles={})
Create a call to intrinsic ID with Args, mangled using OverloadTypes.
Definition IRBuilder.cpp:931

llvm::IRBuilderBase::CreateSelect
LLVM_ABI Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition IRBuilder.cpp:1107

llvm::IRBuilderBase::CreateAddReduce
LLVM_ABI CallInst * CreateAddReduce(Value *Src)
Create a vector int add reduction intrinsic of the source vector.
Definition IRBuilder.cpp:452

llvm::IRBuilderBase::CreatePtrAdd
Value * CreatePtrAdd(Value *Ptr, Value *Offset, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
Definition IRBuilder.h:2091

llvm::IRBuilderBase::CreateCast
Value * CreateCast(Instruction::CastOps Op, Value *V, Type *DestTy, const Twine &Name="", MDNode *FPMathTag=nullptr, FMFSource FMFSource={})
Definition IRBuilder.h:2276

llvm::IRBuilderBase::setFastMathFlags
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Definition IRBuilder.h:352

llvm::IRBuilderBase::CreateInBoundsGEP
Value * CreateInBoundsGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="")
Definition IRBuilder.h:2018

llvm::IRBuilderBase::CreateBinaryIntrinsic
LLVM_ABI Value * CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 2 operands which is mangled on the first type.
Definition IRBuilder.cpp:920

llvm::IRBuilderBase::CreatePHI
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition IRBuilder.h:2539

llvm::IRBuilderBase::getIntN
ConstantInt * getIntN(unsigned N, uint64_t C)
Get a constant N-bit value, zero extended from a 64-bit value.
Definition IRBuilder.h:539

llvm::IRBuilderBase::CreateUnaryIntrinsic
LLVM_ABI CallInst * CreateUnaryIntrinsic(Intrinsic::ID ID, Value *V, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 1 operand which is mangled on its type.
Definition IRBuilder.cpp:912

llvm::IRBuilderBase::CreateLoad
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Definition IRBuilder.h:1918

llvm::IRBuilderBase::CreateShuffleVector
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition IRBuilder.h:2649

llvm::IRBuilderBase::CreateAdd
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1444

llvm::IRBuilderBase::CreateBinOp
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1753

llvm::IRBuilderBase::SetInsertPoint
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:207

llvm::IRBuilderBase::CreateAlignedStore
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
Definition IRBuilder.h:1954

llvm::IRBuilderBase::CreateFMul
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Definition IRBuilder.h:1696

llvm::IRBuilderBase::CreateFNeg
Value * CreateFNeg(Value *V, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1851

llvm::IRBuilderBase::CreateMul
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1478

llvm::IRBuilder
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2858

llvm::Instruction
Definition Instruction.h:70

llvm::Instruction::moveAfter
LLVM_ABI void moveAfter(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
Definition Instruction.cpp:204

llvm::Instruction::setFastMathFlags
LLVM_ABI void setFastMathFlags(FastMathFlags FMF)
Convenience function for setting multiple fast-math flags on this instruction, which must be an opera...
Definition Instruction.cpp:664

llvm::Instruction::eraseFromParent
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Definition Instruction.cpp:112

llvm::Instruction::getFastMathFlags
LLVM_ABI FastMathFlags getFastMathFlags() const LLVM_READONLY
Convenience function for getting all the fast-math flags, which must be an operator which supports th...
Definition Instruction.cpp:714

llvm::IntrinsicInst::getIntrinsicID
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition IntrinsicInst.h:56

llvm::LoadInst::isVolatile
bool isVolatile() const
Return true if this is a load from a volatile memory location.
Definition Instructions.h:210

llvm::LoadInst::getAlign
Align getAlign() const
Return the alignment of the access that is being performed.
Definition Instructions.h:216

llvm::LocationSize::getValue
TypeSize getValue() const
Definition MemoryLocation.h:158

llvm::LoopAnalysis
Analysis pass that exposes the LoopInfo for a function.
Definition LoopInfo.h:587

llvm::LowerMatrixIntrinsicsPass::run
LLVM_ABI PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
Definition LowerMatrixIntrinsics.cpp:2951

llvm::LowerMatrixIntrinsicsPass::printPipeline
LLVM_ABI void printPipeline(raw_ostream &OS, function_ref< StringRef(StringRef)> MapClassName2PassName)
Definition LowerMatrixIntrinsics.cpp:2967

llvm::MatrixBuilder::CreateMatrixTranspose
CallInst * CreateMatrixTranspose(Value *Matrix, unsigned Rows, unsigned Columns, const Twine &Name="")
Create a llvm.matrix.transpose call, transposing Matrix with Rows rows and Columns columns.
Definition MatrixBuilder.h:110

llvm::MatrixBuilder::CreateMatrixMultiply
CallInst * CreateMatrixMultiply(Value *LHS, Value *RHS, unsigned LHSRows, unsigned LHSColumns, unsigned RHSColumns, const Twine &Name="")
Create a llvm.matrix.multiply call, multiplying matrixes LHS and RHS.
Definition MatrixBuilder.h:126

llvm::MemoryLocation::get
static LLVM_ABI MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
Definition MemoryLocation.cpp:36

llvm::MemoryLocation::Size
LocationSize Size
The maximum size of the location, in address-units, or UnknownSize if the size is not known.
Definition MemoryLocation.h:234

llvm::MemoryLocation::Ptr
const Value * Ptr
The address of the start of the location.
Definition MemoryLocation.h:225

llvm::MemoryLocation::getForArgument
static LLVM_ABI MemoryLocation getForArgument(const CallBase *Call, unsigned ArgIdx, const TargetLibraryInfo *TLI)
Return a location representing a particular argument of a call.
Definition MemoryLocation.cpp:181

llvm::PHINode::addIncoming
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Definition Instructions.h:2795

llvm::PHINode::blocks
iterator_range< const_block_iterator > blocks() const
Definition Instructions.h:2721

llvm::PHINode::incoming_values
op_range incoming_values()
Definition Instructions.h:2725

llvm::PoisonValue::get
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition Constants.cpp:2038

llvm::PreservedAnalyses
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112

llvm::PreservedAnalyses::all
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118

llvm::PreservedAnalyses::preserve
PreservedAnalyses & preserve()
Mark an analysis as preserved.
Definition Analysis.h:132

llvm::SetVector::size
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:103

llvm::SetVector::insert_range
void insert_range(Range &&R)
Definition SetVector.h:176

llvm::SetVector::count
size_type count(const_arg_type key) const
Count the number of elements of a given key in the SetVector.
Definition SetVector.h:262

llvm::SetVector::insert
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151

llvm::SmallPtrSetImplBase::empty
bool empty() const
Definition SmallPtrSet.h:98

llvm::SmallPtrSetImpl::erase
bool erase(PtrType Ptr)
Remove pointer from the set.
Definition SmallPtrSet.h:402

llvm::SmallPtrSetImpl::count
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition SmallPtrSet.h:461

llvm::SmallPtrSetImpl::insert
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition SmallPtrSet.h:387

llvm::SmallPtrSetImpl::contains
bool contains(ConstPtrType Ptr) const
Definition SmallPtrSet.h:467

llvm::SmallVectorImpl::pop_back_val
T pop_back_val()
Definition SmallVector.h:681

llvm::SmallVectorImpl::append
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition SmallVector.h:691

llvm::SmallVectorImpl::resize
void resize(size_type N)
Definition SmallVector.h:646

llvm::SmallVectorTemplateBase::pop_back
void pop_back()
Definition SmallVector.h:435

llvm::SmallVectorTemplateBase::push_back
void push_back(const T &Elt)
Definition SmallVector.h:423

llvm::SmallVectorTemplateCommon::size
size_t size() const
Definition SmallVector.h:83

llvm::SmallVectorTemplateCommon::back
reference back()
Definition SmallVector.h:317

llvm::SmallVectorTemplateCommon::empty
bool empty() const
Definition SmallVector.h:86

llvm::StoreInst::getAlign
Align getAlign() const
Definition Instructions.h:339

llvm::StoreInst::isVolatile
bool isVolatile() const
Return true if this is a store to a volatile memory location.
Definition Instructions.h:331

llvm::StringRef
Represent a constant reference to a string, i.e.
Definition StringRef.h:56

llvm::StringRef::drop_front
StringRef drop_front(size_t N=1) const
Return a StringRef equal to 'this' but with the first N elements dropped.
Definition StringRef.h:629

llvm::StringRef::size
constexpr size_t size() const
Get the string size.
Definition StringRef.h:144

llvm::TargetIRAnalysis
Analysis pass providing the TargetTransformInfo.
Definition TargetTransformInfo.h:2121

llvm::TargetTransformInfo::TCK_RecipThroughput
@ TCK_RecipThroughput
Reciprocal throughput.
Definition TargetTransformInfo.h:332

llvm::TargetTransformInfo::RGK_FixedWidthVector
@ RGK_FixedWidthVector
Definition TargetTransformInfo.h:1348

llvm::TargetTransformInfo::SK_Splice
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
Definition TargetTransformInfo.h:1257

llvm::Type
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46

llvm::Type::getScalarType
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:370

llvm::Type::getPrimitiveSizeInBits
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:201

llvm::Type::getScalarSizeInBits
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:236

llvm::Type::isVoidTy
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:141

llvm::UnaryOperator::getOpcode
UnaryOps getOpcode() const
Definition InstrTypes.h:163

llvm::User::getOperand
Value * getOperand(unsigned i) const
Definition User.h:207

llvm::Value
LLVM Value Representation.
Definition Value.h:75

llvm::Value::getType
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255

llvm::Value::user_begin
user_iterator user_begin()
Definition Value.h:402

llvm::Value::hasOneUse
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439

llvm::Value::replaceAllUsesWith
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:549

llvm::Value::users
iterator_range< user_iterator > users()
Definition Value.h:426

llvm::Value::use_empty
bool use_empty() const
Definition Value.h:346

llvm::Value::uses
iterator_range< use_iterator > uses()
Definition Value.h:380

llvm::Value::getName
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:318

llvm::VectorType::getElementType
Type * getElementType() const
Definition DerivedTypes.h:515

llvm::cl::opt
Definition CommandLine.h:1454

llvm::details::FixedOrScalableQuantity::getFixedValue
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200

llvm::function_ref
An efficient, type-erasing, non-owning reference to a callable.
Definition STLFunctionalExtras.h:37

llvm::ilist_detail::node_parent_access::getParent
const ParentTy * getParent() const
Definition ilist_node.h:34

llvm::ilist_node_impl::getIterator
self_iterator getIterator()
Definition ilist_node.h:123

llvm::iterator_range
A range adaptor for a pair of iterators.
Definition iterator_range.h:32

llvm::raw_ostream
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53

Changed
Changed
Definition ObjCARCOpts.cpp:2366

llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition ErrorHandling.h:164

llvm::AArch64PACKey::IB
@ IB
Definition AArch64BaseInfo.h:1013

llvm::AMDGPU::HSAMD::Kernel::Arg::Key::Align
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
Definition AMDGPUMetadata.h:183

llvm::ARM_MB::ST
@ ST
Definition ARMBaseInfo.h:73

llvm::ARM::ProfileKind::M
@ M
Definition ARMTargetParser.h:171

llvm::BitmaskEnumDetail::Mask
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition BitmaskEnum.h:126

llvm::CallingConv::C
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34

llvm::GraphProgram::Name
Name
Definition GraphWriter.h:51

llvm::ISD::BasicBlock
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81

llvm::Intrinsic::getBaseName
LLVM_ABI StringRef getBaseName(ID id)
Return the LLVM name for an intrinsic, without encoded types for overloading, such as "llvm....
Definition Intrinsics.cpp:52

llvm::M68k::MemAddrModeKind::U
@ U
Definition M68kBaseInfo.h:61

llvm::M68k::MemAddrModeKind::V
@ V
Definition M68kBaseInfo.h:63

llvm::M68k::MemAddrModeKind::K
@ K
Definition M68kBaseInfo.h:68

llvm::M68k::MemAddrModeKind::L
@ L
Definition M68kBaseInfo.h:70

llvm::MIPatternMatch::m_OneUse
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
Definition MIPatternMatch.h:56

llvm::NVPTX::Shared
@ Shared
Definition NVPTX.h:205

llvm::PatternMatchHelpers::m_CombineOr
match_combine_or< Ty... > m_CombineOr(const Ty &...Ps)
Combine pattern matchers matching any of Ps patterns.
Definition PatternMatchHelpers.h:56

llvm::PatternMatch
Definition PatternMatch.h:51

llvm::PatternMatch::m_Store
TwoOps_match< ValueOpTy, PointerOpTy, Instruction::Store > m_Store(const ValueOpTy &ValueOp, const PointerOpTy &PointerOp)
Matches StoreInst.
Definition PatternMatch.h:2027

llvm::PatternMatch::m_Add
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
Definition PatternMatch.h:1150

llvm::PatternMatch::m_FMul
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
Definition PatternMatch.h:1222

llvm::PatternMatch::match
bool match(Val *V, const Pattern &P)
Definition PatternMatch.h:53

llvm::PatternMatch::m_Specific
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition PatternMatch.h:939

llvm::PatternMatch::m_One
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
Definition PatternMatch.h:560

llvm::PatternMatch::m_Intrinsic
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
Definition PatternMatch.h:2844

llvm::PatternMatch::m_BinOp
auto m_BinOp()
Match an arbitrary binary operation and ignore it.
Definition PatternMatch.h:141

llvm::PatternMatch::m_Value
auto m_Value()
Match an arbitrary value and ignore it.
Definition PatternMatch.h:135

llvm::PatternMatch::m_FAdd
BinaryOp_match< LHS, RHS, Instruction::FAdd > m_FAdd(const LHS &L, const RHS &R)
Definition PatternMatch.h:1156

llvm::PatternMatch::m_Mul
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
Definition PatternMatch.h:1216

llvm::PatternMatch::m_Load
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
Definition PatternMatch.h:2020

llvm::PatternMatch::m_ConstantInt
auto m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
Definition PatternMatch.h:179

llvm::RISCVFenceField::R
@ R
Definition RISCVBaseInfo.h:489

llvm::SIEncodingFamily::SI
@ SI
Definition SIDefines.h:36

llvm::SI
Definition SIInstrInfo.h:1918

llvm::SPII::Store
@ Store
Definition SparcInstrInfo.h:33

llvm::SPII::Load
@ Load
Definition SparcInstrInfo.h:32

llvm::X86AS::SS
@ SS
Definition X86.h:515

llvm::X86II::TA
@ TA
Definition X86BaseInfo.h:738

llvm::cl::Hidden
@ Hidden
Definition CommandLine.h:138

llvm::cl::values
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
Definition CommandLine.h:712

llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition CommandLine.h:444

llvm::codeview::FrameCookieKind::Copy
@ Copy
Definition CodeView.h:494

llvm::codeview::PublicSymFlags::Function
@ Function
Definition CodeView.h:408

llvm::dwarf_linker::DebugSectionKind::DebugLoc
@ DebugLoc
Definition DWARFLinkerBase.h:34

llvm::dxil::ElementType
ElementType
The element type of an SRV or UAV resource.
Definition DXILABI.h:68

llvm::memprof::Meta::Start
@ Start
Definition MemProf.h:69

llvm::ms_demangle::IntrinsicFunctionKind::New
@ New
Definition MicrosoftDemangleNodes.h:121

llvm::ms_demangle::QualifierMangleMode::Result
@ Result
Definition MicrosoftDemangle.h:132

llvm::ore::NV
DiagnosticInfoOptimizationBase::Argument NV
Definition OptimizationRemarkEmitter.h:139

llvm::rdf::Phi
NodeAddr< PhiNode * > Phi
Definition RDFGraph.h:390

llvm::sandboxir::Instruction
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73

llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition FunctionInfo.h:25

llvm::drop_begin
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:315

llvm::dump
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
Definition SparseBitVector.h:874

llvm::Offset
@ Offset
Definition DWP.cpp:558

llvm::Value
FunctionAddr VTableAddr Value
Definition InstrProf.h:137

llvm::fill
void fill(R &&Range, T &&Value)
Provide wrappers to std::fill which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1758

llvm::ProfcheckDisableMetadataFixes
cl::opt< bool > ProfcheckDisableMetadataFixes
Definition LoopInfo.cpp:60

llvm::PseudoProbeType::Block
@ Block
Definition PseudoProbe.h:30

llvm::size
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1668

llvm::zip_equal
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
Definition STLExtras.h:840

llvm::enumerate
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2553

llvm::setExplicitlyUnknownBranchWeightsIfProfiled
LLVM_ABI void setExplicitlyUnknownBranchWeightsIfProfiled(Instruction &I, StringRef PassName, const Function *F=nullptr)
Like setExplicitlyUnknownBranchWeights(...), but only sets unknown branch weights in the new instruct...
Definition ProfDataUtils.cpp:279

llvm::dyn_cast
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643

llvm::successors
auto successors(const MachineBasicBlock *BB)
Definition MachineBasicBlock.h:1448

llvm::scope_exit
scope_exit(Callable) -> scope_exit< Callable >

llvm::operator!=
bool operator!=(uint64_t V1, const APInt &V2)
Definition APInt.h:2142

llvm::make_range
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
Definition iterator_range.h:70

llvm::operator+=
LLVM_ATTRIBUTE_ALWAYS_INLINE DynamicAPInt & operator+=(DynamicAPInt &A, int64_t B)
Definition DynamicAPInt.h:531

llvm::make_early_inc_range
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:633

llvm::concatenateVectors
LLVM_ABI Value * concatenateVectors(IRBuilderBase &Builder, ArrayRef< Value * > Vecs)
Concatenate a list of vectors.
Definition VectorUtils.cpp:1231

llvm::operator==
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
Definition AddressRanges.h:151

llvm::getPointerOperand
const Value * getPointerOperand(const Value *V)
A helper function that returns the pointer operand of a load, store or GEP instruction.
Definition Instructions.h:5307

llvm::addStringMetadataToLoop
LLVM_ABI void addStringMetadataToLoop(Loop *TheLoop, const char *MDString, unsigned V=0)
Set input string into loop metadata by keeping other values intact.
Definition LoopUtils.cpp:218

llvm::any_of
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1745

llvm::reverse
auto reverse(ContainerTy &&C)
Definition STLExtras.h:407

llvm::sort
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1635

llvm::ComplexDeinterleavingOperation::Splat
@ Splat
Definition ComplexDeinterleavingPass.h:42

llvm::dbgs
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:209

llvm::report_fatal_error
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:163

llvm::Count
FunctionAddr VTableAddr Count
Definition InstrProf.h:139

llvm::SmallVector
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
Definition SmallVector.h:1151

llvm::isa
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547

llvm::errs
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
Definition raw_ostream.cpp:904

llvm::TTI
TargetTransformInfo TTI
Definition TargetTransformInfo.h:263

llvm::IRBuilder
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >

llvm::RecurKind::Mul
@ Mul
Product of integers.
Definition IVDescriptors.h:40

llvm::RecurKind::Add
@ Add
Sum of integers.
Definition IVDescriptors.h:37

llvm::SplitBlock
LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="")
Split the specified block at the specified instruction.
Definition BasicBlockUtils.cpp:1049

llvm::Op
DWARFExpression::Operation Op
Definition DWARFExpressionPrinter.cpp:25

llvm::operator<<
raw_ostream & operator<<(raw_ostream &OS, const APFixedPoint &FX)
Definition APFixedPoint.h:312

llvm::ArrayRef
ArrayRef(const T &OneElt) -> ArrayRef< T >

llvm::copy
OutputIt copy(R &&Range, OutputIt Out)
Definition STLExtras.h:1884

llvm::cast
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559

llvm::commonAlignment
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201

llvm::FunctionAnalysisManager
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
Definition PassManager.h:586

llvm::VFParamKind::Vector
@ Vector
Definition VFABIDemangler.h:27

llvm::getUnderlyingObject
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
Definition ValueTracking.cpp:6940

llvm::AliasAnalysis
AAResults AliasAnalysis
Temporary typedef for legacy code that uses a generic AliasAnalysis pointer or reference.
Definition AliasAnalysis.h:746

llvm::write
LLVM_ABI Error write(DWPWriter &Out, ArrayRef< std::string > Inputs, OnCuIndexOverflow OverflowOptValue, Dwarf64StrOffsetsPromotion StrOffsetsOptValue, raw_pwrite_stream *OS=nullptr)
Definition DWP.cpp:721

llvm::createSequentialMask
LLVM_ABI llvm::SmallVector< int, 16 > createSequentialMask(unsigned Start, unsigned NumInts, unsigned NumUndefs)
Create a sequential shuffle mask.
Definition VectorUtils.cpp:1176

std::swap
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:876

N
#define N

llvm::PassInfoMixin
A CRTP mix-in to automatically provide informational APIs needed for passes.
Definition PassManager.h:89

llvm::cl::desc
Definition CommandLine.h:410