diff options
Diffstat (limited to 'llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp')
| -rw-r--r-- | llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp | 176 |
1 files changed, 175 insertions, 1 deletions
diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp index 03b92d3338a9..0874b29ab7d2 100644 --- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -39,6 +39,7 @@ #include "llvm/ADT/StringRef.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/CmpInstAnalysis.h" +#include "llvm/Analysis/HashRecognize.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/MemoryLocation.h" @@ -143,6 +144,14 @@ static cl::opt<bool, true> cl::location(DisableLIRP::Wcslen), cl::init(false), cl::ReallyHidden); +bool DisableLIRP::HashRecognize; +static cl::opt<bool, true> + DisableLIRPHashRecognize("disable-" DEBUG_TYPE "-hashrecognize", + cl::desc("Proceed with loop idiom recognize pass, " + "but do not optimize CRC loops."), + cl::location(DisableLIRP::HashRecognize), + cl::init(false), cl::ReallyHidden); + static cl::opt<bool> UseLIRCodeSizeHeurs( "use-lir-code-size-heurs", cl::desc("Use loop idiom recognition code size heuristics when compiling " @@ -242,6 +251,7 @@ private: const SCEV *BECount); bool avoidLIRForMultiBlockLoop(bool IsMemset = false, bool IsLoopMemset = false); + bool optimizeCRCLoop(const PolynomialInfo &Info); /// @} /// \name Noncountable Loop Idiom Handling @@ -287,6 +297,8 @@ PreservedAnalyses LoopIdiomRecognizePass::run(Loop &L, LoopAnalysisManager &AM, // but ORE cannot be preserved (see comment before the pass definition). OptimizationRemarkEmitter ORE(L.getHeader()->getParent()); + std::optional<PolynomialInfo> HR; + LoopIdiomRecognize LIR(&AR.AA, &AR.DT, &AR.LI, &AR.SE, &AR.TLI, &AR.TTI, AR.MSSA, DL, ORE); if (!LIR.runOnLoop(&L)) @@ -335,7 +347,8 @@ bool LoopIdiomRecognize::runOnLoop(Loop *L) { HasMemsetPattern = TLI->has(LibFunc_memset_pattern16); HasMemcpy = TLI->has(LibFunc_memcpy); - if (HasMemset || HasMemsetPattern || ForceMemsetPatternIntrinsic || HasMemcpy) + if (HasMemset || HasMemsetPattern || ForceMemsetPatternIntrinsic || + HasMemcpy || !DisableLIRP::HashRecognize) if (SE->hasLoopInvariantBackedgeTakenCount(L)) return runOnCountableLoop(); @@ -378,6 +391,13 @@ bool LoopIdiomRecognize::runOnCountableLoop() { MadeChange |= runOnLoopBlock(BB, BECount, ExitBlocks); } + + // Optimize a CRC loop if HashRecognize found one, provided we're not + // optimizing for size. + if (!DisableLIRP::HashRecognize && !ApplyCodeSizeHeuristics) + if (auto Res = HashRecognize(*CurLoop, *SE).getResult()) + optimizeCRCLoop(*Res); + return MadeChange; } @@ -1514,6 +1534,160 @@ bool LoopIdiomRecognize::avoidLIRForMultiBlockLoop(bool IsMemset, return false; } +bool LoopIdiomRecognize::optimizeCRCLoop(const PolynomialInfo &Info) { + // FIXME: Hexagon has a special HexagonLoopIdiom that optimizes CRC using + // carry-less multiplication instructions, which is more efficient than our + // Sarwate table-lookup optimization. Hence, until we're able to emit + // target-specific instructions for Hexagon, subsuming HexagonLoopIdiom, + // disable the optimization for Hexagon. + Module &M = *CurLoop->getHeader()->getModule(); + Triple TT(M.getTargetTriple()); + if (TT.getArch() == Triple::hexagon) + return false; + + // First, create a new GlobalVariable corresponding to the + // Sarwate-lookup-table. + Type *CRCTy = Info.LHS->getType(); + unsigned CRCBW = CRCTy->getIntegerBitWidth(); + std::array<Constant *, 256> CRCConstants; + transform(HashRecognize::genSarwateTable(Info.RHS, Info.ByteOrderSwapped), + CRCConstants.begin(), + [CRCTy](const APInt &E) { return ConstantInt::get(CRCTy, E); }); + Constant *ConstArray = + ConstantArray::get(ArrayType::get(CRCTy, 256), CRCConstants); + GlobalVariable *GV = + new GlobalVariable(M, ConstArray->getType(), true, + GlobalValue::PrivateLinkage, ConstArray, ".crctable"); + + PHINode *IV = CurLoop->getCanonicalInductionVariable(); + SmallVector<PHINode *, 2> Cleanup; + + // Next, mark all PHIs for removal except IV. + { + for (PHINode &PN : CurLoop->getHeader()->phis()) { + if (&PN == IV) + continue; + PN.replaceAllUsesWith(PoisonValue::get(PN.getType())); + Cleanup.push_back(&PN); + } + } + + // Next, fix up the trip count. + { + unsigned NewBTC = (Info.TripCount / 8) - 1; + BasicBlock *LoopBlk = CurLoop->getLoopLatch(); + BranchInst *BrInst = cast<BranchInst>(LoopBlk->getTerminator()); + CmpPredicate ExitPred = BrInst->getSuccessor(0) == LoopBlk + ? ICmpInst::Predicate::ICMP_NE + : ICmpInst::Predicate::ICMP_EQ; + Instruction *ExitCond = CurLoop->getLatchCmpInst(); + Value *ExitLimit = ConstantInt::get(IV->getType(), NewBTC); + IRBuilder<> Builder(ExitCond); + Value *NewExitCond = + Builder.CreateICmp(ExitPred, IV, ExitLimit, "exit.cond"); + ExitCond->replaceAllUsesWith(NewExitCond); + deleteDeadInstruction(ExitCond); + } + + // Finally, fill the loop with the Sarwate-table-lookup logic, and replace all + // uses of ComputedValue. + // + // Little-endian: + // crc = (crc >> 8) ^ tbl[(iv'th byte of data) ^ (bottom byte of crc)] + // Big-Endian: + // crc = (crc << 8) ^ tbl[(iv'th byte of data) ^ (top byte of crc)] + { + auto LoByte = [](IRBuilderBase &Builder, Value *Op, const Twine &Name) { + Type *OpTy = Op->getType(); + unsigned OpBW = OpTy->getIntegerBitWidth(); + return OpBW > 8 + ? Builder.CreateAnd(Op, ConstantInt::get(OpTy, 0XFF), Name) + : Op; + }; + auto HiIdx = [LoByte, CRCBW](IRBuilderBase &Builder, Value *Op, + const Twine &Name) { + Type *OpTy = Op->getType(); + + // When the bitwidth of the CRC mismatches the Op's bitwidth, we need to + // use the CRC's bitwidth as the reference for shifting right. + return LoByte(Builder, + CRCBW > 8 ? Builder.CreateLShr( + Op, ConstantInt::get(OpTy, CRCBW - 8), Name) + : Op, + Name + ".lo.byte"); + }; + + IRBuilder<> Builder(CurLoop->getHeader(), + CurLoop->getHeader()->getFirstNonPHIIt()); + + // Create the CRC PHI, and initialize its incoming value to the initial + // value of CRC. + PHINode *CRCPhi = Builder.CreatePHI(CRCTy, 2, "crc"); + CRCPhi->addIncoming(Info.LHS, CurLoop->getLoopPreheader()); + + // CRC is now an evolving variable, initialized to the PHI. + Value *CRC = CRCPhi; + + // TableIndexer = ((top|bottom) byte of CRC). It is XOR'ed with (iv'th byte + // of LHSAux), if LHSAux is non-nullptr. + Value *Indexer = CRC; + if (Value *Data = Info.LHSAux) { + Type *DataTy = Data->getType(); + + // To index into the (iv'th byte of LHSAux), we multiply iv by 8, and we + // shift right by that amount, and take the lo-byte (in the little-endian + // case), or shift left by that amount, and take the hi-idx (in the + // big-endian case). + Value *IVBits = Builder.CreateZExtOrTrunc( + Builder.CreateShl(IV, 3, "iv.bits"), DataTy, "iv.indexer"); + Value *DataIndexer = + Info.ByteOrderSwapped + ? Builder.CreateShl(Data, IVBits, "data.indexer") + : Builder.CreateLShr(Data, IVBits, "data.indexer"); + Indexer = Builder.CreateXor( + DataIndexer, + Builder.CreateZExtOrTrunc(Indexer, DataTy, "crc.indexer.cast"), + "crc.data.indexer"); + } + + Indexer = Info.ByteOrderSwapped ? HiIdx(Builder, Indexer, "indexer.hi") + : LoByte(Builder, Indexer, "indexer.lo"); + + // Always index into a GEP using the index type. + Indexer = Builder.CreateZExt( + Indexer, SE->getDataLayout().getIndexType(GV->getType()), + "indexer.ext"); + + // CRCTableLd = CRCTable[(iv'th byte of data) ^ (top|bottom) byte of CRC]. + Value *CRCTableGEP = + Builder.CreateInBoundsGEP(CRCTy, GV, Indexer, "tbl.ptradd"); + Value *CRCTableLd = Builder.CreateLoad(CRCTy, CRCTableGEP, "tbl.ld"); + + // CRCNext = (CRC (<<|>>) 8) ^ CRCTableLd, or simply CRCTableLd in case of + // CRC-8. + Value *CRCNext = CRCTableLd; + if (CRCBW > 8) { + Value *CRCShift = Info.ByteOrderSwapped + ? Builder.CreateShl(CRC, 8, "crc.be.shift") + : Builder.CreateLShr(CRC, 8, "crc.le.shift"); + CRCNext = Builder.CreateXor(CRCShift, CRCTableLd, "crc.next"); + } + + // Connect the back-edge for the loop, and RAUW the ComputedValue. + CRCPhi->addIncoming(CRCNext, CurLoop->getLoopLatch()); + Info.ComputedValue->replaceUsesOutsideBlock(CRCNext, + CurLoop->getLoopLatch()); + } + + // Cleanup. + { + for (PHINode *PN : Cleanup) + RecursivelyDeleteDeadPHINode(PN); + SE->forgetLoop(CurLoop); + } + return true; +} + bool LoopIdiomRecognize::runOnNoncountableLoop() { LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F[" << CurLoop->getHeader()->getParent()->getName() |
