diff options
Diffstat (limited to 'llvm/lib/Target/AArch64/AArch64InstrInfo.cpp')
| -rw-r--r-- | llvm/lib/Target/AArch64/AArch64InstrInfo.cpp | 622 |
1 files changed, 476 insertions, 146 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp index 7d540efe2b41..ee397db3fba6 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -17,6 +17,7 @@ #include "AArch64PointerAuth.h" #include "AArch64Subtarget.h" #include "MCTargetDesc/AArch64AddressingModes.h" +#include "MCTargetDesc/AArch64MCTargetDesc.h" #include "Utils/AArch64BaseInfo.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/STLExtras.h" @@ -1354,48 +1355,52 @@ static bool areCFlagsAccessedBetweenInstrs( return false; } -/// optimizePTestInstr - Attempt to remove a ptest of a predicate-generating -/// operation which could set the flags in an identical manner -bool AArch64InstrInfo::optimizePTestInstr( - MachineInstr *PTest, unsigned MaskReg, unsigned PredReg, - const MachineRegisterInfo *MRI) const { - auto *Mask = MRI->getUniqueVRegDef(MaskReg); - auto *Pred = MRI->getUniqueVRegDef(PredReg); - auto NewOp = Pred->getOpcode(); - bool OpChanged = false; - +std::optional<unsigned> +AArch64InstrInfo::canRemovePTestInstr(MachineInstr *PTest, MachineInstr *Mask, + MachineInstr *Pred, + const MachineRegisterInfo *MRI) const { unsigned MaskOpcode = Mask->getOpcode(); unsigned PredOpcode = Pred->getOpcode(); bool PredIsPTestLike = isPTestLikeOpcode(PredOpcode); bool PredIsWhileLike = isWhileOpcode(PredOpcode); - if (isPTrueOpcode(MaskOpcode) && (PredIsPTestLike || PredIsWhileLike) && - getElementSizeForOpcode(MaskOpcode) == - getElementSizeForOpcode(PredOpcode) && - Mask->getOperand(1).getImm() == 31) { + if (PredIsWhileLike) { + // For PTEST(PG, PG), PTEST is redundant when PG is the result of a WHILEcc + // instruction and the condition is "any" since WHILcc does an implicit + // PTEST(ALL, PG) check and PG is always a subset of ALL. + if ((Mask == Pred) && PTest->getOpcode() == AArch64::PTEST_PP_ANY) + return PredOpcode; + // For PTEST(PTRUE_ALL, WHILE), if the element size matches, the PTEST is // redundant since WHILE performs an implicit PTEST with an all active - // mask. Must be an all active predicate of matching element size. + // mask. + if (isPTrueOpcode(MaskOpcode) && Mask->getOperand(1).getImm() == 31 && + getElementSizeForOpcode(MaskOpcode) == + getElementSizeForOpcode(PredOpcode)) + return PredOpcode; + + return {}; + } + + if (PredIsPTestLike) { + // For PTEST(PG, PG), PTEST is redundant when PG is the result of an + // instruction that sets the flags as PTEST would and the condition is + // "any" since PG is always a subset of the governing predicate of the + // ptest-like instruction. + if ((Mask == Pred) && PTest->getOpcode() == AArch64::PTEST_PP_ANY) + return PredOpcode; // For PTEST(PTRUE_ALL, PTEST_LIKE), the PTEST is redundant if the - // PTEST_LIKE instruction uses the same all active mask and the element - // size matches. If the PTEST has a condition of any then it is always - // redundant. - if (PredIsPTestLike) { + // the element size matches and either the PTEST_LIKE instruction uses + // the same all active mask or the condition is "any". + if (isPTrueOpcode(MaskOpcode) && Mask->getOperand(1).getImm() == 31 && + getElementSizeForOpcode(MaskOpcode) == + getElementSizeForOpcode(PredOpcode)) { auto PTestLikeMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg()); - if (Mask != PTestLikeMask && PTest->getOpcode() != AArch64::PTEST_PP_ANY) - return false; + if (Mask == PTestLikeMask || PTest->getOpcode() == AArch64::PTEST_PP_ANY) + return PredOpcode; } - // Fallthough to simply remove the PTEST. - } else if ((Mask == Pred) && (PredIsPTestLike || PredIsWhileLike) && - PTest->getOpcode() == AArch64::PTEST_PP_ANY) { - // For PTEST(PG, PG), PTEST is redundant when PG is the result of an - // instruction that sets the flags as PTEST would. This is only valid when - // the condition is any. - - // Fallthough to simply remove the PTEST. - } else if (PredIsPTestLike) { // For PTEST(PG, PTEST_LIKE(PG, ...)), the PTEST is redundant since the // flags are set based on the same mask 'PG', but PTEST_LIKE must operate // on 8-bit predicates like the PTEST. Otherwise, for instructions like @@ -1420,56 +1425,67 @@ bool AArch64InstrInfo::optimizePTestInstr( // identical regardless of element size. auto PTestLikeMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg()); uint64_t PredElementSize = getElementSizeForOpcode(PredOpcode); - if ((Mask != PTestLikeMask) || - (PredElementSize != AArch64::ElementSizeB && - PTest->getOpcode() != AArch64::PTEST_PP_ANY)) - return false; + if (Mask == PTestLikeMask && (PredElementSize == AArch64::ElementSizeB || + PTest->getOpcode() == AArch64::PTEST_PP_ANY)) + return PredOpcode; - // Fallthough to simply remove the PTEST. - } else { - // If OP in PTEST(PG, OP(PG, ...)) has a flag-setting variant change the - // opcode so the PTEST becomes redundant. - switch (PredOpcode) { - case AArch64::AND_PPzPP: - case AArch64::BIC_PPzPP: - case AArch64::EOR_PPzPP: - case AArch64::NAND_PPzPP: - case AArch64::NOR_PPzPP: - case AArch64::ORN_PPzPP: - case AArch64::ORR_PPzPP: - case AArch64::BRKA_PPzP: - case AArch64::BRKPA_PPzPP: - case AArch64::BRKB_PPzP: - case AArch64::BRKPB_PPzPP: - case AArch64::RDFFR_PPz: { - // Check to see if our mask is the same. If not the resulting flag bits - // may be different and we can't remove the ptest. - auto *PredMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg()); - if (Mask != PredMask) - return false; - break; - } - case AArch64::BRKN_PPzP: { - // BRKN uses an all active implicit mask to set flags unlike the other - // flag-setting instructions. - // PTEST(PTRUE_B(31), BRKN(PG, A, B)) -> BRKNS(PG, A, B). - if ((MaskOpcode != AArch64::PTRUE_B) || - (Mask->getOperand(1).getImm() != 31)) - return false; - break; - } - case AArch64::PTRUE_B: - // PTEST(OP=PTRUE_B(A), OP) -> PTRUES_B(A) - break; - default: - // Bail out if we don't recognize the input - return false; - } + return {}; + } - NewOp = convertToFlagSettingOpc(PredOpcode); - OpChanged = true; + // If OP in PTEST(PG, OP(PG, ...)) has a flag-setting variant change the + // opcode so the PTEST becomes redundant. + switch (PredOpcode) { + case AArch64::AND_PPzPP: + case AArch64::BIC_PPzPP: + case AArch64::EOR_PPzPP: + case AArch64::NAND_PPzPP: + case AArch64::NOR_PPzPP: + case AArch64::ORN_PPzPP: + case AArch64::ORR_PPzPP: + case AArch64::BRKA_PPzP: + case AArch64::BRKPA_PPzPP: + case AArch64::BRKB_PPzP: + case AArch64::BRKPB_PPzPP: + case AArch64::RDFFR_PPz: { + // Check to see if our mask is the same. If not the resulting flag bits + // may be different and we can't remove the ptest. + auto *PredMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg()); + if (Mask != PredMask) + return {}; + break; + } + case AArch64::BRKN_PPzP: { + // BRKN uses an all active implicit mask to set flags unlike the other + // flag-setting instructions. + // PTEST(PTRUE_B(31), BRKN(PG, A, B)) -> BRKNS(PG, A, B). + if ((MaskOpcode != AArch64::PTRUE_B) || + (Mask->getOperand(1).getImm() != 31)) + return {}; + break; + } + case AArch64::PTRUE_B: + // PTEST(OP=PTRUE_B(A), OP) -> PTRUES_B(A) + break; + default: + // Bail out if we don't recognize the input + return {}; } + return convertToFlagSettingOpc(PredOpcode); +} + +/// optimizePTestInstr - Attempt to remove a ptest of a predicate-generating +/// operation which could set the flags in an identical manner +bool AArch64InstrInfo::optimizePTestInstr( + MachineInstr *PTest, unsigned MaskReg, unsigned PredReg, + const MachineRegisterInfo *MRI) const { + auto *Mask = MRI->getUniqueVRegDef(MaskReg); + auto *Pred = MRI->getUniqueVRegDef(PredReg); + unsigned PredOpcode = Pred->getOpcode(); + auto NewOp = canRemovePTestInstr(PTest, Mask, Pred, MRI); + if (!NewOp) + return false; + const TargetRegisterInfo *TRI = &getRegisterInfo(); // If another instruction between Pred and PTest accesses flags, don't remove @@ -1481,9 +1497,9 @@ bool AArch64InstrInfo::optimizePTestInstr( // as they are prior to PTEST. Sometimes this requires the tested PTEST // operand to be replaced with an equivalent instruction that also sets the // flags. - Pred->setDesc(get(NewOp)); PTest->eraseFromParent(); - if (OpChanged) { + if (*NewOp != PredOpcode) { + Pred->setDesc(get(*NewOp)); bool succeeded = UpdateOperandRegClass(*Pred); (void)succeeded; assert(succeeded && "Operands have incompatible register classes!"); @@ -4481,7 +4497,8 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB, // Copy a Predicate register by ORRing with itself. if (AArch64::PPRRegClass.contains(DestReg) && AArch64::PPRRegClass.contains(SrcReg)) { - assert(Subtarget.hasSVEorSME() && "Unexpected SVE register."); + assert(Subtarget.isSVEorStreamingSVEAvailable() && + "Unexpected SVE register."); BuildMI(MBB, I, DL, get(AArch64::ORR_PPzPP), DestReg) .addReg(SrcReg) // Pg .addReg(SrcReg) @@ -4494,8 +4511,6 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB, bool DestIsPNR = AArch64::PNRRegClass.contains(DestReg); bool SrcIsPNR = AArch64::PNRRegClass.contains(SrcReg); if (DestIsPNR || SrcIsPNR) { - assert((Subtarget.hasSVE2p1() || Subtarget.hasSME2()) && - "Unexpected predicate-as-counter register."); auto ToPPR = [](MCRegister R) -> MCRegister { return (R - AArch64::PN0) + AArch64::P0; }; @@ -4516,7 +4531,8 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB, // Copy a Z register by ORRing with itself. if (AArch64::ZPRRegClass.contains(DestReg) && AArch64::ZPRRegClass.contains(SrcReg)) { - assert(Subtarget.hasSVEorSME() && "Unexpected SVE register."); + assert(Subtarget.isSVEorStreamingSVEAvailable() && + "Unexpected SVE register."); BuildMI(MBB, I, DL, get(AArch64::ORR_ZZZ), DestReg) .addReg(SrcReg) .addReg(SrcReg, getKillRegState(KillSrc)); @@ -4528,7 +4544,8 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB, AArch64::ZPR2StridedOrContiguousRegClass.contains(DestReg)) && (AArch64::ZPR2RegClass.contains(SrcReg) || AArch64::ZPR2StridedOrContiguousRegClass.contains(SrcReg))) { - assert(Subtarget.hasSVEorSME() && "Unexpected SVE register."); + assert(Subtarget.isSVEorStreamingSVEAvailable() && + "Unexpected SVE register."); static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1}; copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ, Indices); @@ -4538,7 +4555,8 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB, // Copy a Z register triple by copying the individual sub-registers. if (AArch64::ZPR3RegClass.contains(DestReg) && AArch64::ZPR3RegClass.contains(SrcReg)) { - assert(Subtarget.hasSVEorSME() && "Unexpected SVE register."); + assert(Subtarget.isSVEorStreamingSVEAvailable() && + "Unexpected SVE register."); static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1, AArch64::zsub2}; copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ, @@ -4551,7 +4569,8 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB, AArch64::ZPR4StridedOrContiguousRegClass.contains(DestReg)) && (AArch64::ZPR4RegClass.contains(SrcReg) || AArch64::ZPR4StridedOrContiguousRegClass.contains(SrcReg))) { - assert(Subtarget.hasSVEorSME() && "Unexpected SVE register."); + assert(Subtarget.isSVEorStreamingSVEAvailable() && + "Unexpected SVE register."); static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1, AArch64::zsub2, AArch64::zsub3}; copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ, @@ -4656,7 +4675,8 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB, if (AArch64::FPR128RegClass.contains(DestReg) && AArch64::FPR128RegClass.contains(SrcReg)) { - if (Subtarget.hasSVEorSME() && !Subtarget.isNeonAvailable()) + if (Subtarget.isSVEorStreamingSVEAvailable() && + !Subtarget.isNeonAvailable()) BuildMI(MBB, I, DL, get(AArch64::ORR_ZZZ)) .addReg(AArch64::Z0 + (DestReg - AArch64::Q0), RegState::Define) .addReg(AArch64::Z0 + (SrcReg - AArch64::Q0)) @@ -4814,14 +4834,12 @@ void AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, Opc = AArch64::STRBui; break; case 2: { - bool IsPNR = AArch64::PNRRegClass.hasSubClassEq(RC); if (AArch64::FPR16RegClass.hasSubClassEq(RC)) Opc = AArch64::STRHui; - else if (IsPNR || AArch64::PPRRegClass.hasSubClassEq(RC)) { - assert(Subtarget.hasSVEorSME() && + else if (AArch64::PNRRegClass.hasSubClassEq(RC) || + AArch64::PPRRegClass.hasSubClassEq(RC)) { + assert(Subtarget.isSVEorStreamingSVEAvailable() && "Unexpected register store without SVE store instructions"); - assert((!IsPNR || Subtarget.hasSVE2p1() || Subtarget.hasSME2()) && - "Unexpected register store without SVE2p1 or SME2"); Opc = AArch64::STR_PXI; StackID = TargetStackID::ScalableVector; } @@ -4870,7 +4888,7 @@ void AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, AArch64::sube64, AArch64::subo64, FI, MMO); return; } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) { - assert(Subtarget.hasSVEorSME() && + assert(Subtarget.isSVEorStreamingSVEAvailable() && "Unexpected register store without SVE store instructions"); Opc = AArch64::STR_ZXI; StackID = TargetStackID::ScalableVector; @@ -4894,7 +4912,7 @@ void AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, Offset = false; } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC) || AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) { - assert(Subtarget.hasSVEorSME() && + assert(Subtarget.isSVEorStreamingSVEAvailable() && "Unexpected register store without SVE store instructions"); Opc = AArch64::STR_ZZXI; StackID = TargetStackID::ScalableVector; @@ -4906,7 +4924,7 @@ void AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, Opc = AArch64::ST1Threev2d; Offset = false; } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) { - assert(Subtarget.hasSVEorSME() && + assert(Subtarget.isSVEorStreamingSVEAvailable() && "Unexpected register store without SVE store instructions"); Opc = AArch64::STR_ZZZXI; StackID = TargetStackID::ScalableVector; @@ -4919,7 +4937,7 @@ void AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, Offset = false; } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC) || AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) { - assert(Subtarget.hasSVEorSME() && + assert(Subtarget.isSVEorStreamingSVEAvailable() && "Unexpected register store without SVE store instructions"); Opc = AArch64::STR_ZZZZXI; StackID = TargetStackID::ScalableVector; @@ -4992,10 +5010,8 @@ void AArch64InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, if (AArch64::FPR16RegClass.hasSubClassEq(RC)) Opc = AArch64::LDRHui; else if (IsPNR || AArch64::PPRRegClass.hasSubClassEq(RC)) { - assert(Subtarget.hasSVEorSME() && + assert(Subtarget.isSVEorStreamingSVEAvailable() && "Unexpected register load without SVE load instructions"); - assert((!IsPNR || Subtarget.hasSVE2p1() || Subtarget.hasSME2()) && - "Unexpected register load without SVE2p1 or SME2"); if (IsPNR) PNRReg = DestReg; Opc = AArch64::LDR_PXI; @@ -5046,7 +5062,7 @@ void AArch64InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, AArch64::subo64, FI, MMO); return; } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) { - assert(Subtarget.hasSVEorSME() && + assert(Subtarget.isSVEorStreamingSVEAvailable() && "Unexpected register load without SVE load instructions"); Opc = AArch64::LDR_ZXI; StackID = TargetStackID::ScalableVector; @@ -5070,7 +5086,7 @@ void AArch64InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, Offset = false; } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC) || AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) { - assert(Subtarget.hasSVEorSME() && + assert(Subtarget.isSVEorStreamingSVEAvailable() && "Unexpected register load without SVE load instructions"); Opc = AArch64::LDR_ZZXI; StackID = TargetStackID::ScalableVector; @@ -5082,7 +5098,7 @@ void AArch64InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, Opc = AArch64::LD1Threev2d; Offset = false; } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) { - assert(Subtarget.hasSVEorSME() && + assert(Subtarget.isSVEorStreamingSVEAvailable() && "Unexpected register load without SVE load instructions"); Opc = AArch64::LDR_ZZZXI; StackID = TargetStackID::ScalableVector; @@ -5095,7 +5111,7 @@ void AArch64InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, Offset = false; } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC) || AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) { - assert(Subtarget.hasSVEorSME() && + assert(Subtarget.isSVEorStreamingSVEAvailable() && "Unexpected register load without SVE load instructions"); Opc = AArch64::LDR_ZZZZXI; StackID = TargetStackID::ScalableVector; @@ -8555,6 +8571,8 @@ AArch64InstrInfo::getOutliningCandidateInfo( NumBytesNoStackCalls <= RepeatedSequenceLocs.size() * 12) { RepeatedSequenceLocs = CandidatesWithoutStackFixups; FrameID = MachineOutlinerNoLRSave; + if (RepeatedSequenceLocs.size() < 2) + return std::nullopt; } else { SetCandidateCallInfo(MachineOutlinerDefault, 12); @@ -8700,6 +8718,13 @@ bool AArch64InstrInfo::isFunctionSafeToOutlineFrom( if (!AFI || AFI->hasRedZone().value_or(true)) return false; + // FIXME: Determine whether it is safe to outline from functions which contain + // streaming-mode changes. We may need to ensure any smstart/smstop pairs are + // outlined together and ensure it is safe to outline with async unwind info, + // required for saving & restoring VG around calls. + if (AFI->hasStreamingModeChanges()) + return false; + // FIXME: Teach the outliner to generate/handle Windows unwind info. if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) return false; @@ -9582,18 +9607,49 @@ AArch64InstrInfo::probedStackAlloc(MachineBasicBlock::iterator MBBI, namespace { class AArch64PipelinerLoopInfo : public TargetInstrInfo::PipelinerLoopInfo { - MachineInstr *PredBranch; + MachineFunction *MF; + const TargetInstrInfo *TII; + const TargetRegisterInfo *TRI; + MachineRegisterInfo &MRI; + + /// The block of the loop + MachineBasicBlock *LoopBB; + /// The conditional branch of the loop + MachineInstr *CondBranch; + /// The compare instruction for loop control + MachineInstr *Comp; + /// The number of the operand of the loop counter value in Comp + unsigned CompCounterOprNum; + /// The instruction that updates the loop counter value + MachineInstr *Update; + /// The number of the operand of the loop counter value in Update + unsigned UpdateCounterOprNum; + /// The initial value of the loop counter + Register Init; + /// True iff Update is a predecessor of Comp + bool IsUpdatePriorComp; + + /// The normalized condition used by createTripCountGreaterCondition() SmallVector<MachineOperand, 4> Cond; public: - AArch64PipelinerLoopInfo(MachineInstr *PredBranch, + AArch64PipelinerLoopInfo(MachineBasicBlock *LoopBB, MachineInstr *CondBranch, + MachineInstr *Comp, unsigned CompCounterOprNum, + MachineInstr *Update, unsigned UpdateCounterOprNum, + Register Init, bool IsUpdatePriorComp, const SmallVectorImpl<MachineOperand> &Cond) - : PredBranch(PredBranch), Cond(Cond.begin(), Cond.end()) {} + : MF(Comp->getParent()->getParent()), + TII(MF->getSubtarget().getInstrInfo()), + TRI(MF->getSubtarget().getRegisterInfo()), MRI(MF->getRegInfo()), + LoopBB(LoopBB), CondBranch(CondBranch), Comp(Comp), + CompCounterOprNum(CompCounterOprNum), Update(Update), + UpdateCounterOprNum(UpdateCounterOprNum), Init(Init), + IsUpdatePriorComp(IsUpdatePriorComp), Cond(Cond.begin(), Cond.end()) {} bool shouldIgnoreForPipelining(const MachineInstr *MI) const override { // Make the instructions for loop control be placed in stage 0. - // The predecessors of PredBranch are considered by the caller. - return MI == PredBranch; + // The predecessors of Comp are considered by the caller. + return MI == Comp; } std::optional<bool> createTripCountGreaterCondition( @@ -9606,31 +9662,277 @@ public: return {}; } + void createRemainingIterationsGreaterCondition( + int TC, MachineBasicBlock &MBB, SmallVectorImpl<MachineOperand> &Cond, + DenseMap<MachineInstr *, MachineInstr *> &LastStage0Insts) override; + void setPreheader(MachineBasicBlock *NewPreheader) override {} void adjustTripCount(int TripCountAdjust) override {} void disposed() override {} + bool isMVEExpanderSupported() override { return true; } }; } // namespace -static bool isCompareAndBranch(unsigned Opcode) { - switch (Opcode) { - case AArch64::CBZW: - case AArch64::CBZX: - case AArch64::CBNZW: - case AArch64::CBNZX: - case AArch64::TBZW: - case AArch64::TBZX: - case AArch64::TBNZW: - case AArch64::TBNZX: - return true; +/// Clone an instruction from MI. The register of ReplaceOprNum-th operand +/// is replaced by ReplaceReg. The output register is newly created. +/// The other operands are unchanged from MI. +static Register cloneInstr(const MachineInstr *MI, unsigned ReplaceOprNum, + Register ReplaceReg, MachineBasicBlock &MBB, + MachineBasicBlock::iterator InsertTo) { + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + const TargetInstrInfo *TII = MBB.getParent()->getSubtarget().getInstrInfo(); + const TargetRegisterInfo *TRI = + MBB.getParent()->getSubtarget().getRegisterInfo(); + MachineInstr *NewMI = MBB.getParent()->CloneMachineInstr(MI); + Register Result = 0; + for (unsigned I = 0; I < NewMI->getNumOperands(); ++I) { + if (I == 0 && NewMI->getOperand(0).getReg().isVirtual()) { + Result = MRI.createVirtualRegister( + MRI.getRegClass(NewMI->getOperand(0).getReg())); + NewMI->getOperand(I).setReg(Result); + } else if (I == ReplaceOprNum) { + MRI.constrainRegClass( + ReplaceReg, + TII->getRegClass(NewMI->getDesc(), I, TRI, *MBB.getParent())); + NewMI->getOperand(I).setReg(ReplaceReg); + } } - return false; + MBB.insert(InsertTo, NewMI); + return Result; +} + +void AArch64PipelinerLoopInfo::createRemainingIterationsGreaterCondition( + int TC, MachineBasicBlock &MBB, SmallVectorImpl<MachineOperand> &Cond, + DenseMap<MachineInstr *, MachineInstr *> &LastStage0Insts) { + // Create and accumulate conditions for next TC iterations. + // Example: + // SUBSXrr N, counter, implicit-def $nzcv # compare instruction for the last + // # iteration of the kernel + // + // # insert the following instructions + // cond = CSINCXr 0, 0, C, implicit $nzcv + // counter = ADDXri counter, 1 # clone from this->Update + // SUBSXrr n, counter, implicit-def $nzcv # clone from this->Comp + // cond = CSINCXr cond, cond, C, implicit $nzcv + // ... (repeat TC times) + // SUBSXri cond, 0, implicit-def $nzcv + + assert(CondBranch->getOpcode() == AArch64::Bcc); + // CondCode to exit the loop + AArch64CC::CondCode CC = + (AArch64CC::CondCode)CondBranch->getOperand(0).getImm(); + if (CondBranch->getOperand(1).getMBB() == LoopBB) + CC = AArch64CC::getInvertedCondCode(CC); + + // Accumulate conditions to exit the loop + Register AccCond = AArch64::XZR; + + // If CC holds, CurCond+1 is returned; otherwise CurCond is returned. + auto AccumulateCond = [&](Register CurCond, + AArch64CC::CondCode CC) -> Register { + Register NewCond = MRI.createVirtualRegister(&AArch64::GPR64commonRegClass); + BuildMI(MBB, MBB.end(), Comp->getDebugLoc(), TII->get(AArch64::CSINCXr)) + .addReg(NewCond, RegState::Define) + .addReg(CurCond) + .addReg(CurCond) + .addImm(AArch64CC::getInvertedCondCode(CC)); + return NewCond; + }; + + if (!LastStage0Insts.empty() && LastStage0Insts[Comp]->getParent() == &MBB) { + // Update and Comp for I==0 are already exists in MBB + // (MBB is an unrolled kernel) + Register Counter; + for (int I = 0; I <= TC; ++I) { + Register NextCounter; + if (I != 0) + NextCounter = + cloneInstr(Comp, CompCounterOprNum, Counter, MBB, MBB.end()); + + AccCond = AccumulateCond(AccCond, CC); + + if (I != TC) { + if (I == 0) { + if (Update != Comp && IsUpdatePriorComp) { + Counter = + LastStage0Insts[Comp]->getOperand(CompCounterOprNum).getReg(); + NextCounter = cloneInstr(Update, UpdateCounterOprNum, Counter, MBB, + MBB.end()); + } else { + // can use already calculated value + NextCounter = LastStage0Insts[Update]->getOperand(0).getReg(); + } + } else if (Update != Comp) { + NextCounter = + cloneInstr(Update, UpdateCounterOprNum, Counter, MBB, MBB.end()); + } + } + Counter = NextCounter; + } + } else { + Register Counter; + if (LastStage0Insts.empty()) { + // use initial counter value (testing if the trip count is sufficient to + // be executed by pipelined code) + Counter = Init; + if (IsUpdatePriorComp) + Counter = + cloneInstr(Update, UpdateCounterOprNum, Counter, MBB, MBB.end()); + } else { + // MBB is an epilogue block. LastStage0Insts[Comp] is in the kernel block. + Counter = LastStage0Insts[Comp]->getOperand(CompCounterOprNum).getReg(); + } + + for (int I = 0; I <= TC; ++I) { + Register NextCounter; + NextCounter = + cloneInstr(Comp, CompCounterOprNum, Counter, MBB, MBB.end()); + AccCond = AccumulateCond(AccCond, CC); + if (I != TC && Update != Comp) + NextCounter = + cloneInstr(Update, UpdateCounterOprNum, Counter, MBB, MBB.end()); + Counter = NextCounter; + } + } + + // If AccCond == 0, the remainder is greater than TC. + BuildMI(MBB, MBB.end(), Comp->getDebugLoc(), TII->get(AArch64::SUBSXri)) + .addReg(AArch64::XZR, RegState::Define | RegState::Dead) + .addReg(AccCond) + .addImm(0) + .addImm(0); + Cond.clear(); + Cond.push_back(MachineOperand::CreateImm(AArch64CC::EQ)); +} + +static void extractPhiReg(const MachineInstr &Phi, const MachineBasicBlock *MBB, + Register &RegMBB, Register &RegOther) { + assert(Phi.getNumOperands() == 5); + if (Phi.getOperand(2).getMBB() == MBB) { + RegMBB = Phi.getOperand(1).getReg(); + RegOther = Phi.getOperand(3).getReg(); + } else { + assert(Phi.getOperand(4).getMBB() == MBB); + RegMBB = Phi.getOperand(3).getReg(); + RegOther = Phi.getOperand(1).getReg(); + } +} + +static bool isDefinedOutside(Register Reg, const MachineBasicBlock *BB) { + if (!Reg.isVirtual()) + return false; + const MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); + return MRI.getVRegDef(Reg)->getParent() != BB; +} + +/// If Reg is an induction variable, return true and set some parameters +static bool getIndVarInfo(Register Reg, const MachineBasicBlock *LoopBB, + MachineInstr *&UpdateInst, + unsigned &UpdateCounterOprNum, Register &InitReg, + bool &IsUpdatePriorComp) { + // Example: + // + // Preheader: + // InitReg = ... + // LoopBB: + // Reg0 = PHI (InitReg, Preheader), (Reg1, LoopBB) + // Reg = COPY Reg0 ; COPY is ignored. + // Reg1 = ADD Reg, #1; UpdateInst. Incremented by a loop invariant value. + // ; Reg is the value calculated in the previous + // ; iteration, so IsUpdatePriorComp == false. + + if (LoopBB->pred_size() != 2) + return false; + if (!Reg.isVirtual()) + return false; + const MachineRegisterInfo &MRI = LoopBB->getParent()->getRegInfo(); + UpdateInst = nullptr; + UpdateCounterOprNum = 0; + InitReg = 0; + IsUpdatePriorComp = true; + Register CurReg = Reg; + while (true) { + MachineInstr *Def = MRI.getVRegDef(CurReg); + if (Def->getParent() != LoopBB) + return false; + if (Def->isCopy()) { + // Ignore copy instructions unless they contain subregisters + if (Def->getOperand(0).getSubReg() || Def->getOperand(1).getSubReg()) + return false; + CurReg = Def->getOperand(1).getReg(); + } else if (Def->isPHI()) { + if (InitReg != 0) + return false; + if (!UpdateInst) + IsUpdatePriorComp = false; + extractPhiReg(*Def, LoopBB, CurReg, InitReg); + } else { + if (UpdateInst) + return false; + switch (Def->getOpcode()) { + case AArch64::ADDSXri: + case AArch64::ADDSWri: + case AArch64::SUBSXri: + case AArch64::SUBSWri: + case AArch64::ADDXri: + case AArch64::ADDWri: + case AArch64::SUBXri: + case AArch64::SUBWri: + UpdateInst = Def; + UpdateCounterOprNum = 1; + break; + case AArch64::ADDSXrr: + case AArch64::ADDSWrr: + case AArch64::SUBSXrr: + case AArch64::SUBSWrr: + case AArch64::ADDXrr: + case AArch64::ADDWrr: + case AArch64::SUBXrr: + case AArch64::SUBWrr: + UpdateInst = Def; + if (isDefinedOutside(Def->getOperand(2).getReg(), LoopBB)) + UpdateCounterOprNum = 1; + else if (isDefinedOutside(Def->getOperand(1).getReg(), LoopBB)) + UpdateCounterOprNum = 2; + else + return false; + break; + default: + return false; + } + CurReg = Def->getOperand(UpdateCounterOprNum).getReg(); + } + + if (!CurReg.isVirtual()) + return false; + if (Reg == CurReg) + break; + } + + if (!UpdateInst) + return false; + + return true; } std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo> AArch64InstrInfo::analyzeLoopForPipelining(MachineBasicBlock *LoopBB) const { + // Accept loops that meet the following conditions + // * The conditional branch is BCC + // * The compare instruction is ADDS/SUBS/WHILEXX + // * One operand of the compare is an induction variable and the other is a + // loop invariant value + // * The induction variable is incremented/decremented by a single instruction + // * Does not contain CALL or instructions which have unmodeled side effects + + for (MachineInstr &MI : *LoopBB) + if (MI.isCall() || MI.hasUnmodeledSideEffects()) + // This instruction may use NZCV, which interferes with the instruction to + // be inserted for loop control. + return nullptr; + MachineBasicBlock *TBB = nullptr, *FBB = nullptr; SmallVector<MachineOperand, 4> Cond; if (analyzeBranch(*LoopBB, TBB, FBB, Cond)) @@ -9641,48 +9943,76 @@ AArch64InstrInfo::analyzeLoopForPipelining(MachineBasicBlock *LoopBB) const { return nullptr; // Must be conditional branch - if (FBB == nullptr) + if (TBB != LoopBB && FBB == nullptr) return nullptr; assert((TBB == LoopBB || FBB == LoopBB) && "The Loop must be a single-basic-block loop"); + MachineInstr *CondBranch = &*LoopBB->getFirstTerminator(); + const TargetRegisterInfo &TRI = getRegisterInfo(); + + if (CondBranch->getOpcode() != AArch64::Bcc) + return nullptr; + // Normalization for createTripCountGreaterCondition() if (TBB == LoopBB) reverseBranchCondition(Cond); - MachineInstr *CondBranch = &*LoopBB->getFirstTerminator(); - const TargetRegisterInfo &TRI = getRegisterInfo(); - - // Find the immediate predecessor of the conditional branch - MachineInstr *PredBranch = nullptr; - if (CondBranch->getOpcode() == AArch64::Bcc) { - for (MachineInstr &MI : reverse(*LoopBB)) { - if (MI.modifiesRegister(AArch64::NZCV, &TRI)) { - PredBranch = &MI; + MachineInstr *Comp = nullptr; + unsigned CompCounterOprNum = 0; + for (MachineInstr &MI : reverse(*LoopBB)) { + if (MI.modifiesRegister(AArch64::NZCV, &TRI)) { + // Guarantee that the compare is SUBS/ADDS/WHILEXX and that one of the + // operands is a loop invariant value + + switch (MI.getOpcode()) { + case AArch64::SUBSXri: + case AArch64::SUBSWri: + case AArch64::ADDSXri: + case AArch64::ADDSWri: + Comp = &MI; + CompCounterOprNum = 1; break; + case AArch64::ADDSWrr: + case AArch64::ADDSXrr: + case AArch64::SUBSWrr: + case AArch64::SUBSXrr: + Comp = &MI; + break; + default: + if (isWhileOpcode(MI.getOpcode())) { + Comp = &MI; + break; + } + return nullptr; } - } - if (!PredBranch) - return nullptr; - } else if (isCompareAndBranch(CondBranch->getOpcode())) { - const MachineRegisterInfo &MRI = LoopBB->getParent()->getRegInfo(); - Register Reg = CondBranch->getOperand(0).getReg(); - if (!Reg.isVirtual()) - return nullptr; - PredBranch = MRI.getVRegDef(Reg); - // MachinePipeliner does not expect that the immediate predecessor is a Phi - if (PredBranch->isPHI()) - return nullptr; + if (CompCounterOprNum == 0) { + if (isDefinedOutside(Comp->getOperand(1).getReg(), LoopBB)) + CompCounterOprNum = 2; + else if (isDefinedOutside(Comp->getOperand(2).getReg(), LoopBB)) + CompCounterOprNum = 1; + else + return nullptr; + } + break; + } + } + if (!Comp) + return nullptr; - if (PredBranch->getParent() != LoopBB) - return nullptr; - } else { + MachineInstr *Update = nullptr; + Register Init; + bool IsUpdatePriorComp; + unsigned UpdateCounterOprNum; + if (!getIndVarInfo(Comp->getOperand(CompCounterOprNum).getReg(), LoopBB, + Update, UpdateCounterOprNum, Init, IsUpdatePriorComp)) return nullptr; - } - return std::make_unique<AArch64PipelinerLoopInfo>(PredBranch, Cond); + return std::make_unique<AArch64PipelinerLoopInfo>( + LoopBB, CondBranch, Comp, CompCounterOprNum, Update, UpdateCounterOprNum, + Init, IsUpdatePriorComp, Cond); } #define GET_INSTRINFO_HELPERS |
