20 files changed, 1979 insertions, 140 deletions
diff --git a/bolt/include/bolt/Passes/PAuthGadgetScanner.h b/bolt/include/bolt/Passes/PAuthGadgetScanner.h
index c6b9cc2eb4b9..721fd664a325 100644
--- a/bolt/include/bolt/Passes/PAuthGadgetScanner.h
+++ b/bolt/include/bolt/Passes/PAuthGadgetScanner.h
@@ -199,8 +199,7 @@ namespace PAuthGadgetScanner {
 // to distinguish intermediate and final results at the type level.
 //
 // Here is an overview of issue life-cycle:
-// * an analysis (SrcSafetyAnalysis at now, DstSafetyAnalysis will be added
-//   later to support the detection of authentication oracles) computes register
+// * an analysis (SrcSafetyAnalysis or DstSafetyAnalysis) computes register
 //   state for each instruction in the function.
 // * for each instruction, it is checked whether it is a gadget of some kind,
 //   taking the computed state into account. If a gadget is found, its kind
@@ -273,6 +272,11 @@ public:
   virtual ~ExtraInfo() {}
 };
 
+/// The set of instructions writing to the affected register in an unsafe
+/// manner.
+///
+/// This is a hint to be printed alongside the report. It should be further
+/// analyzed by the user.
 class ClobberingInfo : public ExtraInfo {
   SmallVector<MCInstReference> ClobberingInstrs;
 
@@ -282,6 +286,20 @@ public:
   void print(raw_ostream &OS, const MCInstReference Location) const override;
 };
 
+/// The set of instructions leaking the authenticated pointer before the
+/// result of authentication was checked.
+///
+/// This is a hint to be printed alongside the report. It should be further
+/// analyzed by the user.
+class LeakageInfo : public ExtraInfo {
+  SmallVector<MCInstReference> LeakingInstrs;
+
+public:
+  LeakageInfo(ArrayRef<MCInstReference> Instrs) : LeakingInstrs(Instrs) {}
+
+  void print(raw_ostream &OS, const MCInstReference Location) const override;
+};
+
 /// A brief version of a report that can be further augmented with the details.
 ///
 /// A half-baked report produced on the first run of the analysis. An extra,
@@ -322,6 +340,9 @@ class FunctionAnalysisContext {
   void findUnsafeUses(SmallVector<PartialReport<MCPhysReg>> &Reports);
   void augmentUnsafeUseReports(ArrayRef<PartialReport<MCPhysReg>> Reports);
 
+  void findUnsafeDefs(SmallVector<PartialReport<MCPhysReg>> &Reports);
+  void augmentUnsafeDefReports(ArrayRef<PartialReport<MCPhysReg>> Reports);
+
   /// Process the reports which do not have to be augmented, and remove them
   /// from Reports.
   void handleSimpleReports(SmallVector<PartialReport<MCPhysReg>> &Reports);
diff --git a/bolt/include/bolt/Profile/DataAggregator.h b/bolt/include/bolt/Profile/DataAggregator.h
index 96969cf53bac..cc28a06c151e 100644
--- a/bolt/include/bolt/Profile/DataAggregator.h
+++ b/bolt/include/bolt/Profile/DataAggregator.h
@@ -85,6 +85,8 @@ private:
   };
   friend raw_ostream &operator<<(raw_ostream &OS, const LBREntry &);
 
+  friend struct PerfSpeEventsTestHelper;
+
   struct PerfBranchSample {
     SmallVector<LBREntry, 32> LBR;
   };
@@ -99,16 +101,17 @@ private:
     uint64_t Addr;
   };
 
-  /// Container for the unit of branch data.
-  /// Backwards compatible with legacy use for branches and fall-throughs:
-  /// - if \p Branch is FT_ONLY or FT_EXTERNAL_ORIGIN, the trace only
-  ///   contains fall-through data,
-  /// - if \p To is BR_ONLY, the trace only contains branch data.
+  /// Container for the unit of branch data, matching pre-aggregated trace type.
+  /// Backwards compatible with branch and fall-through types:
+  /// - if \p To is < 0, the trace only contains branch data (BR_ONLY),
+  /// - if \p Branch is < 0, the trace only contains fall-through data
+  ///   (FT_ONLY, FT_EXTERNAL_ORIGIN, or FT_EXTERNAL_RETURN).
   struct Trace {
     static constexpr const uint64_t EXTERNAL = 0ULL;
     static constexpr const uint64_t BR_ONLY = -1ULL;
     static constexpr const uint64_t FT_ONLY = -1ULL;
     static constexpr const uint64_t FT_EXTERNAL_ORIGIN = -2ULL;
+    static constexpr const uint64_t FT_EXTERNAL_RETURN = -3ULL;
 
     uint64_t Branch;
     uint64_t From;
@@ -388,9 +391,9 @@ private:
   /// File format syntax:
   /// E <event>
   /// S <start> <count>
-  /// T <start> <end> <ft_end> <count>
+  /// [TR] <start> <end> <ft_end> <count>
   /// B <start> <end> <count> <mispred_count>
-  /// [Ff] <start> <end> <count>
+  /// [Ffr] <start> <end> <count>
   ///
   /// where <start>, <end>, <ft_end> have the format [<id>:]<offset>
   ///
@@ -401,8 +404,11 @@ private:
   /// f - an aggregated fall-through with external origin - used to disambiguate
   ///       between a return hitting a basic block head and a regular internal
   ///       jump to the block
+  /// r - an aggregated fall-through originating at an external return, no
+  ///       checks are performed for a fallthrough start
   /// T - an aggregated trace: branch from <start> to <end> with a fall-through
   ///       to <ft_end>
+  /// R - an aggregated trace originating at a return
   ///
   /// <id> - build id of the object containing the address. We can skip it for
   /// the main binary and use "X" for an unknown object. This will save some
@@ -530,7 +536,12 @@ inline raw_ostream &operator<<(raw_ostream &OS,
                                const DataAggregator::Trace &T) {
   switch (T.Branch) {
   case DataAggregator::Trace::FT_ONLY:
+    break;
   case DataAggregator::Trace::FT_EXTERNAL_ORIGIN:
+    OS << "X:0 -> ";
+    break;
+  case DataAggregator::Trace::FT_EXTERNAL_RETURN:
+    OS << "X:R -> ";
     break;
   default:
     OS << Twine::utohexstr(T.Branch) << " -> ";
diff --git a/bolt/include/bolt/Utils/CommandLineOpts.h b/bolt/include/bolt/Utils/CommandLineOpts.h
index 4acce5a3e832..a75b6bf720ec 100644
--- a/bolt/include/bolt/Utils/CommandLineOpts.h
+++ b/bolt/include/bolt/Utils/CommandLineOpts.h
@@ -48,6 +48,7 @@ extern llvm::cl::OptionCategory BinaryAnalysisCategory;
 extern llvm::cl::opt<unsigned> AlignText;
 extern llvm::cl::opt<unsigned> AlignFunctions;
 extern llvm::cl::opt<bool> AggregateOnly;
+extern llvm::cl::opt<bool> ArmSPE;
 extern llvm::cl::opt<unsigned> BucketsPerLine;
 extern llvm::cl::opt<bool> CompactCodeModel;
 extern llvm::cl::opt<bool> DiffOnly;
diff --git a/bolt/lib/Passes/PAuthGadgetScanner.cpp b/bolt/lib/Passes/PAuthGadgetScanner.cpp
index 971ea5fdef42..95e831fe9c8c 100644
--- a/bolt/lib/Passes/PAuthGadgetScanner.cpp
+++ b/bolt/lib/Passes/PAuthGadgetScanner.cpp
@@ -152,6 +152,8 @@ public:
 //    in the gadgets to be reported. This information is used in the second run
 //    to also track which instructions last wrote to those registers.
 
+typedef SmallPtrSet<const MCInst *, 4> SetOfRelatedInsts;
+
 /// A state representing which registers are safe to use by an instruction
 /// at a given program point.
 ///
@@ -195,7 +197,7 @@ struct SrcState {
   /// pac-ret analysis, the expectation is that almost all return instructions
   /// only use register `X30`, and therefore, this vector will probably have
   /// length 1 in the second run.
-  std::vector<SmallPtrSet<const MCInst *, 4>> LastInstWritingReg;
+  std::vector<SetOfRelatedInsts> LastInstWritingReg;
 
   /// Construct an empty state.
   SrcState() {}
@@ -230,12 +232,11 @@ struct SrcState {
   bool operator!=(const SrcState &RHS) const { return !((*this) == RHS); }
 };
 
-static void
-printLastInsts(raw_ostream &OS,
-               ArrayRef<SmallPtrSet<const MCInst *, 4>> LastInstWritingReg) {
+static void printInstsShort(raw_ostream &OS,
+                            ArrayRef<SetOfRelatedInsts> Insts) {
   OS << "Insts: ";
-  for (unsigned I = 0; I < LastInstWritingReg.size(); ++I) {
-    auto &Set = LastInstWritingReg[I];
+  for (unsigned I = 0; I < Insts.size(); ++I) {
+    auto &Set = Insts[I];
     OS << "[" << I << "](";
     for (const MCInst *MCInstP : Set)
       OS << MCInstP << " ";
@@ -243,14 +244,14 @@ printLastInsts(raw_ostream &OS,
   }
 }
 
-raw_ostream &operator<<(raw_ostream &OS, const SrcState &S) {
+static raw_ostream &operator<<(raw_ostream &OS, const SrcState &S) {
   OS << "src-state<";
   if (S.empty()) {
     OS << "empty";
   } else {
     OS << "SafeToDerefRegs: " << S.SafeToDerefRegs << ", ";
     OS << "TrustedRegs: " << S.TrustedRegs << ", ";
-    printLastInsts(OS, S.LastInstWritingReg);
+    printInstsShort(OS, S.LastInstWritingReg);
   }
   OS << ">";
   return OS;
@@ -279,7 +280,7 @@ void SrcStatePrinter::print(raw_ostream &OS, const SrcState &S) const {
     OS << ", TrustedRegs: ";
     RegStatePrinter.print(OS, S.TrustedRegs);
     OS << ", ";
-    printLastInsts(OS, S.LastInstWritingReg);
+    printInstsShort(OS, S.LastInstWritingReg);
   }
   OS << ">";
 }
@@ -323,13 +324,12 @@ protected:
   DenseMap<const MCInst *, std::pair<MCPhysReg, const MCInst *>>
       CheckerSequenceInfo;
 
-  SmallPtrSet<const MCInst *, 4> &lastWritingInsts(SrcState &S,
-                                                   MCPhysReg Reg) const {
+  SetOfRelatedInsts &lastWritingInsts(SrcState &S, MCPhysReg Reg) const {
     unsigned Index = RegsToTrackInstsFor.getIndex(Reg);
     return S.LastInstWritingReg[Index];
   }
-  const SmallPtrSet<const MCInst *, 4> &lastWritingInsts(const SrcState &S,
-                                                         MCPhysReg Reg) const {
+  const SetOfRelatedInsts &lastWritingInsts(const SrcState &S,
+                                            MCPhysReg Reg) const {
     unsigned Index = RegsToTrackInstsFor.getIndex(Reg);
     return S.LastInstWritingReg[Index];
   }
@@ -430,11 +430,13 @@ protected:
   }
 
   SrcState computeNext(const MCInst &Point, const SrcState &Cur) {
+    if (BC.MIB->isCFI(Point))
+      return Cur;
+
     SrcStatePrinter P(BC);
     LLVM_DEBUG({
       dbgs() << "  SrcSafetyAnalysis::ComputeNext(";
-      BC.InstPrinter->printInst(&const_cast<MCInst &>(Point), 0, "", *BC.STI,
-                                dbgs());
+      BC.InstPrinter->printInst(&Point, 0, "", *BC.STI, dbgs());
       dbgs() << ", ";
       P.print(dbgs(), Cur);
       dbgs() << ")\n";
@@ -612,6 +614,42 @@ protected:
   StringRef getAnnotationName() const { return "DataflowSrcSafetyAnalysis"; }
 };
 
+/// A helper base class for implementing a simplified counterpart of a dataflow
+/// analysis for functions without CFG information.
+template <typename StateTy> class CFGUnawareAnalysis {
+  BinaryContext &BC;
+  BinaryFunction &BF;
+  MCPlusBuilder::AllocatorIdTy AllocId;
+  unsigned StateAnnotationIndex;
+
+  void cleanStateAnnotations() {
+    for (auto &I : BF.instrs())
+      BC.MIB->removeAnnotation(I.second, StateAnnotationIndex);
+  }
+
+protected:
+  CFGUnawareAnalysis(BinaryFunction &BF, MCPlusBuilder::AllocatorIdTy AllocId,
+                     StringRef AnnotationName)
+      : BC(BF.getBinaryContext()), BF(BF), AllocId(AllocId) {
+    StateAnnotationIndex = BC.MIB->getOrCreateAnnotationIndex(AnnotationName);
+  }
+
+  void setState(MCInst &Inst, const StateTy &S) {
+    // Check if we need to remove an old annotation (this is the case if
+    // this is the second, detailed run of the analysis).
+    if (BC.MIB->hasAnnotation(Inst, StateAnnotationIndex))
+      BC.MIB->removeAnnotation(Inst, StateAnnotationIndex);
+    // Attach the state.
+    BC.MIB->addAnnotation(Inst, StateAnnotationIndex, S, AllocId);
+  }
+
+  const StateTy &getState(const MCInst &Inst) const {
+    return BC.MIB->getAnnotationAs<StateTy>(Inst, StateAnnotationIndex);
+  }
+
+  virtual ~CFGUnawareAnalysis() { cleanStateAnnotations(); }
+};
+
 // A simplified implementation of DataflowSrcSafetyAnalysis for functions
 // lacking CFG information.
 //
@@ -646,15 +684,10 @@ protected:
 // of instructions without labels in between. These sequences can be processed
 // the same way basic blocks are processed by data-flow analysis, assuming
 // pessimistically that all registers are unsafe at the start of each sequence.
-class CFGUnawareSrcSafetyAnalysis : public SrcSafetyAnalysis {
+class CFGUnawareSrcSafetyAnalysis : public SrcSafetyAnalysis,
+                                    public CFGUnawareAnalysis<SrcState> {
+  using SrcSafetyAnalysis::BC;
   BinaryFunction &BF;
-  MCPlusBuilder::AllocatorIdTy AllocId;
-  unsigned StateAnnotationIndex;
-
-  void cleanStateAnnotations() {
-    for (auto &I : BF.instrs())
-      BC.MIB->removeAnnotation(I.second, StateAnnotationIndex);
-  }
 
   /// Creates a state with all registers marked unsafe (not to be confused
   /// with empty state).
@@ -666,15 +699,16 @@ public:
   CFGUnawareSrcSafetyAnalysis(BinaryFunction &BF,
                               MCPlusBuilder::AllocatorIdTy AllocId,
                               ArrayRef<MCPhysReg> RegsToTrackInstsFor)
-      : SrcSafetyAnalysis(BF, RegsToTrackInstsFor), BF(BF), AllocId(AllocId) {
-    StateAnnotationIndex =
-        BC.MIB->getOrCreateAnnotationIndex("CFGUnawareSrcSafetyAnalysis");
+      : SrcSafetyAnalysis(BF, RegsToTrackInstsFor),
+        CFGUnawareAnalysis(BF, AllocId, "CFGUnawareSrcSafetyAnalysis"), BF(BF) {
   }
 
   void run() override {
     SrcState S = createEntryState();
     for (auto &I : BF.instrs()) {
       MCInst &Inst = I.second;
+      if (BC.MIB->isCFI(Inst))
+        continue;
 
       // If there is a label before this instruction, it is possible that it
       // can be jumped-to, thus conservatively resetting S. As an exception,
@@ -687,12 +721,8 @@ public:
         S = createUnsafeState();
       }
 
-      // Check if we need to remove an old annotation (this is the case if
-      // this is the second, detailed, run of the analysis).
-      if (BC.MIB->hasAnnotation(Inst, StateAnnotationIndex))
-        BC.MIB->removeAnnotation(Inst, StateAnnotationIndex);
       // Attach the state *before* this instruction executes.
-      BC.MIB->addAnnotation(Inst, StateAnnotationIndex, S, AllocId);
+      setState(Inst, S);
 
       // Compute the state after this instruction executes.
       S = computeNext(Inst, S);
@@ -700,10 +730,8 @@ public:
   }
 
   const SrcState &getStateBefore(const MCInst &Inst) const override {
-    return BC.MIB->getAnnotationAs<SrcState>(Inst, StateAnnotationIndex);
+    return getState(Inst);
   }
-
-  ~CFGUnawareSrcSafetyAnalysis() { cleanStateAnnotations(); }
 };
 
 std::shared_ptr<SrcSafetyAnalysis>
@@ -717,6 +745,483 @@ SrcSafetyAnalysis::create(BinaryFunction &BF,
                                                        RegsToTrackInstsFor);
 }
 
+/// A state representing which registers are safe to be used as the destination
+/// operand of an authentication instruction.
+///
+/// Similar to SrcState, it is the responsibility of the analysis to take
+/// register aliasing into account.
+///
+/// Depending on the implementation (such as whether FEAT_FPAC is implemented
+/// by an AArch64 CPU or not), it may be possible that an authentication
+/// instruction returns an invalid pointer on failure instead of terminating
+/// the program immediately (assuming the program will crash as soon as that
+/// pointer is dereferenced). Since few bits are usually allocated for the PAC
+/// field (such as less than 16 bits on a typical AArch64 system), an attacker
+/// can try every possible signature and guess the correct one if there is a
+/// gadget that tells whether the particular pointer has a correct signature
+/// (a so called "authentication oracle"). For that reason, it should be
+/// impossible for an attacker to test if a pointer is correctly signed -
+/// either the program should be terminated on authentication failure or
+/// the result of authentication should not be accessible to an attacker.
+///
+/// Considering the instructions in forward order as they are executed, a
+/// restricted set of operations can be allowed on any register containing a
+/// value derived from the result of an authentication instruction until that
+/// value is checked not to contain the result of a failed authentication.
+/// In DstSafetyAnalysis, these rules are adapted, so that the safety property
+/// for a register is computed by iterating the instructions in backward order.
+/// Then the resulting properties are used at authentication instruction sites
+/// to check output registers and report the particular instruction if it writes
+/// to an unsafe register.
+///
+/// Another approach would be to simulate the above rules as-is, iterating over
+/// the instructions in forward direction. To make it possible to report the
+/// particular instructions as oracles, this would probably require tracking
+/// references to these instructions for each register currently containing
+/// sensitive data.
+///
+/// In DstSafetyAnalysis, the source register Xn of an instruction Inst is safe
+/// if at least one of the following is true:
+/// * Inst checks if Xn contains the result of a successful authentication and
+///   terminates the program on failure. Note that Inst can either naturally
+///   dereference Xn (load, branch, return, etc. instructions) or be the first
+///   instruction of an explicit checking sequence.
+/// * Inst performs safe address arithmetic AND both source and result
+///   registers, as well as any temporary registers, must be safe after
+///   execution of Inst (temporaries are not used on AArch64 and thus not
+///   currently supported/allowed).
+///   See MCPlusBuilder::analyzeAddressArithmeticsForPtrAuth for the details.
+/// * Inst fully overwrites Xn with a constant.
+struct DstState {
+  /// The set of registers whose values cannot be inspected by an attacker in
+  /// a way usable as an authentication oracle. The results of authentication
+  /// instructions should only be written to such registers.
+  BitVector CannotEscapeUnchecked;
+
+  /// A vector of sets, only used on the second analysis run.
+  /// Each element in this vector represents one of the tracked registers.
+  /// For each such register we track the set of first instructions that leak
+  /// the authenticated pointer before it was checked. This is intended to
+  /// provide clues on which instruction made the particular register unsafe.
+  ///
+  /// Please note that the mapping from MCPhysReg values to indexes in this
+  /// vector is provided by RegsToTrackInstsFor field of DstSafetyAnalysis.
+  std::vector<SetOfRelatedInsts> FirstInstLeakingReg;
+
+  /// Constructs an empty state.
+  DstState() {}
+
+  DstState(unsigned NumRegs, unsigned NumRegsToTrack)
+      : CannotEscapeUnchecked(NumRegs), FirstInstLeakingReg(NumRegsToTrack) {}
+
+  DstState &merge(const DstState &StateIn) {
+    if (StateIn.empty())
+      return *this;
+    if (empty())
+      return (*this = StateIn);
+
+    CannotEscapeUnchecked &= StateIn.CannotEscapeUnchecked;
+    for (unsigned I = 0; I < FirstInstLeakingReg.size(); ++I)
+      for (const MCInst *J : StateIn.FirstInstLeakingReg[I])
+        FirstInstLeakingReg[I].insert(J);
+    return *this;
+  }
+
+  /// Returns true if this object does not store state of any registers -
+  /// neither safe, nor unsafe ones.
+  bool empty() const { return CannotEscapeUnchecked.empty(); }
+
+  bool operator==(const DstState &RHS) const {
+    return CannotEscapeUnchecked == RHS.CannotEscapeUnchecked &&
+           FirstInstLeakingReg == RHS.FirstInstLeakingReg;
+  }
+  bool operator!=(const DstState &RHS) const { return !((*this) == RHS); }
+};
+
+static raw_ostream &operator<<(raw_ostream &OS, const DstState &S) {
+  OS << "dst-state<";
+  if (S.empty()) {
+    OS << "empty";
+  } else {
+    OS << "CannotEscapeUnchecked: " << S.CannotEscapeUnchecked << ", ";
+    printInstsShort(OS, S.FirstInstLeakingReg);
+  }
+  OS << ">";
+  return OS;
+}
+
+class DstStatePrinter {
+public:
+  void print(raw_ostream &OS, const DstState &S) const;
+  explicit DstStatePrinter(const BinaryContext &BC) : BC(BC) {}
+
+private:
+  const BinaryContext &BC;
+};
+
+void DstStatePrinter::print(raw_ostream &OS, const DstState &S) const {
+  RegStatePrinter RegStatePrinter(BC);
+  OS << "dst-state<";
+  if (S.empty()) {
+    assert(S.CannotEscapeUnchecked.empty());
+    assert(S.FirstInstLeakingReg.empty());
+    OS << "empty";
+  } else {
+    OS << "CannotEscapeUnchecked: ";
+    RegStatePrinter.print(OS, S.CannotEscapeUnchecked);
+    OS << ", ";
+    printInstsShort(OS, S.FirstInstLeakingReg);
+  }
+  OS << ">";
+}
+
+/// Computes which registers are safe to be written to by auth instructions.
+///
+/// This is the base class for two implementations: a dataflow-based analysis
+/// which is intended to be used for most functions and a simplified CFG-unaware
+/// version for functions without reconstructed CFG.
+class DstSafetyAnalysis {
+public:
+  DstSafetyAnalysis(BinaryFunction &BF, ArrayRef<MCPhysReg> RegsToTrackInstsFor)
+      : BC(BF.getBinaryContext()), NumRegs(BC.MRI->getNumRegs()),
+        RegsToTrackInstsFor(RegsToTrackInstsFor) {}
+
+  virtual ~DstSafetyAnalysis() {}
+
+  static std::shared_ptr<DstSafetyAnalysis>
+  create(BinaryFunction &BF, MCPlusBuilder::AllocatorIdTy AllocId,
+         ArrayRef<MCPhysReg> RegsToTrackInstsFor);
+
+  virtual void run() = 0;
+  virtual const DstState &getStateAfter(const MCInst &Inst) const = 0;
+
+protected:
+  BinaryContext &BC;
+  const unsigned NumRegs;
+
+  const TrackedRegisters RegsToTrackInstsFor;
+
+  /// Stores information about the detected instruction sequences emitted to
+  /// check an authenticated pointer. Specifically, if such sequence is detected
+  /// in a basic block, it maps the first instruction of that sequence to the
+  /// register being checked.
+  ///
+  /// As the detection of such sequences requires iterating over the adjacent
+  /// instructions, it should be done before calling computeNext(), which
+  /// operates on separate instructions.
+  DenseMap<const MCInst *, MCPhysReg> RegCheckedAt;
+
+  SetOfRelatedInsts &firstLeakingInsts(DstState &S, MCPhysReg Reg) const {
+    unsigned Index = RegsToTrackInstsFor.getIndex(Reg);
+    return S.FirstInstLeakingReg[Index];
+  }
+  const SetOfRelatedInsts &firstLeakingInsts(const DstState &S,
+                                             MCPhysReg Reg) const {
+    unsigned Index = RegsToTrackInstsFor.getIndex(Reg);
+    return S.FirstInstLeakingReg[Index];
+  }
+
+  /// Creates a state with all registers marked unsafe (not to be confused
+  /// with empty state).
+  DstState createUnsafeState() {
+    return DstState(NumRegs, RegsToTrackInstsFor.getNumTrackedRegisters());
+  }
+
+  /// Returns the set of registers that can be leaked by this instruction.
+  /// A register is considered leaked if it has any intersection with any
+  /// register read by Inst. This is similar to how the set of clobbered
+  /// registers is computed, but taking input operands instead of outputs.
+  BitVector getLeakedRegs(const MCInst &Inst) const {
+    BitVector Leaked(NumRegs);
+
+    // Assume a call can read all registers.
+    if (BC.MIB->isCall(Inst)) {
+      Leaked.set();
+      return Leaked;
+    }
+
+    // Compute the set of registers overlapping with any register used by
+    // this instruction.
+
+    const MCInstrDesc &Desc = BC.MII->get(Inst.getOpcode());
+
+    for (MCPhysReg Reg : Desc.implicit_uses())
+      Leaked |= BC.MIB->getAliases(Reg, /*OnlySmaller=*/false);
+
+    for (const MCOperand &Op : BC.MIB->useOperands(Inst)) {
+      if (Op.isReg())
+        Leaked |= BC.MIB->getAliases(Op.getReg(), /*OnlySmaller=*/false);
+    }
+
+    return Leaked;
+  }
+
+  SmallVector<MCPhysReg> getRegsMadeProtected(const MCInst &Inst,
+                                              const BitVector &LeakedRegs,
+                                              const DstState &Cur) const {
+    SmallVector<MCPhysReg> Regs;
+
+    // A pointer can be checked, or
+    if (auto CheckedReg =
+            BC.MIB->getAuthCheckedReg(Inst, /*MayOverwrite=*/true))
+      Regs.push_back(*CheckedReg);
+    if (RegCheckedAt.contains(&Inst))
+      Regs.push_back(RegCheckedAt.at(&Inst));
+
+    // ... it can be used as a branch target, or
+    if (BC.MIB->isIndirectBranch(Inst) || BC.MIB->isIndirectCall(Inst)) {
+      bool IsAuthenticated;
+      MCPhysReg BranchDestReg =
+          BC.MIB->getRegUsedAsIndirectBranchDest(Inst, IsAuthenticated);
+      assert(BranchDestReg != BC.MIB->getNoRegister());
+      if (!IsAuthenticated)
+        Regs.push_back(BranchDestReg);
+    }
+
+    // ... it can be used as a return target, or
+    if (BC.MIB->isReturn(Inst)) {
+      bool IsAuthenticated = false;
+      std::optional<MCPhysReg> RetReg =
+          BC.MIB->getRegUsedAsRetDest(Inst, IsAuthenticated);
+      if (RetReg && !IsAuthenticated)
+        Regs.push_back(*RetReg);
+    }
+
+    // ... an address can be updated in a safe manner, or
+    if (auto DstAndSrc = BC.MIB->analyzeAddressArithmeticsForPtrAuth(Inst)) {
+      MCPhysReg DstReg, SrcReg;
+      std::tie(DstReg, SrcReg) = *DstAndSrc;
+      // Note that *all* registers containing the derived values must be safe,
+      // both source and destination ones. No temporaries are supported at now.
+      if (Cur.CannotEscapeUnchecked[SrcReg] &&
+          Cur.CannotEscapeUnchecked[DstReg])
+        Regs.push_back(SrcReg);
+    }
+
+    // ... the register can be overwritten in whole with a constant: for that
+    // purpose, look for the instructions with no register inputs (neither
+    // explicit nor implicit ones) and no side effects (to rule out reading
+    // not modelled locations).
+    const MCInstrDesc &Desc = BC.MII->get(Inst.getOpcode());
+    bool HasExplicitSrcRegs = llvm::any_of(BC.MIB->useOperands(Inst),
+                                           [](auto Op) { return Op.isReg(); });
+    if (!Desc.hasUnmodeledSideEffects() && !HasExplicitSrcRegs &&
+        Desc.implicit_uses().empty()) {
+      for (const MCOperand &Def : BC.MIB->defOperands(Inst))
+        Regs.push_back(Def.getReg());
+    }
+
+    return Regs;
+  }
+
+  DstState computeNext(const MCInst &Point, const DstState &Cur) {
+    if (BC.MIB->isCFI(Point))
+      return Cur;
+
+    DstStatePrinter P(BC);
+    LLVM_DEBUG({
+      dbgs() << "  DstSafetyAnalysis::ComputeNext(";
+      BC.InstPrinter->printInst(&Point, 0, "", *BC.STI, dbgs());
+      dbgs() << ", ";
+      P.print(dbgs(), Cur);
+      dbgs() << ")\n";
+    });
+
+    // If this instruction is reachable by the analysis, a non-empty state will
+    // be propagated to it sooner or later. Until then, skip computeNext().
+    if (Cur.empty()) {
+      LLVM_DEBUG(
+          { dbgs() << "Skipping computeNext(Point, Cur) as Cur is empty.\n"; });
+      return DstState();
+    }
+
+    // First, compute various properties of the instruction, taking the state
+    // after its execution into account, if necessary.
+
+    BitVector LeakedRegs = getLeakedRegs(Point);
+    SmallVector<MCPhysReg> NewProtectedRegs =
+        getRegsMadeProtected(Point, LeakedRegs, Cur);
+
+    // Then, compute the state before this instruction is executed.
+    DstState Next = Cur;
+
+    Next.CannotEscapeUnchecked.reset(LeakedRegs);
+    for (MCPhysReg Reg : RegsToTrackInstsFor.getRegisters()) {
+      if (LeakedRegs[Reg])
+        firstLeakingInsts(Next, Reg) = {&Point};
+    }
+
+    BitVector NewProtectedSubregs(NumRegs);
+    for (MCPhysReg Reg : NewProtectedRegs)
+      NewProtectedSubregs |= BC.MIB->getAliases(Reg, /*OnlySmaller=*/true);
+    Next.CannotEscapeUnchecked |= NewProtectedSubregs;
+    for (MCPhysReg Reg : RegsToTrackInstsFor.getRegisters()) {
+      if (NewProtectedSubregs[Reg])
+        firstLeakingInsts(Next, Reg).clear();
+    }
+
+    LLVM_DEBUG({
+      dbgs() << "    .. result: (";
+      P.print(dbgs(), Next);
+      dbgs() << ")\n";
+    });
+
+    return Next;
+  }
+
+public:
+  std::vector<MCInstReference> getLeakingInsts(const MCInst &Inst,
+                                               BinaryFunction &BF,
+                                               MCPhysReg LeakedReg) const {
+    const DstState &S = getStateAfter(Inst);
+
+    std::vector<MCInstReference> Result;
+    for (const MCInst *Inst : firstLeakingInsts(S, LeakedReg)) {
+      MCInstReference Ref = MCInstReference::get(Inst, BF);
+      assert(Ref && "Expected Inst to be found");
+      Result.push_back(Ref);
+    }
+    return Result;
+  }
+};
+
+class DataflowDstSafetyAnalysis
+    : public DstSafetyAnalysis,
+      public DataflowAnalysis<DataflowDstSafetyAnalysis, DstState,
+                              /*Backward=*/true, DstStatePrinter> {
+  using DFParent = DataflowAnalysis<DataflowDstSafetyAnalysis, DstState, true,
+                                    DstStatePrinter>;
+  friend DFParent;
+
+  using DstSafetyAnalysis::BC;
+  using DstSafetyAnalysis::computeNext;
+
+public:
+  DataflowDstSafetyAnalysis(BinaryFunction &BF,
+                            MCPlusBuilder::AllocatorIdTy AllocId,
+                            ArrayRef<MCPhysReg> RegsToTrackInstsFor)
+      : DstSafetyAnalysis(BF, RegsToTrackInstsFor), DFParent(BF, AllocId) {}
+
+  const DstState &getStateAfter(const MCInst &Inst) const override {
+    // The dataflow analysis base class iterates backwards over the
+    // instructions, thus "after" vs. "before" difference.
+    return DFParent::getStateBefore(Inst).get();
+  }
+
+  void run() override {
+    for (BinaryBasicBlock &BB : Func) {
+      if (auto CheckerInfo = BC.MIB->getAuthCheckedReg(BB)) {
+        LLVM_DEBUG({
+          dbgs() << "Found pointer checking sequence in " << BB.getName()
+                 << ":\n";
+          traceReg(BC, "Checked register", CheckerInfo->first);
+          traceInst(BC, "First instruction", *CheckerInfo->second);
+        });
+        RegCheckedAt[CheckerInfo->second] = CheckerInfo->first;
+      }
+    }
+    DFParent::run();
+  }
+
+protected:
+  void preflight() {}
+
+  DstState getStartingStateAtBB(const BinaryBasicBlock &BB) {
+    // In general, the initial state should be empty, not everything-is-unsafe,
+    // to give a chance for some meaningful state to be propagated to BB from
+    // an indirectly reachable "exit basic block" ending with a return or tail
+    // call instruction.
+    //
+    // A basic block without any successors, on the other hand, can be
+    // pessimistically initialized to everything-is-unsafe: this will naturally
+    // handle both return and tail call instructions and is harmless for
+    // internal indirect branch instructions (such as computed gotos).
+    if (BB.succ_empty())
+      return createUnsafeState();
+
+    return DstState();
+  }
+
+  DstState getStartingStateAtPoint(const MCInst &Point) { return DstState(); }
+
+  void doConfluence(DstState &StateOut, const DstState &StateIn) {
+    DstStatePrinter P(BC);
+    LLVM_DEBUG({
+      dbgs() << "  DataflowDstSafetyAnalysis::Confluence(\n";
+      dbgs() << "    State 1: ";
+      P.print(dbgs(), StateOut);
+      dbgs() << "\n";
+      dbgs() << "    State 2: ";
+      P.print(dbgs(), StateIn);
+      dbgs() << ")\n";
+    });
+
+    StateOut.merge(StateIn);
+
+    LLVM_DEBUG({
+      dbgs() << "    merged state: ";
+      P.print(dbgs(), StateOut);
+      dbgs() << "\n";
+    });
+  }
+
+  StringRef getAnnotationName() const { return "DataflowDstSafetyAnalysis"; }
+};
+
+class CFGUnawareDstSafetyAnalysis : public DstSafetyAnalysis,
+                                    public CFGUnawareAnalysis<DstState> {
+  using DstSafetyAnalysis::BC;
+  BinaryFunction &BF;
+
+public:
+  CFGUnawareDstSafetyAnalysis(BinaryFunction &BF,
+                              MCPlusBuilder::AllocatorIdTy AllocId,
+                              ArrayRef<MCPhysReg> RegsToTrackInstsFor)
+      : DstSafetyAnalysis(BF, RegsToTrackInstsFor),
+        CFGUnawareAnalysis(BF, AllocId, "CFGUnawareDstSafetyAnalysis"), BF(BF) {
+  }
+
+  void run() override {
+    DstState S = createUnsafeState();
+    for (auto &I : llvm::reverse(BF.instrs())) {
+      MCInst &Inst = I.second;
+      if (BC.MIB->isCFI(Inst))
+        continue;
+
+      // If Inst can change the control flow, we cannot be sure that the next
+      // instruction (to be executed in analyzed program) is the one processed
+      // on the previous iteration, thus pessimistically reset S before
+      // starting to analyze Inst.
+      if (BC.MIB->isCall(Inst) || BC.MIB->isBranch(Inst) ||
+          BC.MIB->isReturn(Inst)) {
+        LLVM_DEBUG({ traceInst(BC, "Control flow instruction", Inst); });
+        S = createUnsafeState();
+      }
+
+      // Attach the state *after* this instruction executes.
+      setState(Inst, S);
+
+      // Compute the next state.
+      S = computeNext(Inst, S);
+    }
+  }
+
+  const DstState &getStateAfter(const MCInst &Inst) const override {
+    return getState(Inst);
+  }
+};
+
+std::shared_ptr<DstSafetyAnalysis>
+DstSafetyAnalysis::create(BinaryFunction &BF,
+                          MCPlusBuilder::AllocatorIdTy AllocId,
+                          ArrayRef<MCPhysReg> RegsToTrackInstsFor) {
+  if (BF.hasCFG())
+    return std::make_shared<DataflowDstSafetyAnalysis>(BF, AllocId,
+                                                       RegsToTrackInstsFor);
+  return std::make_shared<CFGUnawareDstSafetyAnalysis>(BF, AllocId,
+                                                       RegsToTrackInstsFor);
+}
+
 // This function could return PartialReport<T>, but currently T is always
 // MCPhysReg, even though it is an implementation detail.
 static PartialReport<MCPhysReg> make_generic_report(MCInstReference Location,
@@ -808,6 +1313,37 @@ shouldReportSigningOracle(const BinaryContext &BC, const MCInstReference &Inst,
   return make_gadget_report(SigningOracleKind, Inst, *SignedReg);
 }
 
+static std::optional<PartialReport<MCPhysReg>>
+shouldReportAuthOracle(const BinaryContext &BC, const MCInstReference &Inst,
+                       const DstState &S) {
+  static const GadgetKind AuthOracleKind("authentication oracle found");
+
+  bool IsChecked = false;
+  std::optional<MCPhysReg> AuthReg =
+      BC.MIB->getWrittenAuthenticatedReg(Inst, IsChecked);
+  if (!AuthReg || IsChecked)
+    return std::nullopt;
+
+  LLVM_DEBUG({
+    traceInst(BC, "Found auth inst", Inst);
+    traceReg(BC, "Authenticated reg", *AuthReg);
+  });
+
+  if (S.empty()) {
+    LLVM_DEBUG({ dbgs() << "    DstState is empty!\n"; });
+    return make_generic_report(
+        Inst, "Warning: no state computed for an authentication instruction "
+              "(possibly unreachable)");
+  }
+
+  LLVM_DEBUG(
+      { traceRegMask(BC, "safe output registers", S.CannotEscapeUnchecked); });
+  if (S.CannotEscapeUnchecked[*AuthReg])
+    return std::nullopt;
+
+  return make_gadget_report(AuthOracleKind, Inst, *AuthReg);
+}
+
 template <typename T> static void iterateOverInstrs(BinaryFunction &BF, T Fn) {
   if (BF.hasCFG()) {
     for (BinaryBasicBlock &BB : BF)
@@ -840,6 +1376,9 @@ void FunctionAnalysisContext::findUnsafeUses(
   });
 
   iterateOverInstrs(BF, [&](MCInstReference Inst) {
+    if (BC.MIB->isCFI(Inst))
+      return;
+
     const SrcState &S = Analysis->getStateBefore(Inst);
 
     // If non-empty state was never propagated from the entry basic block
@@ -889,6 +1428,55 @@ void FunctionAnalysisContext::augmentUnsafeUseReports(
   }
 }
 
+void FunctionAnalysisContext::findUnsafeDefs(
+    SmallVector<PartialReport<MCPhysReg>> &Reports) {
+  if (PacRetGadgetsOnly)
+    return;
+
+  auto Analysis = DstSafetyAnalysis::create(BF, AllocatorId, {});
+  LLVM_DEBUG({ dbgs() << "Running dst register safety analysis...\n"; });
+  Analysis->run();
+  LLVM_DEBUG({
+    dbgs() << "After dst register safety analysis:\n";
+    BF.dump();
+  });
+
+  iterateOverInstrs(BF, [&](MCInstReference Inst) {
+    if (BC.MIB->isCFI(Inst))
+      return;
+
+    const DstState &S = Analysis->getStateAfter(Inst);
+
+    if (auto Report = shouldReportAuthOracle(BC, Inst, S))
+      Reports.push_back(*Report);
+  });
+}
+
+void FunctionAnalysisContext::augmentUnsafeDefReports(
+    ArrayRef<PartialReport<MCPhysReg>> Reports) {
+  SmallVector<MCPhysReg> RegsToTrack = collectRegsToTrack(Reports);
+  // Re-compute the analysis with register tracking.
+  auto Analysis = DstSafetyAnalysis::create(BF, AllocatorId, RegsToTrack);
+  LLVM_DEBUG(
+      { dbgs() << "\nRunning detailed dst register safety analysis...\n"; });
+  Analysis->run();
+  LLVM_DEBUG({
+    dbgs() << "After detailed dst register safety analysis:\n";
+    BF.dump();
+  });
+
+  // Augment gadget reports.
+  for (auto &Report : Reports) {
+    MCInstReference Location = Report.Issue->Location;
+    LLVM_DEBUG({ traceInst(BC, "Attaching leakage info to", Location); });
+    assert(Report.RequestedDetails &&
+           "Should be removed by handleSimpleReports");
+    auto DetailedInfo = std::make_shared<LeakageInfo>(
+        Analysis->getLeakingInsts(Location, BF, *Report.RequestedDetails));
+    Result.Diagnostics.emplace_back(Report.Issue, DetailedInfo);
+  }
+}
+
 void FunctionAnalysisContext::handleSimpleReports(
     SmallVector<PartialReport<MCPhysReg>> &Reports) {
   // Before re-running the detailed analysis, process the reports which do not
@@ -912,6 +1500,12 @@ void FunctionAnalysisContext::run() {
   handleSimpleReports(UnsafeUses);
   if (!UnsafeUses.empty())
     augmentUnsafeUseReports(UnsafeUses);
+
+  SmallVector<PartialReport<MCPhysReg>> UnsafeDefs;
+  findUnsafeDefs(UnsafeDefs);
+  handleSimpleReports(UnsafeDefs);
+  if (!UnsafeDefs.empty())
+    augmentUnsafeDefReports(UnsafeDefs);
 }
 
 void Analysis::runOnFunction(BinaryFunction &BF,
@@ -1015,6 +1609,12 @@ void ClobberingInfo::print(raw_ostream &OS,
   printRelatedInstrs(OS, Location, ClobberingInstrs);
 }
 
+void LeakageInfo::print(raw_ostream &OS, const MCInstReference Location) const {
+  OS << "  The " << LeakingInstrs.size()
+     << " instructions that leak the affected registers are:\n";
+  printRelatedInstrs(OS, Location, LeakingInstrs);
+}
+
 void GenericDiagnostic::generateReport(raw_ostream &OS,
                                        const BinaryContext &BC) const {
   printBasicInfo(OS, BC, Text);
diff --git a/bolt/lib/Profile/BoltAddressTranslation.cpp b/bolt/lib/Profile/BoltAddressTranslation.cpp
index a253522e4fb1..7ad4e6a2e141 100644
--- a/bolt/lib/Profile/BoltAddressTranslation.cpp
+++ b/bolt/lib/Profile/BoltAddressTranslation.cpp
@@ -546,7 +546,7 @@ BoltAddressTranslation::getFallthroughsInTrace(uint64_t FuncAddress,
     return Res;
 
   for (auto Iter = FromIter; Iter != ToIter;) {
-    const uint32_t Src = Iter->first;
+    const uint32_t Src = Iter->second >> 1;
     if (Iter->second & BRANCHENTRY) {
       ++Iter;
       continue;
@@ -557,7 +557,7 @@ BoltAddressTranslation::getFallthroughsInTrace(uint64_t FuncAddress,
       ++Iter;
     if (Iter->second & BRANCHENTRY)
       break;
-    Res.emplace_back(Src, Iter->first);
+    Res.emplace_back(Src, Iter->second >> 1);
   }
 
   return Res;
diff --git a/bolt/lib/Profile/DataAggregator.cpp b/bolt/lib/Profile/DataAggregator.cpp
index 178c9d3a6373..5c8af3710720 100644
--- a/bolt/lib/Profile/DataAggregator.cpp
+++ b/bolt/lib/Profile/DataAggregator.cpp
@@ -49,6 +49,9 @@ static cl::opt<bool>
                      cl::desc("aggregate basic samples (without LBR info)"),
                      cl::cat(AggregatorCategory));
 
+cl::opt<bool> ArmSPE("spe", cl::desc("Enable Arm SPE mode."),
+                     cl::cat(AggregatorCategory));
+
 static cl::opt<std::string>
     ITraceAggregation("itrace",
                       cl::desc("Generate LBR info with perf itrace argument"),
@@ -181,11 +184,21 @@ void DataAggregator::start() {
 
   findPerfExecutable();
 
+  if (opts::ArmSPE) {
+    // pid    from_ip      to_ip        flags
+    // where flags could be:
+    // P/M: whether branch was Predicted or Mispredicted.
+    // N: optionally appears when the branch was Not-Taken (ie fall-through)
+    // 12345  0x123/0x456/PN/-/-/8/RET/-
+    opts::ITraceAggregation = "bl";
+    opts::ParseMemProfile = true;
+    opts::BasicAggregation = false;
+  }
+
   if (opts::BasicAggregation) {
-    launchPerfProcess("events without LBR",
-                      MainEventsPPI,
+    launchPerfProcess("events without LBR", MainEventsPPI,
                       "script -F pid,event,ip",
-                      /*Wait = */false);
+                      /*Wait = */ false);
   } else if (!opts::ITraceAggregation.empty()) {
     // Disable parsing memory profile from trace data, unless requested by user.
     if (!opts::ParseMemProfile.getNumOccurrences())
@@ -524,8 +537,7 @@ Error DataAggregator::preprocessProfile(BinaryContext &BC) {
 
 heatmap:
   // Sort parsed traces for faster processing.
-  if (!opts::BasicAggregation)
-    llvm::sort(Traces, llvm::less_first());
+  llvm::sort(Traces, llvm::less_first());
 
   if (!opts::HeatmapMode)
     return Error::success();
@@ -823,13 +835,8 @@ bool DataAggregator::doTrace(const Trace &Trace, uint64_t Count,
 
   LLVM_DEBUG(dbgs() << "Processing " << FTs->size() << " fallthroughs for "
                     << FromFunc->getPrintName() << ":" << Trace << '\n');
-  for (auto [From, To] : *FTs) {
-    if (BAT) {
-      From = BAT->translate(FromFunc->getAddress(), From, /*IsBranchSrc=*/true);
-      To = BAT->translate(FromFunc->getAddress(), To, /*IsBranchSrc=*/false);
-    }
+  for (const auto &[From, To] : *FTs)
     doIntraBranch(*ParentFunc, From, To, Count, false);
-  }
 
   return true;
 }
@@ -870,13 +877,9 @@ DataAggregator::getFallthroughsInTrace(BinaryFunction &BF, const Trace &Trace,
 
   // Adjust FromBB if the first LBR is a return from the last instruction in
   // the previous block (that instruction should be a call).
-  if (IsReturn) {
-    if (From)
-      FromBB = BF.getBasicBlockContainingOffset(From - 1);
-    else
-      LLVM_DEBUG(dbgs() << "return to the function start: " << Trace << '\n');
-  } else if (Trace.Branch == Trace::EXTERNAL && From == FromBB->getOffset() &&
-             !FromBB->isEntryPoint() && !FromBB->isLandingPad()) {
+  if (Trace.Branch != Trace::FT_ONLY && !BF.containsAddress(Trace.Branch) &&
+      From == FromBB->getOffset() &&
+      (IsReturn ? From : !(FromBB->isEntryPoint() || FromBB->isLandingPad()))) {
     const BinaryBasicBlock *PrevBB =
         BF.getLayout().getBlock(FromBB->getIndex() - 1);
     if (PrevBB->getSuccessor(FromBB->getLabel())) {
@@ -994,9 +997,22 @@ ErrorOr<DataAggregator::LBREntry> DataAggregator::parseLBREntry() {
   if (std::error_code EC = MispredStrRes.getError())
     return EC;
   StringRef MispredStr = MispredStrRes.get();
-  if (MispredStr.size() != 1 ||
-      (MispredStr[0] != 'P' && MispredStr[0] != 'M' && MispredStr[0] != '-')) {
-    reportError("expected single char for mispred bit");
+  // SPE brstack mispredicted flags might be up to two characters long:
+  // 'PN' or 'MN'. Where 'N' optionally appears.
+  bool ValidStrSize = opts::ArmSPE
+                          ? MispredStr.size() >= 1 && MispredStr.size() <= 2
+                          : MispredStr.size() == 1;
+  bool SpeTakenBitErr =
+      (opts::ArmSPE && MispredStr.size() == 2 && MispredStr[1] != 'N');
+  bool PredictionBitErr =
+      !ValidStrSize ||
+      (MispredStr[0] != 'P' && MispredStr[0] != 'M' && MispredStr[0] != '-');
+  if (SpeTakenBitErr)
+    reportError("expected 'N' as SPE prediction bit for a not-taken branch");
+  if (PredictionBitErr)
+    reportError("expected 'P', 'M' or '-' char as a prediction bit");
+
+  if (SpeTakenBitErr || PredictionBitErr) {
     Diag << "Found: " << MispredStr << "\n";
     return make_error_code(llvm::errc::io_error);
   }
@@ -1202,12 +1218,14 @@ ErrorOr<Location> DataAggregator::parseLocationOrOffset() {
 std::error_code DataAggregator::parseAggregatedLBREntry() {
   enum AggregatedLBREntry : char {
     INVALID = 0,
-    EVENT_NAME,        // E
-    TRACE,             // T
-    SAMPLE,            // S
-    BRANCH,            // B
-    FT,                // F
-    FT_EXTERNAL_ORIGIN // f
+    EVENT_NAME,         // E
+    TRACE,              // T
+    RETURN,             // R
+    SAMPLE,             // S
+    BRANCH,             // B
+    FT,                 // F
+    FT_EXTERNAL_ORIGIN, // f
+    FT_EXTERNAL_RETURN  // r
   } Type = INVALID;
 
   /// The number of fields to parse, set based on \p Type.
@@ -1235,20 +1253,22 @@ std::error_code DataAggregator::parseAggregatedLBREntry() {
 
     Type = StringSwitch<AggregatedLBREntry>(Str)
                .Case("T", TRACE)
+               .Case("R", RETURN)
                .Case("S", SAMPLE)
                .Case("E", EVENT_NAME)
                .Case("B", BRANCH)
                .Case("F", FT)
                .Case("f", FT_EXTERNAL_ORIGIN)
+               .Case("r", FT_EXTERNAL_RETURN)
                .Default(INVALID);
 
     if (Type == INVALID) {
-      reportError("expected T, S, E, B, F or f");
+      reportError("expected T, R, S, E, B, F, f or r");
       return make_error_code(llvm::errc::io_error);
     }
 
     using SSI = StringSwitch<int>;
-    AddrNum = SSI(Str).Case("T", 3).Case("S", 1).Case("E", 0).Default(2);
+    AddrNum = SSI(Str).Cases("T", "R", 3).Case("S", 1).Case("E", 0).Default(2);
     CounterNum = SSI(Str).Case("B", 2).Case("E", 0).Default(1);
   }
 
@@ -1305,17 +1325,30 @@ std::error_code DataAggregator::parseAggregatedLBREntry() {
   if (ToFunc)
     ToFunc->setHasProfileAvailable();
 
-  /// For legacy fall-through types, adjust locations to match Trace container.
-  if (Type == FT || Type == FT_EXTERNAL_ORIGIN) {
+  /// For fall-through types, adjust locations to match Trace container.
+  if (Type == FT || Type == FT_EXTERNAL_ORIGIN || Type == FT_EXTERNAL_RETURN) {
     Addr[2] = Location(Addr[1]->Offset); // Trace To
     Addr[1] = Location(Addr[0]->Offset); // Trace From
-    // Put a magic value into Trace Branch to differentiate from a full trace.
-    Addr[0] = Location(Type == FT ? Trace::FT_ONLY : Trace::FT_EXTERNAL_ORIGIN);
+    // Put a magic value into Trace Branch to differentiate from a full trace:
+    if (Type == FT)
+      Addr[0] = Location(Trace::FT_ONLY);
+    else if (Type == FT_EXTERNAL_ORIGIN)
+      Addr[0] = Location(Trace::FT_EXTERNAL_ORIGIN);
+    else if (Type == FT_EXTERNAL_RETURN)
+      Addr[0] = Location(Trace::FT_EXTERNAL_RETURN);
+    else
+      llvm_unreachable("Unexpected fall-through type");
   }
 
-  /// For legacy branch type, mark Trace To to differentite from a full trace.
-  if (Type == BRANCH) {
+  /// For branch type, mark Trace To to differentiate from a full trace.
+  if (Type == BRANCH)
     Addr[2] = Location(Trace::BR_ONLY);
+
+  if (Type == RETURN) {
+    if (!Addr[0]->Offset)
+      Addr[0]->Offset = Trace::FT_EXTERNAL_RETURN;
+    else
+      Returns.emplace(Addr[0]->Offset);
   }
 
   /// Record a trace.
@@ -1497,7 +1530,9 @@ void DataAggregator::printBranchStacksDiagnostics(
 }
 
 std::error_code DataAggregator::parseBranchEvents() {
-  outs() << "PERF2BOLT: parse branch events...\n";
+  std::string BranchEventTypeStr =
+      opts::ArmSPE ? "SPE branch events in LBR-format" : "branch events";
+  outs() << "PERF2BOLT: parse " << BranchEventTypeStr << "...\n";
   NamedRegionTimer T("parseBranch", "Parsing branch events", TimerGroupName,
                      TimerGroupDesc, opts::TimeAggregator);
 
@@ -1525,7 +1560,8 @@ std::error_code DataAggregator::parseBranchEvents() {
     }
 
     NumEntries += Sample.LBR.size();
-    if (BAT && Sample.LBR.size() == 32 && !NeedsSkylakeFix) {
+    if (this->BC->isX86() && BAT && Sample.LBR.size() == 32 &&
+        !NeedsSkylakeFix) {
       errs() << "PERF2BOLT-WARNING: using Intel Skylake bug workaround\n";
       NeedsSkylakeFix = true;
     }
@@ -1548,10 +1584,18 @@ std::error_code DataAggregator::parseBranchEvents() {
     if (NumSamples && NumSamplesNoLBR == NumSamples) {
       // Note: we don't know if perf2bolt is being used to parse memory samples
       // at this point. In this case, it is OK to parse zero LBRs.
-      errs() << "PERF2BOLT-WARNING: all recorded samples for this binary lack "
-                "LBR. Record profile with perf record -j any or run perf2bolt "
-                "in no-LBR mode with -nl (the performance improvement in -nl "
-                "mode may be limited)\n";
+      if (!opts::ArmSPE)
+        errs()
+            << "PERF2BOLT-WARNING: all recorded samples for this binary lack "
+               "LBR. Record profile with perf record -j any or run perf2bolt "
+               "in no-LBR mode with -nl (the performance improvement in -nl "
+               "mode may be limited)\n";
+      else
+        errs()
+            << "PERF2BOLT-WARNING: All recorded samples for this binary lack "
+               "SPE brstack entries. Make sure you are running Linux perf 6.14 "
+               "or later, otherwise you get zero samples. Record the profile "
+               "with: perf record -e 'arm_spe_0/branch_filter=1/'.";
     } else {
       printBranchStacksDiagnostics(NumTotalSamples - NumSamples);
     }
@@ -1565,6 +1609,7 @@ void DataAggregator::processBranchEvents() {
   NamedRegionTimer T("processBranch", "Processing branch events",
                      TimerGroupName, TimerGroupDesc, opts::TimeAggregator);
 
+  Returns.emplace(Trace::FT_EXTERNAL_RETURN);
   for (const auto &[Trace, Info] : Traces) {
     bool IsReturn = checkReturn(Trace.Branch);
     // Ignore returns.
diff --git a/bolt/lib/Rewrite/LinuxKernelRewriter.cpp b/bolt/lib/Rewrite/LinuxKernelRewriter.cpp
index 5a5e044184d0..174721a3a053 100644
--- a/bolt/lib/Rewrite/LinuxKernelRewriter.cpp
+++ b/bolt/lib/Rewrite/LinuxKernelRewriter.cpp
@@ -432,25 +432,33 @@ public:
 };
 
 Error LinuxKernelRewriter::detectLinuxKernelVersion() {
-  if (BinaryData *BD = BC.getBinaryDataByName("linux_banner")) {
-    const BinarySection &Section = BD->getSection();
-    const std::string S =
-        Section.getContents().substr(BD->getOffset(), BD->getSize()).str();
-
-    const std::regex Re(R"---(Linux version ((\d+)\.(\d+)(\.(\d+))?))---");
-    std::smatch Match;
-    if (std::regex_search(S, Match, Re)) {
-      const unsigned Major = std::stoi(Match[2].str());
-      const unsigned Minor = std::stoi(Match[3].str());
-      const unsigned Rev = Match[5].matched ? std::stoi(Match[5].str()) : 0;
-      LinuxKernelVersion = LKVersion(Major, Minor, Rev);
-      BC.outs() << "BOLT-INFO: Linux kernel version is " << Match[1].str()
-                << "\n";
-      return Error::success();
-    }
+  // Check for global and local linux_banner symbol.
+  BinaryData *BD = BC.getBinaryDataByName("linux_banner");
+  if (!BD)
+    BD = BC.getBinaryDataByName("linux_banner/1");
+
+  if (!BD)
+    return createStringError(errc::executable_format_error,
+                             "unable to locate linux_banner");
+
+  const BinarySection &Section = BD->getSection();
+  const std::string S =
+      Section.getContents().substr(BD->getOffset(), BD->getSize()).str();
+
+  const std::regex Re(R"---(Linux version ((\d+)\.(\d+)(\.(\d+))?))---");
+  std::smatch Match;
+  if (std::regex_search(S, Match, Re)) {
+    const unsigned Major = std::stoi(Match[2].str());
+    const unsigned Minor = std::stoi(Match[3].str());
+    const unsigned Rev = Match[5].matched ? std::stoi(Match[5].str()) : 0;
+    LinuxKernelVersion = LKVersion(Major, Minor, Rev);
+    BC.outs() << "BOLT-INFO: Linux kernel version is " << Match[1].str()
+              << "\n";
+    return Error::success();
   }
+
   return createStringError(errc::executable_format_error,
-                           "Linux kernel version is unknown");
+                           "Linux kernel version is unknown: " + S);
 }
 
 void LinuxKernelRewriter::processLKSections() {
diff --git a/bolt/lib/Rewrite/RewriteInstance.cpp b/bolt/lib/Rewrite/RewriteInstance.cpp
index e1aa00a3d749..93bd93b6cb98 100644
--- a/bolt/lib/Rewrite/RewriteInstance.cpp
+++ b/bolt/lib/Rewrite/RewriteInstance.cpp
@@ -780,14 +780,6 @@ void RewriteInstance::discoverFileObjects() {
 
   // For local symbols we want to keep track of associated FILE symbol name for
   // disambiguation by combined name.
-  StringRef FileSymbolName;
-  bool SeenFileName = false;
-  struct SymbolRefHash {
-    size_t operator()(SymbolRef const &S) const {
-      return std::hash<decltype(DataRefImpl::p)>{}(S.getRawDataRefImpl().p);
-    }
-  };
-  std::unordered_map<SymbolRef, StringRef, SymbolRefHash> SymbolToFileName;
   for (const ELFSymbolRef &Symbol : InputFile->symbols()) {
     Expected<StringRef> NameOrError = Symbol.getName();
     if (NameOrError && NameOrError->starts_with("__asan_init")) {
@@ -806,21 +798,8 @@ void RewriteInstance::discoverFileObjects() {
     if (cantFail(Symbol.getFlags()) & SymbolRef::SF_Undefined)
       continue;
 
-    if (cantFail(Symbol.getType()) == SymbolRef::ST_File) {
+    if (cantFail(Symbol.getType()) == SymbolRef::ST_File)
       FileSymbols.emplace_back(Symbol);
-      StringRef Name =
-          cantFail(std::move(NameOrError), "cannot get symbol name for file");
-      // Ignore Clang LTO artificial FILE symbol as it is not always generated,
-      // and this uncertainty is causing havoc in function name matching.
-      if (Name == "ld-temp.o")
-        continue;
-      FileSymbolName = Name;
-      SeenFileName = true;
-      continue;
-    }
-    if (!FileSymbolName.empty() &&
-        !(cantFail(Symbol.getFlags()) & SymbolRef::SF_Global))
-      SymbolToFileName[Symbol] = FileSymbolName;
   }
 
   // Sort symbols in the file by value. Ignore symbols from non-allocatable
@@ -1028,14 +1007,14 @@ void RewriteInstance::discoverFileObjects() {
       // The <id> field is used for disambiguation of local symbols since there
       // could be identical function names coming from identical file names
       // (e.g. from different directories).
-      std::string AltPrefix;
-      auto SFI = SymbolToFileName.find(Symbol);
-      if (SymbolType == SymbolRef::ST_Function && SFI != SymbolToFileName.end())
-        AltPrefix = Name + "/" + std::string(SFI->second);
+      auto SFI = llvm::upper_bound(FileSymbols, ELFSymbolRef(Symbol));
+      if (SymbolType == SymbolRef::ST_Function && SFI != FileSymbols.begin()) {
+        StringRef FileSymbolName = cantFail(SFI[-1].getName());
+        if (!FileSymbolName.empty())
+          AlternativeName = NR.uniquify(Name + "/" + FileSymbolName.str());
+      }
 
       UniqueName = NR.uniquify(Name);
-      if (!AltPrefix.empty())
-        AlternativeName = NR.uniquify(AltPrefix);
     }
 
     uint64_t SymbolSize = ELFSymbolRef(Symbol).getSize();
@@ -1294,7 +1273,7 @@ void RewriteInstance::discoverFileObjects() {
                              FDE->getAddressRange());
   }
 
-  BC->setHasSymbolsWithFileName(SeenFileName);
+  BC->setHasSymbolsWithFileName(FileSymbols.size());
 
   // Now that all the functions were created - adjust their boundaries.
   adjustFunctionBoundaries();
@@ -1567,6 +1546,11 @@ void RewriteInstance::registerFragments() {
 
     uint64_t ParentAddress{0};
 
+    // Check if containing FILE symbol is BOLT emitted synthetic symbol marking
+    // local fragments of global parents.
+    if (cantFail(FSI[-1].getName()) == getBOLTFileSymbolName())
+      goto registerParent;
+
     // BOLT split fragment symbols are emitted just before the main function
     // symbol.
     for (ELFSymbolRef NextSymbol = Symbol; NextSymbol < StopSymbol;
diff --git a/bolt/test/AArch64/r_aarch64_prelxx.s b/bolt/test/AArch64/r_aarch64_prelxx.s
index 5cbe2c50b294..39f74301cedf 100644
--- a/bolt/test/AArch64/r_aarch64_prelxx.s
+++ b/bolt/test/AArch64/r_aarch64_prelxx.s
@@ -5,7 +5,7 @@
 // REQUIRES: system-linux
 
 // RUN: %clang %cflags -nostartfiles -nostdlib %s -o %t.exe -mlittle-endian \
-// RUN:     -Wl,-q -Wl,-z,max-page-size=4
+// RUN:     -Wl,-q -Wl,-z,max-page-size=4 -Wl,--no-relax
 // RUN: llvm-readelf -Wa %t.exe | FileCheck %s -check-prefix=CHECKPREL
 
 // CHECKPREL:       R_AARCH64_PREL16      {{.*}} .dummy + 0
@@ -36,9 +36,9 @@
   .type _start, %function
 _start:
   adrp x0, datatable
-  add x0, x0, :lo12:datable
+  add x0, x0, :lo12:datatable
   mov x0, #0
-  ret 
+  ret
 
 .section .dummy, "a", @progbits
 dummy:
diff --git a/bolt/test/X86/callcont-fallthru.s b/bolt/test/X86/callcont-fallthru.s
index c2ef024db947..8c05491e7bca 100644
--- a/bolt/test/X86/callcont-fallthru.s
+++ b/bolt/test/X86/callcont-fallthru.s
@@ -10,6 +10,10 @@
 # RUN: link_fdata %s %t %t.pa-ret PREAGG-RET
 # Trace from an external location to a landing pad/entry point call continuation
 # RUN: link_fdata %s %t %t.pa-ext PREAGG-EXT
+# Return trace to a landing pad/entry point call continuation
+# RUN: link_fdata %s %t %t.pa-pret PREAGG-PRET
+# External return to a landing pad/entry point call continuation
+# RUN: link_fdata %s %t %t.pa-eret PREAGG-ERET
 # RUN-DISABLED: link_fdata %s %t %t.pa-plt PREAGG-PLT
 
 # RUN: llvm-strip --strip-unneeded %t -o %t.strip
@@ -38,6 +42,21 @@
 # RUN: llvm-bolt %t.strip --pa -p %t.pa-ext -o %t.out \
 # RUN:   --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK-SKIP
 
+## Check pre-aggregated return traces from external location attach call
+## continuation fallthrough count to secondary entry point (unstripped)
+# RUN: llvm-bolt %t --pa -p %t.pa-pret -o %t.out \
+# RUN:   --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK-ATTACH
+## Check pre-aggregated return traces from external location attach call
+## continuation fallthrough count to landing pad (stripped, landing pad)
+# RUN: llvm-bolt %t.strip --pa -p %t.pa-pret -o %t.out \
+# RUN:   --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK-ATTACH
+
+## Same for external return type
+# RUN: llvm-bolt %t --pa -p %t.pa-eret -o %t.out \
+# RUN:   --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK-ATTACH
+# RUN: llvm-bolt %t.strip --pa -p %t.pa-eret -o %t.out \
+# RUN:   --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK-ATTACH
+
 ## Check pre-aggregated traces don't report zero-sized PLT fall-through as
 ## invalid trace
 # RUN-DISABLED: llvm-bolt %t.strip --pa -p %t.pa-plt -o %t.out | FileCheck %s \
@@ -92,6 +111,10 @@ Ltmp4_br:
 # PREAGG-RET: T #Lfoo_ret# #Ltmp3# #Ltmp3_br# 1
 ## Target is a secondary entry point (unstripped) or a landing pad (stripped)
 # PREAGG-EXT: T X:0 #Ltmp3# #Ltmp3_br# 1
+## Pre-aggregated return trace
+# PREAGG-PRET: R X:0 #Ltmp3# #Ltmp3_br# 1
+## External return
+# PREAGG-ERET: r #Ltmp3# #Ltmp3_br# 1
 
 # CHECK-ATTACH:      callq foo
 # CHECK-ATTACH-NEXT: count: 1
diff --git a/bolt/test/X86/linux-version.S b/bolt/test/X86/linux-version.S
index e680d0d64a21..a3d7f365304a 100644
--- a/bolt/test/X86/linux-version.S
+++ b/bolt/test/X86/linux-version.S
@@ -17,6 +17,11 @@
 # RUN:   -Wl,--image-base=0xffffffff80000000,--no-dynamic-linker,--no-eh-frame-hdr
 # RUN: llvm-bolt %t.exe -o %t.out 2>&1 | FileCheck --check-prefix=CHECK-C %s
 
+# RUN: %clang -DD -target x86_64-unknown-unknown \
+# RUN:   %cflags -nostdlib %s -o %t.exe \
+# RUN:   -Wl,--image-base=0xffffffff80000000,--no-dynamic-linker,--no-eh-frame-hdr
+# RUN: llvm-bolt %t.exe -o %t.out 2>&1 | FileCheck --check-prefix=CHECK-D %s
+
   .text
   .globl foo
   .type foo, %function
@@ -46,6 +51,12 @@ linux_banner:
 #endif
 # CHECK-C: BOLT-INFO: Linux kernel version is 6.6
 
+#ifdef D
+  .hidden linux_banner
+  .string  "Linux version 6.6.15.2-2-xxx\n"
+#endif
+# CHECK-D: BOLT-INFO: Linux kernel version is 6.6
+
   .size  linux_banner, . - linux_banner
 
 ## Fake Linux Kernel sections.
diff --git a/bolt/test/X86/register-fragments-bolt-symbols.s b/bolt/test/X86/register-fragments-bolt-symbols.s
index c9f1859c4e8a..20e7345541d9 100644
--- a/bolt/test/X86/register-fragments-bolt-symbols.s
+++ b/bolt/test/X86/register-fragments-bolt-symbols.s
@@ -29,6 +29,7 @@
 
 # RUN: link_fdata %s %t.bolt %t.preagg PREAGG
 # PREAGG: B X:0 #chain.cold.0# 1 0
+# PREAGG: B X:0 #dummy# 1 0
 # RUN: perf2bolt %t.bolt -p %t.preagg --pa -o %t.bat.fdata -w %t.bat.yaml -v=1 \
 # RUN:   | FileCheck %s --check-prefix=CHECK-REGISTER
 # RUN: FileCheck --input-file %t.bat.fdata --check-prefix=CHECK-FDATA %s
@@ -44,7 +45,13 @@
 # CHECK-SYMS: l  F .text.cold     [[#]] chain.cold.0
 # CHECK-SYMS: l  F .text          [[#]] chain
 # CHECK-SYMS: l df *ABS*          [[#]] bolt-pseudo.o
+# CHECK-SYMS: l  F .text.cold     [[#]] dummy.cold.0
+# CHECK-SYMS: l  F .text.cold.1   [[#]] dummy.cold.1
+# CHECK-SYMS: l  F .text.cold.2   [[#]] dummy.cold.2
 
+# CHECK-REGISTER: BOLT-INFO: marking dummy.cold.0/1(*2) as a fragment of dummy
+# CHECK-REGISTER: BOLT-INFO: marking dummy.cold.1/1(*2) as a fragment of dummy
+# CHECK-REGISTER: BOLT-INFO: marking dummy.cold.2/1(*2) as a fragment of dummy
 # CHECK-REGISTER: BOLT-INFO: marking chain.cold.0/1(*2) as a fragment of chain/2(*2)
 
 # CHECK-FDATA: 0 [unknown] 0 1 chain/chain.s/2 10 0 1
diff --git a/bolt/test/binary-analysis/AArch64/gs-pauth-authentication-oracles.s b/bolt/test/binary-analysis/AArch64/gs-pauth-authentication-oracles.s
new file mode 100644
index 000000000000..717bf40df3d0
--- /dev/null
+++ b/bolt/test/binary-analysis/AArch64/gs-pauth-authentication-oracles.s
@@ -0,0 +1,812 @@
+// RUN: %clang %cflags -march=armv8.3-a %s -o %t.exe
+// RUN: llvm-bolt-binary-analysis --scanners=pacret %t.exe 2>&1 | FileCheck -check-prefix=PACRET %s
+// RUN: llvm-bolt-binary-analysis --scanners=pauth  %t.exe 2>&1 | FileCheck %s
+
+// The detection of compiler-generated explicit pointer checks is tested in
+// gs-pauth-address-checks.s, for that reason only test here "dummy-load" and
+// "high-bits-notbi" checkers, as the shortest examples of checkers that are
+// detected per-instruction and per-BB.
+
+// PACRET-NOT: authentication oracle found in function
+
+        .text
+
+        .type   sym,@function
+sym:
+        ret
+        .size sym, .-sym
+
+        .globl  callee
+        .type   callee,@function
+callee:
+        ret
+        .size callee, .-callee
+
+        .globl  good_ret
+        .type   good_ret,@function
+good_ret:
+// CHECK-NOT: good_ret
+        autia   x0, x1
+        ret     x0
+        .size good_ret, .-good_ret
+
+        .globl  good_call
+        .type   good_call,@function
+good_call:
+// CHECK-NOT: good_call
+        paciasp
+        stp     x29, x30, [sp, #-16]!
+        mov     x29, sp
+
+        autia   x0, x1
+        blr     x0
+
+        ldp     x29, x30, [sp], #16
+        autiasp
+        ret
+        .size good_call, .-good_call
+
+        .globl  good_branch
+        .type   good_branch,@function
+good_branch:
+// CHECK-NOT: good_branch
+        autia   x0, x1
+        br      x0
+        .size good_branch, .-good_branch
+
+        .globl  good_load_other_reg
+        .type   good_load_other_reg,@function
+good_load_other_reg:
+// CHECK-NOT: good_load_other_reg
+        autia   x0, x1
+        ldr     x2, [x0]
+        ret
+        .size good_load_other_reg, .-good_load_other_reg
+
+        .globl  good_load_same_reg
+        .type   good_load_same_reg,@function
+good_load_same_reg:
+// CHECK-NOT: good_load_same_reg
+        autia   x0, x1
+        ldr     x0, [x0]
+        ret
+        .size good_load_same_reg, .-good_load_same_reg
+
+        .globl  good_explicit_check
+        .type   good_explicit_check,@function
+good_explicit_check:
+// CHECK-NOT: good_explicit_check
+        autia   x0, x1
+        eor     x16, x0, x0, lsl #1
+        tbz     x16, #62, 1f
+        brk     0x1234
+1:
+        ret
+        .size good_explicit_check, .-good_explicit_check
+
+        .globl  bad_unchecked
+        .type   bad_unchecked,@function
+bad_unchecked:
+// CHECK-LABEL: GS-PAUTH: authentication oracle found in function bad_unchecked, basic block {{[^,]+}}, at address
+// CHECK-NEXT:  The instruction is     {{[0-9a-f]+}}:      autia   x0, x1
+// CHECK-NEXT:  The 0 instructions that leak the affected registers are:
+        autia   x0, x1
+        ret
+        .size bad_unchecked, .-bad_unchecked
+
+        .globl  bad_leaked_to_subroutine
+        .type   bad_leaked_to_subroutine,@function
+bad_leaked_to_subroutine:
+// CHECK-LABEL: GS-PAUTH: authentication oracle found in function bad_leaked_to_subroutine, basic block {{[^,]+}}, at address
+// CHECK-NEXT:  The instruction is     {{[0-9a-f]+}}:      autia   x0, x1
+// CHECK-NEXT:  The 1 instructions that leak the affected registers are:
+// CHECK-NEXT:  1.     {{[0-9a-f]+}}:      bl      callee
+// CHECK-NEXT:  This happens in the following basic block:
+// CHECK-NEXT:  {{[0-9a-f]+}}:   paciasp
+// CHECK-NEXT:  {{[0-9a-f]+}}:   stp     x29, x30, [sp, #-0x10]!
+// CHECK-NEXT:  {{[0-9a-f]+}}:   mov     x29, sp
+// CHECK-NEXT:  {{[0-9a-f]+}}:   autia   x0, x1
+// CHECK-NEXT:  {{[0-9a-f]+}}:   bl      callee
+// CHECK-NEXT:  {{[0-9a-f]+}}:   ldr     x2, [x0]
+// CHECK-NEXT:  {{[0-9a-f]+}}:   ldp     x29, x30, [sp], #0x10
+// CHECK-NEXT:  {{[0-9a-f]+}}:   autiasp
+// CHECK-NEXT:  {{[0-9a-f]+}}:   ret
+        paciasp
+        stp     x29, x30, [sp, #-16]!
+        mov     x29, sp
+
+        autia   x0, x1
+        bl      callee
+        ldr     x2, [x0]
+
+        ldp     x29, x30, [sp], #16
+        autiasp
+        ret
+        .size bad_leaked_to_subroutine, .-bad_leaked_to_subroutine
+
+        .globl  bad_unknown_usage_read
+        .type   bad_unknown_usage_read,@function
+bad_unknown_usage_read:
+// CHECK-LABEL: GS-PAUTH: authentication oracle found in function bad_unknown_usage_read, basic block {{[^,]+}}, at address
+// CHECK-NEXT:  The instruction is     {{[0-9a-f]+}}:      autia   x0, x1
+// CHECK-NEXT:  The 1 instructions that leak the affected registers are:
+// CHECK-NEXT:  1.     {{[0-9a-f]+}}:      mul     x3, x0, x1
+// CHECK-NEXT:  This happens in the following basic block:
+// CHECK-NEXT:  {{[0-9a-f]+}}:   autia   x0, x1
+// CHECK-NEXT:  {{[0-9a-f]+}}:   mul     x3, x0, x1
+// CHECK-NEXT:  {{[0-9a-f]+}}:   ldr     x2, [x0]
+// CHECK-NEXT:  {{[0-9a-f]+}}:   ret
+        autia   x0, x1
+        // Registers are not accessible to an attacker under Pointer
+        // Authentication threat model, until spilled to memory.
+        // Thus, reporting the below MUL instruction is a false positive, since
+        // the next LDR instruction prevents any possible spilling of x3 unless
+        // the authentication succeeded. Though, rejecting anything except for
+        // a closed list of instruction types is the intended behavior of the
+        // analysis, so this false positive is by design.
+        mul     x3, x0, x1
+        ldr     x2, [x0]
+        ret
+        .size bad_unknown_usage_read, .-bad_unknown_usage_read
+
+        .globl  bad_store_to_memory_and_wait
+        .type   bad_store_to_memory_and_wait,@function
+bad_store_to_memory_and_wait:
+// CHECK-LABEL: GS-PAUTH: authentication oracle found in function bad_store_to_memory_and_wait, basic block {{[^,]+}}, at address
+// CHECK-NEXT:  The instruction is     {{[0-9a-f]+}}:      autia   x0, x1
+// CHECK-NEXT:  The 1 instructions that leak the affected registers are:
+// CHECK-NEXT:  1.     {{[0-9a-f]+}}:      str     x0, [x3]
+        autia   x0, x1
+        cbz     x3, 2f
+        str     x0, [x3]
+1:
+        // The thread performs a time-consuming computation while the result of
+        // authentication is accessible in memory.
+        nop
+2:
+        ldr     x2, [x0]
+        ret
+        .size bad_store_to_memory_and_wait, .-bad_store_to_memory_and_wait
+
+// FIXME: Known false negative: if no return instruction is reachable from a
+//        program point (this probably implies an infinite loop), such
+//        instruction cannot be detected as an authentication oracle.
+        .globl  bad_store_to_memory_and_hang
+        .type   bad_store_to_memory_and_hang,@function
+bad_store_to_memory_and_hang:
+// CHECK-NOT: bad_store_to_memory_and_hang
+        autia   x0, x1
+        cbz     x3, 2f
+        str     x0, [x3]
+1:
+        // The thread loops indefinitely while the result of authentication
+        // is accessible in memory.
+        b       1b
+2:
+        ldr     x2, [x0]
+        ret
+        .size bad_store_to_memory_and_hang, .-bad_store_to_memory_and_hang
+
+        .globl  bad_unknown_usage_subreg_read
+        .type   bad_unknown_usage_subreg_read,@function
+bad_unknown_usage_subreg_read:
+// CHECK-LABEL: GS-PAUTH: authentication oracle found in function bad_unknown_usage_subreg_read, basic block {{[^,]+}}, at address
+// CHECK-NEXT:  The instruction is     {{[0-9a-f]+}}:      autia   x0, x1
+// CHECK-NEXT:  The 1 instructions that leak the affected registers are:
+// CHECK-NEXT:  1.     {{[0-9a-f]+}}:      mul     w3, w0, w1
+// CHECK-NEXT:  This happens in the following basic block:
+// CHECK-NEXT:  {{[0-9a-f]+}}:   autia   x0, x1
+// CHECK-NEXT:  {{[0-9a-f]+}}:   mul     w3, w0, w1
+// CHECK-NEXT:  {{[0-9a-f]+}}:   ldr     x2, [x0]
+// CHECK-NEXT:  {{[0-9a-f]+}}:   ret
+        autia   x0, x1
+        mul     w3, w0, w1
+        ldr     x2, [x0]
+        ret
+        .size bad_unknown_usage_subreg_read, .-bad_unknown_usage_subreg_read
+
+        .globl  bad_unknown_usage_update
+        .type   bad_unknown_usage_update,@function
+bad_unknown_usage_update:
+// CHECK-LABEL: GS-PAUTH: authentication oracle found in function bad_unknown_usage_update, basic block {{[^,]+}}, at address
+// CHECK-NEXT:  The instruction is     {{[0-9a-f]+}}:      autia   x0, x1
+// CHECK-NEXT:  The 1 instructions that leak the affected registers are:
+// CHECK-NEXT:  1.     {{[0-9a-f]+}}:      movk    x0, #0x2a, lsl #16
+// CHECK-NEXT:  This happens in the following basic block:
+// CHECK-NEXT:  {{[0-9a-f]+}}:   autia   x0, x1
+// CHECK-NEXT:  {{[0-9a-f]+}}:   movk    x0, #0x2a, lsl #16
+// CHECK-NEXT:  {{[0-9a-f]+}}:   ldr     x2, [x0]
+// CHECK-NEXT:  {{[0-9a-f]+}}:   ret
+        autia   x0, x1
+        movk    x0, #42, lsl #16 // does not overwrite x0 completely
+        ldr     x2, [x0]
+        ret
+        .size bad_unknown_usage_update, .-bad_unknown_usage_update
+
+        .globl  good_overwrite_with_constant
+        .type   good_overwrite_with_constant,@function
+good_overwrite_with_constant:
+// CHECK-NOT: good_overwrite_with_constant
+        autia   x0, x1
+        mov     x0, #42
+        ret
+        .size good_overwrite_with_constant, .-good_overwrite_with_constant
+
+// Overwriting sensitive data by instructions with unmodelled side-effects is
+// explicitly rejected, even though this particular MRS is safe.
+        .globl  bad_overwrite_with_side_effects
+        .type   bad_overwrite_with_side_effects,@function
+bad_overwrite_with_side_effects:
+// CHECK-LABEL: GS-PAUTH: authentication oracle found in function bad_overwrite_with_side_effects, basic block {{[^,]+}}, at address
+// CHECK-NEXT:  The instruction is     {{[0-9a-f]+}}:      autia   x0, x1
+// CHECK-NEXT:  The 0 instructions that leak the affected registers are:
+        autia   x0, x1
+        mrs     x0, CTR_EL0
+        ret
+        .size bad_overwrite_with_side_effects, .-bad_overwrite_with_side_effects
+
+// Here the new value written by MUL to x0 is completely unrelated to the result
+// of authentication, so this is a false positive.
+// FIXME: Can/should we generalize overwriting by constant to handle such cases?
+        .globl  good_unknown_overwrite
+        .type   good_unknown_overwrite,@function
+good_unknown_overwrite:
+// CHECK-LABEL: GS-PAUTH: authentication oracle found in function good_unknown_overwrite, basic block {{[^,]+}}, at address
+// CHECK-NEXT:  The instruction is     {{[0-9a-f]+}}:      autia   x0, x1
+// CHECK-NEXT:  The 0 instructions that leak the affected registers are:
+        autia   x0, x1
+        mul     x0, x1, x2
+        ret
+        .size good_unknown_overwrite, .-good_unknown_overwrite
+
+// This is a false positive: when a general-purpose register is written to as
+// a 32-bit register, its top 32 bits are zeroed, but according to LLVM
+// representation, the instruction only overwrites the Wn register.
+        .globl  good_wreg_overwrite
+        .type   good_wreg_overwrite,@function
+good_wreg_overwrite:
+// CHECK-LABEL: GS-PAUTH: authentication oracle found in function good_wreg_overwrite, basic block {{[^,]+}}, at address
+// CHECK-NEXT:  The instruction is     {{[0-9a-f]+}}:      autia   x0, x1
+        autia   x0, x1
+        mov     w0, #42
+        ret
+        .size good_wreg_overwrite, .-good_wreg_overwrite
+
+        .globl  good_address_arith
+        .type   good_address_arith,@function
+good_address_arith:
+// CHECK-NOT: good_address_arith
+        autia   x0, x1
+
+        add     x1, x0, #8
+        sub     x2, x1, #16
+        mov     x3, x2
+
+        ldr     x4, [x3]
+        mov     x0, #0
+        mov     x1, #0
+        mov     x2, #0
+
+        ret
+        .size good_address_arith, .-good_address_arith
+
+        .globl  good_ret_multi_bb
+        .type   good_ret_multi_bb,@function
+good_ret_multi_bb:
+// CHECK-NOT: good_ret_multi_bb
+        autia   x0, x1
+        cbz     x1, 1f
+        nop
+1:
+        ret     x0
+        .size good_ret_multi_bb, .-good_ret_multi_bb
+
+        .globl  good_call_multi_bb
+        .type   good_call_multi_bb,@function
+good_call_multi_bb:
+// CHECK-NOT: good_call_multi_bb
+        paciasp
+        stp     x29, x30, [sp, #-16]!
+        mov     x29, sp
+
+        autia   x0, x1
+        cbz     x1, 1f
+        nop
+1:
+        blr     x0
+        cbz     x1, 2f
+        nop
+2:
+        ldp     x29, x30, [sp], #16
+        autiasp
+        ret
+        .size good_call_multi_bb, .-good_call_multi_bb
+
+        .globl  good_branch_multi_bb
+        .type   good_branch_multi_bb,@function
+good_branch_multi_bb:
+// CHECK-NOT: good_branch_multi_bb
+        autia   x0, x1
+        cbz     x1, 1f
+        nop
+1:
+        br      x0
+        .size good_branch_multi_bb, .-good_branch_multi_bb
+
+        .globl  good_load_other_reg_multi_bb
+        .type   good_load_other_reg_multi_bb,@function
+good_load_other_reg_multi_bb:
+// CHECK-NOT: good_load_other_reg_multi_bb
+        autia   x0, x1
+        cbz     x1, 1f
+        nop
+1:
+        ldr     x2, [x0]
+        cbz     x1, 2f
+        nop
+2:
+        ret
+        .size good_load_other_reg_multi_bb, .-good_load_other_reg_multi_bb
+
+        .globl  good_load_same_reg_multi_bb
+        .type   good_load_same_reg_multi_bb,@function
+good_load_same_reg_multi_bb:
+// CHECK-NOT: good_load_same_reg_multi_bb
+        autia   x0, x1
+        cbz     x1, 1f
+        nop
+1:
+        ldr     x0, [x0]
+        cbz     x1, 2f
+        nop
+2:
+        ret
+        .size good_load_same_reg_multi_bb, .-good_load_same_reg_multi_bb
+
+        .globl  good_explicit_check_multi_bb
+        .type   good_explicit_check_multi_bb,@function
+good_explicit_check_multi_bb:
+// CHECK-NOT: good_explicit_check_multi_bb
+        autia   x0, x1
+        cbz     x1, 1f
+        nop
+1:
+        eor     x16, x0, x0, lsl #1
+        tbz     x16, #62, 2f
+        brk     0x1234
+2:
+        cbz     x1, 3f
+        nop
+3:
+        ret
+        .size good_explicit_check_multi_bb, .-good_explicit_check_multi_bb
+
+        .globl  bad_unchecked_multi_bb
+        .type   bad_unchecked_multi_bb,@function
+bad_unchecked_multi_bb:
+// CHECK-LABEL: GS-PAUTH: authentication oracle found in function bad_unchecked_multi_bb, basic block {{[^,]+}}, at address
+// CHECK-NEXT:  The instruction is     {{[0-9a-f]+}}:      autia   x0, x1
+// CHECK-NEXT:  The 0 instructions that leak the affected registers are:
+        autia   x0, x1
+        cbz     x1, 1f
+        ldr     x2, [x0]
+1:
+        ret
+        .size bad_unchecked_multi_bb, .-bad_unchecked_multi_bb
+
+        .globl  bad_leaked_to_subroutine_multi_bb
+        .type   bad_leaked_to_subroutine_multi_bb,@function
+bad_leaked_to_subroutine_multi_bb:
+// CHECK-LABEL: GS-PAUTH: authentication oracle found in function bad_leaked_to_subroutine_multi_bb, basic block {{[^,]+}}, at address
+// CHECK-NEXT:  The instruction is     {{[0-9a-f]+}}:      autia   x0, x1
+// CHECK-NEXT:  The 1 instructions that leak the affected registers are:
+// CHECK-NEXT:  1.     {{[0-9a-f]+}}:      bl      callee
+        paciasp
+        stp     x29, x30, [sp, #-16]!
+        mov     x29, sp
+
+        autia   x0, x1
+        cbz     x1, 1f
+        ldr     x2, [x0]
+1:
+        bl      callee
+        ldr     x2, [x0]
+
+        ldp     x29, x30, [sp], #16
+        autiasp
+        ret
+        .size bad_leaked_to_subroutine_multi_bb, .-bad_leaked_to_subroutine_multi_bb
+
+        .globl  bad_unknown_usage_read_multi_bb
+        .type   bad_unknown_usage_read_multi_bb,@function
+bad_unknown_usage_read_multi_bb:
+// CHECK-LABEL: GS-PAUTH: authentication oracle found in function bad_unknown_usage_read_multi_bb, basic block {{[^,]+}}, at address
+// CHECK-NEXT:  The instruction is     {{[0-9a-f]+}}:      autia   x0, x1
+// CHECK-NEXT:  The 1 instructions that leak the affected registers are:
+// CHECK-NEXT:  1.     {{[0-9a-f]+}}:      mul     x3, x0, x1
+        autia   x0, x1
+        cbz     x3, 1f
+        mul     x3, x0, x1
+1:
+        ldr     x2, [x0]
+        ret
+        .size bad_unknown_usage_read_multi_bb, .-bad_unknown_usage_read_multi_bb
+
+        .globl  bad_unknown_usage_subreg_read_multi_bb
+        .type   bad_unknown_usage_subreg_read_multi_bb,@function
+bad_unknown_usage_subreg_read_multi_bb:
+// CHECK-LABEL: GS-PAUTH: authentication oracle found in function bad_unknown_usage_subreg_read_multi_bb, basic block {{[^,]+}}, at address
+// CHECK-NEXT:  The instruction is     {{[0-9a-f]+}}:      autia   x0, x1
+// CHECK-NEXT:  The 1 instructions that leak the affected registers are:
+// CHECK-NEXT:  1.     {{[0-9a-f]+}}:      mul     w3, w0, w1
+        autia   x0, x1
+        cbz     x3, 1f
+        mul     w3, w0, w1
+1:
+        ldr     x2, [x0]
+        ret
+        .size bad_unknown_usage_subreg_read_multi_bb, .-bad_unknown_usage_subreg_read_multi_bb
+
+        .globl  bad_unknown_usage_update_multi_bb
+        .type   bad_unknown_usage_update_multi_bb,@function
+bad_unknown_usage_update_multi_bb:
+// CHECK-LABEL: GS-PAUTH: authentication oracle found in function bad_unknown_usage_update_multi_bb, basic block {{[^,]+}}, at address
+// CHECK-NEXT:  The instruction is     {{[0-9a-f]+}}:      autia   x0, x1
+// CHECK-NEXT:  The 1 instructions that leak the affected registers are:
+// CHECK-NEXT:  1.     {{[0-9a-f]+}}:      movk    x0, #0x2a, lsl #16
+        autia   x0, x1
+        cbz     x3, 1f
+        movk    x0, #42, lsl #16  // does not overwrite x0 completely
+1:
+        ldr     x2, [x0]
+        ret
+        .size bad_unknown_usage_update_multi_bb, .-bad_unknown_usage_update_multi_bb
+
+        .globl  good_overwrite_with_constant_multi_bb
+        .type   good_overwrite_with_constant_multi_bb,@function
+good_overwrite_with_constant_multi_bb:
+// CHECK-NOT: good_overwrite_with_constant_multi_bb
+        autia   x0, x1
+        cbz     x3, 1f
+1:
+        mov     x0, #42
+        ret
+        .size good_overwrite_with_constant_multi_bb, .-good_overwrite_with_constant_multi_bb
+
+        .globl  good_address_arith_multi_bb
+        .type   good_address_arith_multi_bb,@function
+good_address_arith_multi_bb:
+// CHECK-NOT: good_address_arith_multi_bb
+        autia   x0, x1
+        cbz     x3, 1f
+
+        add     x1, x0, #8
+        sub     x2, x1, #16
+        mov     x0, x2
+
+        mov     x1, #0
+        mov     x2, #0
+1:
+        ldr     x3, [x0]
+        ret
+        .size good_address_arith_multi_bb, .-good_address_arith_multi_bb
+
+// FIXME: Most *_nocfg test cases contain paciasp+autiasp instructions even if
+//        LR is not spilled - this is a workaround for RET instructions being
+//        reported as non-protected, because LR state is reset at every label.
+
+        .globl  good_ret_nocfg
+        .type   good_ret_nocfg,@function
+good_ret_nocfg:
+// CHECK-NOT: good_ret_nocfg
+        adr     x2, 1f
+        br      x2
+1:
+        autia   x0, x1
+
+        ret     x0
+        .size good_ret_nocfg, .-good_ret_nocfg
+
+        .globl  good_call_nocfg
+        .type   good_call_nocfg,@function
+good_call_nocfg:
+// CHECK-NOT: good_call_nocfg
+        paciasp
+        stp     x29, x30, [sp, #-16]!
+        mov     x29, sp
+
+        adr     x2, 1f
+        br      x2
+1:
+        autia   x0, x1
+        blr     x0
+
+        ldp     x29, x30, [sp], #16
+        autiasp
+        ret
+        .size good_call_nocfg, .-good_call_nocfg
+
+        .globl  good_branch_nocfg
+        .type   good_branch_nocfg,@function
+good_branch_nocfg:
+// CHECK-NOT: good_branch_nocfg
+        adr     x2, 1f
+        br      x2
+1:
+        autia   x0, x1
+        br      x0
+        .size good_branch_nocfg, .-good_branch_nocfg
+
+        .globl  good_load_other_reg_nocfg
+        .type   good_load_other_reg_nocfg,@function
+good_load_other_reg_nocfg:
+// CHECK-NOT: good_load_other_reg_nocfg
+        paciasp
+        adr     x2, 1f
+        br      x2
+1:
+        autia   x0, x1
+        ldr     x2, [x0]
+
+        autiasp
+        ret
+        .size good_load_other_reg_nocfg, .-good_load_other_reg_nocfg
+
+        .globl  good_load_same_reg_nocfg
+        .type   good_load_same_reg_nocfg,@function
+good_load_same_reg_nocfg:
+// CHECK-NOT: good_load_same_reg_nocfg
+        paciasp
+        adr     x2, 1f
+        br      x2
+1:
+        autia   x0, x1
+        ldr     x0, [x0]
+
+        autiasp
+        ret
+        .size good_load_same_reg_nocfg, .-good_load_same_reg_nocfg
+
+// FIXME: Multi-instruction checker sequences are not supported without CFG.
+
+        .globl  bad_unchecked_nocfg
+        .type   bad_unchecked_nocfg,@function
+bad_unchecked_nocfg:
+// CHECK-LABEL: GS-PAUTH: authentication oracle found in function bad_unchecked_nocfg, at address
+// CHECK-NEXT:  The instruction is     {{[0-9a-f]+}}:      autia   x0, x1
+// CHECK-NEXT:  The 0 instructions that leak the affected registers are:
+        paciasp
+        adr     x2, 1f
+        br      x2
+1:
+        autia   x0, x1
+
+        autiasp
+        ret
+        .size bad_unchecked_nocfg, .-bad_unchecked_nocfg
+
+        .globl  bad_leaked_to_subroutine_nocfg
+        .type   bad_leaked_to_subroutine_nocfg,@function
+bad_leaked_to_subroutine_nocfg:
+// CHECK-LABEL: GS-PAUTH: authentication oracle found in function bad_leaked_to_subroutine_nocfg, at address
+// CHECK-NEXT:  The instruction is     {{[0-9a-f]+}}:      autia   x0, x1
+// CHECK-NEXT:  The 1 instructions that leak the affected registers are:
+// CHECK-NEXT:  1.     {{[0-9a-f]+}}:      bl      callee # Offset: 24
+        paciasp
+        stp     x29, x30, [sp, #-16]!
+        mov     x29, sp
+
+        adr     x2, 1f
+        br      x2
+1:
+        autia   x0, x1
+        bl      callee
+        ldr     x2, [x0]
+
+        ldp     x29, x30, [sp], #16
+        autiasp
+        ret
+        .size bad_leaked_to_subroutine_nocfg, .-bad_leaked_to_subroutine_nocfg
+
+        .globl  bad_unknown_usage_read_nocfg
+        .type   bad_unknown_usage_read_nocfg,@function
+bad_unknown_usage_read_nocfg:
+// CHECK-LABEL: GS-PAUTH: authentication oracle found in function bad_unknown_usage_read_nocfg, at address
+// CHECK-NEXT:  The instruction is     {{[0-9a-f]+}}:      autia   x0, x1
+// CHECK-NEXT:  The 1 instructions that leak the affected registers are:
+// CHECK-NEXT:  1.     {{[0-9a-f]+}}:      mul     x3, x0, x1
+        paciasp
+        adr     x2, 1f
+        br      x2
+1:
+        autia   x0, x1
+        mul     x3, x0, x1
+        ldr     x2, [x0]
+
+        autiasp
+        ret
+        .size bad_unknown_usage_read_nocfg, .-bad_unknown_usage_read_nocfg
+
+        .globl  bad_unknown_usage_subreg_read_nocfg
+        .type   bad_unknown_usage_subreg_read_nocfg,@function
+bad_unknown_usage_subreg_read_nocfg:
+// CHECK-LABEL: GS-PAUTH: authentication oracle found in function bad_unknown_usage_subreg_read_nocfg, at address
+// CHECK-NEXT:  The instruction is     {{[0-9a-f]+}}:      autia   x0, x1
+// CHECK-NEXT:  The 1 instructions that leak the affected registers are:
+// CHECK-NEXT:  1.     {{[0-9a-f]+}}:      mul     w3, w0, w1
+        paciasp
+        adr     x2, 1f
+        br      x2
+1:
+        autia   x0, x1
+        mul     w3, w0, w1
+        ldr     x2, [x0]
+
+        autiasp
+        ret
+        .size bad_unknown_usage_subreg_read_nocfg, .-bad_unknown_usage_subreg_read_nocfg
+
+        .globl  bad_unknown_usage_update_nocfg
+        .type   bad_unknown_usage_update_nocfg,@function
+bad_unknown_usage_update_nocfg:
+// CHECK-LABEL: GS-PAUTH: authentication oracle found in function bad_unknown_usage_update_nocfg, at address
+// CHECK-NEXT:  The instruction is     {{[0-9a-f]+}}:      autia   x0, x1
+// CHECK-NEXT:  The 1 instructions that leak the affected registers are:
+// CHECK-NEXT:  1.     {{[0-9a-f]+}}:      movk    x0, #0x2a, lsl #16
+        paciasp
+        adr     x2, 1f
+        br      x2
+1:
+        autia   x0, x1
+        movk    x0, #42, lsl #16  // does not overwrite x0 completely
+        ldr     x2, [x0]
+
+        autiasp
+        ret
+        .size bad_unknown_usage_update_nocfg, .-bad_unknown_usage_update_nocfg
+
+        .globl  good_overwrite_with_constant_nocfg
+        .type   good_overwrite_with_constant_nocfg,@function
+good_overwrite_with_constant_nocfg:
+// CHECK-NOT: good_overwrite_with_constant_nocfg
+        paciasp
+        adr     x2, 1f
+        br      x2
+1:
+        autia   x0, x1
+        mov     x0, #42
+
+        autiasp
+        ret
+        .size good_overwrite_with_constant_nocfg, .-good_overwrite_with_constant_nocfg
+
+        .globl  good_address_arith_nocfg
+        .type   good_address_arith_nocfg,@function
+good_address_arith_nocfg:
+// CHECK-NOT: good_address_arith_nocfg
+        paciasp
+        adr     x2, 1f
+        br      x2
+1:
+        autia   x0, x1
+        add     x1, x0, #8
+        sub     x2, x1, #16
+        mov     x3, x2
+
+        ldr     x4, [x3]
+        mov     x0, #0
+        mov     x1, #0
+        mov     x2, #0
+
+        autiasp
+        ret
+        .size good_address_arith_nocfg, .-good_address_arith_nocfg
+
+        .globl  good_explicit_check_unrelated_reg
+        .type   good_explicit_check_unrelated_reg,@function
+good_explicit_check_unrelated_reg:
+// CHECK-LABEL: GS-PAUTH: authentication oracle found in function good_explicit_check_unrelated_reg, basic block {{[^,]+}}, at address
+        // FIXME: The below instruction is not an authentication oracle
+        autia   x2, x3    // One of possible execution paths after this instruction
+                          // ends at BRK below, thus BRK used as a trap instruction
+                          // should formally "check everything" not to introduce
+                          // false-positive here.
+        autia   x0, x1
+        eor     x16, x0, x0, lsl #1
+        tbz     x16, #62, 1f
+        brk     0x1234
+1:
+        ldr     x4, [x2]  // Right before this instruction X2 is checked - this
+                          // should be propagated to the basic block ending with
+                          // TBZ instruction above.
+        ret
+        .size good_explicit_check_unrelated_reg, .-good_explicit_check_unrelated_reg
+
+// The last BB (in layout order) is processed first by the data-flow analysis.
+// Its initial state is usually filled in a special way (because it ends with
+// `ret` instruction), and then affects the state propagated to the other BBs
+// Thus, the case of the last instruction in a function being a jump somewhere
+// in the middle is special.
+
+        .globl  good_no_ret_from_last_bb
+        .type   good_no_ret_from_last_bb,@function
+good_no_ret_from_last_bb:
+// CHECK-NOT: good_no_ret_from_last_bb
+        paciasp
+        autiasp     // authenticates LR
+        b       2f
+1:
+        ret
+2:
+        b       1b  // LR is dereferenced by `ret`, which is executed next
+        .size good_no_ret_from_last_bb, .-good_no_ret_from_last_bb
+
+        .globl  bad_no_ret_from_last_bb
+        .type   bad_no_ret_from_last_bb,@function
+bad_no_ret_from_last_bb:
+// CHECK-LABEL: GS-PAUTH: authentication oracle found in function bad_no_ret_from_last_bb, basic block {{[^,]+}}, at address
+// CHECK-NEXT:  The instruction is     {{[0-9a-f]+}}:      autiasp
+// CHECK-NEXT:  The 0 instructions that leak the affected registers are:
+        paciasp
+        autiasp     // authenticates LR
+        b       2f
+1:
+        ret     x0
+2:
+        b       1b  // X0 (but not LR) is dereferenced by `ret x0`
+        .size bad_no_ret_from_last_bb, .-bad_no_ret_from_last_bb
+
+// Test that combined auth+something instructions are not reported as
+// authentication oracles.
+
+        .globl  inst_retaa
+        .type   inst_retaa,@function
+inst_retaa:
+// CHECK-NOT: inst_retaa
+        paciasp
+        retaa
+        .size inst_retaa, .-inst_retaa
+
+        .globl  inst_blraa
+        .type   inst_blraa,@function
+inst_blraa:
+// CHECK-NOT: inst_blraa
+        paciasp
+        stp     x29, x30, [sp, #-16]!
+        mov     x29, sp
+
+        blraa   x0, x1
+
+        ldp     x29, x30, [sp], #16
+        retaa
+        .size inst_blraa, .-inst_blraa
+
+        .globl  inst_braa
+        .type   inst_braa,@function
+inst_braa:
+// CHECK-NOT: inst_braa
+        braa    x0, x1
+        .size inst_braa, .-inst_braa
+
+        .globl  inst_ldraa_no_wb
+        .type   inst_ldraa_no_wb,@function
+inst_ldraa_no_wb:
+// CHECK-NOT: inst_ldraa_no_wb
+        ldraa   x1, [x0]
+        ret
+        .size inst_ldraa_no_wb, .-inst_ldraa_no_wb
+
+        .globl  inst_ldraa_wb
+        .type   inst_ldraa_wb,@function
+inst_ldraa_wb:
+// CHECK-NOT: inst_ldraa_wb
+        ldraa   x1, [x0]!
+        ret
+        .size inst_ldraa_wb, .-inst_ldraa_wb
+
+        .globl  main
+        .type   main,@function
+main:
+        mov     x0, 0
+        ret
+        .size   main, .-main
diff --git a/bolt/test/binary-analysis/AArch64/gs-pauth-debug-output.s b/bolt/test/binary-analysis/AArch64/gs-pauth-debug-output.s
index 82494d834a15..fbb96a63d41e 100644
--- a/bolt/test/binary-analysis/AArch64/gs-pauth-debug-output.s
+++ b/bolt/test/binary-analysis/AArch64/gs-pauth-debug-output.s
@@ -113,7 +113,7 @@ simple:
 // CHECK-EMPTY:
 // PAUTH-NEXT:   Found sign inst:     00000000:        paciasp # DataflowSrcSafetyAnalysis: src-state<SafeToDerefRegs: BitVector, TrustedRegs: BitVector, Insts: >
 // PAUTH-NEXT:     Signed reg: LR
-// PAUTH-NEXT:     TrustedRegs: LR W30 W30_HI
+// PAUTH-NEXT:     TrustedRegs: LR W30 W30_HI{{[ \t]*$}}
 // PAUTH-NEXT:   Found call inst:     00000000:        blr     x0 # DataflowSrcSafetyAnalysis: src-state<SafeToDerefRegs: BitVector, TrustedRegs: BitVector, Insts: >
 // PAUTH-NEXT:     Call destination reg: X0
 // PAUTH-NEXT:     SafeToDerefRegs: W0 X0 W0_HI{{[ \t]*$}}
@@ -220,10 +220,10 @@ nocfg:
 // CHECK-EMPTY:
 // PAUTH-NEXT:   Found call inst:     00000000:        br      x0 # UNKNOWN CONTROL FLOW # Offset: 4 # CFGUnawareSrcSafetyAnalysis: src-state<SafeToDerefRegs: BitVector, TrustedRegs: BitVector, Insts: >
 // PAUTH-NEXT:     Call destination reg: X0
-// PAUTH-NEXT:     SafeToDerefRegs: LR W0 W30 X0 W0_HI W30_HI
+// PAUTH-NEXT:     SafeToDerefRegs: LR W0 W30 X0 W0_HI W30_HI{{[ \t]*$}}
 // CHECK-NEXT:   Found RET inst:     00000000:         ret # Offset: 8 # CFGUnawareSrcSafetyAnalysis: src-state<SafeToDerefRegs: BitVector, TrustedRegs: BitVector, Insts: >
 // CHECK-NEXT:     RetReg: LR
-// CHECK-NEXT:     SafeToDerefRegs:
+// CHECK-NEXT:     SafeToDerefRegs:{{[ \t]*$}}
 // CHECK-EMPTY:
 // CHECK-NEXT: Running detailed src register safety analysis...
 // CHECK-NEXT:   SrcSafetyAnalysis::ComputeNext(   adr     x0, __ENTRY_nocfg@0x[[ENTRY_ADDR]], src-state<SafeToDerefRegs: LR W30 W30_HI , TrustedRegs: LR W30 W30_HI , Insts: [0]()>)
@@ -251,6 +251,116 @@ nocfg:
 // CHECK-EMPTY:
 // CHECK-NEXT:   Attaching clobbering info to:     00000000:   ret # Offset: 8 # CFGUnawareSrcSafetyAnalysis: src-state<SafeToDerefRegs: BitVector, TrustedRegs: BitVector, Insts: [0]()>
 
+        .globl  auth_oracle
+        .type   auth_oracle,@function
+auth_oracle:
+        autia   x0, x1
+        ret
+        .size auth_oracle, .-auth_oracle
+
+// CHECK-LABEL:Analyzing function auth_oracle, AllocatorId = 1
+// CHECK-NEXT: Binary Function "auth_oracle"  {
+// CHECK-NEXT:   Number      : 4
+// CHECK-NEXT:   State       : CFG constructed
+// ...
+// CHECK:        BB Layout   : [[BB0:[0-9a-zA-Z.]+]]
+// CHECK-NEXT: }
+// CHECK-NEXT: [[BB0]] (2 instructions, align : 1)
+// CHECK-NEXT:   Entry Point
+// CHECK-NEXT:     00000000:   autia   x0, x1
+// CHECK-NEXT:     00000004:   ret
+// CHECK-EMPTY:
+// CHECK-NEXT: DWARF CFI Instructions:
+// CHECK-NEXT:     <empty>
+// CHECK-NEXT: End of Function "auth_oracle"
+// CHECK-EMPTY:
+// CHECK-NEXT: Running src register safety analysis...
+// ...
+// CHECK:      After src register safety analysis:
+// CHECK-NEXT: Binary Function "auth_oracle"  {
+// ...
+// CHECK:      End of Function "auth_oracle"
+// ...
+// PAUTH:      Running dst register safety analysis...
+// PAUTH-NEXT:   DstSafetyAnalysis::ComputeNext(       ret     x30, dst-state<CannotEscapeUnchecked: , Insts: >)
+// PAUTH-NEXT:     .. result: (dst-state<CannotEscapeUnchecked: LR W30 W30_HI , Insts: >)
+// PAUTH-NEXT:   DstSafetyAnalysis::ComputeNext(       autia   x0, x1, dst-state<CannotEscapeUnchecked: LR W30 W30_HI , Insts: >)
+// PAUTH-NEXT:     .. result: (dst-state<CannotEscapeUnchecked: LR W30 W30_HI , Insts: >)
+// PAUTH-NEXT: After dst register safety analysis:
+// PAUTH-NEXT: Binary Function "auth_oracle"  {
+// PAUTH-NEXT:   Number      : 4
+// PAUTH-NEXT:   State       : CFG constructed
+// ...
+// PAUTH:        BB Layout   : [[BB0]]
+// PAUTH-NEXT: }
+// PAUTH-NEXT: [[BB0]] (2 instructions, align : 1)
+// PAUTH-NEXT:   Entry Point
+// PAUTH-NEXT:     00000000:   autia   x0, x1 # DataflowDstSafetyAnalysis: dst-state<CannotEscapeUnchecked: BitVector, Insts: >
+// PAUTH-NEXT:     00000004:   ret # DataflowDstSafetyAnalysis: dst-state<CannotEscapeUnchecked: BitVector, Insts: >
+// PAUTH-EMPTY:
+// PAUTH-NEXT: DWARF CFI Instructions:
+// PAUTH-NEXT:     <empty>
+// PAUTH-NEXT: End of Function "auth_oracle"
+// PAUTH-EMPTY:
+// PAUTH-NEXT:   Found auth inst:     00000000:        autia   x0, x1 # DataflowDstSafetyAnalysis: dst-state<CannotEscapeUnchecked: BitVector, Insts: >
+// PAUTH-NEXT:     Authenticated reg: X0
+// PAUTH-NEXT:     safe output registers: LR W30 W30_HI{{[ \t]*$}}
+// PAUTH-EMPTY:
+// PAUTH-NEXT: Running detailed dst register safety analysis...
+// PAUTH-NEXT:   DstSafetyAnalysis::ComputeNext(       ret     x30, dst-state<CannotEscapeUnchecked: , Insts: [0]()>)
+// PAUTH-NEXT:     .. result: (dst-state<CannotEscapeUnchecked: LR W30 W30_HI , Insts: [0]()>)
+// PAUTH-NEXT:   DstSafetyAnalysis::ComputeNext(       autia   x0, x1, dst-state<CannotEscapeUnchecked: LR W30 W30_HI , Insts: [0]()>)
+// PAUTH-NEXT:     .. result: (dst-state<CannotEscapeUnchecked: LR W30 W30_HI , Insts: [0](0x{{[0-9a-f]+}} )>)
+// PAUTH-NEXT: After detailed dst register safety analysis:
+// PAUTH-NEXT: Binary Function "auth_oracle"  {
+// PAUTH-NEXT:   Number      : 4
+// PAUTH-NEXT:   State       : CFG constructed
+// ...
+// PAUTH:        BB Layout   : [[BB0]]
+// PAUTH-NEXT: }
+// PAUTH-NEXT: [[BB0]] (2 instructions, align : 1)
+// PAUTH-NEXT:   Entry Point
+// PAUTH-NEXT:     00000000:   autia   x0, x1 # DataflowDstSafetyAnalysis: dst-state<CannotEscapeUnchecked: BitVector, Insts: [0](0x{{[0-9a-f]+}} )>
+// PAUTH-NEXT:     00000004:   ret # DataflowDstSafetyAnalysis: dst-state<CannotEscapeUnchecked: BitVector, Insts: [0]()>
+// PAUTH-EMPTY:
+// PAUTH-NEXT: DWARF CFI Instructions:
+// PAUTH-NEXT:     <empty>
+// PAUTH-NEXT: End of Function "auth_oracle"
+// PAUTH-EMPTY:
+// PAUTH-NEXT:   Attaching leakage info to:     00000000:      autia   x0, x1 # DataflowDstSafetyAnalysis: dst-state<CannotEscapeUnchecked: BitVector, Insts: [0](0x{{[0-9a-f]+}} )>
+
+// Gadget scanner should not crash on CFI instructions, including when debug-printing them.
+// Note that the particular debug output is not checked, but BOLT should be
+// compiled with assertions enabled to support -debug-only argument.
+
+        .globl  cfi_inst_df
+        .type   cfi_inst_df,@function
+cfi_inst_df:
+        .cfi_startproc
+        sub     sp, sp, #16
+        .cfi_def_cfa_offset 16
+        add     sp, sp, #16
+        .cfi_def_cfa_offset 0
+        ret
+        .size   cfi_inst_df, .-cfi_inst_df
+        .cfi_endproc
+
+        .globl  cfi_inst_nocfg
+        .type   cfi_inst_nocfg,@function
+cfi_inst_nocfg:
+        .cfi_startproc
+        sub     sp, sp, #16
+        .cfi_def_cfa_offset 16
+
+        adr     x0, 1f
+        br      x0
+1:
+        add     sp, sp, #16
+        .cfi_def_cfa_offset 0
+        ret
+        .size   cfi_inst_nocfg, .-cfi_inst_nocfg
+        .cfi_endproc
+
 // CHECK-LABEL:Analyzing function main, AllocatorId = 1
         .globl  main
         .type   main,@function
diff --git a/bolt/test/link_fdata.py b/bolt/test/link_fdata.py
index 5a9752068bb9..898dce8e3fb5 100755
--- a/bolt/test/link_fdata.py
+++ b/bolt/test/link_fdata.py
@@ -36,9 +36,9 @@ prefix_pat = re.compile(f"^# {args.prefix}: (.*)")
 fdata_pat = re.compile(r"([01].*) (?P<mispred>\d+) (?P<exec>\d+)")
 
 # Pre-aggregated profile:
-# {T|S|E|B|F|f} <start> [<end>] [<ft_end>] <count> [<mispred_count>]
+# {T|R|S|E|B|F|f|r} <start> [<end>] [<ft_end>] <count> [<mispred_count>]
 # <loc>: [<id>:]<offset>
-preagg_pat = re.compile(r"(?P<type>[TSBFf]) (?P<offsets_count>.*)")
+preagg_pat = re.compile(r"(?P<type>[TRSBFfr]) (?P<offsets_count>.*)")
 
 # No-LBR profile:
 # <is symbol?> <closest elf symbol or DSO name> <relative address> <count>
diff --git a/bolt/test/perf2bolt/AArch64/perf2bolt-spe.test b/bolt/test/perf2bolt/AArch64/perf2bolt-spe.test
new file mode 100644
index 000000000000..91f5c857fbab
--- /dev/null
+++ b/bolt/test/perf2bolt/AArch64/perf2bolt-spe.test
@@ -0,0 +1,12 @@
+## Check that Arm SPE mode is available on AArch64.
+
+REQUIRES: system-linux,perf,target=aarch64{{.*}}
+
+RUN: %clang %cflags %p/../../Inputs/asm_foo.s %p/../../Inputs/asm_main.c -o %t.exe
+
+RUN: perf record -e cycles -q -o %t.perf.data -- %t.exe 2> /dev/null
+
+RUN: (perf2bolt -p %t.perf.data -o %t.perf.boltdata --spe %t.exe 2> /dev/null; exit 0) | FileCheck %s --check-prefix=CHECK-SPE-LBR
+
+CHECK-SPE-LBR: PERF2BOLT: parse SPE branch events in LBR-format
+
diff --git a/bolt/test/perf2bolt/X86/perf2bolt-spe.test b/bolt/test/perf2bolt/X86/perf2bolt-spe.test
new file mode 100644
index 000000000000..101bd3789a18
--- /dev/null
+++ b/bolt/test/perf2bolt/X86/perf2bolt-spe.test
@@ -0,0 +1,9 @@
+## Check that Arm SPE mode is unavailable on X86.
+
+REQUIRES: system-linux,x86_64-linux
+
+RUN: %clang %cflags %p/../../Inputs/asm_foo.s %p/../../Inputs/asm_main.c -o %t.exe
+RUN: touch %t.empty.perf.data
+RUN: not perf2bolt -p %t.empty.perf.data -o %t.perf.boltdata --spe --pa %t.exe 2>&1 | FileCheck %s
+
+CHECK: perf2bolt{{.*}} -spe is available only on AArch64.
diff --git a/bolt/tools/driver/llvm-bolt.cpp b/bolt/tools/driver/llvm-bolt.cpp
index b9836c2397b6..cf1b31f8c0c6 100644
--- a/bolt/tools/driver/llvm-bolt.cpp
+++ b/bolt/tools/driver/llvm-bolt.cpp
@@ -237,6 +237,13 @@ int main(int argc, char **argv) {
       if (Error E = RIOrErr.takeError())
         report_error(opts::InputFilename, std::move(E));
       RewriteInstance &RI = *RIOrErr.get();
+
+      if (opts::AggregateOnly && !RI.getBinaryContext().isAArch64() &&
+          opts::ArmSPE) {
+        errs() << ToolName << ": -spe is available only on AArch64.\n";
+        exit(1);
+      }
+
       if (!opts::PerfData.empty()) {
         if (!opts::AggregateOnly) {
           errs() << ToolName
diff --git a/bolt/unittests/Profile/CMakeLists.txt b/bolt/unittests/Profile/CMakeLists.txt
index e0aa0926b49c..ce01c6c4b949 100644
--- a/bolt/unittests/Profile/CMakeLists.txt
+++ b/bolt/unittests/Profile/CMakeLists.txt
@@ -1,11 +1,25 @@
+set(LLVM_LINK_COMPONENTS
+  DebugInfoDWARF
+  Object
+  ${LLVM_TARGETS_TO_BUILD}
+  )
+
 add_bolt_unittest(ProfileTests
   DataAggregator.cpp
+  PerfSpeEvents.cpp
 
   DISABLE_LLVM_LINK_LLVM_DYLIB
   )
 
 target_link_libraries(ProfileTests
   PRIVATE
+  LLVMBOLTCore
   LLVMBOLTProfile
+  LLVMTargetParser
+  LLVMTestingSupport
   )
 
+foreach (tgt ${BOLT_TARGETS_TO_BUILD})
+  string(TOUPPER "${tgt}" upper)
+  target_compile_definitions(ProfileTests PRIVATE "${upper}_AVAILABLE")
+endforeach()
diff --git a/bolt/unittests/Profile/PerfSpeEvents.cpp b/bolt/unittests/Profile/PerfSpeEvents.cpp
new file mode 100644
index 000000000000..3e3e05395246
--- /dev/null
+++ b/bolt/unittests/Profile/PerfSpeEvents.cpp
@@ -0,0 +1,164 @@
+//===- bolt/unittests/Profile/PerfSpeEvents.cpp ---------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifdef AARCH64_AVAILABLE
+
+#include "bolt/Core/BinaryContext.h"
+#include "bolt/Profile/DataAggregator.h"
+#include "llvm/BinaryFormat/ELF.h"
+#include "llvm/DebugInfo/DWARF/DWARFContext.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/TargetSelect.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+using namespace llvm::bolt;
+using namespace llvm::object;
+using namespace llvm::ELF;
+
+namespace opts {
+extern cl::opt<std::string> ReadPerfEvents;
+extern cl::opt<bool> ArmSPE;
+} // namespace opts
+
+namespace llvm {
+namespace bolt {
+
+/// Perform checks on perf SPE branch events.
+struct PerfSpeEventsTestHelper : public testing::Test {
+  void SetUp() override {
+    initalizeLLVM();
+    prepareElf();
+    initializeBOLT();
+  }
+
+protected:
+  using Trace = DataAggregator::Trace;
+  using TakenBranchInfo = DataAggregator::TakenBranchInfo;
+
+  void initalizeLLVM() {
+    llvm::InitializeAllTargetInfos();
+    llvm::InitializeAllTargetMCs();
+    llvm::InitializeAllAsmParsers();
+    llvm::InitializeAllDisassemblers();
+    llvm::InitializeAllTargets();
+    llvm::InitializeAllAsmPrinters();
+  }
+
+  void prepareElf() {
+    memcpy(ElfBuf, "\177ELF", 4);
+    ELF64LE::Ehdr *EHdr = reinterpret_cast<typename ELF64LE::Ehdr *>(ElfBuf);
+    EHdr->e_ident[llvm::ELF::EI_CLASS] = llvm::ELF::ELFCLASS64;
+    EHdr->e_ident[llvm::ELF::EI_DATA] = llvm::ELF::ELFDATA2LSB;
+    EHdr->e_machine = llvm::ELF::EM_AARCH64;
+    MemoryBufferRef Source(StringRef(ElfBuf, sizeof(ElfBuf)), "ELF");
+    ObjFile = cantFail(ObjectFile::createObjectFile(Source));
+  }
+
+  void initializeBOLT() {
+    Relocation::Arch = ObjFile->makeTriple().getArch();
+    BC = cantFail(BinaryContext::createBinaryContext(
+        ObjFile->makeTriple(), std::make_shared<orc::SymbolStringPool>(),
+        ObjFile->getFileName(), nullptr, /*IsPIC*/ false,
+        DWARFContext::create(*ObjFile.get()), {llvm::outs(), llvm::errs()}));
+    ASSERT_FALSE(!BC);
+  }
+
+  char ElfBuf[sizeof(typename ELF64LE::Ehdr)] = {};
+  std::unique_ptr<ObjectFile> ObjFile;
+  std::unique_ptr<BinaryContext> BC;
+
+  /// Helper function to export lists to show the mismatch.
+  void reportBrStackEventMismatch(
+      const std::vector<std::pair<Trace, TakenBranchInfo>> &Traces,
+      const std::vector<std::pair<Trace, TakenBranchInfo>> &ExpectedSamples) {
+    llvm::errs() << "Traces items: \n";
+    for (const auto &[Trace, BI] : Traces)
+      llvm::errs() << "{" << Trace.Branch << ", " << Trace.From << ","
+                   << Trace.To << ", " << BI.TakenCount << ", "
+                   << BI.MispredCount << "}" << "\n";
+
+    llvm::errs() << "Expected items: \n";
+    for (const auto &[Trace, BI] : ExpectedSamples)
+      llvm::errs() << "{" << Trace.Branch << ", " << Trace.From << ", "
+                   << Trace.To << ", " << BI.TakenCount << ", "
+                   << BI.MispredCount << "}" << "\n";
+  }
+
+  /// Parse and check SPE brstack as LBR.
+  void parseAndCheckBrstackEvents(
+      uint64_t PID,
+      const std::vector<std::pair<Trace, TakenBranchInfo>> &ExpectedSamples) {
+    DataAggregator DA("<pseudo input>");
+    DA.ParsingBuf = opts::ReadPerfEvents;
+    DA.BC = BC.get();
+    DataAggregator::MMapInfo MMap;
+    DA.BinaryMMapInfo.insert(std::make_pair(PID, MMap));
+
+    DA.parseBranchEvents();
+
+    EXPECT_EQ(DA.Traces.size(), ExpectedSamples.size());
+    if (DA.Traces.size() != ExpectedSamples.size())
+      reportBrStackEventMismatch(DA.Traces, ExpectedSamples);
+
+    const auto TracesBegin = DA.Traces.begin();
+    const auto TracesEnd = DA.Traces.end();
+    for (const auto &BI : ExpectedSamples) {
+      auto it = find_if(TracesBegin, TracesEnd,
+                        [&BI](const auto &Tr) { return Tr.first == BI.first; });
+
+      EXPECT_NE(it, TracesEnd);
+      EXPECT_EQ(it->second.MispredCount, BI.second.MispredCount);
+      EXPECT_EQ(it->second.TakenCount, BI.second.TakenCount);
+    }
+  }
+};
+
+} // namespace bolt
+} // namespace llvm
+
+TEST_F(PerfSpeEventsTestHelper, SpeBranchesWithBrstack) {
+  // Check perf input with SPE branch events as brstack format.
+  // Example collection command:
+  // ```
+  // perf record -e 'arm_spe_0/branch_filter=1/u' -- BINARY
+  // ```
+  // How Bolt extracts the branch events:
+  // ```
+  // perf script -F pid,brstack --itrace=bl
+  // ```
+
+  opts::ArmSPE = true;
+  opts::ReadPerfEvents = "  1234  0xa001/0xa002/PN/-/-/10/COND/-\n"
+                         "  1234  0xb001/0xb002/P/-/-/4/RET/-\n"
+                         "  1234  0xc456/0xc789/P/-/-/13/-/-\n"
+                         "  1234  0xd123/0xd456/M/-/-/7/RET/-\n"
+                         "  1234  0xe001/0xe002/P/-/-/14/RET/-\n"
+                         "  1234  0xd123/0xd456/M/-/-/7/RET/-\n"
+                         "  1234  0xf001/0xf002/MN/-/-/8/COND/-\n"
+                         "  1234  0xc456/0xc789/M/-/-/13/-/-\n";
+
+  // ExpectedSamples contains the aggregated information about
+  // a branch {{Branch From, To}, {TakenCount, MispredCount}}.
+  // Consider this example trace: {{0xd123, 0xd456, Trace::BR_ONLY},
+  // {2,2}}. This entry has a TakenCount = 2, as we have two samples for
+  // (0xd123, 0xd456) in our input. It also has MispredsCount = 2,
+  // as 'M' misprediction flag appears in both cases. BR_ONLY means
+  // the trace only contains branch data.
+  std::vector<std::pair<Trace, TakenBranchInfo>> ExpectedSamples = {
+      {{0xa001, 0xa002, Trace::BR_ONLY}, {1, 0}},
+      {{0xb001, 0xb002, Trace::BR_ONLY}, {1, 0}},
+      {{0xc456, 0xc789, Trace::BR_ONLY}, {2, 1}},
+      {{0xd123, 0xd456, Trace::BR_ONLY}, {2, 2}},
+      {{0xe001, 0xe002, Trace::BR_ONLY}, {1, 0}},
+      {{0xf001, 0xf002, Trace::BR_ONLY}, {1, 1}}};
+
+  parseAndCheckBrstackEvents(1234, ExpectedSamples);
+}
+
+#endif