8 files changed, 363 insertions, 212 deletions
diff --git a/mlir/lib/Target/Cpp/TranslateToCpp.cpp b/mlir/lib/Target/Cpp/TranslateToCpp.cpp
index 202df89025f2..626638282efe 100644
--- a/mlir/lib/Target/Cpp/TranslateToCpp.cpp
+++ b/mlir/lib/Target/Cpp/TranslateToCpp.cpp
@@ -1574,6 +1574,12 @@ LogicalResult CppEmitter::emitType(Location loc, Type type) {
   }
   if (auto iType = dyn_cast<IndexType>(type))
     return (os << "size_t"), success();
+  if (auto sType = dyn_cast<emitc::SizeTType>(type))
+    return (os << "size_t"), success();
+  if (auto sType = dyn_cast<emitc::SignedSizeTType>(type))
+    return (os << "ssize_t"), success();
+  if (auto pType = dyn_cast<emitc::PtrDiffTType>(type))
+    return (os << "ptrdiff_t"), success();
   if (auto tType = dyn_cast<TensorType>(type)) {
     if (!tType.hasRank())
       return emitError(loc, "cannot emit unranked tensor type");
diff --git a/mlir/lib/Target/LLVM/CMakeLists.txt b/mlir/lib/Target/LLVM/CMakeLists.txt
index 5a3fa160850b..93dc5ff9d35b 100644
--- a/mlir/lib/Target/LLVM/CMakeLists.txt
+++ b/mlir/lib/Target/LLVM/CMakeLists.txt
@@ -32,6 +32,8 @@ endif()
 add_mlir_dialect_library(MLIRNVVMTarget
   NVVM/Target.cpp
 
+  OBJECT
+
   ADDITIONAL_HEADER_DIRS
   ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/LLVMIR
 
@@ -109,6 +111,8 @@ endif()
 add_mlir_dialect_library(MLIRROCDLTarget
   ROCDL/Target.cpp
 
+  OBJECT
+
   LINK_COMPONENTS
   MCParser
   ${AMDGPU_LIBS}
@@ -123,17 +127,18 @@ add_mlir_dialect_library(MLIRROCDLTarget
   )
 
 if(MLIR_ENABLE_ROCM_CONVERSIONS)
-  if (NOT ("AMDGPU" IN_LIST LLVM_TARGETS_TO_BUILD))
-    message(SEND_ERROR
-      "Building mlir with ROCm support requires the AMDGPU backend")
-  endif()
-
   if (DEFINED ROCM_PATH)
     set(DEFAULT_ROCM_PATH "${ROCM_PATH}" CACHE PATH "Fallback path to search for ROCm installs")
   elseif(DEFINED ENV{ROCM_PATH})
     set(DEFAULT_ROCM_PATH "$ENV{ROCM_PATH}" CACHE PATH "Fallback path to search for ROCm installs")
   else()
-    set(DEFAULT_ROCM_PATH "/opt/rocm" CACHE PATH "Fallback path to search for ROCm installs")
+    if (WIN32)
+      # Avoid setting an UNIX path for Windows.
+      # TODO: Eventually migrate to FindHIP once it becomes a part of CMake.
+      set(DEFAULT_ROCM_PATH "" CACHE PATH "Fallback path to search for ROCm installs")
+    else()
+      set(DEFAULT_ROCM_PATH "/opt/rocm" CACHE PATH "Fallback path to search for ROCm installs")
+    endif()
   endif()
   message(VERBOSE "MLIR Default ROCM toolkit path: ${DEFAULT_ROCM_PATH}")
 
diff --git a/mlir/lib/Target/LLVM/ROCDL/Target.cpp b/mlir/lib/Target/LLVM/ROCDL/Target.cpp
index cc13e5b7436e..047d214b751f 100644
--- a/mlir/lib/Target/LLVM/ROCDL/Target.cpp
+++ b/mlir/lib/Target/LLVM/ROCDL/Target.cpp
@@ -17,9 +17,6 @@
 #include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
 #include "mlir/Support/FileUtilities.h"
 #include "mlir/Target/LLVM/ROCDL/Utils.h"
-#include "mlir/Target/LLVMIR/Dialect/GPU/GPUToLLVMIRTranslation.h"
-#include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h"
-#include "mlir/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.h"
 #include "mlir/Target/LLVMIR/Export.h"
 
 #include "llvm/IR/Constants.h"
@@ -111,9 +108,6 @@ SerializeGPUModuleBase::SerializeGPUModuleBase(
     for (Attribute attr : files.getValue())
       if (auto file = dyn_cast<StringAttr>(attr))
         fileList.push_back(file.str());
-
-  // Append standard ROCm device bitcode libraries to the files to be loaded.
-  (void)appendStandardLibs();
 }
 
 void SerializeGPUModuleBase::init() {
@@ -138,28 +132,63 @@ ArrayRef<std::string> SerializeGPUModuleBase::getFileList() const {
   return fileList;
 }
 
-LogicalResult SerializeGPUModuleBase::appendStandardLibs() {
+LogicalResult SerializeGPUModuleBase::appendStandardLibs(AMDGCNLibraries libs) {
+  if (libs == AMDGCNLibraries::None)
+    return success();
   StringRef pathRef = getToolkitPath();
-  if (!pathRef.empty()) {
-    SmallVector<char, 256> path;
-    path.insert(path.begin(), pathRef.begin(), pathRef.end());
-    llvm::sys::path::append(path, "amdgcn", "bitcode");
-    pathRef = StringRef(path.data(), path.size());
-    if (!llvm::sys::fs::is_directory(pathRef)) {
-      getOperation().emitRemark() << "ROCm amdgcn bitcode path: " << pathRef
-                                  << " does not exist or is not a directory.";
-      return failure();
-    }
-    StringRef isaVersion =
-        llvm::AMDGPU::getArchNameAMDGCN(llvm::AMDGPU::parseArchAMDGCN(chip));
-    isaVersion.consume_front("gfx");
-    return getCommonBitcodeLibs(fileList, path, isaVersion);
+
+  // Get the path for the device libraries
+  SmallString<256> path;
+  path.insert(path.begin(), pathRef.begin(), pathRef.end());
+  llvm::sys::path::append(path, "amdgcn", "bitcode");
+  pathRef = StringRef(path.data(), path.size());
+
+  // Fail if the path is invalid.
+  if (!llvm::sys::fs::is_directory(pathRef)) {
+    getOperation().emitError() << "ROCm amdgcn bitcode path: " << pathRef
+                               << " does not exist or is not a directory";
+    return failure();
   }
+
+  // Get the ISA version.
+  StringRef isaVersion =
+      llvm::AMDGPU::getArchNameAMDGCN(llvm::AMDGPU::parseArchAMDGCN(chip));
+  isaVersion.consume_front("gfx");
+
+  // Helper function for adding a library.
+  auto addLib = [&](const Twine &lib) -> bool {
+    auto baseSize = path.size();
+    llvm::sys::path::append(path, lib);
+    StringRef pathRef(path.data(), path.size());
+    if (!llvm::sys::fs::is_regular_file(pathRef)) {
+      getOperation().emitRemark() << "bitcode library path: " << pathRef
+                                  << " does not exist or is not a file";
+      return true;
+    }
+    fileList.push_back(pathRef.str());
+    path.truncate(baseSize);
+    return false;
+  };
+
+  // Add ROCm device libraries. Fail if any of the libraries is not found, ie.
+  // if any of the `addLib` failed.
+  if ((any(libs & AMDGCNLibraries::Ocml) && addLib("ocml.bc")) ||
+      (any(libs & AMDGCNLibraries::Ockl) && addLib("ockl.bc")) ||
+      (any(libs & AMDGCNLibraries::Hip) && addLib("hip.bc")) ||
+      (any(libs & AMDGCNLibraries::OpenCL) && addLib("opencl.bc")) ||
+      (any(libs & (AMDGCNLibraries::Ocml | AMDGCNLibraries::Ockl)) &&
+       addLib("oclc_isa_version_" + isaVersion + ".bc")))
+    return failure();
   return success();
 }
 
 std::optional<SmallVector<std::unique_ptr<llvm::Module>>>
 SerializeGPUModuleBase::loadBitcodeFiles(llvm::Module &module) {
+  // Return if there are no libs to load.
+  if (deviceLibs == AMDGCNLibraries::None && fileList.empty())
+    return SmallVector<std::unique_ptr<llvm::Module>>();
+  if (failed(appendStandardLibs(deviceLibs)))
+    return std::nullopt;
   SmallVector<std::unique_ptr<llvm::Module>> bcFiles;
   if (failed(loadBitcodeFilesFromList(module.getContext(), fileList, bcFiles,
                                       true)))
@@ -174,80 +203,77 @@ LogicalResult SerializeGPUModuleBase::handleBitcodeFile(llvm::Module &module) {
   // Stop spamming us with clang version numbers
   if (auto *ident = module.getNamedMetadata("llvm.ident"))
     module.eraseNamedMetadata(ident);
+  // Override the libModules datalayout and target triple with the compiler's
+  // data layout should there be a discrepency.
+  setDataLayoutAndTriple(module);
   return success();
 }
 
 void SerializeGPUModuleBase::handleModulePreLink(llvm::Module &module) {
-  [[maybe_unused]] std::optional<llvm::TargetMachine *> targetMachine =
-      getOrCreateTargetMachine();
-  assert(targetMachine && "expect a TargetMachine");
-  addControlVariables(module, target.hasWave64(), target.hasDaz(),
+  // If all libraries are not set, traverse the module to determine which
+  // libraries are required.
+  if (deviceLibs != AMDGCNLibraries::All) {
+    for (llvm::Function &f : module.functions()) {
+      if (f.hasExternalLinkage() && f.hasName() && !f.hasExactDefinition()) {
+        StringRef funcName = f.getName();
+        if ("printf" == funcName)
+          deviceLibs |= AMDGCNLibraries::OpenCL | AMDGCNLibraries::Ockl |
+                        AMDGCNLibraries::Ocml;
+        if (funcName.starts_with("__ockl_"))
+          deviceLibs |= AMDGCNLibraries::Ockl;
+        if (funcName.starts_with("__ocml_"))
+          deviceLibs |= AMDGCNLibraries::Ocml;
+        if (funcName == "__atomic_work_item_fence")
+          deviceLibs |= AMDGCNLibraries::Hip;
+      }
+    }
+  }
+  addControlVariables(module, deviceLibs, target.hasWave64(), target.hasDaz(),
                       target.hasFiniteOnly(), target.hasUnsafeMath(),
                       target.hasFastMath(), target.hasCorrectSqrt(),
                       target.getAbi());
 }
 
-// Get the paths of ROCm device libraries.
-LogicalResult SerializeGPUModuleBase::getCommonBitcodeLibs(
-    llvm::SmallVector<std::string> &libs, SmallVector<char, 256> &libPath,
-    StringRef isaVersion) {
-  auto addLib = [&](StringRef path) -> bool {
-    if (!llvm::sys::fs::is_regular_file(path)) {
-      getOperation().emitRemark() << "Bitcode library path: " << path
-                                  << " does not exist or is not a file.\n";
-      return true;
-    }
-    libs.push_back(path.str());
-    return false;
-  };
-  auto getLibPath = [&libPath](Twine lib) {
-    auto baseSize = libPath.size();
-    llvm::sys::path::append(libPath, lib + ".bc");
-    std::string path(StringRef(libPath.data(), libPath.size()).str());
-    libPath.truncate(baseSize);
-    return path;
-  };
-
-  // Add ROCm device libraries. Fail if any of the libraries is not found.
-  if (addLib(getLibPath("ocml")) || addLib(getLibPath("ockl")) ||
-      addLib(getLibPath("hip")) || addLib(getLibPath("opencl")) ||
-      addLib(getLibPath("oclc_isa_version_" + isaVersion)))
-    return failure();
-  return success();
-}
-
 void SerializeGPUModuleBase::addControlVariables(
-    llvm::Module &module, bool wave64, bool daz, bool finiteOnly,
-    bool unsafeMath, bool fastMath, bool correctSqrt, StringRef abiVer) {
-  llvm::Type *i8Ty = llvm::Type::getInt8Ty(module.getContext());
-  auto addControlVariable = [i8Ty, &module](StringRef name, bool enable) {
+    llvm::Module &module, AMDGCNLibraries libs, bool wave64, bool daz,
+    bool finiteOnly, bool unsafeMath, bool fastMath, bool correctSqrt,
+    StringRef abiVer) {
+  // Return if no device libraries are required.
+  if (libs == AMDGCNLibraries::None)
+    return;
+  // Helper function for adding control variables.
+  auto addControlVariable = [&module](StringRef name, uint32_t value,
+                                      uint32_t bitwidth) {
+    if (module.getNamedGlobal(name))
+      return;
+    llvm::IntegerType *type =
+        llvm::IntegerType::getIntNTy(module.getContext(), bitwidth);
     llvm::GlobalVariable *controlVariable = new llvm::GlobalVariable(
-        module, i8Ty, true, llvm::GlobalValue::LinkageTypes::LinkOnceODRLinkage,
-        llvm::ConstantInt::get(i8Ty, enable), name, nullptr,
-        llvm::GlobalValue::ThreadLocalMode::NotThreadLocal, 4);
+        module, /*isConstant=*/type, true,
+        llvm::GlobalValue::LinkageTypes::LinkOnceODRLinkage,
+        llvm::ConstantInt::get(type, value), name, /*before=*/nullptr,
+        /*threadLocalMode=*/llvm::GlobalValue::ThreadLocalMode::NotThreadLocal,
+        /*addressSpace=*/4);
     controlVariable->setVisibility(
         llvm::GlobalValue::VisibilityTypes::ProtectedVisibility);
-    controlVariable->setAlignment(llvm::MaybeAlign(1));
+    controlVariable->setAlignment(llvm::MaybeAlign(bitwidth / 8));
     controlVariable->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Local);
   };
-  addControlVariable("__oclc_finite_only_opt", finiteOnly || fastMath);
-  addControlVariable("__oclc_unsafe_math_opt", unsafeMath || fastMath);
-  addControlVariable("__oclc_daz_opt", daz || fastMath);
-  addControlVariable("__oclc_correctly_rounded_sqrt32",
-                     correctSqrt && !fastMath);
-  addControlVariable("__oclc_wavefrontsize64", wave64);
-
-  llvm::Type *i32Ty = llvm::Type::getInt32Ty(module.getContext());
-  int abi = 500;
-  abiVer.getAsInteger(0, abi);
-  llvm::GlobalVariable *abiVersion = new llvm::GlobalVariable(
-      module, i32Ty, true, llvm::GlobalValue::LinkageTypes::LinkOnceODRLinkage,
-      llvm::ConstantInt::get(i32Ty, abi), "__oclc_ABI_version", nullptr,
-      llvm::GlobalValue::ThreadLocalMode::NotThreadLocal, 4);
-  abiVersion->setVisibility(
-      llvm::GlobalValue::VisibilityTypes::ProtectedVisibility);
-  abiVersion->setAlignment(llvm::MaybeAlign(4));
-  abiVersion->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Local);
+  // Add ocml related control variables.
+  if (any(libs & AMDGCNLibraries::Ocml)) {
+    addControlVariable("__oclc_finite_only_opt", finiteOnly || fastMath, 8);
+    addControlVariable("__oclc_daz_opt", daz || fastMath, 8);
+    addControlVariable("__oclc_correctly_rounded_sqrt32",
+                       correctSqrt && !fastMath, 8);
+    addControlVariable("__oclc_unsafe_math_opt", unsafeMath || fastMath, 8);
+  }
+  // Add ocml or ockl related control variables.
+  if (any(libs & (AMDGCNLibraries::Ocml | AMDGCNLibraries::Ockl))) {
+    addControlVariable("__oclc_wavefrontsize64", wave64, 8);
+    int abi = 500;
+    abiVer.getAsInteger(0, abi);
+    addControlVariable("__oclc_ABI_version", abi, 32);
+  }
 }
 
 std::optional<SmallVector<char, 0>>
@@ -307,53 +333,21 @@ SerializeGPUModuleBase::assembleIsa(StringRef isa) {
 
   if (!tap) {
     emitError(loc, "assembler initialization error");
-    return {};
+    return std::nullopt;
   }
 
   parser->setTargetParser(*tap);
   parser->Run(false);
-
-  return result;
-}
-
-#if MLIR_ENABLE_ROCM_CONVERSIONS
-namespace {
-class AMDGPUSerializer : public SerializeGPUModuleBase {
-public:
-  AMDGPUSerializer(Operation &module, ROCDLTargetAttr target,
-                   const gpu::TargetOptions &targetOptions);
-
-  gpu::GPUModuleOp getOperation();
-
-  // Compile to HSA.
-  std::optional<SmallVector<char, 0>>
-  compileToBinary(const std::string &serializedISA);
-
-  std::optional<SmallVector<char, 0>>
-  moduleToObject(llvm::Module &llvmModule) override;
-
-private:
-  // Target options.
-  gpu::TargetOptions targetOptions;
-};
-} // namespace
-
-AMDGPUSerializer::AMDGPUSerializer(Operation &module, ROCDLTargetAttr target,
-                                   const gpu::TargetOptions &targetOptions)
-    : SerializeGPUModuleBase(module, target, targetOptions),
-      targetOptions(targetOptions) {}
-
-gpu::GPUModuleOp AMDGPUSerializer::getOperation() {
-  return dyn_cast<gpu::GPUModuleOp>(&SerializeGPUModuleBase::getOperation());
+  return std::move(result);
 }
 
 std::optional<SmallVector<char, 0>>
-AMDGPUSerializer::compileToBinary(const std::string &serializedISA) {
+SerializeGPUModuleBase::compileToBinary(const std::string &serializedISA) {
   // Assemble the ISA.
   std::optional<SmallVector<char, 0>> isaBinary = assembleIsa(serializedISA);
 
   if (!isaBinary) {
-    getOperation().emitError() << "Failed during ISA assembling.";
+    getOperation().emitError() << "failed during ISA assembling";
     return std::nullopt;
   }
 
@@ -363,7 +357,7 @@ AMDGPUSerializer::compileToBinary(const std::string &serializedISA) {
   if (llvm::sys::fs::createTemporaryFile("kernel%%", "o", tempIsaBinaryFd,
                                          tempIsaBinaryFilename)) {
     getOperation().emitError()
-        << "Failed to create a temporary file for dumping the ISA binary.";
+        << "failed to create a temporary file for dumping the ISA binary";
     return std::nullopt;
   }
   llvm::FileRemover cleanupIsaBinary(tempIsaBinaryFilename);
@@ -378,7 +372,7 @@ AMDGPUSerializer::compileToBinary(const std::string &serializedISA) {
   if (llvm::sys::fs::createTemporaryFile("kernel", "hsaco",
                                          tempHsacoFilename)) {
     getOperation().emitError()
-        << "Failed to create a temporary file for the HSA code object.";
+        << "failed to create a temporary file for the HSA code object";
     return std::nullopt;
   }
   llvm::FileRemover cleanupHsaco(tempHsacoFilename);
@@ -389,7 +383,7 @@ AMDGPUSerializer::compileToBinary(const std::string &serializedISA) {
       lldPath,
       {"ld.lld", "-shared", tempIsaBinaryFilename, "-o", tempHsacoFilename});
   if (lldResult != 0) {
-    getOperation().emitError() << "lld invocation failed.";
+    getOperation().emitError() << "lld invocation failed";
     return std::nullopt;
   }
 
@@ -398,7 +392,7 @@ AMDGPUSerializer::compileToBinary(const std::string &serializedISA) {
       llvm::MemoryBuffer::getFile(tempHsacoFilename, /*IsText=*/false);
   if (!hsacoFile) {
     getOperation().emitError()
-        << "Failed to read the HSA code object from the temp file.";
+        << "failed to read the HSA code object from the temp file";
     return std::nullopt;
   }
 
@@ -407,13 +401,13 @@ AMDGPUSerializer::compileToBinary(const std::string &serializedISA) {
   return SmallVector<char, 0>(buffer.begin(), buffer.end());
 }
 
-std::optional<SmallVector<char, 0>>
-AMDGPUSerializer::moduleToObject(llvm::Module &llvmModule) {
+std::optional<SmallVector<char, 0>> SerializeGPUModuleBase::moduleToObjectImpl(
+    const gpu::TargetOptions &targetOptions, llvm::Module &llvmModule) {
   // Return LLVM IR if the compilation target is offload.
 #define DEBUG_TYPE "serialize-to-llvm"
   LLVM_DEBUG({
-    llvm::dbgs() << "LLVM IR for module: " << getOperation().getNameAttr()
-                 << "\n"
+    llvm::dbgs() << "LLVM IR for module: "
+                 << cast<gpu::GPUModuleOp>(getOperation()).getNameAttr() << "\n"
                  << llvmModule << "\n";
   });
 #undef DEBUG_TYPE
@@ -423,8 +417,8 @@ AMDGPUSerializer::moduleToObject(llvm::Module &llvmModule) {
   std::optional<llvm::TargetMachine *> targetMachine =
       getOrCreateTargetMachine();
   if (!targetMachine) {
-    getOperation().emitError() << "Target Machine unavailable for triple "
-                               << triple << ", can't compile with LLVM\n";
+    getOperation().emitError() << "target Machine unavailable for triple "
+                               << triple << ", can't compile with LLVM";
     return std::nullopt;
   }
 
@@ -432,12 +426,13 @@ AMDGPUSerializer::moduleToObject(llvm::Module &llvmModule) {
   std::optional<std::string> serializedISA =
       translateToISA(llvmModule, **targetMachine);
   if (!serializedISA) {
-    getOperation().emitError() << "Failed translating the module to ISA.";
+    getOperation().emitError() << "failed translating the module to ISA";
     return std::nullopt;
   }
 #define DEBUG_TYPE "serialize-to-isa"
   LLVM_DEBUG({
-    llvm::dbgs() << "ISA for module: " << getOperation().getNameAttr() << "\n"
+    llvm::dbgs() << "ISA for module: "
+                 << cast<gpu::GPUModuleOp>(getOperation()).getNameAttr() << "\n"
                  << *serializedISA << "\n";
   });
 #undef DEBUG_TYPE
@@ -445,9 +440,41 @@ AMDGPUSerializer::moduleToObject(llvm::Module &llvmModule) {
   if (targetOptions.getCompilationTarget() == gpu::CompilationTarget::Assembly)
     return SmallVector<char, 0>(serializedISA->begin(), serializedISA->end());
 
+  // Compiling to binary requires a valid ROCm path, fail if it's not found.
+  if (getToolkitPath().empty()) {
+    getOperation().emitError() << "invalid ROCm path, please set a valid path";
+    return std::nullopt;
+  }
+
   // Compile to binary.
   return compileToBinary(*serializedISA);
 }
+
+#if MLIR_ENABLE_ROCM_CONVERSIONS
+namespace {
+class AMDGPUSerializer : public SerializeGPUModuleBase {
+public:
+  AMDGPUSerializer(Operation &module, ROCDLTargetAttr target,
+                   const gpu::TargetOptions &targetOptions);
+
+  std::optional<SmallVector<char, 0>>
+  moduleToObject(llvm::Module &llvmModule) override;
+
+private:
+  // Target options.
+  gpu::TargetOptions targetOptions;
+};
+} // namespace
+
+AMDGPUSerializer::AMDGPUSerializer(Operation &module, ROCDLTargetAttr target,
+                                   const gpu::TargetOptions &targetOptions)
+    : SerializeGPUModuleBase(module, target, targetOptions),
+      targetOptions(targetOptions) {}
+
+std::optional<SmallVector<char, 0>>
+AMDGPUSerializer::moduleToObject(llvm::Module &llvmModule) {
+  return moduleToObjectImpl(targetOptions, llvmModule);
+}
 #endif // MLIR_ENABLE_ROCM_CONVERSIONS
 
 std::optional<SmallVector<char, 0>> ROCDLTargetAttrImpl::serializeToObject(
@@ -457,7 +484,7 @@ std::optional<SmallVector<char, 0>> ROCDLTargetAttrImpl::serializeToObject(
   if (!module)
     return std::nullopt;
   if (!mlir::isa<gpu::GPUModuleOp>(module)) {
-    module->emitError("Module must be a GPU module.");
+    module->emitError("module must be a GPU module");
     return std::nullopt;
   }
 #if MLIR_ENABLE_ROCM_CONVERSIONS
@@ -466,8 +493,8 @@ std::optional<SmallVector<char, 0>> ROCDLTargetAttrImpl::serializeToObject(
   serializer.init();
   return serializer.run();
 #else
-  module->emitError("The `AMDGPU` target was not built. Please enable it when "
-                    "building LLVM.");
+  module->emitError("the `AMDGPU` target was not built. Please enable it when "
+                    "building LLVM");
   return std::nullopt;
 #endif // MLIR_ENABLE_ROCM_CONVERSIONS
 }
@@ -477,10 +504,15 @@ ROCDLTargetAttrImpl::createObject(Attribute attribute,
                                   const SmallVector<char, 0> &object,
                                   const gpu::TargetOptions &options) const {
   gpu::CompilationTarget format = options.getCompilationTarget();
+  // If format is `fatbin` transform it to binary as `fatbin` is not yet
+  // supported.
+  if (format > gpu::CompilationTarget::Binary)
+    format = gpu::CompilationTarget::Binary;
+
+  DictionaryAttr properties{};
   Builder builder(attribute.getContext());
   return builder.getAttr<gpu::ObjectAttr>(
-      attribute,
-      format > gpu::CompilationTarget::Binary ? gpu::CompilationTarget::Binary
-                                              : format,
-      builder.getStringAttr(StringRef(object.data(), object.size())), nullptr);
+      attribute, format,
+      builder.getStringAttr(StringRef(object.data(), object.size())),
+      properties);
 }
diff --git a/mlir/lib/Target/LLVMIR/ConvertToLLVMIR.cpp b/mlir/lib/Target/LLVMIR/ConvertToLLVMIR.cpp
index 455889377953..92e786b130ab 100644
--- a/mlir/lib/Target/LLVMIR/ConvertToLLVMIR.cpp
+++ b/mlir/lib/Target/LLVMIR/ConvertToLLVMIR.cpp
@@ -16,9 +16,12 @@
 #include "mlir/Target/LLVMIR/Dialect/All.h"
 #include "mlir/Target/LLVMIR/Export.h"
 #include "mlir/Tools/mlir-translate/Translation.h"
+#include "llvm/IR/DebugProgramInstruction.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 
+extern llvm::cl::opt<bool> WriteNewDbgInfoFormat;
+
 using namespace mlir;
 
 namespace mlir {
@@ -31,6 +34,13 @@ void registerToLLVMIRTranslation() {
         if (!llvmModule)
           return failure();
 
+        // When printing LLVM IR, we should convert the module to the debug info
+        // format that LLVM expects us to print.
+        // See https://llvm.org/docs/RemoveDIsDebugInfo.html
+        llvm::ScopedDbgInfoFormatSetter formatSetter(*llvmModule,
+                                                     WriteNewDbgInfoFormat);
+        if (WriteNewDbgInfoFormat)
+          llvmModule->removeDebugIntrinsicDeclarations();
         llvmModule->print(output, nullptr);
         return success();
       },
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 6ec4c120c11e..eabc4b30f57a 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -30,6 +30,7 @@
 #include "llvm/Frontend/OpenMP/OMPIRBuilder.h"
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/ReplaceConstant.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/TargetParser/Triple.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
@@ -388,8 +389,19 @@ static LogicalResult inlineConvertOmpRegions(
     // be processed multiple times.
     moduleTranslation.forgetMapping(region);
 
-    if (potentialTerminator && potentialTerminator->isTerminator())
-      potentialTerminator->insertAfter(&builder.GetInsertBlock()->back());
+    if (potentialTerminator && potentialTerminator->isTerminator()) {
+      llvm::BasicBlock *block = builder.GetInsertBlock();
+      if (block->empty()) {
+        // this can happen for really simple reduction init regions e.g.
+        // %0 = llvm.mlir.constant(0 : i32) : i32
+        // omp.yield(%0 : i32)
+        // because the llvm.mlir.constant (MLIR op) isn't converted into any
+        // llvm op
+        potentialTerminator->insertInto(block, block->begin());
+      } else {
+        potentialTerminator->insertAfter(&block->back());
+      }
+    }
 
     return success();
   }
@@ -762,7 +774,7 @@ convertOmpTaskgroupOp(omp::TaskgroupOp tgOp, llvm::IRBuilderBase &builder,
 /// Allocate space for privatized reduction variables.
 template <typename T>
 static void allocByValReductionVars(
-    T loop, llvm::IRBuilderBase &builder,
+    T loop, ArrayRef<BlockArgument> reductionArgs, llvm::IRBuilderBase &builder,
     LLVM::ModuleTranslation &moduleTranslation,
     llvm::OpenMPIRBuilder::InsertPointTy &allocaIP,
     SmallVectorImpl<omp::DeclareReductionOp> &reductionDecls,
@@ -770,17 +782,15 @@ static void allocByValReductionVars(
     DenseMap<Value, llvm::Value *> &reductionVariableMap,
     llvm::ArrayRef<bool> isByRefs) {
   llvm::IRBuilderBase::InsertPointGuard guard(builder);
-  builder.restoreIP(allocaIP);
-  auto args =
-      loop.getRegion().getArguments().take_back(loop.getNumReductionVars());
+  builder.SetInsertPoint(allocaIP.getBlock()->getTerminator());
 
   for (std::size_t i = 0; i < loop.getNumReductionVars(); ++i) {
     if (isByRefs[i])
       continue;
     llvm::Value *var = builder.CreateAlloca(
         moduleTranslation.convertType(reductionDecls[i].getType()));
-    moduleTranslation.mapValue(args[i], var);
-    privateReductionVariables.push_back(var);
+    moduleTranslation.mapValue(reductionArgs[i], var);
+    privateReductionVariables[i] = var;
     reductionVariableMap.try_emplace(loop.getReductionVars()[i], var);
   }
 }
@@ -826,14 +836,17 @@ static void collectReductionInfo(
   // Collect the reduction information.
   reductionInfos.reserve(numReductions);
   for (unsigned i = 0; i < numReductions; ++i) {
-    llvm::OpenMPIRBuilder::AtomicReductionGenTy atomicGen = nullptr;
+    llvm::OpenMPIRBuilder::ReductionGenAtomicCBTy atomicGen = nullptr;
     if (owningAtomicReductionGens[i])
       atomicGen = owningAtomicReductionGens[i];
     llvm::Value *variable =
         moduleTranslation.lookupValue(loop.getReductionVars()[i]);
     reductionInfos.push_back(
         {moduleTranslation.convertType(reductionDecls[i].getType()), variable,
-         privateReductionVariables[i], owningReductionGens[i], atomicGen});
+         privateReductionVariables[i],
+         /*EvaluationKind=*/llvm::OpenMPIRBuilder::EvalKind::Scalar,
+         owningReductionGens[i],
+         /*ReductionGenClang=*/nullptr, atomicGen});
   }
 }
 
@@ -911,16 +924,20 @@ convertOmpWsloop(Operation &opInst, llvm::IRBuilderBase &builder,
   llvm::OpenMPIRBuilder::InsertPointTy allocaIP =
       findAllocaInsertPoint(builder, moduleTranslation);
 
-  SmallVector<llvm::Value *> privateReductionVariables;
+  SmallVector<llvm::Value *> privateReductionVariables(
+      wsloopOp.getNumReductionVars());
   DenseMap<Value, llvm::Value *> reductionVariableMap;
-  allocByValReductionVars(wsloopOp, builder, moduleTranslation, allocaIP,
-                          reductionDecls, privateReductionVariables,
+
+  MutableArrayRef<BlockArgument> reductionArgs =
+      wsloopOp.getRegion().getArguments();
+
+  allocByValReductionVars(wsloopOp, reductionArgs, builder, moduleTranslation,
+                          allocaIP, reductionDecls, privateReductionVariables,
                           reductionVariableMap, isByRef);
 
   // Before the loop, store the initial values of reductions into reduction
   // variables. Although this could be done after allocas, we don't want to mess
   // up with the alloca insertion point.
-  ArrayRef<BlockArgument> reductionArgs = wsloopOp.getRegion().getArguments();
   for (unsigned i = 0; i < wsloopOp.getNumReductionVars(); ++i) {
     SmallVector<llvm::Value *> phis;
 
@@ -942,7 +959,7 @@ convertOmpWsloop(Operation &opInst, llvm::IRBuilderBase &builder,
       // ptr
       builder.CreateStore(phis[0], var);
 
-      privateReductionVariables.push_back(var);
+      privateReductionVariables[i] = var;
       moduleTranslation.mapValue(reductionArgs[i], phis[0]);
       reductionVariableMap.try_emplace(wsloopOp.getReductionVars()[i], phis[0]);
     } else {
@@ -1140,20 +1157,40 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
   // Collect reduction declarations
   SmallVector<omp::DeclareReductionOp> reductionDecls;
   collectReductionDecls(opInst, reductionDecls);
-  SmallVector<llvm::Value *> privateReductionVariables;
+  SmallVector<llvm::Value *> privateReductionVariables(
+      opInst.getNumReductionVars());
 
   auto bodyGenCB = [&](InsertPointTy allocaIP, InsertPointTy codeGenIP) {
     // Allocate reduction vars
     DenseMap<Value, llvm::Value *> reductionVariableMap;
-    allocByValReductionVars(opInst, builder, moduleTranslation, allocaIP,
-                            reductionDecls, privateReductionVariables,
+
+    MutableArrayRef<BlockArgument> reductionArgs =
+        opInst.getRegion().getArguments().slice(
+            opInst.getNumAllocateVars() + opInst.getNumAllocatorsVars(),
+            opInst.getNumReductionVars());
+
+    allocByValReductionVars(opInst, reductionArgs, builder, moduleTranslation,
+                            allocaIP, reductionDecls, privateReductionVariables,
                             reductionVariableMap, isByRef);
 
     // Initialize reduction vars
     builder.restoreIP(allocaIP);
-    MutableArrayRef<BlockArgument> reductionArgs =
-        opInst.getRegion().getArguments().take_back(
-            opInst.getNumReductionVars());
+    llvm::BasicBlock *initBlock = splitBB(builder, true, "omp.reduction.init");
+    allocaIP =
+        InsertPointTy(allocaIP.getBlock(),
+                      allocaIP.getBlock()->getTerminator()->getIterator());
+    SmallVector<llvm::Value *> byRefVars(opInst.getNumReductionVars());
+    for (unsigned i = 0; i < opInst.getNumReductionVars(); ++i) {
+      if (isByRef[i]) {
+        // Allocate reduction variable (which is a pointer to the real reduciton
+        // variable allocated in the inlined region)
+        byRefVars[i] = builder.CreateAlloca(
+            moduleTranslation.convertType(reductionDecls[i].getType()));
+      }
+    }
+
+    builder.SetInsertPoint(initBlock->getFirstNonPHIOrDbgOrAlloca());
+
     for (unsigned i = 0; i < opInst.getNumReductionVars(); ++i) {
       SmallVector<llvm::Value *> phis;
 
@@ -1166,18 +1203,17 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
       assert(phis.size() == 1 &&
              "expected one value to be yielded from the "
              "reduction neutral element declaration region");
-      builder.restoreIP(allocaIP);
+
+      // mapInitializationArg finishes its block with a terminator. We need to
+      // insert before that terminator.
+      builder.SetInsertPoint(builder.GetInsertBlock()->getTerminator());
 
       if (isByRef[i]) {
-        // Allocate reduction variable (which is a pointer to the real reduciton
-        // variable allocated in the inlined region)
-        llvm::Value *var = builder.CreateAlloca(
-            moduleTranslation.convertType(reductionDecls[i].getType()));
         // Store the result of the inlined region to the allocated reduction var
         // ptr
-        builder.CreateStore(phis[0], var);
+        builder.CreateStore(phis[0], byRefVars[i]);
 
-        privateReductionVariables.push_back(var);
+        privateReductionVariables[i] = byRefVars[i];
         moduleTranslation.mapValue(reductionArgs[i], phis[0]);
         reductionVariableMap.try_emplace(opInst.getReductionVars()[i], phis[0]);
       } else {
@@ -1275,7 +1311,26 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
           // region. The privatizer is processed in-place (see below) before it
           // gets inlined in the parallel region and therefore processing the
           // original op is dangerous.
-          return {privVar, privatizer.clone()};
+
+          MLIRContext &context = moduleTranslation.getContext();
+          mlir::IRRewriter opCloner(&context);
+          opCloner.setInsertionPoint(privatizer);
+          auto clone = llvm::cast<mlir::omp::PrivateClauseOp>(
+              opCloner.clone(*privatizer));
+
+          // Unique the clone name to avoid clashes in the symbol table.
+          unsigned counter = 0;
+          SmallString<256> cloneName = SymbolTable::generateSymbolName<256>(
+              privatizer.getSymName(),
+              [&](llvm::StringRef candidate) {
+                return SymbolTable::lookupNearestSymbolFrom(
+                           opInst, StringAttr::get(&context, candidate)) !=
+                       nullptr;
+              },
+              counter);
+
+          clone.setSymName(cloneName);
+          return {privVar, clone};
         }
       }
 
@@ -1925,12 +1980,6 @@ llvm::Value *getSizeInBytes(DataLayout &dl, const mlir::Type &type,
                             Operation *clauseOp, llvm::Value *basePointer,
                             llvm::Type *baseType, llvm::IRBuilderBase &builder,
                             LLVM::ModuleTranslation &moduleTranslation) {
-  // utilising getTypeSizeInBits instead of getTypeSize as getTypeSize gives
-  // the size in inconsistent byte or bit format.
-  uint64_t underlyingTypeSzInBits = dl.getTypeSizeInBits(type);
-  if (auto arrTy = llvm::dyn_cast_if_present<LLVM::LLVMArrayType>(type))
-    underlyingTypeSzInBits = getArrayElementSizeInBits(arrTy, dl);
-
   if (auto memberClause =
           mlir::dyn_cast_if_present<mlir::omp::MapInfoOp>(clauseOp)) {
     // This calculates the size to transfer based on bounds and the underlying
@@ -1956,6 +2005,12 @@ llvm::Value *getSizeInBytes(DataLayout &dl, const mlir::Type &type,
         }
       }
 
+      // utilising getTypeSizeInBits instead of getTypeSize as getTypeSize gives
+      // the size in inconsistent byte or bit format.
+      uint64_t underlyingTypeSzInBits = dl.getTypeSizeInBits(type);
+      if (auto arrTy = llvm::dyn_cast_if_present<LLVM::LLVMArrayType>(type))
+        underlyingTypeSzInBits = getArrayElementSizeInBits(arrTy, dl);
+
       // The size in bytes x number of elements, the sizeInBytes stored is
       // the underyling types size, e.g. if ptr<i32>, it'll be the i32's
       // size, so we do some on the fly runtime math to get the size in
@@ -1966,7 +2021,7 @@ llvm::Value *getSizeInBytes(DataLayout &dl, const mlir::Type &type,
     }
   }
 
-  return builder.getInt64(underlyingTypeSzInBits / 8);
+  return builder.getInt64(dl.getTypeSizeInBits(type) / 8);
 }
 
 void collectMapDataFromMapOperands(MapInfoData &mapData,
@@ -2263,7 +2318,7 @@ static llvm::omp::OpenMPOffloadMappingFlags mapParentWithMembers(
 
   // This creates the initial MEMBER_OF mapping that consists of
   // the parent/top level container (same as above effectively, except
-  // with a fixed initial compile time size and seperate maptype which
+  // with a fixed initial compile time size and separate maptype which
   // indicates the true mape type (tofrom etc.). This parent mapping is
   // only relevant if the structure in its totality is being mapped,
   // otherwise the above suffices.
@@ -2388,7 +2443,7 @@ static void processMapWithMembersOf(
 
   // If we have a partial map (no parent referenced in the map clauses of the
   // directive, only members) and only a single member, we do not need to bind
-  // the map of the member to the parent, we can pass the member seperately.
+  // the map of the member to the parent, we can pass the member separately.
   if (parentClause.getMembers().size() == 1 && parentClause.getPartialMap()) {
     auto memberClause = llvm::cast<mlir::omp::MapInfoOp>(
         parentClause.getMembers()[0].getDefiningOp());
@@ -2425,7 +2480,7 @@ createAlteredByCaptureMap(MapInfoData &mapData,
                           LLVM::ModuleTranslation &moduleTranslation,
                           llvm::IRBuilderBase &builder) {
   for (size_t i = 0; i < mapData.MapClause.size(); ++i) {
-    // if it's declare target, skip it, it's handled seperately.
+    // if it's declare target, skip it, it's handled separately.
     if (!mapData.IsDeclareTarget[i]) {
       auto mapOp =
           mlir::dyn_cast_if_present<mlir::omp::MapInfoOp>(mapData.MapClause[i]);
@@ -2847,7 +2902,7 @@ static bool targetOpSupported(Operation &opInst) {
 static void
 handleDeclareTargetMapVar(MapInfoData &mapData,
                           LLVM::ModuleTranslation &moduleTranslation,
-                          llvm::IRBuilderBase &builder) {
+                          llvm::IRBuilderBase &builder, llvm::Function *func) {
   for (size_t i = 0; i < mapData.MapClause.size(); ++i) {
     // In the case of declare target mapped variables, the basePointer is
     // the reference pointer generated by the convertDeclareTargetAttr
@@ -2862,19 +2917,31 @@ handleDeclareTargetMapVar(MapInfoData &mapData,
     // reference pointer and the pointer are assigned in the kernel argument
     // structure for the host.
     if (mapData.IsDeclareTarget[i]) {
+      // If the original map value is a constant, then we have to make sure all
+      // of it's uses within the current kernel/function that we are going to
+      // rewrite are converted to instructions, as we will be altering the old
+      // use (OriginalValue) from a constant to an instruction, which will be
+      // illegal and ICE the compiler if the user is a constant expression of
+      // some kind e.g. a constant GEP.
+      if (auto *constant = dyn_cast<llvm::Constant>(mapData.OriginalValue[i]))
+        convertUsersOfConstantsToInstructions(constant, func, false);
+
       // The users iterator will get invalidated if we modify an element,
-      // so we populate this vector of uses to alter each user on an individual
-      // basis to emit its own load (rather than one load for all).
+      // so we populate this vector of uses to alter each user on an
+      // individual basis to emit its own load (rather than one load for
+      // all).
       llvm::SmallVector<llvm::User *> userVec;
       for (llvm::User *user : mapData.OriginalValue[i]->users())
         userVec.push_back(user);
 
       for (llvm::User *user : userVec) {
         if (auto *insn = dyn_cast<llvm::Instruction>(user)) {
-          auto *load = builder.CreateLoad(mapData.BasePointers[i]->getType(),
-                                          mapData.BasePointers[i]);
-          load->moveBefore(insn);
-          user->replaceUsesOfWith(mapData.OriginalValue[i], load);
+          if (insn->getFunction() == func) {
+            auto *load = builder.CreateLoad(mapData.BasePointers[i]->getType(),
+                                            mapData.BasePointers[i]);
+            load->moveBefore(insn);
+            user->replaceUsesOfWith(mapData.OriginalValue[i], load);
+          }
         }
       }
     }
@@ -2992,6 +3059,7 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder,
   auto &targetRegion = targetOp.getRegion();
   DataLayout dl = DataLayout(opInst.getParentOfType<ModuleOp>());
   SmallVector<Value> mapOperands = targetOp.getMapOperands();
+  llvm::Function *llvmOutlinedFn = nullptr;
 
   LogicalResult bodyGenStatus = success();
   using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy;
@@ -3001,7 +3069,7 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder,
     // original function to the new outlined function.
     llvm::Function *llvmParentFn =
         moduleTranslation.lookupFunction(parentFn.getName());
-    llvm::Function *llvmOutlinedFn = codeGenIP.getBlock()->getParent();
+    llvmOutlinedFn = codeGenIP.getBlock()->getParent();
     assert(llvmParentFn && llvmOutlinedFn &&
            "Both parent and outlined functions must exist at this point");
 
@@ -3096,7 +3164,8 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder,
   // Remap access operations to declare target reference pointers for the
   // device, essentially generating extra loadop's as necessary
   if (moduleTranslation.getOpenMPBuilder()->Config.isTargetDevice())
-    handleDeclareTargetMapVar(mapData, moduleTranslation, builder);
+    handleDeclareTargetMapVar(mapData, moduleTranslation, builder,
+                              llvmOutlinedFn);
 
   return bodyGenStatus;
 }
diff --git a/mlir/lib/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.cpp
index 94423b35d1ff..2a146f5efed3 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.cpp
@@ -17,9 +17,9 @@
 #include "mlir/IR/Operation.h"
 #include "mlir/Target/LLVMIR/ModuleTranslation.h"
 
+#include "llvm/IR/ConstantRange.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
-#include "llvm/IR/MDBuilder.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace mlir;
@@ -32,12 +32,9 @@ static llvm::Value *createIntrinsicCallWithRange(llvm::IRBuilderBase &builder,
   auto *inst = llvm::cast<llvm::CallInst>(
       createIntrinsicCall(builder, intrinsic, {}, {}));
   if (maybeRange) {
-    SmallVector<llvm::APInt, 2> apInts;
-    for (int32_t i : maybeRange.asArrayRef())
-      apInts.push_back(llvm::APInt(32, i));
-    llvm::MDBuilder mdBuilder(builder.getContext());
-    llvm::MDNode *range = mdBuilder.createRange(apInts[0], apInts[1]);
-    inst->setMetadata(llvm::LLVMContext::MD_range, range);
+    llvm::ConstantRange Range(APInt(32, maybeRange[0]),
+                              APInt(32, maybeRange[1]));
+    inst->addRangeRetAttr(Range);
   }
   return inst;
 }
diff --git a/mlir/lib/Target/LLVMIR/ModuleImport.cpp b/mlir/lib/Target/LLVMIR/ModuleImport.cpp
index 191b84acd56f..cfcf33436a89 100644
--- a/mlir/lib/Target/LLVMIR/ModuleImport.cpp
+++ b/mlir/lib/Target/LLVMIR/ModuleImport.cpp
@@ -1664,23 +1664,26 @@ static void processMemoryEffects(llvm::Function *func, LLVMFuncOp funcOp) {
 
 // List of LLVM IR attributes that map to an explicit attribute on the MLIR
 // LLVMFuncOp.
-static constexpr std::array ExplicitAttributes{
-    StringLiteral("aarch64_pstate_sm_enabled"),
-    StringLiteral("aarch64_pstate_sm_body"),
-    StringLiteral("aarch64_pstate_sm_compatible"),
-    StringLiteral("aarch64_new_za"),
-    StringLiteral("aarch64_preserves_za"),
+static constexpr std::array kExplicitAttributes{
     StringLiteral("aarch64_in_za"),
-    StringLiteral("aarch64_out_za"),
     StringLiteral("aarch64_inout_za"),
-    StringLiteral("vscale_range"),
+    StringLiteral("aarch64_new_za"),
+    StringLiteral("aarch64_out_za"),
+    StringLiteral("aarch64_preserves_za"),
+    StringLiteral("aarch64_pstate_sm_body"),
+    StringLiteral("aarch64_pstate_sm_compatible"),
+    StringLiteral("aarch64_pstate_sm_enabled"),
+    StringLiteral("alwaysinline"),
+    StringLiteral("approx-func-fp-math"),
     StringLiteral("frame-pointer"),
-    StringLiteral("target-features"),
-    StringLiteral("unsafe-fp-math"),
     StringLiteral("no-infs-fp-math"),
     StringLiteral("no-nans-fp-math"),
-    StringLiteral("approx-func-fp-math"),
     StringLiteral("no-signed-zeros-fp-math"),
+    StringLiteral("noinline"),
+    StringLiteral("optnone"),
+    StringLiteral("target-features"),
+    StringLiteral("unsafe-fp-math"),
+    StringLiteral("vscale_range"),
 };
 
 static void processPassthroughAttrs(llvm::Function *func, LLVMFuncOp funcOp) {
@@ -1709,7 +1712,7 @@ static void processPassthroughAttrs(llvm::Function *func, LLVMFuncOp funcOp) {
     auto keyAttr = StringAttr::get(context, attrName);
 
     // Skip attributes that map to an explicit attribute on the LLVMFuncOp.
-    if (llvm::is_contained(ExplicitAttributes, attrName))
+    if (llvm::is_contained(kExplicitAttributes, attrName))
       continue;
 
     if (attr.isStringAttribute()) {
@@ -1745,6 +1748,13 @@ void ModuleImport::processFunctionAttributes(llvm::Function *func,
   processMemoryEffects(func, funcOp);
   processPassthroughAttrs(func, funcOp);
 
+  if (func->hasFnAttribute(llvm::Attribute::NoInline))
+    funcOp.setNoInline(true);
+  if (func->hasFnAttribute(llvm::Attribute::AlwaysInline))
+    funcOp.setAlwaysInline(true);
+  if (func->hasFnAttribute(llvm::Attribute::OptimizeNone))
+    funcOp.setOptimizeNone(true);
+
   if (func->hasFnAttribute("aarch64_pstate_sm_enabled"))
     funcOp.setArmStreaming(true);
   else if (func->hasFnAttribute("aarch64_pstate_sm_body"))
diff --git a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
index 7b86b250c294..6e8b2dec75b7 100644
--- a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
@@ -64,6 +64,8 @@ using namespace mlir;
 using namespace mlir::LLVM;
 using namespace mlir::LLVM::detail;
 
+extern llvm::cl::opt<bool> UseNewDbgInfoFormat;
+
 #include "mlir/Dialect/LLVMIR/LLVMConversionEnumsToLLVM.inc"
 
 namespace {
@@ -1390,10 +1392,10 @@ LogicalResult ModuleTranslation::convertDialectAttributes(
   return success();
 }
 
-/// Converts the function attributes from LLVMFuncOp and attaches them to the
-/// llvm::Function.
-static void convertFunctionAttributes(LLVMFuncOp func,
-                                      llvm::Function *llvmFunc) {
+/// Converts memory effect attributes from `func` and attaches them to
+/// `llvmFunc`.
+static void convertFunctionMemoryAttributes(LLVMFuncOp func,
+                                            llvm::Function *llvmFunc) {
   if (!func.getMemory())
     return;
 
@@ -1412,6 +1414,18 @@ static void convertFunctionAttributes(LLVMFuncOp func,
   llvmFunc->setMemoryEffects(newMemEffects);
 }
 
+/// Converts function attributes from `func` and attaches them to `llvmFunc`.
+static void convertFunctionAttributes(LLVMFuncOp func,
+                                      llvm::Function *llvmFunc) {
+  if (func.getNoInlineAttr())
+    llvmFunc->addFnAttr(llvm::Attribute::NoInline);
+  if (func.getAlwaysInlineAttr())
+    llvmFunc->addFnAttr(llvm::Attribute::AlwaysInline);
+  if (func.getOptimizeNoneAttr())
+    llvmFunc->addFnAttr(llvm::Attribute::OptimizeNone);
+  convertFunctionMemoryAttributes(func, llvmFunc);
+}
+
 FailureOr<llvm::AttrBuilder>
 ModuleTranslation::convertParameterAttrs(LLVMFuncOp func, int argIdx,
                                          DictionaryAttr paramAttrs) {
@@ -1789,6 +1803,9 @@ prepareLLVMModule(Operation *m, llvm::LLVMContext &llvmContext,
                   StringRef name) {
   m->getContext()->getOrLoadDialect<LLVM::LLVMDialect>();
   auto llvmModule = std::make_unique<llvm::Module>(name, llvmContext);
+  // ModuleTranslation can currently only construct modules in the old debug
+  // info format, so set the flag accordingly.
+  llvmModule->setNewDbgInfoFormatFlag(false);
   if (auto dataLayoutAttr =
           m->getDiscardableAttr(LLVM::LLVMDialect::getDataLayoutAttrName())) {
     llvmModule->setDataLayout(cast<StringAttr>(dataLayoutAttr).getValue());
@@ -1867,6 +1884,11 @@ mlir::translateModuleToLLVMIR(Operation *module, llvm::LLVMContext &llvmContext,
   if (failed(translator.convertFunctions()))
     return nullptr;
 
+  // Once we've finished constructing elements in the module, we should convert
+  // it to use the debug info format desired by LLVM.
+  // See https://llvm.org/docs/RemoveDIsDebugInfo.html
+  translator.llvmModule->setIsNewDbgInfoFormat(UseNewDbgInfoFormat);
+
   if (!disableVerification &&
       llvm::verifyModule(*translator.llvmModule, &llvm::errs()))
     return nullptr;