4 files changed, 82 insertions, 29 deletions
diff --git a/flang/lib/Optimizer/CodeGen/CMakeLists.txt b/flang/lib/Optimizer/CodeGen/CMakeLists.txt
index 980307db315d..16c7944a885a 100644
--- a/flang/lib/Optimizer/CodeGen/CMakeLists.txt
+++ b/flang/lib/Optimizer/CodeGen/CMakeLists.txt
@@ -34,6 +34,7 @@ add_flang_library(FIRCodeGen
 
   MLIR_LIBS
   MLIRComplexToLLVM
+  MLIRComplexToROCDLLibraryCalls
   MLIRComplexToStandard
   MLIRGPUDialect
   MLIRMathToFuncs
diff --git a/flang/lib/Optimizer/CodeGen/CodeGen.cpp b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
index ecc04a6c9a2b..d879382555c3 100644
--- a/flang/lib/Optimizer/CodeGen/CodeGen.cpp
+++ b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
@@ -33,6 +33,7 @@
 #include "mlir/Conversion/ArithCommon/AttrToLLVMConverter.h"
 #include "mlir/Conversion/ArithToLLVM/ArithToLLVM.h"
 #include "mlir/Conversion/ComplexToLLVM/ComplexToLLVM.h"
+#include "mlir/Conversion/ComplexToROCDLLibraryCalls/ComplexToROCDLLibraryCalls.h"
 #include "mlir/Conversion/ComplexToStandard/ComplexToStandard.h"
 #include "mlir/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.h"
 #include "mlir/Conversion/FuncToLLVM/ConvertFuncToLLVM.h"
@@ -1122,6 +1123,16 @@ struct AllocMemOpConversion : public fir::FIROpConversion<fir::AllocMemOp> {
     for (mlir::Value opnd : adaptor.getOperands())
       size = rewriter.create<mlir::LLVM::MulOp>(
           loc, ity, size, integerCast(loc, rewriter, ity, opnd));
+
+    // As the return value of malloc(0) is implementation defined, allocate one
+    // byte to ensure the allocation status being true. This behavior aligns to
+    // what the runtime has.
+    mlir::Value zero = genConstantIndex(loc, ity, rewriter, 0);
+    mlir::Value one = genConstantIndex(loc, ity, rewriter, 1);
+    mlir::Value cmp = rewriter.create<mlir::LLVM::ICmpOp>(
+        loc, mlir::LLVM::ICmpPredicate::sgt, size, zero);
+    size = rewriter.create<mlir::LLVM::SelectOp>(loc, cmp, size, one);
+
     auto mallocTyWidth = lowerTy().getIndexTypeBitwidth();
     auto mallocTy =
         mlir::IntegerType::get(rewriter.getContext(), mallocTyWidth);
@@ -4145,22 +4156,24 @@ public:
     // conversions that affect the ModuleOp, e.g. create new
     // function operations in it. We have to run such conversions
     // as passes here.
-    mlir::OpPassManager mathConvertionPM("builtin.module");
+    mlir::OpPassManager mathConversionPM("builtin.module");
 
     bool isAMDGCN = fir::getTargetTriple(mod).isAMDGCN();
     // If compiling for AMD target some math operations must be lowered to AMD
     // GPU library calls, the rest can be converted to LLVM intrinsics, which
     // is handled in the mathToLLVM conversion. The lowering to libm calls is
     // not needed since all math operations are handled this way.
-    if (isAMDGCN)
-      mathConvertionPM.addPass(mlir::createConvertMathToROCDL());
+    if (isAMDGCN) {
+      mathConversionPM.addPass(mlir::createConvertMathToROCDL());
+      mathConversionPM.addPass(mlir::createConvertComplexToROCDLLibraryCalls());
+    }
 
     // Convert math::FPowI operations to inline implementation
     // only if the exponent's width is greater than 32, otherwise,
     // it will be lowered to LLVM intrinsic operation by a later conversion.
     mlir::ConvertMathToFuncsOptions mathToFuncsOptions{};
     mathToFuncsOptions.minWidthOfFPowIExponent = 33;
-    mathConvertionPM.addPass(
+    mathConversionPM.addPass(
         mlir::createConvertMathToFuncs(mathToFuncsOptions));
 
     mlir::ConvertComplexToStandardPassOptions complexToStandardOptions{};
@@ -4173,15 +4186,15 @@ public:
       complexToStandardOptions.complexRange =
           mlir::complex::ComplexRangeFlags::improved;
     }
-    mathConvertionPM.addPass(
+    mathConversionPM.addPass(
         mlir::createConvertComplexToStandardPass(complexToStandardOptions));
 
     // Convert Math dialect operations into LLVM dialect operations.
     // There is no way to prefer MathToLLVM patterns over MathToLibm
     // patterns (applied below), so we have to run MathToLLVM conversion here.
-    mathConvertionPM.addNestedPass<mlir::func::FuncOp>(
+    mathConversionPM.addNestedPass<mlir::func::FuncOp>(
         mlir::createConvertMathToLLVMPass());
-    if (mlir::failed(runPipeline(mathConvertionPM, mod)))
+    if (mlir::failed(runPipeline(mathConversionPM, mod)))
       return signalPassFailure();
 
     std::optional<mlir::DataLayout> dl =
diff --git a/flang/lib/Optimizer/CodeGen/LowerRepackArrays.cpp b/flang/lib/Optimizer/CodeGen/LowerRepackArrays.cpp
index 2774382c22bf..d2cf85bedd54 100644
--- a/flang/lib/Optimizer/CodeGen/LowerRepackArrays.cpp
+++ b/flang/lib/Optimizer/CodeGen/LowerRepackArrays.cpp
@@ -38,7 +38,7 @@
 #include "flang/Optimizer/Dialect/FIRDialect.h"
 #include "flang/Optimizer/Dialect/FIROps.h"
 #include "flang/Optimizer/Dialect/FIRType.h"
-#include "flang/Optimizer/OpenACC/RegisterOpenACCExtensions.h"
+#include "flang/Optimizer/OpenACC/Support/RegisterOpenACCExtensions.h"
 #include "flang/Optimizer/OpenMP/Support/RegisterOpenMPExtensions.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
@@ -63,13 +63,14 @@ private:
   static constexpr llvm::StringRef bufferName = ".repacked";
 
   // Return value of fir::BaseBoxType that represents a temporary
-  // array created for the original box with given extents and
-  // type parameters. The new box has the default lower bounds.
-  // If useStack is true, then the temporary will be allocated
+  // array created for the original box with given lbounds/extents and
+  // type parameters. The new box has the same shape as the original
+  // array. If useStack is true, then the temporary will be allocated
   // in stack memory (when possible).
   static mlir::Value allocateTempBuffer(fir::FirOpBuilder &builder,
                                         mlir::Location loc, bool useStack,
                                         mlir::Value origBox,
+                                        llvm::ArrayRef<mlir::Value> lbounds,
                                         llvm::ArrayRef<mlir::Value> extents,
                                         llvm::ArrayRef<mlir::Value> typeParams);
 
@@ -99,7 +100,9 @@ public:
 // the presence of the stack attribute does not automatically
 // mean that the allocation is actually done in stack memory.
 // For example, we always do the heap allocation for polymorphic
-// types using Fortran runtime.
+// types using Fortran runtime. Currently, we allocate all
+// repack temporaries of derived types as polymorphic,
+// so that we can preserve the dynamic type of the original.
 // Adding the polymorpic mold to fir.alloca and then using
 // Fortran runtime to compute the allocation size could probably
 // resolve this limitation.
@@ -170,7 +173,8 @@ PackArrayConversion::matchAndRewrite(fir::PackArrayOp op,
 
 mlir::Value PackArrayConversion::allocateTempBuffer(
     fir::FirOpBuilder &builder, mlir::Location loc, bool useStack,
-    mlir::Value origBox, llvm::ArrayRef<mlir::Value> extents,
+    mlir::Value origBox, llvm::ArrayRef<mlir::Value> lbounds,
+    llvm::ArrayRef<mlir::Value> extents,
     llvm::ArrayRef<mlir::Value> typeParams) {
   auto tempType = mlir::cast<fir::SequenceType>(
       fir::extractSequenceType(origBox.getType()));
@@ -191,16 +195,35 @@ mlir::Value PackArrayConversion::allocateTempBuffer(
     assert(!isHeapAllocation && "temp must have been allocated on the stack");
 
   mlir::Type ptrType = base.getType();
-  if (llvm::isa<fir::BaseBoxType>(ptrType))
-    return base;
+  if (auto tempBoxType = mlir::dyn_cast<fir::BaseBoxType>(ptrType)) {
+    // We need to reset the CFI_attribute_allocatable before
+    // returning the temporary box to avoid any mishandling
+    // of the temporary box in Fortran runtime.
+    base = builder.create<fir::BoxAddrOp>(loc, fir::boxMemRefType(tempBoxType),
+                                          base);
+    ptrType = base.getType();
+  }
 
-  mlir::Type tempBoxType = fir::BoxType::get(mlir::isa<fir::HeapType>(ptrType)
-                                                 ? ptrType
-                                                 : fir::unwrapRefType(ptrType));
+  // Create the temporary using dynamic type of the original,
+  // if it is polymorphic, or it has a derived type with SEQUENCE
+  // or BIND attribute (such dummy arguments may have their dynamic
+  // type not exactly matching their static type).
+  // Note that for the latter case, the allocation can still be done
+  // without the mold, because the dynamic and static types
+  // must be storage compatible.
+  bool useDynamicType = fir::isBoxedRecordType(origBox.getType()) ||
+                        fir::isPolymorphicType(origBox.getType());
+  mlir::Type tempBoxType =
+      fir::wrapInClassOrBoxType(fir::unwrapRefType(ptrType),
+                                /*isPolymorphic=*/useDynamicType);
+  // Use the shape with proper lower bounds for the final box.
+  shape = builder.genShape(loc, lbounds, extents);
   mlir::Value newBox =
       builder.createBox(loc, tempBoxType, base, shape, /*slice=*/nullptr,
-                        typeParams, /*tdesc=*/nullptr);
-  return newBox;
+                        typeParams, useDynamicType ? origBox : nullptr);
+  // The new box might be !fir.class, while the original might be
+  // !fir.box - we have to add a conversion.
+  return builder.createConvert(loc, origBox.getType(), newBox);
 }
 
 mlir::FailureOr<mlir::Value>
@@ -280,16 +303,11 @@ PackArrayConversion::genRepackedBox(fir::FirOpBuilder &builder,
                             << op.getOperation() << '\n';
   }
 
-  mlir::Value tempBox =
-      allocateTempBuffer(builder, loc, op.getStack(), box, extents, typeParams);
+  mlir::Value tempBox = allocateTempBuffer(builder, loc, op.getStack(), box,
+                                           lbounds, extents, typeParams);
   if (!op.getNoCopy())
     fir::runtime::genShallowCopy(builder, loc, tempBox, box,
                                  /*resultIsAllocated=*/true);
-
-  // Set lower bounds after the original box.
-  mlir::Value shift = builder.genShift(loc, lbounds);
-  tempBox = builder.create<fir::ReboxOp>(loc, boxType, tempBox, shift,
-                                         /*slice=*/nullptr);
   builder.create<fir::ResultOp>(loc, tempBox);
 
   return ifOp.getResult(0);
diff --git a/flang/lib/Optimizer/CodeGen/Target.cpp b/flang/lib/Optimizer/CodeGen/Target.cpp
index 7dbf21ce0c12..b60a72e4340b 100644
--- a/flang/lib/Optimizer/CodeGen/Target.cpp
+++ b/flang/lib/Optimizer/CodeGen/Target.cpp
@@ -1443,14 +1443,35 @@ struct TargetAMDGPU : public GenericTarget<TargetAMDGPU> {
   CodeGenSpecifics::Marshalling
   complexArgumentType(mlir::Location loc, mlir::Type eleTy) const override {
     CodeGenSpecifics::Marshalling marshal;
-    TODO(loc, "handle complex argument types");
+    const auto *sem = &floatToSemantics(kindMap, eleTy);
+    if (sem == &llvm::APFloat::IEEEsingle()) {
+      // Lower COMPLEX(KIND=4) as an array of two element values.
+      marshal.emplace_back(fir::SequenceType::get({2}, eleTy), AT{});
+    } else if (sem == &llvm::APFloat::IEEEdouble()) {
+      // Pass COMPLEX(KIND=8) as two separate arguments.
+      marshal.emplace_back(eleTy, AT{});
+      marshal.emplace_back(eleTy, AT{});
+    } else {
+      typeTodo(sem, loc, "argument");
+    }
     return marshal;
   }
 
   CodeGenSpecifics::Marshalling
   complexReturnType(mlir::Location loc, mlir::Type eleTy) const override {
     CodeGenSpecifics::Marshalling marshal;
-    TODO(loc, "handle complex return types");
+    const auto *sem = &floatToSemantics(kindMap, eleTy);
+    if (sem == &llvm::APFloat::IEEEsingle()) {
+      // Return COMPLEX(KIND=4) as an array of two elements.
+      marshal.emplace_back(fir::SequenceType::get({2}, eleTy), AT{});
+    } else if (sem == &llvm::APFloat::IEEEdouble()) {
+      // Return COMPLEX(KIND=8) via an aggregate with two fields.
+      marshal.emplace_back(mlir::TupleType::get(eleTy.getContext(),
+                                                mlir::TypeRange{eleTy, eleTy}),
+                           AT{});
+    } else {
+      typeTodo(sem, loc, "return");
+    }
     return marshal;
   }
 };