summaryrefslogtreecommitdiff
path: root/flang-rt
diff options
context:
space:
mode:
authormodiking <mmo@nvidia.com>2025-10-03 09:48:59 -0700
committerGitHub <noreply@github.com>2025-10-03 09:48:59 -0700
commit74180eb024f3e45c4e0ebeb5dd07f34f85ff6539 (patch)
tree2a9c76c9225e184b7da17b754c4f7e4c4c49bb9e /flang-rt
parentd0e98909d28be377408b1e52fa35423a2236036c (diff)
[flang][rt] Add noinline attributes for CUDA compile path for successful compilation (#161760)
NVCC does more aggressive inlining than Clang/GCC causing the exported functions in extrema.cpp and findloc.cpp to become extremely large from function specializations leading to compilation timeouts. Marking the 2 functions in this change as noinline for NVCC alleviates this problem as it removes the worst of the cross-matrix argument specializations. Also remove the workaround in https://github.com/llvm/llvm-project/pull/156542 that opted out findloc.cpp from the CUDA flang-rt build Testing: ninja flang-rt builds in ~30 minutes, these 2 files build in ~3 minutes
Diffstat (limited to 'flang-rt')
-rw-r--r--flang-rt/lib/runtime/CMakeLists.txt3
-rw-r--r--flang-rt/lib/runtime/extrema.cpp9
-rw-r--r--flang-rt/lib/runtime/findloc.cpp11
3 files changed, 13 insertions, 10 deletions
diff --git a/flang-rt/lib/runtime/CMakeLists.txt b/flang-rt/lib/runtime/CMakeLists.txt
index 6548ec955b2b..e8f70bd544e0 100644
--- a/flang-rt/lib/runtime/CMakeLists.txt
+++ b/flang-rt/lib/runtime/CMakeLists.txt
@@ -178,9 +178,6 @@ endif ()
if ("${LLVM_RUNTIMES_TARGET}" MATCHES "^amdgcn|^nvptx")
set(sources ${gpu_sources})
elseif(FLANG_RT_EXPERIMENTAL_OFFLOAD_SUPPORT STREQUAL "CUDA")
- # findloc.cpp has some issues with higher compute capability. Remove it
- # from CUDA build until we can lower its memory footprint.
- list(REMOVE_ITEM supported_sources findloc.cpp)
set(sources ${supported_sources})
else ()
set(sources ${supported_sources} ${host_sources} ${f128_sources})
diff --git a/flang-rt/lib/runtime/extrema.cpp b/flang-rt/lib/runtime/extrema.cpp
index 9846529665e8..c4575cced901 100644
--- a/flang-rt/lib/runtime/extrema.cpp
+++ b/flang-rt/lib/runtime/extrema.cpp
@@ -397,9 +397,12 @@ template <TypeCategory CAT, bool IS_MAX,
template <typename, bool, bool> class COMPARE>
struct DoPartialMaxOrMinLocHelper {
template <int KIND> struct Functor {
- RT_API_ATTRS void operator()(const char *intrinsic, Descriptor &result,
- const Descriptor &x, int kind, int dim, const Descriptor *mask,
- bool back, Terminator &terminator) const {
+ // NVCC inlines more aggressively which causes too many specializations of
+ // this function to be inlined causing compiler timeouts. Set as
+ // noinline to allow compilation to complete.
+ RT_API_ATTRS RT_DEVICE_NOINLINE void operator()(const char *intrinsic,
+ Descriptor &result, const Descriptor &x, int kind, int dim,
+ const Descriptor *mask, bool back, Terminator &terminator) const {
DoPartialMaxOrMinLoc<CAT, KIND, IS_MAX, COMPARE>(
intrinsic, result, x, kind, dim, mask, back, terminator);
}
diff --git a/flang-rt/lib/runtime/findloc.cpp b/flang-rt/lib/runtime/findloc.cpp
index 5485f4b97bd2..b5031ec95508 100644
--- a/flang-rt/lib/runtime/findloc.cpp
+++ b/flang-rt/lib/runtime/findloc.cpp
@@ -153,10 +153,13 @@ template <TypeCategory CAT,
class HELPER>
struct NumericFindlocHelper {
template <int KIND> struct Functor {
- RT_API_ATTRS void operator()(TypeCategory targetCat, int targetKind,
- Descriptor &result, const Descriptor &x, const Descriptor &target,
- int kind, int dim, const Descriptor *mask, bool back,
- Terminator &terminator) const {
+ // NVCC inlines more aggressively which causes too many specializations of
+ // this function to be inlined causing compiler timeouts. Set as
+ // noinline to allow compilation to complete.
+ RT_API_ATTRS RT_DEVICE_NOINLINE void operator()(TypeCategory targetCat,
+ int targetKind, Descriptor &result, const Descriptor &x,
+ const Descriptor &target, int kind, int dim, const Descriptor *mask,
+ bool back, Terminator &terminator) const {
switch (targetCat) {
case TypeCategory::Integer:
case TypeCategory::Unsigned: