libgomp: Simplify OpenMP reverse offload host <-> device memory copy implementation

... by using the existing 'goacc_asyncqueue' instead of re-coding parts of it. Follow-up to commit 131d18e928a3ea1ab2d3bf61aa92d68a8a254609 "libgomp/nvptx: Prepare for reverse-offload callback handling", and commit ea4b23d9c82d9be3b982c3519fe5e8e9d833a6a8 "libgomp: Handle OpenMP's reverse offloads". libgomp/ * target.c (gomp_target_rev): Instead of 'dev_to_host_cpy', 'host_to_dev_cpy', 'token', take a single 'goacc_asyncqueue'. * libgomp.h (gomp_target_rev): Adjust. * libgomp-plugin.c (GOMP_PLUGIN_target_rev): Adjust. * libgomp-plugin.h (GOMP_PLUGIN_target_rev): Adjust. * plugin/plugin-gcn.c (process_reverse_offload): Adjust. * plugin/plugin-nvptx.c (rev_off_dev_to_host_cpy) (rev_off_host_to_dev_cpy): Remove. (GOMP_OFFLOAD_run): Adjust.
author: Thomas Schwinge <thomas@codesourcery.com> 2023-03-21 16:14:16 +0100
committer: Thomas Schwinge <thomas@codesourcery.com> 2023-05-08 15:58:05 +0200
commit: 130c2f3c3acd0963aeab64b77bd6b578e698a2f6 (patch)
tree: c453371e275c13a093dfe21d86e96d914044e5ec /libgomp/plugin
parent: bd6dbdb196da5aa5c7354e0fc7b0a146237bcf8a (diff)
2 files changed, 46 insertions, 33 deletions
diff --git a/libgomp/plugin/plugin-gcn.c b/libgomp/plugin/plugin-gcn.c
index 347803762eb..2181bf0235f 100644
--- a/libgomp/plugin/plugin-gcn.c
+++ b/libgomp/plugin/plugin-gcn.c
@@ -1949,7 +1949,7 @@ process_reverse_offload (uint64_t fn, uint64_t mapnum, uint64_t hostaddrs,
 {
   int dev_num = dev_num64;
   GOMP_PLUGIN_target_rev (fn, mapnum, hostaddrs, sizes, kinds, dev_num,
-			  NULL, NULL, NULL);
+			  NULL);
 }
 
 /* Output any data written to console output from the kernel.  It is expected
diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
index b3481c408c9..ffc8e2d79d1 100644
--- a/libgomp/plugin/plugin-nvptx.c
+++ b/libgomp/plugin/plugin-nvptx.c
@@ -56,6 +56,7 @@
 #include <unistd.h>
 #include <assert.h>
 #include <errno.h>
+#include <stdlib.h>
 
 /* An arbitrary fixed limit (128MB) for the size of the OpenMP soft stacks
    block to cache between kernel invocations.  For soft-stacks blocks bigger
@@ -1625,11 +1626,11 @@ GOMP_OFFLOAD_openacc_cuda_set_stream (struct goacc_asyncqueue *aq, void *stream)
   return 1;
 }
 
-struct goacc_asyncqueue *
-GOMP_OFFLOAD_openacc_async_construct (int device __attribute__((unused)))
+static struct goacc_asyncqueue *
+nvptx_goacc_asyncqueue_construct (unsigned int flags)
 {
   CUstream stream = NULL;
-  CUDA_CALL_ERET (NULL, cuStreamCreate, &stream, CU_STREAM_DEFAULT);
+  CUDA_CALL_ERET (NULL, cuStreamCreate, &stream, flags);
 
   struct goacc_asyncqueue *aq
     = GOMP_PLUGIN_malloc (sizeof (struct goacc_asyncqueue));
@@ -1637,14 +1638,26 @@ GOMP_OFFLOAD_openacc_async_construct (int device __attribute__((unused)))
   return aq;
 }
 
-bool
-GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq)
+struct goacc_asyncqueue *
+GOMP_OFFLOAD_openacc_async_construct (int device __attribute__((unused)))
+{
+  return nvptx_goacc_asyncqueue_construct (CU_STREAM_DEFAULT);
+}
+
+static bool
+nvptx_goacc_asyncqueue_destruct (struct goacc_asyncqueue *aq)
 {
   CUDA_CALL_ERET (false, cuStreamDestroy, aq->cuda_stream);
   free (aq);
   return true;
 }
 
+bool
+GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq)
+{
+  return nvptx_goacc_asyncqueue_destruct (aq);
+}
+
 int
 GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq)
 {
@@ -1658,14 +1671,20 @@ GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq)
   return -1;
 }
 
-bool
-GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq)
+static bool
+nvptx_goacc_asyncqueue_synchronize (struct goacc_asyncqueue *aq)
 {
   CUDA_CALL_ERET (false, cuStreamSynchronize, aq->cuda_stream);
   return true;
 }
 
 bool
+GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq)
+{
+  return nvptx_goacc_asyncqueue_synchronize (aq);
+}
+
+bool
 GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue *aq1,
 				      struct goacc_asyncqueue *aq2)
 {
@@ -1925,22 +1944,6 @@ nvptx_stacks_acquire (struct ptx_device *ptx_dev, size_t size, int num)
 
 
 void
-rev_off_dev_to_host_cpy (void *dest, const void *src, size_t size,
-			 CUstream stream)
-{
-  CUDA_CALL_ASSERT (cuMemcpyDtoHAsync, dest, (CUdeviceptr) src, size, stream);
-  CUDA_CALL_ASSERT (cuStreamSynchronize, stream);
-}
-
-void
-rev_off_host_to_dev_cpy (void *dest, const void *src, size_t size,
-			 CUstream stream)
-{
-  CUDA_CALL_ASSERT (cuMemcpyHtoDAsync, (CUdeviceptr) dest, src, size, stream);
-  CUDA_CALL_ASSERT (cuStreamSynchronize, stream);
-}
-
-void
 GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
 {
   struct targ_fn_descriptor *tgt_fn_desc
@@ -1973,9 +1976,17 @@ GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
     }
   nvptx_adjust_launch_bounds (tgt_fn, ptx_dev, &teams, &threads);
 
-  size_t stack_size = nvptx_stacks_size ();
   bool reverse_offload = ptx_dev->rev_data != NULL;
-  CUstream copy_stream = NULL;
+  struct goacc_asyncqueue *reverse_offload_aq = NULL;
+  if (reverse_offload)
+    {
+      reverse_offload_aq
+	= nvptx_goacc_asyncqueue_construct (CU_STREAM_NON_BLOCKING);
+      if (!reverse_offload_aq)
+	exit (EXIT_FAILURE);
+    }
+
+  size_t stack_size = nvptx_stacks_size ();
 
   pthread_mutex_lock (&ptx_dev->omp_stacks.lock);
   void *stacks = nvptx_stacks_acquire (ptx_dev, stack_size, teams * threads);
@@ -1989,8 +2000,6 @@ GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
   GOMP_PLUGIN_debug (0, "  %s: kernel %s: launch"
 		     " [(teams: %u), 1, 1] [(lanes: 32), (threads: %u), 1]\n",
 		     __FUNCTION__, fn_name, teams, threads);
-  if (reverse_offload)
-    CUDA_CALL_ASSERT (cuStreamCreate, &copy_stream, CU_STREAM_NON_BLOCKING);
   r = CUDA_CALL_NOCHECK (cuLaunchKernel, function, teams, 1, 1,
 			 32, threads, 1, 0, NULL, NULL, config);
   if (r != CUDA_SUCCESS)
@@ -2013,17 +2022,15 @@ GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
 	    GOMP_PLUGIN_target_rev (rev_data->fn, rev_data->mapnum,
 				    rev_data->addrs, rev_data->sizes,
 				    rev_data->kinds, rev_data->dev_num,
-				    rev_off_dev_to_host_cpy,
-				    rev_off_host_to_dev_cpy, copy_stream);
-	    CUDA_CALL_ASSERT (cuStreamSynchronize, copy_stream);
+				    reverse_offload_aq);
+	    if (!nvptx_goacc_asyncqueue_synchronize (reverse_offload_aq))
+	      exit (EXIT_FAILURE);
 	    __atomic_store_n (&rev_data->fn, 0, __ATOMIC_RELEASE);
 	  }
 	usleep (1);
       }
   else
     r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
-  if (reverse_offload)
-    CUDA_CALL_ASSERT (cuStreamDestroy, copy_stream);
   if (r == CUDA_ERROR_LAUNCH_FAILED)
     GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
 		       maybe_abort_msg);
@@ -2031,6 +2038,12 @@ GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
     GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
 
   pthread_mutex_unlock (&ptx_dev->omp_stacks.lock);
+
+  if (reverse_offload)
+    {
+      if (!nvptx_goacc_asyncqueue_destruct (reverse_offload_aq))
+	exit (EXIT_FAILURE);
+    }
 }
 
 /* TODO: Implement GOMP_OFFLOAD_async_run. */
author	Thomas Schwinge <thomas@codesourcery.com>	2023-03-21 16:14:16 +0100
committer	Thomas Schwinge <thomas@codesourcery.com>	2023-05-08 15:58:05 +0200
commit	130c2f3c3acd0963aeab64b77bd6b578e698a2f6 (patch)
tree	c453371e275c13a093dfe21d86e96d914044e5ec /libgomp/plugin
parent	bd6dbdb196da5aa5c7354e0fc7b0a146237bcf8a (diff)