summaryrefslogtreecommitdiff
path: root/libgomp/plugin
diff options
context:
space:
mode:
authorThomas Schwinge <thomas@codesourcery.com>2023-03-21 16:14:16 +0100
committerThomas Schwinge <thomas@codesourcery.com>2023-05-08 15:58:05 +0200
commit130c2f3c3acd0963aeab64b77bd6b578e698a2f6 (patch)
treec453371e275c13a093dfe21d86e96d914044e5ec /libgomp/plugin
parentbd6dbdb196da5aa5c7354e0fc7b0a146237bcf8a (diff)
libgomp: Simplify OpenMP reverse offload host <-> device memory copy implementation
... by using the existing 'goacc_asyncqueue' instead of re-coding parts of it. Follow-up to commit 131d18e928a3ea1ab2d3bf61aa92d68a8a254609 "libgomp/nvptx: Prepare for reverse-offload callback handling", and commit ea4b23d9c82d9be3b982c3519fe5e8e9d833a6a8 "libgomp: Handle OpenMP's reverse offloads". libgomp/ * target.c (gomp_target_rev): Instead of 'dev_to_host_cpy', 'host_to_dev_cpy', 'token', take a single 'goacc_asyncqueue'. * libgomp.h (gomp_target_rev): Adjust. * libgomp-plugin.c (GOMP_PLUGIN_target_rev): Adjust. * libgomp-plugin.h (GOMP_PLUGIN_target_rev): Adjust. * plugin/plugin-gcn.c (process_reverse_offload): Adjust. * plugin/plugin-nvptx.c (rev_off_dev_to_host_cpy) (rev_off_host_to_dev_cpy): Remove. (GOMP_OFFLOAD_run): Adjust.
Diffstat (limited to 'libgomp/plugin')
-rw-r--r--libgomp/plugin/plugin-gcn.c2
-rw-r--r--libgomp/plugin/plugin-nvptx.c77
2 files changed, 46 insertions, 33 deletions
diff --git a/libgomp/plugin/plugin-gcn.c b/libgomp/plugin/plugin-gcn.c
index 347803762eb..2181bf0235f 100644
--- a/libgomp/plugin/plugin-gcn.c
+++ b/libgomp/plugin/plugin-gcn.c
@@ -1949,7 +1949,7 @@ process_reverse_offload (uint64_t fn, uint64_t mapnum, uint64_t hostaddrs,
{
int dev_num = dev_num64;
GOMP_PLUGIN_target_rev (fn, mapnum, hostaddrs, sizes, kinds, dev_num,
- NULL, NULL, NULL);
+ NULL);
}
/* Output any data written to console output from the kernel. It is expected
diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
index b3481c408c9..ffc8e2d79d1 100644
--- a/libgomp/plugin/plugin-nvptx.c
+++ b/libgomp/plugin/plugin-nvptx.c
@@ -56,6 +56,7 @@
#include <unistd.h>
#include <assert.h>
#include <errno.h>
+#include <stdlib.h>
/* An arbitrary fixed limit (128MB) for the size of the OpenMP soft stacks
block to cache between kernel invocations. For soft-stacks blocks bigger
@@ -1625,11 +1626,11 @@ GOMP_OFFLOAD_openacc_cuda_set_stream (struct goacc_asyncqueue *aq, void *stream)
return 1;
}
-struct goacc_asyncqueue *
-GOMP_OFFLOAD_openacc_async_construct (int device __attribute__((unused)))
+static struct goacc_asyncqueue *
+nvptx_goacc_asyncqueue_construct (unsigned int flags)
{
CUstream stream = NULL;
- CUDA_CALL_ERET (NULL, cuStreamCreate, &stream, CU_STREAM_DEFAULT);
+ CUDA_CALL_ERET (NULL, cuStreamCreate, &stream, flags);
struct goacc_asyncqueue *aq
= GOMP_PLUGIN_malloc (sizeof (struct goacc_asyncqueue));
@@ -1637,14 +1638,26 @@ GOMP_OFFLOAD_openacc_async_construct (int device __attribute__((unused)))
return aq;
}
-bool
-GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq)
+struct goacc_asyncqueue *
+GOMP_OFFLOAD_openacc_async_construct (int device __attribute__((unused)))
+{
+ return nvptx_goacc_asyncqueue_construct (CU_STREAM_DEFAULT);
+}
+
+static bool
+nvptx_goacc_asyncqueue_destruct (struct goacc_asyncqueue *aq)
{
CUDA_CALL_ERET (false, cuStreamDestroy, aq->cuda_stream);
free (aq);
return true;
}
+bool
+GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq)
+{
+ return nvptx_goacc_asyncqueue_destruct (aq);
+}
+
int
GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq)
{
@@ -1658,14 +1671,20 @@ GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq)
return -1;
}
-bool
-GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq)
+static bool
+nvptx_goacc_asyncqueue_synchronize (struct goacc_asyncqueue *aq)
{
CUDA_CALL_ERET (false, cuStreamSynchronize, aq->cuda_stream);
return true;
}
bool
+GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq)
+{
+ return nvptx_goacc_asyncqueue_synchronize (aq);
+}
+
+bool
GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue *aq1,
struct goacc_asyncqueue *aq2)
{
@@ -1925,22 +1944,6 @@ nvptx_stacks_acquire (struct ptx_device *ptx_dev, size_t size, int num)
void
-rev_off_dev_to_host_cpy (void *dest, const void *src, size_t size,
- CUstream stream)
-{
- CUDA_CALL_ASSERT (cuMemcpyDtoHAsync, dest, (CUdeviceptr) src, size, stream);
- CUDA_CALL_ASSERT (cuStreamSynchronize, stream);
-}
-
-void
-rev_off_host_to_dev_cpy (void *dest, const void *src, size_t size,
- CUstream stream)
-{
- CUDA_CALL_ASSERT (cuMemcpyHtoDAsync, (CUdeviceptr) dest, src, size, stream);
- CUDA_CALL_ASSERT (cuStreamSynchronize, stream);
-}
-
-void
GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
{
struct targ_fn_descriptor *tgt_fn_desc
@@ -1973,9 +1976,17 @@ GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
}
nvptx_adjust_launch_bounds (tgt_fn, ptx_dev, &teams, &threads);
- size_t stack_size = nvptx_stacks_size ();
bool reverse_offload = ptx_dev->rev_data != NULL;
- CUstream copy_stream = NULL;
+ struct goacc_asyncqueue *reverse_offload_aq = NULL;
+ if (reverse_offload)
+ {
+ reverse_offload_aq
+ = nvptx_goacc_asyncqueue_construct (CU_STREAM_NON_BLOCKING);
+ if (!reverse_offload_aq)
+ exit (EXIT_FAILURE);
+ }
+
+ size_t stack_size = nvptx_stacks_size ();
pthread_mutex_lock (&ptx_dev->omp_stacks.lock);
void *stacks = nvptx_stacks_acquire (ptx_dev, stack_size, teams * threads);
@@ -1989,8 +2000,6 @@ GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
GOMP_PLUGIN_debug (0, " %s: kernel %s: launch"
" [(teams: %u), 1, 1] [(lanes: 32), (threads: %u), 1]\n",
__FUNCTION__, fn_name, teams, threads);
- if (reverse_offload)
- CUDA_CALL_ASSERT (cuStreamCreate, &copy_stream, CU_STREAM_NON_BLOCKING);
r = CUDA_CALL_NOCHECK (cuLaunchKernel, function, teams, 1, 1,
32, threads, 1, 0, NULL, NULL, config);
if (r != CUDA_SUCCESS)
@@ -2013,17 +2022,15 @@ GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
GOMP_PLUGIN_target_rev (rev_data->fn, rev_data->mapnum,
rev_data->addrs, rev_data->sizes,
rev_data->kinds, rev_data->dev_num,
- rev_off_dev_to_host_cpy,
- rev_off_host_to_dev_cpy, copy_stream);
- CUDA_CALL_ASSERT (cuStreamSynchronize, copy_stream);
+ reverse_offload_aq);
+ if (!nvptx_goacc_asyncqueue_synchronize (reverse_offload_aq))
+ exit (EXIT_FAILURE);
__atomic_store_n (&rev_data->fn, 0, __ATOMIC_RELEASE);
}
usleep (1);
}
else
r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
- if (reverse_offload)
- CUDA_CALL_ASSERT (cuStreamDestroy, copy_stream);
if (r == CUDA_ERROR_LAUNCH_FAILED)
GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
maybe_abort_msg);
@@ -2031,6 +2038,12 @@ GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
pthread_mutex_unlock (&ptx_dev->omp_stacks.lock);
+
+ if (reverse_offload)
+ {
+ if (!nvptx_goacc_asyncqueue_destruct (reverse_offload_aq))
+ exit (EXIT_FAILURE);
+ }
}
/* TODO: Implement GOMP_OFFLOAD_async_run. */