diff options
| author | Thomas Schwinge <thomas@codesourcery.com> | 2023-03-21 16:14:16 +0100 |
|---|---|---|
| committer | Thomas Schwinge <thomas@codesourcery.com> | 2023-05-08 15:58:05 +0200 |
| commit | 130c2f3c3acd0963aeab64b77bd6b578e698a2f6 (patch) | |
| tree | c453371e275c13a093dfe21d86e96d914044e5ec /libgomp/plugin | |
| parent | bd6dbdb196da5aa5c7354e0fc7b0a146237bcf8a (diff) | |
libgomp: Simplify OpenMP reverse offload host <-> device memory copy implementation
... by using the existing 'goacc_asyncqueue' instead of re-coding parts of it.
Follow-up to commit 131d18e928a3ea1ab2d3bf61aa92d68a8a254609
"libgomp/nvptx: Prepare for reverse-offload callback handling",
and commit ea4b23d9c82d9be3b982c3519fe5e8e9d833a6a8
"libgomp: Handle OpenMP's reverse offloads".
libgomp/
* target.c (gomp_target_rev): Instead of 'dev_to_host_cpy',
'host_to_dev_cpy', 'token', take a single 'goacc_asyncqueue'.
* libgomp.h (gomp_target_rev): Adjust.
* libgomp-plugin.c (GOMP_PLUGIN_target_rev): Adjust.
* libgomp-plugin.h (GOMP_PLUGIN_target_rev): Adjust.
* plugin/plugin-gcn.c (process_reverse_offload): Adjust.
* plugin/plugin-nvptx.c (rev_off_dev_to_host_cpy)
(rev_off_host_to_dev_cpy): Remove.
(GOMP_OFFLOAD_run): Adjust.
Diffstat (limited to 'libgomp/plugin')
| -rw-r--r-- | libgomp/plugin/plugin-gcn.c | 2 | ||||
| -rw-r--r-- | libgomp/plugin/plugin-nvptx.c | 77 |
2 files changed, 46 insertions, 33 deletions
diff --git a/libgomp/plugin/plugin-gcn.c b/libgomp/plugin/plugin-gcn.c index 347803762eb..2181bf0235f 100644 --- a/libgomp/plugin/plugin-gcn.c +++ b/libgomp/plugin/plugin-gcn.c @@ -1949,7 +1949,7 @@ process_reverse_offload (uint64_t fn, uint64_t mapnum, uint64_t hostaddrs, { int dev_num = dev_num64; GOMP_PLUGIN_target_rev (fn, mapnum, hostaddrs, sizes, kinds, dev_num, - NULL, NULL, NULL); + NULL); } /* Output any data written to console output from the kernel. It is expected diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c index b3481c408c9..ffc8e2d79d1 100644 --- a/libgomp/plugin/plugin-nvptx.c +++ b/libgomp/plugin/plugin-nvptx.c @@ -56,6 +56,7 @@ #include <unistd.h> #include <assert.h> #include <errno.h> +#include <stdlib.h> /* An arbitrary fixed limit (128MB) for the size of the OpenMP soft stacks block to cache between kernel invocations. For soft-stacks blocks bigger @@ -1625,11 +1626,11 @@ GOMP_OFFLOAD_openacc_cuda_set_stream (struct goacc_asyncqueue *aq, void *stream) return 1; } -struct goacc_asyncqueue * -GOMP_OFFLOAD_openacc_async_construct (int device __attribute__((unused))) +static struct goacc_asyncqueue * +nvptx_goacc_asyncqueue_construct (unsigned int flags) { CUstream stream = NULL; - CUDA_CALL_ERET (NULL, cuStreamCreate, &stream, CU_STREAM_DEFAULT); + CUDA_CALL_ERET (NULL, cuStreamCreate, &stream, flags); struct goacc_asyncqueue *aq = GOMP_PLUGIN_malloc (sizeof (struct goacc_asyncqueue)); @@ -1637,14 +1638,26 @@ GOMP_OFFLOAD_openacc_async_construct (int device __attribute__((unused))) return aq; } -bool -GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq) +struct goacc_asyncqueue * +GOMP_OFFLOAD_openacc_async_construct (int device __attribute__((unused))) +{ + return nvptx_goacc_asyncqueue_construct (CU_STREAM_DEFAULT); +} + +static bool +nvptx_goacc_asyncqueue_destruct (struct goacc_asyncqueue *aq) { CUDA_CALL_ERET (false, cuStreamDestroy, aq->cuda_stream); free (aq); return true; } +bool +GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq) +{ + return nvptx_goacc_asyncqueue_destruct (aq); +} + int GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq) { @@ -1658,14 +1671,20 @@ GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq) return -1; } -bool -GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq) +static bool +nvptx_goacc_asyncqueue_synchronize (struct goacc_asyncqueue *aq) { CUDA_CALL_ERET (false, cuStreamSynchronize, aq->cuda_stream); return true; } bool +GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq) +{ + return nvptx_goacc_asyncqueue_synchronize (aq); +} + +bool GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue *aq1, struct goacc_asyncqueue *aq2) { @@ -1925,22 +1944,6 @@ nvptx_stacks_acquire (struct ptx_device *ptx_dev, size_t size, int num) void -rev_off_dev_to_host_cpy (void *dest, const void *src, size_t size, - CUstream stream) -{ - CUDA_CALL_ASSERT (cuMemcpyDtoHAsync, dest, (CUdeviceptr) src, size, stream); - CUDA_CALL_ASSERT (cuStreamSynchronize, stream); -} - -void -rev_off_host_to_dev_cpy (void *dest, const void *src, size_t size, - CUstream stream) -{ - CUDA_CALL_ASSERT (cuMemcpyHtoDAsync, (CUdeviceptr) dest, src, size, stream); - CUDA_CALL_ASSERT (cuStreamSynchronize, stream); -} - -void GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args) { struct targ_fn_descriptor *tgt_fn_desc @@ -1973,9 +1976,17 @@ GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args) } nvptx_adjust_launch_bounds (tgt_fn, ptx_dev, &teams, &threads); - size_t stack_size = nvptx_stacks_size (); bool reverse_offload = ptx_dev->rev_data != NULL; - CUstream copy_stream = NULL; + struct goacc_asyncqueue *reverse_offload_aq = NULL; + if (reverse_offload) + { + reverse_offload_aq + = nvptx_goacc_asyncqueue_construct (CU_STREAM_NON_BLOCKING); + if (!reverse_offload_aq) + exit (EXIT_FAILURE); + } + + size_t stack_size = nvptx_stacks_size (); pthread_mutex_lock (&ptx_dev->omp_stacks.lock); void *stacks = nvptx_stacks_acquire (ptx_dev, stack_size, teams * threads); @@ -1989,8 +2000,6 @@ GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args) GOMP_PLUGIN_debug (0, " %s: kernel %s: launch" " [(teams: %u), 1, 1] [(lanes: 32), (threads: %u), 1]\n", __FUNCTION__, fn_name, teams, threads); - if (reverse_offload) - CUDA_CALL_ASSERT (cuStreamCreate, ©_stream, CU_STREAM_NON_BLOCKING); r = CUDA_CALL_NOCHECK (cuLaunchKernel, function, teams, 1, 1, 32, threads, 1, 0, NULL, NULL, config); if (r != CUDA_SUCCESS) @@ -2013,17 +2022,15 @@ GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args) GOMP_PLUGIN_target_rev (rev_data->fn, rev_data->mapnum, rev_data->addrs, rev_data->sizes, rev_data->kinds, rev_data->dev_num, - rev_off_dev_to_host_cpy, - rev_off_host_to_dev_cpy, copy_stream); - CUDA_CALL_ASSERT (cuStreamSynchronize, copy_stream); + reverse_offload_aq); + if (!nvptx_goacc_asyncqueue_synchronize (reverse_offload_aq)) + exit (EXIT_FAILURE); __atomic_store_n (&rev_data->fn, 0, __ATOMIC_RELEASE); } usleep (1); } else r = CUDA_CALL_NOCHECK (cuCtxSynchronize, ); - if (reverse_offload) - CUDA_CALL_ASSERT (cuStreamDestroy, copy_stream); if (r == CUDA_ERROR_LAUNCH_FAILED) GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r), maybe_abort_msg); @@ -2031,6 +2038,12 @@ GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args) GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r)); pthread_mutex_unlock (&ptx_dev->omp_stacks.lock); + + if (reverse_offload) + { + if (!nvptx_goacc_asyncqueue_destruct (reverse_offload_aq)) + exit (EXIT_FAILURE); + } } /* TODO: Implement GOMP_OFFLOAD_async_run. */ |
