CUresult cuDeviceGetAttribute (int *, CUdevice_attribute, CUdevice);
CUresult cuDeviceGetCount (int *);
CUresult cuDeviceGetName (char *, int, CUdevice);
+CUresult cuDeviceGetPCIBusId (char *, int, CUdevice);
CUresult cuEventCreate (CUevent *, unsigned);
#define cuEventDestroy cuEventDestroy_v2
CUresult cuEventDestroy (CUevent);
gomp_display_string (buffer, size, ret, buf, strlen (buf));
}
+int
+gomp_get_current_numa_node ()
+{
+ return -1;
+}
+
+int
+gomp_get_numa_distance (int node1, int node2)
+{
+ (void) node1;
+ (void) node2;
+ return -1;
+}
+
ialias(omp_get_place_num_procs)
ialias(omp_get_place_proc_ids)
#endif
#include "libgomp.h"
#include "proc.h"
+#include <dirent.h>
#include <errno.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include <limits.h>
+#include <sys/syscall.h>
#ifdef HAVE_PTHREAD_AFFINITY_NP
#include "../../affinity.c"
#endif
+
+static int num_numa_nodes = 0;
+static int *numa_distances = NULL;
+
+int
+gomp_get_current_numa_node ()
+{
+ int node;
+ syscall (SYS_getcpu, NULL /* cpu */, &node, NULL /* no longer used */);
+ return node;
+}
+
+int
+gomp_get_numa_distance (int node1, int node2)
+{
+ if (node1 < 0 || node2 < 0 || num_numa_nodes < 0)
+ return -1;
+
+ if (numa_distances == NULL)
+ {
+ num_numa_nodes = -1;
+ DIR *dir = opendir ("/sys/devices/system/node");
+ if (!dir)
+ return -1;
+ struct dirent *dp;
+ int cnt = 0;
+ errno = 0;
+ while ((dp = readdir(dir)) != NULL)
+ if (strncmp ("node", dp->d_name, 4 /* strlen ("node") */) == 0)
+ cnt++;
+ else if (errno)
+ {
+ closedir (dir);
+ return -1;
+ }
+ closedir (dir);
+ numa_distances = (int *) gomp_malloc (sizeof (int) * cnt * cnt);
+
+ constexpr int len = sizeof ("/sys/devices/system/node/node12345/"
+ "distance");
+ char filename[len];
+
+ for (int i = 0; i < cnt; i++)
+ {
+ if (len < snprintf (filename, sizeof (filename),
+ "/sys/devices/system/node/node%d/distance", i))
+ return -1;
+ int distance = -1;
+ FILE *in = fopen (filename, "r");
+ for (int j = 0; j < cnt; j++)
+ {
+ fscanf (in, "%d", &distance);
+ if (distance == -1)
+ {
+ fclose (in);
+ free (numa_distances);
+ return -1;
+ }
+ numa_distances[i * cnt + j] = distance;
+ }
+ fclose (in);
+ }
+ num_numa_nodes = cnt;
+ }
+ return numa_distances[node1 * num_numa_nodes + node2];
+}
/* Prototypes for functions implemented by libgomp plugins. */
extern const char *GOMP_OFFLOAD_get_name (void);
extern const char *GOMP_OFFLOAD_get_uid (int);
+extern const int GOMP_OFFLOAD_get_numa_node (int);
+
extern unsigned int GOMP_OFFLOAD_get_caps (void);
extern int GOMP_OFFLOAD_get_type (void);
extern int GOMP_OFFLOAD_get_num_devices (unsigned int);
extern bool gomp_display_affinity_var;
extern char *gomp_affinity_format_var;
extern size_t gomp_affinity_format_len;
+extern int gomp_get_current_numa_node ();
+extern int gomp_get_numa_distance (int, int);
extern uintptr_t gomp_def_allocator;
extern const struct gomp_default_icv gomp_default_icv_values;
extern struct gomp_icv_list *gomp_initial_icv_list;
/* Function handlers. */
__typeof (GOMP_OFFLOAD_get_name) *get_name_func;
__typeof (GOMP_OFFLOAD_get_uid) *get_uid_func;
+ __typeof (GOMP_OFFLOAD_get_numa_node) *get_numa_node_func;
__typeof (GOMP_OFFLOAD_get_caps) *get_caps_func;
__typeof (GOMP_OFFLOAD_get_type) *get_type_func;
__typeof (GOMP_OFFLOAD_get_num_devices) *get_num_devices_func;
omp_control_tool_8_;
} OMP_5.2;
+OMP_6.1 {
+ global:
+ omp_get_device_distances;
+} OMP_6.0;
+
GOMP_1.0 {
global:
GOMP_atomic_end;
@c * omp_set_device_num_teams:: <fixme>/TR13
@c * omp_get_device_teams_thread_limit:: <fixme>/TR13
@c * omp_set_device_teams_thread_limit:: <fixme>/TR13
+* omp_get_device_distances:: Obtain the NUMA distance of a device
@end menu
@end table
+@node omp_get_device_distances
+@subsection @code{omp_get_device_distances} -- Obtain the NUMA distance of a device
+@table @asis
+@item @emph{Description}:
+The routine takes two arrays, @var{devs} and @var{distances}, that have at least
+the size @var{ndevs}. For each element of @var{devs}, the corresponding element
+of @var{distances} is set to an implementation-defined non-negative distance
+between the place on which the routine was invoked and the device number
+specified in @var{devs}. For the host device, the distance is zero.
+
+The routine must be executed on the host device and all values of @var{devs}
+must be conforming devices numbers other than @code{omp_invalid_device}.
+
+In GCC the distance is the value returned by the operating system for the
+NUMA distance between the node on which the routine was executed and the node
+to which the device is attached to. There are two exceptions: As required by
+the specification, for the host device @code{0} is returned. Additionally,
+when either the operating system reports a negative number or when GCC runtime
+cannot obtain the distance, the fixed value @code{10} is returned. The former
+is in particular the case when the system is virtualized. The values the
+operating system returns are usually based on the SLIT (System Locality Distance
+Information Table) of ACPI (Advanced Configuration and Power Interface) and have
+values between @code{10} and @code{254}. If an invalid device number is
+specified, GCC returns @code{-1}.
+
+Fortran note: In GCC, this routine does not support passing default-kind integer
+arguments with @code{-fdefault-integer-8}.
+
+@item @emph{C/C++}
+@multitable @columnfractions .20 .80
+@item @emph{Prototype}: @tab @code{void omp_get_device_distances (int ndevs, const int *devs, int *distances)}
+@end multitable
+
+@item @emph{Fortran}:
+@multitable @columnfractions .20 .80
+@item @emph{Interface}: @tab @code{subroutine omp_get_device_distances (ndevs, devs, distances)}
+@item @tab @code{integer, intent(in) :: ndevs, devs(*)}
+@item @tab @code{integer, intent(out) :: distances(*)}
+@end multitable
+
+@item @emph{See also}:
+@ref{omp_get_num_devices}
+
+@item @emph{Reference}:
+@c TODO: Update once OpenMP 6.1 is released
+@uref{https://www.openmp.org, OpenMP specification Technical Report 15}, Section 3.16
+@end table
+
+
@node Device Memory Routines
@section Device Memory Routines
omp_interop_rc_t) __GOMP_NOTHROW;
extern int omp_get_device_from_uid (const char *) __GOMP_NOTHROW;
+extern void omp_get_device_distances(int, const int *, int *) __GOMP_NOTHROW;
extern const char *omp_get_uid_from_device (int) __GOMP_NOTHROW;
extern omp_control_tool_result_t omp_control_tool (omp_control_tool_t, int,
end function omp_get_uid_from_device_8
end interface omp_get_uid_from_device
+ interface
+ ! This function only supports integer(4) for simplicity.
+ ! Contrary to OpenMP 6.1, it also uses bind(C) and value
+ ! and an explicit-size instead of assumed-size array.
+ subroutine omp_get_device_distances (ndevs, devs, distances) bind(C)
+ use iso_c_binding
+ integer(c_int), value, intent(in) :: ndevs
+ integer(c_int), intent(in) :: devs(ndevs)
+ integer(c_int), intent(out) :: distances(ndevs)
+ end subroutine omp_get_device_distances
+ end interface
+
interface omp_control_tool
integer (kind=omp_control_tool_result_kind) function &
omp_control_tool (command, modifier)
end function omp_get_uid_from_device_8
end interface omp_get_uid_from_device
+ interface
+! This function only supports integer(4) for simplicity.
+! Contrary to OpenMP 6.1, it also uses bind(C) and value
+! and an explicit-size instead of assumed-size array.
+ subroutine omp_get_device_distances (ndevs, devs, distances) &
+ & bind(C)
+ use iso_c_binding
+ integer(c_int), value, intent(in) :: ndevs
+ integer(c_int), intent(in) :: devs(ndevs)
+ integer(c_int), intent(out) :: distances(ndevs)
+ end subroutine omp_get_device_distances
+ end interface
+
interface omp_control_tool
integer (kind=omp_control_tool_result_kind) function &
& omp_control_tool (command, modifier)
CUDA_ONE_CALL (cuDeviceGetAttribute)
CUDA_ONE_CALL (cuDeviceGetCount)
CUDA_ONE_CALL (cuDeviceGetName)
+CUDA_ONE_CALL_MAYBE_NULL (cuDeviceGetPCIBusId)
CUDA_ONE_CALL_MAYBE_NULL (cuDeviceGetUuid)
CUDA_ONE_CALL_MAYBE_NULL (cuDeviceGetUuid_v2)
CUDA_ONE_CALL (cuDeviceTotalMem)
thread should have locked agent->module_rwlock for reading before
acquiring it. */
pthread_mutex_t prog_mutex;
+ /* HSA executable - the finalized program that is used to locate kernels. */
+ hsa_executable_t executable;
+ /* NUMA node; 0 = not initialized, < 0 N/A (error or virtualized machine);
+ if > 0: actual node is 'numa_node - 1' (range: 0...(num-nodes -1)). */
+ int numa_node;
/* Flag whether the HSA program that consists of all the modules has been
finalized. */
bool prog_finalized;
- /* HSA executable - the finalized program that is used to locate kernels. */
- hsa_executable_t executable;
};
/* Information required to identify, finalize and run any given kernel. */
return str;
}
+/* Return the NUMA node of the GPU identified by ORD; returns -1 when
+ an error occurred; this value might also be returned if on
+ virtualized systems.
+ The implementation assumes that the Linux /sys is available. */
+
+int
+GOMP_OFFLOAD_get_numa_node (int ord)
+{
+ hsa_status_t status, status2;
+ uint32_t domain, bdfid;
+ struct agent_info *agent = get_agent_info (ord);
+
+ /* Initialized to 0; to distinguish, save with offset. */
+ if (agent->numa_node != 0)
+ return agent->numa_node > 0 ? agent->numa_node - 1 : agent->numa_node;
+
+ agent->numa_node = -1;
+
+ status = hsa_fns.hsa_agent_get_info_fn (agent->id, HSA_AMD_AGENT_INFO_DOMAIN,
+ &domain);
+ status2 = hsa_fns.hsa_agent_get_info_fn (agent->id, HSA_AMD_AGENT_INFO_BDFID,
+ &bdfid);
+ if (status != HSA_STATUS_SUCCESS || status2 != HSA_STATUS_SUCCESS)
+ return -1;
+
+ constexpr int len = sizeof ("/sys/bus/pci/devices/0000:00:00.00/numa_node");
+ char filename[len];
+ if (len < snprintf (filename, sizeof (filename),
+ "/sys/bus/pci/devices/%04x:%02x:%02x.0/numa_node",
+ domain, bdfid >> 8, bdfid & 0xFF))
+ return -1;
+
+ FILE *in = fopen (filename, "r");
+ if (!in)
+ return -1;
+ int numa_node = -1;
+ fscanf (in, "%d", &numa_node);
+ fclose (in);
+
+ agent->numa_node = numa_node >= 0 ? numa_node + 1 : numa_node;
+ return numa_node;
+}
+
/* Return the specific capabilities the HSA accelerator have. */
unsigned int
int warp_size;
int max_threads_per_block;
int max_threads_per_multiprocessor;
+ int numa_node;
int default_dims[GOMP_DIM_MAX];
/* Length as used by the CUDA Runtime API ('struct cudaDeviceProp'). */
return str;
}
+/* Return the NUMA node of the GPU identified by ORD; returns -1 when
+ an error occurred; this value might also be returned if on
+ virtualized systems.
+ The implementation assumes that the Linux /sys is available. */
+
+int
+GOMP_OFFLOAD_get_numa_node (int ord)
+{
+ CUresult r = CUDA_ERROR_NOT_FOUND;
+ char bus_id[14] = {};
+ struct ptx_device *dev = ptx_devices[ord];
+
+ /* Initialized to 0; to distinguish, save with offset. */
+ if (dev->numa_node != 0)
+ return dev->numa_node > 0 ? dev->numa_node - 1 : dev->numa_node;
+
+ dev->numa_node = -1;
+
+ if (CUDA_CALL_EXISTS (cuDeviceGetPCIBusId))
+ r = CUDA_CALL_NOCHECK (cuDeviceGetPCIBusId, bus_id, sizeof (bus_id)-1,
+ dev->dev);
+ if (bus_id[0] == '\0' || r != CUDA_SUCCESS)
+ return -1;
+
+ constexpr int len = (sizeof("/sys/bus/pci/devices//numa_node")
+ + sizeof (bus_id));
+ char filename[len];
+ if (len < snprintf (filename, sizeof (filename),
+ "/sys/bus/pci/devices/%s/numa_node", bus_id))
+ return -1;
+
+ FILE *in = fopen (filename, "r");
+ if (!in)
+ return -1;
+ int numa_node = -1;
+ fscanf (in, "%d", &numa_node);
+ fclose (in);
+
+ dev->numa_node = numa_node >= 0 ? numa_node + 1 : numa_node;
+ return numa_node;
+}
+
unsigned int
GOMP_OFFLOAD_get_caps (void)
{
return omp_invalid_device;
}
+/* Return the numa distance between the numa node of the calling host
+ thread and each of the NDEV devices in DEVICES.
+ Special values:
+ - Invalid device number: undefined, GCC uses -1.
+ - Host device: OpenMP defines this to be 0
+ - Nonhost device: >= 0. If not available use 10 (= lowest ACPI
+ distance).
+ Note: Not available can mean either not a supported system
+ (e.g non Linux) or the value is not known (virtualized system). */
+
+void
+omp_get_device_distances (int ndevs, const int *devs, int *distances)
+{
+ if (ndevs < 1)
+ return;
+ int num_devices = gomp_get_num_devices ();
+ int node = gomp_get_current_numa_node ();
+ if (node < 0) /* Not supported. */
+ {
+ for (int i = 0; i < ndevs; i++)
+ if (devs[i] < omp_initial_device || devs[i] > num_devices)
+ distances[i] = -1; /* invalid */
+ else if (devs[i] == omp_initial_device || devs[i] == num_devices)
+ distances[i] = 0;
+ else
+ distances[i] = 10;
+ return;
+ }
+ for (int i = 0; i < ndevs; i++)
+ {
+ int device_num = (devs[i] == omp_default_device
+ ? gomp_get_default_device () : devs[i]);
+ if (device_num < omp_initial_device || device_num > num_devices)
+ distances[i] = -1; /* invalid */
+ else if (device_num == omp_initial_device || device_num == num_devices)
+ distances[i] = 0;
+ else
+ {
+ int dist = 10;
+ struct gomp_device_descr *devicep = resolve_device (device_num,
+ false);
+ if (devicep && devicep->get_numa_node_func)
+ {
+ int node2 = devicep->get_numa_node_func (devicep->target_id);
+ dist = gomp_get_numa_distance (node, node2);
+ }
+ distances[i] = dist >= 0 ? dist : 10;
+ }
+ }
+}
+
ialias (omp_get_uid_from_device)
ialias (omp_get_device_from_uid)
+ialias (omp_get_device_distances)
#ifdef PLUGIN_SUPPORT
DLSYM (get_name);
DLSYM_OPT (get_uid, get_uid);
+ DLSYM_OPT (get_numa_node, get_numa_node);
DLSYM (get_caps);
DLSYM (get_type);
DLSYM (get_num_devices);
--- /dev/null
+#include <omp.h>
+
+int
+main ()
+{
+ int dev, dist;
+ int dev2, dist2;
+
+ // GCC specific: invalid numbers return -1
+ dev = omp_invalid_device; dist = -99;
+ omp_get_device_distances (1, &dev, &dist);
+ if (dev != omp_invalid_device || dist != -1)
+ __builtin_abort ();
+
+ dev = -99; dist = -99;
+ omp_get_device_distances (1, &dev, &dist);
+ if (dev != -99 || dist != -1)
+ __builtin_abort ();
+
+ dev = omp_get_num_devices () + 1; dist = -99;
+ omp_get_device_distances (1, &dev, &dist);
+ if (dev != omp_get_num_devices () + 1 || dist != -1)
+ __builtin_abort ();
+
+ // The following two need to yield idential results
+ dev = omp_default_device; dist = -99;
+ omp_get_device_distances (1, &dev, &dist);
+ if (dev != omp_default_device || dist < 0)
+ __builtin_abort ();
+
+ dev2 = omp_get_default_device (); dist2 = -99;
+ omp_get_device_distances (1, &dev2, &dist2);
+ if (dev2 != omp_get_default_device () || dist2 != dist)
+ __builtin_abort ();
+
+ // The following two need to yield idential results
+ dev = omp_initial_device; dist = -99;
+ omp_get_device_distances (1, &dev, &dist);
+ if (dev != omp_initial_device || dist != 0)
+ __builtin_abort ();
+
+ dev2 = omp_get_num_devices (); dist2 = -99;
+ omp_get_device_distances (1, &dev2, &dist2);
+ if (dev2 != omp_get_num_devices () || dist2 != 0)
+ __builtin_abort ();
+
+ for (int i = omp_initial_device; i <= omp_get_num_devices (); i++)
+ {
+ dev = i; dist = -99;
+ omp_get_device_distances (1, &dev, &dist);
+ if (dev != i)
+ __builtin_abort ();
+ // Host == 0 per definition
+ // GCC specific: All others (unknown or not) should have > 0
+ // the spec only requires >= 0
+ if (i == omp_initial_device || i == omp_get_num_devices ())
+ {
+ if (dist != 0)
+ __builtin_abort ();
+ }
+ else
+ {
+ if (dist <= 0)
+ __builtin_abort ();
+ }
+ }
+
+ int *devs, *dists;
+ int size = 1 + 1 + omp_get_num_devices ();
+ devs = (int *) __builtin_malloc (sizeof (int) * size * size);
+ dists = (int *) __builtin_malloc (sizeof (int) * size * size);
+ for (int i = 0; i < size; i++)
+ {
+ int dev = i - 1; // -1 (initial), 0, ... num_dev
+ devs[i] = dev;
+ dists[i] = -99;
+ }
+
+ omp_get_device_distances (size, devs, dists);
+
+ for (int i = 0; i < size; i++)
+ {
+ int dev = i - 1; // -1 (initial), 0, ... num_dev
+ if (devs[i] != dev)
+ __builtin_abort ();
+ // Host == 0 per definition
+ // GCC specific: All others (unknown or not) should have > 0
+ // the spec only requires >= 0
+ if (devs[i] == omp_initial_device || devs[i] == omp_get_num_devices ())
+ {
+ if (dists[i] != 0)
+ __builtin_abort ();
+ }
+ else
+ {
+ if (dists[i] <= 0)
+ __builtin_abort ();
+ }
+ __builtin_printf ("Device%3d: distance %d\n", dev, dists[i]);
+ }
+}
--- /dev/null
+use omp_lib
+implicit none (type, external)
+integer :: devs(1), dists(1)
+integer, allocatable :: a_devs(:), a_dists(:)
+integer :: i
+
+! GCC specific: invalid numbers return -1
+devs(1) = omp_invalid_device; dists(1) = -99
+call omp_get_device_distances (size(devs), devs, dists)
+if (devs(1) /= omp_invalid_device .or. dists(1) /= -1) stop 1
+
+devs(1) = -123; dists(1) = -99
+call omp_get_device_distances (size(devs), devs, dists)
+if (devs(1) /= -123 .or. dists(1) /= -1) stop 2
+
+devs(1) = omp_get_num_devices () + 1; dists(1) = -99
+call omp_get_device_distances (size(devs), devs, dists)
+if (devs(1) /= omp_get_num_devices () + 1 .or. dists(1) /= -1) stop 3
+
+do i = omp_initial_device, omp_get_num_devices ()
+ devs(1) = i; dists(1) = -99
+ call omp_get_device_distances (size(devs), devs, dists)
+ if (devs(1) /= i) stop 4
+ ! Host == 0 per definition
+ ! GCC specific: All others (unknown or not) should have > 0
+ ! the spec only requires >= 0
+ if (i == omp_initial_device .or. i == omp_get_num_devices ()) then
+ if (dists(1) /= 0) stop 5
+ else
+ if (dists(1) <= 0) stop 6
+ end if
+end do
+
+allocate(a_devs(omp_initial_device:omp_get_num_devices ()))
+a_devs = [(i, i = omp_initial_device, omp_get_num_devices ())]
+allocate(a_dists, mold=a_devs)
+a_dists = -99
+
+call omp_get_device_distances (size(a_devs), a_devs, a_dists)
+
+do i = lbound (a_devs, 1), ubound (a_devs, 1)
+ if (a_devs(i) /= i) stop 7
+ ! Host == 0 per definition
+ ! GCC specific: All others (unknown or not) should have > 0
+ if (i == omp_initial_device .or. i == omp_get_num_devices ()) then
+ if (a_dists(i) /= 0) stop 8
+ else
+ if (a_dists(i) <= 0) stop 9
+ end if
+ print '(a,i3,a,i0)', 'device', i, ': distance ', a_dists(i)
+end do
+
+end