////////////////////////////////////////////////////////////////////////////////
//
-// Copyright (C) 2014-2020 Advanced Micro Devices Inc. All rights reserved.
+// Copyright (C) 2014-2022 Advanced Micro Devices Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person or organization
// obtaining a copy of the software and accompanying documentation covered by
* String containing the ROCr build identifier.
*/
HSA_AMD_SYSTEM_INFO_BUILD_VERSION = 0x200,
-
+ /**
+ * Returns true if hsa_amd_svm_* APIs are supported by the driver. The type of
+ * this attribute is bool.
+ */
+ HSA_AMD_SYSTEM_INFO_SVM_SUPPORTED = 0x201,
+ // TODO: Should this be per Agent?
+ /**
+ * Returns true if all Agents have access to system allocated memory (such as
+ * that allocated by mmap, malloc, or new) by default.
+ * If false then system allocated memory may only be made SVM accessible to
+ * an Agent by declaration of accessibility with hsa_amd_svm_set_attributes.
+ * The type of this attribute is bool.
+ */
HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT = 0x202
} hsa_system_info_t;
* Minor version of the HSA runtime specification supported by the
* agent. The type of this attribute is uint16_t.
*/
- HSA_AGENT_INFO_VERSION_MINOR = 22
-
+ HSA_AGENT_INFO_VERSION_MINOR = 22,
+ /**
+ * This enum does not have a fixed underlying type, thus in C++ post D2338:
+ * If the enumeration type does not have a fixed underlying type, the value is
+ * unchanged if the original value is within the range of the enumeration
+ * values (9.7.1 [dcl.enum]), and otherwise, the behavior is
+ * undefined.
+ * Thus increase the range of this enum to encompass vendor extensions.
+ */
+ HSA_AGENT_INFO_LAST = INT32_MAX
} hsa_agent_info_t;
/**
////////////////////////////////////////////////////////////////////////////////
//
-// Copyright (C) 2014-2020 Advanced Micro Devices Inc. All rights reserved.
+// Copyright (C) 2014-2022 Advanced Micro Devices Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person or organization
// obtaining a copy of the software and accompanying documentation covered by
extern "C" {
#endif
+/** \addtogroup aql Architected Queuing Language
+ * @{
+ */
+
+/**
+ * @brief A fixed-size type used to represent ::hsa_signal_condition_t constants.
+ */
+typedef uint32_t hsa_signal_condition32_t;
+
+/**
+ * @brief AMD vendor specific packet type.
+ */
+typedef enum {
+ /**
+ * Packet used by agents to delay processing of subsequent packets until a
+ * configurable condition is satisfied by an HSA signal. Only kernel dispatch
+ * queues created from AMD GPU Agents support this packet.
+ */
+ HSA_AMD_PACKET_TYPE_BARRIER_VALUE = 2,
+} hsa_amd_packet_type_t;
+
+/**
+ * @brief A fixed-size type used to represent ::hsa_amd_packet_type_t constants.
+ */
+typedef uint8_t hsa_amd_packet_type8_t;
+
+/**
+ * @brief AMD vendor specific AQL packet header
+ */
+typedef struct hsa_amd_packet_header_s {
+ /**
+ * Packet header. Used to configure multiple packet parameters such as the
+ * packet type. The parameters are described by ::hsa_packet_header_t.
+ */
+ uint16_t header;
+
+ /**
+ *Format of the vendor specific packet.
+ */
+ hsa_amd_packet_type8_t AmdFormat;
+
+ /**
+ * Reserved. Must be 0.
+ */
+ uint8_t reserved;
+} hsa_amd_vendor_packet_header_t;
+
+/**
+ * @brief AMD barrier value packet. Halts packet processing and waits for
+ * (signal_value & ::mask) ::cond ::value to be satisfied, where signal_value
+ * is the value of the signal ::signal.
+ */
+typedef struct hsa_amd_barrier_value_packet_s {
+ /**
+ * AMD vendor specific packet header.
+ */
+ hsa_amd_vendor_packet_header_t header;
+
+ /**
+ * Reserved. Must be 0.
+ */
+ uint32_t reserved0;
+
+ /**
+ * Dependent signal object. A signal with a handle value of 0 is
+ * allowed and is interpreted by the packet processor a satisfied
+ * dependency.
+ */
+ hsa_signal_t signal;
+
+ /**
+ * Value to compare against.
+ */
+ hsa_signal_value_t value;
+
+ /**
+ * Bit mask to be combined by bitwise AND with ::signal's value.
+ */
+ hsa_signal_value_t mask;
+
+ /**
+ * Comparison operation. See ::hsa_signal_condition_t.
+ */
+ hsa_signal_condition32_t cond;
+
+ /**
+ * Reserved. Must be 0.
+ */
+ uint32_t reserved1;
+
+ /**
+ * Reserved. Must be 0.
+ */
+ uint64_t reserved2;
+
+ /**
+ * Reserved. Must be 0.
+ */
+ uint64_t reserved3;
+
+ /**
+ * Signal used to indicate completion of the job. The application can use the
+ * special signal handle 0 to indicate that no signal is used.
+ */
+ hsa_signal_t completion_signal;
+} hsa_amd_barrier_value_packet_t;
+
+/** @} */
+
/**
* @brief Enumeration constants added to ::hsa_status_t.
*
* Agent executed an invalid shader instruction.
*/
HSA_STATUS_ERROR_ILLEGAL_INSTRUCTION = 42,
+
+ /**
+ * Agent attempted to access an inaccessible address.
+ * See hsa_amd_register_system_event_handler and
+ * HSA_AMD_GPU_MEMORY_FAULT_EVENT for more information on illegal accesses.
+ */
+ HSA_STATUS_ERROR_MEMORY_FAULT = 43,
+
+ /**
+ * The CU mask was successfully set but the mask attempted to enable a CU
+ * which was disabled for the process. CUs disabled for the process remain
+ * disabled.
+ */
+ HSA_STATUS_CU_MASK_REDUCED = 44,
};
/**
const hsa_agent_t* consumers, uint64_t attributes,
hsa_signal_t* signal);
+/**
+ * @brief Returns a pointer to the value of a signal.
+ *
+ * Use of this API does not modify the lifetime of ::signal and any
+ * hsa_signal_value_t retrieved by this API has lifetime equal to that of
+ * ::signal.
+ *
+ * This API is intended for partial interoperability with non-HSA compatible
+ * devices and should not be used where HSA interfaces are available.
+ *
+ * Use of the signal value must comply with use restritions of ::signal.
+ * Use may result in data races if the operations performed are not platform
+ * atomic. Use with HSA_AMD_SIGNAL_AMD_GPU_ONLY or HSA_AMD_SIGNAL_IPC
+ * attributed signals is required.
+ *
+ * @param[in] Signal handle to extract the signal value pointer from.
+ *
+ * @param[out] Location where the extracted signal value pointer will be placed.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_SIGNAL signal is not a valid hsa_signal_t
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT value_ptr is NULL.
+ */
+hsa_status_t hsa_amd_signal_value_pointer(hsa_signal_t signal,
+ volatile hsa_signal_value_t** value_ptr);
+
/**
* @brief Asyncronous signal handler function type.
*
void* value);
/**
- * @brief Set a CU affinity to specific queues within the process, this function
- * call is "atomic".
+ * @brief Set a queue's CU affinity mask.
+ *
+ * @details Enables the queue to run on only selected CUs. The given mask is
+ * combined by bitwise AND with any device wide mask in HSA_CU_MASK before
+ * being applied.
+ * If num_cu_mask_count is 0 then the request is interpreted as a request to
+ * enable all CUs and no cu_mask array need be given.
*
* @param[in] queue A pointer to HSA queue.
*
- * @param[in] num_cu_mask_count Size of CUMask bit array passed in.
+ * @param[in] num_cu_mask_count Size of CUMask bit array passed in, in bits.
*
* @param[in] cu_mask Bit-vector representing the CU mask.
*
* @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
*
+ * @retval ::HSA_STATUS_CU_MASK_REDUCED The function was successfully executed
+ * but the given mask attempted to enable a CU which was disabled by
+ * HSA_CU_MASK. CUs disabled by HSA_CU_MASK remain disabled.
+ *
* @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
* initialized.
*
* @retval ::HSA_STATUS_ERROR_INVALID_QUEUE @p queue is NULL or invalid.
*
* @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p num_cu_mask_count is not
- * multiple of 32 or @p cu_mask is NULL.
- *
- * @retval ::HSA_STATUS_ERROR failed to call thunk api
+ * a multiple of 32 or @p num_cu_mask_count is not 0 and cu_mask is NULL.
+ * Devices with work group processors must even-index contiguous pairwise
+ * CU enable e.g. 0x33(b'110011) is valid while 0x5(0x101) and 0x6(b'0110)
+ * are invalid.
*
*/
hsa_status_t HSA_API hsa_amd_queue_cu_set_mask(const hsa_queue_t* queue,
uint32_t num_cu_mask_count,
const uint32_t* cu_mask);
+/**
+ * @brief Retrieve a queue's CU affinity mask.
+ *
+ * @details Returns the first num_cu_mask_count bits of a queue's CU mask.
+ * Ensure that num_cu_mask_count is at least as large as
+ * HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT to retrieve the entire mask.
+ *
+ * @param[in] queue A pointer to HSA queue.
+ *
+ * @param[in] num_cu_mask_count Size of CUMask bit array passed in, in bits.
+ *
+ * @param[out] cu_mask Bit-vector representing the CU mask.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_QUEUE @p queue is NULL or invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p num_cu_mask_count is 0, not
+ * a multiple of 32 or @p cu_mask is NULL.
+ *
+ */
+hsa_status_t HSA_API hsa_amd_queue_cu_get_mask(const hsa_queue_t* queue, uint32_t num_cu_mask_count,
+ uint32_t* cu_mask);
+
/**
* @brief Memory segments associated with a memory pool.
*/
HSA_AMD_MEMORY_POOL_INFO_ALLOC_MAX_SIZE = 16,
} hsa_amd_memory_pool_info_t;
+/**
+ * @brief Memory pool flag used to specify allocation directives
+ *
+ */
+typedef enum hsa_amd_memory_pool_flag_s {
+ /**
+ * Allocates memory that conforms to standard HSA memory consistency model
+ */
+ HSA_AMD_MEMORY_POOL_STANDARD_FLAG = 0,
+ /**
+ * Allocates fine grain memory type where memory ordering is per point to point
+ * connection. Atomic memory operations on these memory buffers are not
+ * guaranteed to be visible at system scope.
+ */
+ HSA_AMD_MEMORY_POOL_PCIE_FLAG = 1,
+
+} hsa_amd_memory_pool_flag_t;
+
/**
* @brief Get the current value of an attribute of a memory pool.
*
* ::HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE in @p memory_pool.
*
* @param[in] flags A bit-field that is used to specify allocation
- * directives. Reserved parameter, must be 0.
+ * directives.
*
* @param[out] ptr Pointer to the location where to store the base virtual
* address of
*
* @param[in] dst_agent Agent associated with the @p dst. The agent must be able to directly
* access both the source and destination buffers in their current locations.
+ * May be zero in which case the runtime will attempt to discover the destination agent.
+ * Discovery may have variable and/or high latency.
*
* @param[in] src A valid pointer to the source of data to be copied. The source
* buffer must not overlap with the destination buffer, otherwise the copy will succeed
*
* @param[in] src_agent Agent associated with the @p src. The agent must be able to directly
* access both the source and destination buffers in their current locations.
+ * May be zero in which case the runtime will attempt to discover the destination agent.
+ * Discovery may have variable and/or high latency.
*
* @param[in] size Number of bytes to copy. If @p size is 0, no copy is
* performed and the function returns success. Copying a number of bytes larger
*
* @param[in] dep_signals List of signals that must be waited on before the copy
* operation starts. The copy will start after every signal has been observed with
- * the value 0. The dependent signal should not include completion signal from hsa_amd_memory_async_copy
- * operation to be issued in future as that can result in a deadlock. If @p num_dep_signals is 0, this
- * argument is ignored.
+ * the value 0. The dependent signal should not include completion signal from
+ * hsa_amd_memory_async_copy operation to be issued in future as that can result
+ * in a deadlock. If @p num_dep_signals is 0, this argument is ignored.
*
* @param[in] completion_signal Signal used to indicate completion of the copy
* operation. When the copy operation is finished, the value of the signal is
* @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
* initialized.
*
- * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid.
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT An agent is invalid or no discovered agent has access.
*
* @retval ::HSA_STATUS_ERROR_INVALID_SIGNAL @p completion_signal is invalid.
*
GPU boards) any such agent may be returned.
*/
hsa_agent_t agentOwner;
+ /*
+ Contains a bitfield of hsa_amd_memory_pool_global_flag_t values.
+ Reports the effective global flags bitmask for the allocation. This field is not meaningful if
+ the type of the allocation is HSA_EXT_POINTER_TYPE_UNKNOWN.
+ */
+ uint32_t global_flags;
} hsa_amd_pointer_info_t;
/**
*
* @retval HSA_STATUS_ERROR_INVALID_ARGUMENT NULL in @p ptr or @p info.
*/
-hsa_status_t HSA_API hsa_amd_pointer_info(void* ptr,
+hsa_status_t HSA_API hsa_amd_pointer_info(const void* ptr,
hsa_amd_pointer_info_t* info,
void* (*alloc)(size_t),
uint32_t* num_agents_accessible,
*
* @retval HSA_STATUS_ERROR_INVALID_ARGUMENT @p ptr is not known to ROCr.
*/
-hsa_status_t HSA_API hsa_amd_pointer_info_set_userdata(void* ptr,
+hsa_status_t HSA_API hsa_amd_pointer_info_set_userdata(const void* ptr,
void* userdata);
/**
// GPU attempted access to a host only page.
HSA_AMD_MEMORY_FAULT_HOST_ONLY = 1 << 3,
// DRAM ECC failure.
- HSA_AMD_MEMORY_FAULT_DRAM_ECC = 1 << 4,
+ HSA_AMD_MEMORY_FAULT_DRAMECC = 1 << 4,
// Can't determine the exact fault address.
HSA_AMD_MEMORY_FAULT_IMPRECISE = 1 << 5,
// SRAM ECC failure (ie registers, no fault address).
- HSA_AMD_MEMORY_FAULT_SRAM_ECC = 1 << 6,
+ HSA_AMD_MEMORY_FAULT_SRAMECC = 1 << 6,
// GPU reset following unspecified hang.
HSA_AMD_MEMORY_FAULT_HANG = 1 << 31
} hsa_amd_memory_fault_reason_t;
hsa_status_t HSA_API hsa_amd_deregister_deallocation_callback(void* ptr,
hsa_amd_deallocation_callback_t callback);
+typedef enum hsa_amd_svm_model_s {
+ /**
+ * Updates to memory with this attribute conform to HSA memory consistency
+ * model.
+ */
+ HSA_AMD_SVM_GLOBAL_FLAG_FINE_GRAINED = 0,
+ /**
+ * Writes to memory with this attribute can be performed by a single agent
+ * at a time.
+ */
+ HSA_AMD_SVM_GLOBAL_FLAG_COARSE_GRAINED = 1,
+ /**
+ * Memory region queried contains subregions with both
+ * HSA_AMD_SVM_GLOBAL_FLAG_COARSE_GRAINED and
+ * HSA_AMD_SVM_GLOBAL_FLAG_FINE_GRAINED attributes.
+ *
+ * This attribute can not be used in hsa_amd_svm_attributes_set. It is a
+ * possible return from hsa_amd_svm_attributes_get indicating that the query
+ * region contains both coarse and fine grained memory.
+ */
+ HSA_AMD_SVM_GLOBAL_FLAG_INDETERMINATE = 2
+} hsa_amd_svm_model_t;
+
+typedef enum hsa_amd_svm_attribute_s {
+ // Memory model attribute.
+ // Type of this attribute is hsa_amd_svm_model_t.
+ HSA_AMD_SVM_ATTRIB_GLOBAL_FLAG = 0,
+ // Marks the range read only. This allows multiple physical copies to be
+ // placed local to each accessing device.
+ // Type of this attribute is bool.
+ HSA_AMD_SVM_ATTRIB_READ_ONLY = 1,
+ // Automatic migrations should attempt to keep the memory within the xgmi hive
+ // containing accessible agents.
+ // Type of this attribute is bool.
+ HSA_AMD_SVM_ATTRIB_HIVE_LOCAL = 2,
+ // Page granularity to migrate at once. Page granularity is specified as
+ // log2(page_count).
+ // Type of this attribute is uint64_t.
+ HSA_AMD_SVM_ATTRIB_MIGRATION_GRANULARITY = 3,
+ // Physical location to prefer when automatic migration occurs.
+ // Set to the null agent handle (handle == 0) to indicate there
+ // is no preferred location.
+ // Type of this attribute is hsa_agent_t.
+ HSA_AMD_SVM_ATTRIB_PREFERRED_LOCATION = 4,
+ // This attribute can not be used in ::hsa_amd_svm_attributes_set (see
+ // ::hsa_amd_svm_prefetch_async).
+ // Queries the physical location of most recent prefetch command.
+ // If the prefetch location has not been set or is not uniform across the
+ // address range then returned hsa_agent_t::handle will be 0.
+ // Querying this attribute will return the destination agent of the most
+ // recent ::hsa_amd_svm_prefetch_async targeting the address range. If
+ // multiple async prefetches have been issued targeting the region and the
+ // most recently issued prefetch has completed then the query will return
+ // the location of the most recently completed prefetch.
+ // Type of this attribute is hsa_agent_t.
+ HSA_AMD_SVM_ATTRIB_PREFETCH_LOCATION = 5,
+ // Optimizes with the anticipation that the majority of operations to the
+ // range will be read operations.
+ // Type of this attribute is bool.
+ HSA_AMD_SVM_ATTRIB_READ_MOSTLY = 6,
+ // Allows the execution on GPU.
+ // Type of this attribute is bool.
+ HSA_AMD_SVM_ATTRIB_GPU_EXEC = 7,
+ // This attribute can not be used in ::hsa_amd_svm_attributes_get.
+ // Enables an agent for access to the range. Access may incur a page fault
+ // and associated memory migration. Either this or
+ // HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE_IN_PLACE is required prior to SVM
+ // access if HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT is false.
+ // Type of this attribute is hsa_agent_t.
+ HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE = 0x200,
+ // This attribute can not be used in ::hsa_amd_svm_attributes_get.
+ // Enables an agent for access to the range without page faults. Access
+ // will not incur a page fault and will not cause access based migration.
+ // and associated memory migration. Either this or
+ // HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE is required prior to SVM access if
+ // HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT is false.
+ // Type of this attribute is hsa_agent_t.
+ HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE_IN_PLACE = 0x201,
+ // This attribute can not be used in ::hsa_amd_svm_attributes_get.
+ // Denies an agent access to the memory range. Access will cause a terminal
+ // segfault.
+ // Type of this attribute is hsa_agent_t.
+ HSA_AMD_SVM_ATTRIB_AGENT_NO_ACCESS = 0x202,
+ // This attribute can not be used in ::hsa_amd_svm_attributes_set.
+ // Returns the access attribute associated with the agent.
+ // The agent to query must be set in the attribute value field.
+ // The attribute enum will be replaced with the agent's current access
+ // attribute for the address range.
+ // TODO: Clarify KFD return value for non-uniform access attribute.
+ // Type of this attribute is hsa_agent_t.
+ HSA_AMD_SVM_ATTRIB_ACCESS_QUERY = 0x203,
+} hsa_amd_svm_attribute_t;
+
+// List type for hsa_amd_svm_attributes_set/get.
+typedef struct hsa_amd_svm_attribute_pair_s {
+ // hsa_amd_svm_attribute_t value.
+ uint64_t attribute;
+ // Attribute value. Bit values should be interpreted according to the type
+ // given in the associated attribute description.
+ uint64_t value;
+} hsa_amd_svm_attribute_pair_t;
+
+/**
+ * @brief Sets SVM memory attributes.
+ *
+ * If HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT returns false then enabling
+ * access to an Agent via this API (setting HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE
+ * or HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE_IN_PLACE) is required prior to SVM
+ * memory access by that Agent.
+ *
+ * Attributes HSA_AMD_SVM_ATTRIB_ACCESS_QUERY and HSA_AMD_SVM_ATTRIB_PREFETCH_LOCATION
+ * may not be used with this API.
+ *
+ * @param[in] ptr Will be aligned down to nearest page boundary.
+ *
+ * @param[in] size Will be aligned up to nearest page boundary.
+ *
+ * @param[in] attribute_list List of attributes to set for the address range.
+ *
+ * @param[in] attribute_count Length of @p attribute_list.
+ */
+hsa_status_t hsa_amd_svm_attributes_set(void* ptr, size_t size,
+ hsa_amd_svm_attribute_pair_t* attribute_list,
+ size_t attribute_count);
+
+/**
+ * @brief Gets SVM memory attributes.
+ *
+ * Attributes HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE,
+ * HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE_IN_PLACE and
+ * HSA_AMD_SVM_ATTRIB_PREFETCH_LOCATION may not be used with this API.
+ *
+ * Note that attribute HSA_AMD_SVM_ATTRIB_ACCESS_QUERY takes as input an
+ * hsa_agent_t and returns the current access type through its attribute field.
+ *
+ * @param[in] ptr Will be aligned down to nearest page boundary.
+ *
+ * @param[in] size Will be aligned up to nearest page boundary.
+ *
+ * @param[in] attribute_list List of attributes to set for the address range.
+ *
+ * @param[in] attribute_count Length of @p attribute_list.
+ */
+hsa_status_t hsa_amd_svm_attributes_get(void* ptr, size_t size,
+ hsa_amd_svm_attribute_pair_t* attribute_list,
+ size_t attribute_count);
+
+/**
+ * @brief Asynchronously migrates memory to an agent.
+ *
+ * Schedules memory migration to @p agent when @p dep_signals have been observed equal to zero.
+ * @p completion_signal will decrement when the migration is complete.
+ *
+ * @param[in] ptr Will be aligned down to nearest page boundary.
+ *
+ * @param[in] size Will be aligned up to nearest page boundary.
+ *
+ * @param[in] agent Agent to migrate to.
+ *
+ * @param[in] num_dep_signals Number of dependent signals. Can be 0.
+ *
+ * @param[in] dep_signals List of signals that must be waited on before the migration
+ * operation starts. The migration will start after every signal has been observed with
+ * the value 0. If @p num_dep_signals is 0, this argument is ignored.
+ *
+ * @param[in] completion_signal Signal used to indicate completion of the migration
+ * operation. When the migration operation is finished, the value of the signal is
+ * decremented. The runtime indicates that an error has occurred during the copy
+ * operation by setting the value of the completion signal to a negative
+ * number. If no completion signal is required this handle may be null.
+ */
+hsa_status_t hsa_amd_svm_prefetch_async(void* ptr, size_t size, hsa_agent_t agent,
+ uint32_t num_dep_signals, const hsa_signal_t* dep_signals,
+ hsa_signal_t completion_signal);
+
#ifdef __cplusplus
} // end extern "C" block
#endif
////////////////////////////////////////////////////////////////////////////////
//
-// Copyright (C) 2014-2020 Advanced Micro Devices Inc. All rights reserved.
+// Copyright (C) 2014-2022 Advanced Micro Devices Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person or organization
// obtaining a copy of the software and accompanying documentation covered by
@PLUGIN_GCN_TRUE@libgomp_plugin_gcn_la_DEPENDENCIES = libgomp.la \
@PLUGIN_GCN_TRUE@ $(am__DEPENDENCIES_1)
@PLUGIN_GCN_TRUE@am_libgomp_plugin_gcn_la_OBJECTS = \
-@PLUGIN_GCN_TRUE@ libgomp_plugin_gcn_la-plugin-gcn.lo
+@PLUGIN_GCN_TRUE@ libgomp_plugin_gcn_la-plugin-gcn.lo \
+@PLUGIN_GCN_TRUE@ libgomp_plugin_gcn_la-simple-allocator.lo \
+@PLUGIN_GCN_TRUE@ libgomp_plugin_gcn_la-mutex.lo
libgomp_plugin_gcn_la_OBJECTS = $(am_libgomp_plugin_gcn_la_OBJECTS)
AM_V_lt = $(am__v_lt_@AM_V@)
am__v_lt_ = $(am__v_lt_@AM_DEFAULT_V@)
# AMD GCN plugin
@PLUGIN_GCN_TRUE@libgomp_plugin_gcn_version_info = -version-info $(libtool_VERSION)
-@PLUGIN_GCN_TRUE@libgomp_plugin_gcn_la_SOURCES = plugin/plugin-gcn.c
+@PLUGIN_GCN_TRUE@libgomp_plugin_gcn_la_SOURCES = plugin/plugin-gcn.c simple-allocator.c \
+@PLUGIN_GCN_TRUE@ plugin/mutex.c
+
@PLUGIN_GCN_TRUE@libgomp_plugin_gcn_la_CPPFLAGS = $(AM_CPPFLAGS) \
@PLUGIN_GCN_TRUE@ -D_GNU_SOURCE
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/iter.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/iter_ull.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libgomp-plugin.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libgomp_plugin_gcn_la-mutex.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libgomp_plugin_gcn_la-plugin-gcn.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libgomp_plugin_gcn_la-simple-allocator.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libgomp_plugin_nvptx_la-plugin-nvptx.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/lock.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/loop.Plo@am__quote@
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(libgomp_plugin_gcn_la_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libgomp_plugin_gcn_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o libgomp_plugin_gcn_la-plugin-gcn.lo `test -f 'plugin/plugin-gcn.c' || echo '$(srcdir)/'`plugin/plugin-gcn.c
+libgomp_plugin_gcn_la-simple-allocator.lo: simple-allocator.c
+@am__fastdepCC_TRUE@ $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(libgomp_plugin_gcn_la_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libgomp_plugin_gcn_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT libgomp_plugin_gcn_la-simple-allocator.lo -MD -MP -MF $(DEPDIR)/libgomp_plugin_gcn_la-simple-allocator.Tpo -c -o libgomp_plugin_gcn_la-simple-allocator.lo `test -f 'simple-allocator.c' || echo '$(srcdir)/'`simple-allocator.c
+@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libgomp_plugin_gcn_la-simple-allocator.Tpo $(DEPDIR)/libgomp_plugin_gcn_la-simple-allocator.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='simple-allocator.c' object='libgomp_plugin_gcn_la-simple-allocator.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(libgomp_plugin_gcn_la_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libgomp_plugin_gcn_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o libgomp_plugin_gcn_la-simple-allocator.lo `test -f 'simple-allocator.c' || echo '$(srcdir)/'`simple-allocator.c
+
+libgomp_plugin_gcn_la-mutex.lo: plugin/mutex.c
+@am__fastdepCC_TRUE@ $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(libgomp_plugin_gcn_la_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libgomp_plugin_gcn_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT libgomp_plugin_gcn_la-mutex.lo -MD -MP -MF $(DEPDIR)/libgomp_plugin_gcn_la-mutex.Tpo -c -o libgomp_plugin_gcn_la-mutex.lo `test -f 'plugin/mutex.c' || echo '$(srcdir)/'`plugin/mutex.c
+@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libgomp_plugin_gcn_la-mutex.Tpo $(DEPDIR)/libgomp_plugin_gcn_la-mutex.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='plugin/mutex.c' object='libgomp_plugin_gcn_la-mutex.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(libgomp_plugin_gcn_la_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libgomp_plugin_gcn_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o libgomp_plugin_gcn_la-mutex.lo `test -f 'plugin/mutex.c' || echo '$(srcdir)/'`plugin/mutex.c
+
libgomp_plugin_nvptx_la-plugin-nvptx.lo: plugin/plugin-nvptx.c
@am__fastdepCC_TRUE@ $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(libgomp_plugin_nvptx_la_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libgomp_plugin_nvptx_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT libgomp_plugin_nvptx_la-plugin-nvptx.lo -MD -MP -MF $(DEPDIR)/libgomp_plugin_nvptx_la-plugin-nvptx.Tpo -c -o libgomp_plugin_nvptx_la-plugin-nvptx.lo `test -f 'plugin/plugin-nvptx.c' || echo '$(srcdir)/'`plugin/plugin-nvptx.c
@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libgomp_plugin_nvptx_la-plugin-nvptx.Tpo $(DEPDIR)/libgomp_plugin_nvptx_la-plugin-nvptx.Plo
omp_interop_property_t);
#endif
+/* simple-allocator.c */
+
+typedef struct gomp_simple_alloc_context *gomp_simple_alloc_ctx_p;
+
+gomp_simple_alloc_ctx_p gomp_simple_alloc_init_context ();
+void gomp_simple_alloc_register_memory (gomp_simple_alloc_ctx_p ctx,
+ char *base, size_t size);
+void *gomp_simple_alloc (gomp_simple_alloc_ctx_p ctx, size_t size);
+void gomp_simple_free (gomp_simple_alloc_ctx_p ctx, void *addr);
+void *gomp_simple_realloc (gomp_simple_alloc_ctx_p ctx, void *addr,
+ size_t newsize);
+
#ifdef __cplusplus
}
#endif
}
#endif
-/* simple-allocator.c */
-
-typedef struct gomp_simple_alloc_context *gomp_simple_alloc_ctx_p;
-
-gomp_simple_alloc_ctx_p gomp_simple_alloc_init_context ();
-void gomp_simple_alloc_register_memory (gomp_simple_alloc_ctx_p ctx,
- char *base, size_t size);
-void *gomp_simple_alloc (gomp_simple_alloc_ctx_p ctx, size_t size);
-void gomp_simple_free (gomp_simple_alloc_ctx_p ctx, void *addr);
-void *gomp_simple_realloc (gomp_simple_alloc_ctx_p ctx, void *addr,
- size_t newsize);
+/* simple-allocator.c has its prototypes in libgomp-plugin.h so it's
+ accessible from both. */
#endif /* LIBGOMP_H */
a performance boost for NVPTX offload code and also allows unlimited use
of pinned memory regardless of the OS @code{ulimit}/@code{rlimit}
settings.
-@item Managed memory allocated with the OpenMP
+@item Managed memory allocated on the host with the
@code{ompx_gnu_managed_mem_alloc} allocator or in the
- @code{ompx_gnu_managed_mem_space} is not currently supported for AMD GPU
- devices; attempting to use it in an allocator will trigger the fall-back
- trait.
+ @code{ompx_gnu_managed_mem_space} (both GNU extensions) allocate memory
+ equivalent to HIP Managed Memory, although @emph{not} actually allocated
+ using @code{hipMallocManaged}. This memory is accessible by both the
+ host and the device at the same address, so it need not be mapped with
+ @code{map} clauses. Instead, use the @code{is_device_ptr} clause or
+ @code{has_device_addr} clause to indicate that the pointer is already
+ accessible on the device. The ROCm runtime will automatically handle
+ data migration between host and device as needed. Not all AMD GPU
+ devices support this feature, and many that do require that
+ @code{-mxnack=on} is configured at compile time. If managed memory is
+ not supported by the default device, as configured at the moment the
+ allocator is called, then the allocator will use the fall-back setting.
+ If the default device is configured differently when the memory is freed,
+ via @code{omp_free} or @code{omp_realloc}, the result may be undefined.
+ If the current device does not support Unified Shared Memory (or it is
+ not enabled with @code{HSA_XNACK=1}) then Managed Memory might still
+ work, but allocations may only be visible to a single device (whichever
+ was the default device when the @emph{first} allocation was made).
@item The OpenMP routines @code{omp_target_memcpy_rect} and
@code{omp_target_memcpy_rect_async} and the @code{target update}
directive for non-contiguous list items use the 3D memory-copy function
# AMD GCN plugin
libgomp_plugin_gcn_version_info = -version-info $(libtool_VERSION)
toolexeclib_LTLIBRARIES += libgomp-plugin-gcn.la
-libgomp_plugin_gcn_la_SOURCES = plugin/plugin-gcn.c
+libgomp_plugin_gcn_la_SOURCES = plugin/plugin-gcn.c simple-allocator.c \
+ plugin/mutex.c
libgomp_plugin_gcn_la_CPPFLAGS = $(AM_CPPFLAGS) \
-D_GNU_SOURCE
libgomp_plugin_gcn_la_LDFLAGS = $(libgomp_plugin_gcn_version_info) \
--- /dev/null
+/* Mutex implementation for libgomp plugins.
+
+ Copyright (C) 2025 Free Software Foundation, Inc.
+
+ Contributed by BayLibre
+
+ This file is part of the GNU Offloading and Multi Processing Library
+ (libgomp).
+
+ Libgomp is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* This is a minimal implementation of the gomp_mutex_t spinlocks, but
+ without all the dependencies used by the config/linux/mutex implementation.
+
+ At the time of writing, this is only used by simple_alloc which has
+ short-lived locks and should be fine with these. The actual locks are in
+ a header file, so only the fallback "slow" functions are needed here. */
+
+#include "config.h"
+#include <unistd.h>
+#include "libgomp.h"
+
+#ifndef HAVE_SYNC_BUILTINS
+#error "HAVE_SYNC_BUILTINS is required to build this"
+#endif
+
+void
+gomp_mutex_lock_slow (gomp_mutex_t *mutex, int oldval)
+{
+ while (oldval == 1)
+ {
+ usleep (1);
+ oldval = __atomic_exchange_n (mutex, 1, __ATOMIC_ACQUIRE);
+ }
+}
+
+void
+gomp_mutex_unlock_slow (gomp_mutex_t *mutex)
+{
+ GOMP_PLUGIN_fatal ("gomp_mutex_unlock_slow should be unreachable");
+}
#include "oacc-plugin.h"
#include "oacc-int.h"
#include <assert.h>
+#include <sys/mman.h>
+#include <unistd.h>
/* Create hash-table for declare target's indirect clause on the host;
see build-target-indirect-htab.h for details. */
const hsa_dim3_t *range, hsa_agent_t copy_agent,
hsa_amd_copy_direction_t dir, uint32_t num_dep_signals,
const hsa_signal_t *dep_signals, hsa_signal_t completion_signal);
+ hsa_status_t (*hsa_amd_svm_attributes_set_fn)
+ (void* ptr, size_t size, hsa_amd_svm_attribute_pair_t* attribute_list,
+ size_t attribute_count);
};
/* As an HIP runtime is dlopened, following structure defines function
}
else
GCN_WARNING ("HSA_SYSTEM_INFO_EXTENSIONS: FAILED\n");
+
+ bool svm_supported;
+ status = hsa_fns.hsa_system_get_info_fn
+ (HSA_AMD_SYSTEM_INFO_SVM_SUPPORTED, &svm_supported);
+ if (status == HSA_STATUS_SUCCESS)
+ GCN_DEBUG ("HSA_AMD_SYSTEM_INFO_SVM_SUPPORTED: %s\n",
+ (svm_supported ? "TRUE" : "FALSE"));
+ else
+ GCN_WARNING ("HSA_AMD_SYSTEM_INFO_SVM_SUPPORTED: FAILED\n");
+
+ bool svm_accessible;
+ status = hsa_fns.hsa_system_get_info_fn
+ (HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT, &svm_accessible);
+ if (status == HSA_STATUS_SUCCESS)
+ GCN_DEBUG ("HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT: %s\n",
+ (svm_accessible ? "TRUE" : "FALSE"));
+ else
+ GCN_WARNING ("HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT: FAILED\n");
}
/* Dump information about the available hardware. */
DLSYM_OPT_FN (hsa_amd_memory_lock)
DLSYM_OPT_FN (hsa_amd_memory_unlock)
DLSYM_OPT_FN (hsa_amd_memory_async_copy_rect)
+ DLSYM_OPT_FN (hsa_amd_svm_attributes_set)
return true;
#undef DLSYM_OPT_FN
#undef DLSYM_FN
"Consider using ROCR_VISIBLE_DEVICES to disable incompatible "
"devices or run with LOADER_ENABLE_LOGGING=1 for more details.",
device_isa_s, agent_isa_s, agent->device_id);
+ else if (strcmp (device_isa_s, agent_isa_s) == 0)
+ snprintf (msg, sizeof msg,
+ "GCN code object features do not match for an unknown reason "
+ "(device %d).\n"
+ "Try to adjust the HSA_XNACK setting (perhaps?), or use\n"
+ "ROCR_VISIBLE_DEVICES to disable incompatible devices.\n",
+ agent->device_id);
else
snprintf (msg, sizeof msg,
"GCN code object ISA '%s' is incompatible with GPU ISA '%s' "
}
/* }}} */
+/* {{{ Managed Memory
+
+ This implements an allocator equivalent to CUDA "Managed" memory, in which
+ the pages automatically migrate between host and device memory, as needed.
+ These allocations are visible from both the host and devices without the
+ need for explicit mappings. However, OpenMP does need "is_device_ptr" or
+ "has_device_addr" to function properly.
+
+ There isn't a high-level HSA/ROCr API to allocate managed memory, so we
+ use regular memory and register it with the driver by setting it to
+ "coarse-grained" mode, and setting the "accessible by default" attribute
+ on devices where HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT isn't set
+ as standard (as it isn't on systems that don't support USM, or when
+ HSA_XNACK != 1).
+
+ This is in contrast to GOMP_OFFLOAD_alloc which allocates coarse-grained
+ *GPU memory*, which is not visible on the host.
+
+ It would be possible to register memory returned by malloc, but
+ experimentation shows that doing so causes memory faults within the HSA
+ runtime code. Therefore, the Managed memory space is allocated as a
+ largish block and then subdivided via a custom allocator. The "simple"
+ allocator is designed specifically to store its free-chain outside of
+ the registered pages so that allocation does not inadvertently cause
+ pages to migrate.
+
+ Note: if the user has multiple mismatched devices, and one or more do
+ not support USM (or XNACK is off), then each page of the Managed heap
+ could end up associated with a different device (by calling omp_alloc
+ before and after omp_set_default_device). This issue remains
+ an *unhandled* edge-case, at present. */
+
+gomp_simple_alloc_ctx_p managed_ctx = NULL;
+
+/* Initialize or extend the Managed memory space. This is called whenever
+ allocation fails. SIZE is the minimum size required for the failed
+ allocation to succeed; the function may choose a larger size.
+ Note that Linux lazy allocation means that the memory returned isn't
+ guaranteed to actually exist. */
+
+static bool
+managed_heap_create (struct agent_info *agent, size_t size)
+{
+ static int lock = 0;
+ while (__atomic_exchange_n (&lock, 1, __ATOMIC_ACQUIRE) != 0)
+ ;
+
+ size_t default_size = 1L * 1024 * 1024 * 1024; /* 1GB */
+ if (size < default_size)
+ size = default_size;
+
+ /* Round up to a whole page. */
+ int pagesize = getpagesize ();
+ int misalignment = size % pagesize;
+ if (misalignment > 0)
+ size += pagesize - misalignment;
+
+ /* Try to get contiguous memory, but it might not be possible.
+ The most recent previous allocation is at the head of the list. */
+ static void *addrhint = NULL;
+ void *new_pages = mmap (addrhint, size, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ if (!new_pages)
+ {
+ GCN_DEBUG ("Could not allocate Managed Memory heap.");
+ __atomic_store_n (&lock, 0, __ATOMIC_RELEASE);
+ return false;
+ }
+
+ /* Register the heap allocation as coarse grained, "Managed" memory. */
+ struct hsa_amd_svm_attribute_pair_s attr = {
+ HSA_AMD_SVM_ATTRIB_GLOBAL_FLAG,
+ HSA_AMD_SVM_GLOBAL_FLAG_COARSE_GRAINED
+ };
+ hsa_status_t status = hsa_fns.hsa_amd_svm_attributes_set_fn (new_pages, size,
+ &attr, 1);
+ if (status != HSA_STATUS_SUCCESS)
+ GOMP_PLUGIN_fatal ("Failed to allocate Unified Shared Memory;"
+ " please update your drivers and/or kernel");
+
+ /* The HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE setting is required on devices
+ without default SVM. */
+ static int svm_accessible = 0xff; /* Use 0xff as "undefined". */
+ if (svm_accessible == 0xff)
+ {
+ status = hsa_fns.hsa_system_get_info_fn
+ (HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT, &svm_accessible);
+ if (status != HSA_STATUS_SUCCESS)
+ {
+ GCN_DEBUG ("warning: failed to query "
+ " HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT\n");
+ svm_accessible = false;
+ }
+ }
+ if (svm_accessible == false)
+ {
+ struct hsa_amd_svm_attribute_pair_s attr2;
+ attr2.attribute = HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE;
+ attr2.value = agent->id.handle;
+ status = hsa_fns.hsa_amd_svm_attributes_set_fn (new_pages, size, &attr2,
+ 1);
+ if (status != HSA_STATUS_SUCCESS)
+ GOMP_PLUGIN_fatal ("Failed to allocate Unified Shared Memory;"
+ " please update your drivers and/or kernel");
+ }
+
+ addrhint = new_pages + size;
+
+ /* Initialize a new Managed memory heap, or add the new memory into an
+ existing Managed memory heap. */
+ if (!managed_ctx)
+ managed_ctx = gomp_simple_alloc_init_context ();
+ gomp_simple_alloc_register_memory (managed_ctx, new_pages, size);
+
+ __atomic_store_n (&lock, 0, __ATOMIC_RELEASE);
+ return true;
+}
+
+/* }}} */
/* {{{ OpenACC support */
/* Execute an OpenACC kernel, synchronously or asynchronously. */
GOMP_PLUGIN_target_task_completion, async_data);
}
+/* Allocate memory suitable for Managed Memory. */
+
+void *
+GOMP_OFFLOAD_managed_alloc (int device, size_t size)
+{
+ struct agent_info *agent = get_agent_info (device);
+ while (1)
+ {
+ void *result = gomp_simple_alloc (managed_ctx, size);
+ if (result)
+ return result;
+
+ /* Allocation failed. Try again if we can create a new heap block.
+ Note: it's possible another thread could get to the new memory
+ first, so the while loop is necessary. */
+ if (!managed_heap_create (agent, size))
+ return NULL;
+ }
+}
+
+/* Free memory allocated via GOMP_OFFLOAD_managed_alloc. */
+
+bool
+GOMP_OFFLOAD_managed_free (int device, void *ptr)
+{
+ gomp_simple_free (managed_ctx, ptr);
+ return true;
+}
+
/* }}} */
/* {{{ OpenACC Plugin API */
/* Include the splay tree code inline, with the prefixes added. */
#define splay_tree_prefix simple_alloc
#define splay_tree_c
+#define gomp_fatal GOMP_PLUGIN_fatal /* So it links into a plugin. */
#include "splay-tree.h"
if { [check_effective_target_offload_device_nvptx] } {
return 1
}
+
+ if { [libgomp_check_effective_target_offload_target "amdgcn"] } {
+ if [check_runtime_nocache managed_available_ {
+ #include <omp.h>
+ #include <stdlib.h>
+ int main ()
+ {
+ const omp_alloctrait_t traits[] = {
+ { omp_atk_fallback, omp_atv_null_fb }
+ };
+ omp_allocator_handle_t managed_no_fallback
+ = omp_init_allocator (ompx_gnu_managed_mem_space, 1, traits);
+ void *a = omp_alloc (16, managed_no_fallback);
+ return a == NULL;
+ }
+ } ] {
+ return 1
+ }
+ }
+
+ return 0
+}
+
+# return 1 if -mxnack=on is accepted
+
+proc check_effective_target_offload_target_amdgcn_with_xnack { } {
+ if { [libgomp_check_effective_target_offload_target "amdgcn"] } {
+ return [check_no_compiler_messages amd_xnack_ executable {
+ int main () {
+ #pragma omp target
+ ;
+ return 0;
+ }
+ } "-foffload-options=amdgcn-amdhsa=-mxnack=on" ]
+ }
+
return 0
}
// { dg-do run }
// { dg-require-effective-target omp_managedmem }
+// { dg-additional-options -foffload-options=amdgcn-amdhsa=-mxnack=on { target offload_target_amdgcn_with_xnack } }
// Check that the ompx::allocator::gnu_managed_mem allocator can allocate
// Managed Memory, and that host and target can see the data, at the same
/* { dg-additional-options "-foffload-options=nvptx-none=-misa=sm_35" { target { offload_target_nvptx } } } */
/* { dg-additional-sources requires-4-aux.c } */
+/* GCC explicitly disables XNACK for gfx908 (and others) as the hardware
+ support is limited, which results in a diagnostic. */
+/* { dg-excess-errors "Unified Shared Memory is enabled, but XNACK is disabled" { target offload_target_amdgcn } } */
+
/* Check no diagnostic by device-compiler's or host compiler's lto1.
Other file uses: 'requires reverse_offload', but that's inactive as
there are no declare target directives, device constructs nor device routines */
/* { dg-additional-options "-foffload-options=nvptx-none=-misa=sm_35" { target { offload_target_nvptx } } } */
/* { dg-additional-sources requires-4-aux.c } */
+/* GCC explicitly disables XNACK for gfx908 (and others) as the hardware
+ support is limited, which results in a diagnostic. */
+/* { dg-excess-errors "Unified Shared Memory is enabled, but XNACK is disabled" { target offload_target_amdgcn } } */
+
/* Same as requires-4.c, but uses heap memory for 'a'. */
/* Check no diagnostic by device-compiler's or host compiler's lto1.
/* { dg-additional-options "-foffload-options=nvptx-none=-misa=sm_35" { target { offload_target_nvptx } } } */
/* { dg-additional-sources requires-5-aux.c } */
+/* { dg-excess-errors "Unified Shared Memory is enabled, but XNACK is disabled" { target offload_target_amdgcn } } */
/* Depending on offload device capabilities, it may print something like the
following (only) if GOMP_DEBUG=1:
/* { dg-do run } */
/* { dg-require-effective-target omp_managedmem } */
+/* { dg-additional-options "-foffload-options=amdgcn-amdhsa=-mxnack=on" { target offload_target_amdgcn_with_xnack } } */
/* Check that omp_alloc can allocate Managed Memory, and that host and target
can see the data, at the same address, without a mapping. */
/* { dg-do run } */
/* { dg-require-effective-target omp_managedmem } */
+/* { dg-additional-options -foffload-options=amdgcn-amdhsa=-mxnack=on { target offload_target_amdgcn_with_xnack } } */
/* Check that omp_calloc can allocate Managed Memory, and that host and target
can see the data, at the same address, without a mapping. */
/* { dg-do run } */
/* { dg-require-effective-target omp_managedmem } */
+/* { dg-additional-options -foffload-options=amdgcn-amdhsa=-mxnack=on { target offload_target_amdgcn_with_xnack } } */
/* Check that omp_realloc can allocate Managed Memory, and that host and target
can see the data, at the same address, without a mapping. */
/* { dg-do run } */
/* { dg-require-effective-target omp_managedmem } */
+/* { dg-additional-options -foffload-options=amdgcn-amdhsa=-mxnack=on { target offload_target_amdgcn_with_xnack } } */
/* { dg-shouldfail "" } */
/* { dg-output "libgomp: attempted to free managed memory at 0x\[0-9a-f\]+, but the default device is set to the host device" } */
! { dg-do run }
! { dg-require-effective-target omp_managedmem }
+! { dg-additional-options -foffload-options=amdgcn-amdhsa=-mxnack=on { target offload_target_amdgcn_with_xnack } }
! Check that omp_alloc can allocate Managed Memory, and that host and target
! can see the data, at the same address, without a mapping.