libgomp, amdgcn: Implement Managed Memory

author Andrew Stubbs <ams@baylibre.com>

Thu, 20 Nov 2025 17:18:13 +0000 (17:18 +0000)

committer Andrew Stubbs <ams@baylibre.com>

Mon, 1 Dec 2025 12:03:35 +0000 (12:03 +0000)
author Andrew Stubbs <ams@baylibre.com>
Thu, 20 Nov 2025 17:18:13 +0000 (17:18 +0000)
committer Andrew Stubbs <ams@baylibre.com>
Mon, 1 Dec 2025 12:03:35 +0000 (12:03 +0000)
diff --git a/include/hsa.h b/include/hsa.h

index 3c7be95d7fd0fb6cbb48ab5c0a078c4eb7f6bbf4..28867a91a7cd5674b2460befe523301ca2d6b2eb 100644 (file)
--- a/include/hsa.h
+++ b/include/hsa.h
@@ -1,6 +1,6 @@
  ////////////////////////////////////////////////////////////////////////////////
  //
-// Copyright (C) 2014-2020 Advanced Micro Devices Inc.  All rights reserved.
+// Copyright (C) 2014-2022 Advanced Micro Devices Inc.  All rights reserved.
  //
  // Permission is hereby granted, free of charge, to any person or organization
  // obtaining a copy of the software and accompanying documentation covered by
@@ -467,7 +467,19 @@ typedef enum {
    * String containing the ROCr build identifier.
    */
    HSA_AMD_SYSTEM_INFO_BUILD_VERSION = 0x200,
-
+  /**
+  * Returns true if hsa_amd_svm_* APIs are supported by the driver.  The type of
+  * this attribute is bool.
+  */
+  HSA_AMD_SYSTEM_INFO_SVM_SUPPORTED = 0x201,
+  // TODO: Should this be per Agent?
+  /**
+  * Returns true if all Agents have access to system allocated memory (such as
+  * that allocated by mmap, malloc, or new) by default.
+  * If false then system allocated memory may only be made SVM accessible to
+  * an Agent by declaration of accessibility with hsa_amd_svm_set_attributes.
+  * The type of this attribute is bool.
+  */
    HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT = 0x202
  } hsa_system_info_t;
  
@@ -986,8 +998,16 @@ typedef enum {
     * Minor version of the HSA runtime specification supported by the
     * agent. The type of this attribute is uint16_t.
     */
-  HSA_AGENT_INFO_VERSION_MINOR = 22
-
+  HSA_AGENT_INFO_VERSION_MINOR = 22,
+  /**
+   * This enum does not have a fixed underlying type, thus in C++ post D2338:
+   * If the enumeration type does not have a fixed underlying type, the value is
+   * unchanged if the original value is within the range of the enumeration
+   * values (9.7.1 [dcl.enum]), and otherwise, the behavior is
+   * undefined.
+   * Thus increase the range of this enum to encompass vendor extensions.
+   */
+  HSA_AGENT_INFO_LAST = INT32_MAX
  } hsa_agent_info_t;
  
  /**
diff --git a/include/hsa_ext_amd.h b/include/hsa_ext_amd.h

index e29e88090eb02a468e347faaa63e27d7c0762086..16a6aa01d259467025172ef1372611ea3c8b5002 100644 (file)
--- a/include/hsa_ext_amd.h
+++ b/include/hsa_ext_amd.h
@@ -1,6 +1,6 @@
  ////////////////////////////////////////////////////////////////////////////////
  //
-// Copyright (C) 2014-2020 Advanced Micro Devices Inc.  All rights reserved.
+// Copyright (C) 2014-2022 Advanced Micro Devices Inc.  All rights reserved.
  //
  // Permission is hereby granted, free of charge, to any person or organization
  // obtaining a copy of the software and accompanying documentation covered by
@@ -41,6 +41,115 @@
  extern "C" {
  #endif
  
+/** \addtogroup aql Architected Queuing Language
+ *  @{
+ */
+
+/**
+ * @brief A fixed-size type used to represent ::hsa_signal_condition_t constants.
+ */
+typedef uint32_t hsa_signal_condition32_t;
+
+/**
+ * @brief AMD vendor specific packet type.
+ */
+typedef enum {
+  /**
+   * Packet used by agents to delay processing of subsequent packets until a
+   * configurable condition is satisfied by an HSA signal.  Only kernel dispatch
+   * queues created from AMD GPU Agents support this packet.
+   */
+  HSA_AMD_PACKET_TYPE_BARRIER_VALUE = 2,
+} hsa_amd_packet_type_t;
+
+/**
+ * @brief A fixed-size type used to represent ::hsa_amd_packet_type_t constants.
+ */
+typedef uint8_t hsa_amd_packet_type8_t;
+
+/**
+ * @brief AMD vendor specific AQL packet header
+ */
+typedef struct hsa_amd_packet_header_s {
+  /**
+   * Packet header. Used to configure multiple packet parameters such as the
+   * packet type. The parameters are described by ::hsa_packet_header_t.
+   */
+  uint16_t header;
+
+  /**
+   *Format of the vendor specific packet.
+   */
+  hsa_amd_packet_type8_t AmdFormat;
+
+  /**
+   * Reserved. Must be 0.
+   */
+  uint8_t reserved;
+} hsa_amd_vendor_packet_header_t;
+
+/**
+ * @brief AMD barrier value packet.  Halts packet processing and waits for
+ * (signal_value & ::mask) ::cond ::value to be satisfied, where signal_value
+ * is the value of the signal ::signal.
+ */
+typedef struct hsa_amd_barrier_value_packet_s {
+  /**
+   * AMD vendor specific packet header.
+   */
+  hsa_amd_vendor_packet_header_t header;
+
+  /**
+   * Reserved. Must be 0.
+   */
+  uint32_t reserved0;
+
+  /**
+   * Dependent signal object. A signal with a handle value of 0 is
+   * allowed and is interpreted by the packet processor a satisfied
+   * dependency.
+   */
+  hsa_signal_t signal;
+
+  /**
+   * Value to compare against.
+   */
+  hsa_signal_value_t value;
+
+  /**
+   * Bit mask to be combined by bitwise AND with ::signal's value.
+   */
+  hsa_signal_value_t mask;
+
+  /**
+   * Comparison operation.  See ::hsa_signal_condition_t.
+   */
+  hsa_signal_condition32_t cond;
+
+  /**
+   * Reserved. Must be 0.
+   */
+  uint32_t reserved1;
+
+  /**
+   * Reserved. Must be 0.
+   */
+  uint64_t reserved2;
+
+  /**
+   * Reserved. Must be 0.
+   */
+  uint64_t reserved3;
+
+  /**
+   * Signal used to indicate completion of the job. The application can use the
+   * special signal handle 0 to indicate that no signal is used.
+   */
+  hsa_signal_t completion_signal;
+} hsa_amd_barrier_value_packet_t;
+
+/** @} */
+
  /**
   * @brief Enumeration constants added to ::hsa_status_t.
   *
@@ -61,6 +170,20 @@ enum {
     * Agent executed an invalid shader instruction.
     */
    HSA_STATUS_ERROR_ILLEGAL_INSTRUCTION = 42,
+
+  /**
+   * Agent attempted to access an inaccessible address.
+   * See hsa_amd_register_system_event_handler and
+   * HSA_AMD_GPU_MEMORY_FAULT_EVENT for more information on illegal accesses.
+   */
+  HSA_STATUS_ERROR_MEMORY_FAULT = 43,
+
+  /**
+   * The CU mask was successfully set but the mask attempted to enable a CU
+   * which was disabled for the process.  CUs disabled for the process remain
+   * disabled.
+   */
+  HSA_STATUS_CU_MASK_REDUCED = 44,
  };
  
  /**
@@ -479,6 +602,37 @@ hsa_status_t HSA_API hsa_amd_signal_create(hsa_signal_value_t initial_value, uin
                                             const hsa_agent_t* consumers, uint64_t attributes,
                                             hsa_signal_t* signal);
  
+/**
+ * @brief Returns a pointer to the value of a signal.
+ *
+ * Use of this API does not modify the lifetime of ::signal and any
+ * hsa_signal_value_t retrieved by this API has lifetime equal to that of
+ * ::signal.
+ *
+ * This API is intended for partial interoperability with non-HSA compatible
+ * devices and should not be used where HSA interfaces are available.
+ *
+ * Use of the signal value must comply with use restritions of ::signal.
+ * Use may result in data races if the operations performed are not platform
+ * atomic.  Use with HSA_AMD_SIGNAL_AMD_GPU_ONLY or HSA_AMD_SIGNAL_IPC
+ * attributed signals is required.
+ *
+ * @param[in] Signal handle to extract the signal value pointer from.
+ *
+ * @param[out] Location where the extracted signal value pointer will be placed.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_SIGNAL signal is not a valid hsa_signal_t
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT value_ptr is NULL.
+ */
+hsa_status_t hsa_amd_signal_value_pointer(hsa_signal_t signal,
+                                          volatile hsa_signal_value_t** value_ptr);
+
  /**
   * @brief Asyncronous signal handler function type.
   *
@@ -621,32 +775,69 @@ hsa_status_t HSA_API hsa_amd_image_get_info_max_dim(hsa_agent_t agent,
                                                      void* value);
  
  /**
- * @brief Set a CU affinity to specific queues within the process, this function
- * call is "atomic".
+ * @brief Set a queue's CU affinity mask.
+ *
+ * @details Enables the queue to run on only selected CUs.  The given mask is
+ * combined by bitwise AND with any device wide mask in HSA_CU_MASK before
+ * being applied.
+ * If num_cu_mask_count is 0 then the request is interpreted as a request to
+ * enable all CUs and no cu_mask array need be given.
   *
   * @param[in] queue A pointer to HSA queue.
   *
- * @param[in] num_cu_mask_count Size of CUMask bit array passed in.
+ * @param[in] num_cu_mask_count Size of CUMask bit array passed in, in bits.
   *
   * @param[in] cu_mask Bit-vector representing the CU mask.
   *
   * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
   *
+ * @retval ::HSA_STATUS_CU_MASK_REDUCED The function was successfully executed
+ * but the given mask attempted to enable a CU which was disabled by
+ * HSA_CU_MASK.  CUs disabled by HSA_CU_MASK remain disabled.
+ *
   * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
   * initialized.
   *
   * @retval ::HSA_STATUS_ERROR_INVALID_QUEUE @p queue is NULL or invalid.
   *
   * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p num_cu_mask_count is not
- * multiple of 32 or @p cu_mask is NULL.
- *
- * @retval ::HSA_STATUS_ERROR failed to call thunk api
+ * a multiple of 32 or @p num_cu_mask_count is not 0 and cu_mask is NULL.
+ * Devices with work group processors must even-index contiguous pairwise
+ * CU enable e.g. 0x33(b'110011) is valid while 0x5(0x101) and 0x6(b'0110)
+ * are invalid.
   *
   */
  hsa_status_t HSA_API hsa_amd_queue_cu_set_mask(const hsa_queue_t* queue,
                                                 uint32_t num_cu_mask_count,
                                                 const uint32_t* cu_mask);
  
+/**
+ * @brief Retrieve a queue's CU affinity mask.
+ *
+ * @details Returns the first num_cu_mask_count bits of a queue's CU mask.
+ * Ensure that num_cu_mask_count is at least as large as
+ * HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT to retrieve the entire mask.
+ *
+ * @param[in] queue A pointer to HSA queue.
+ *
+ * @param[in] num_cu_mask_count Size of CUMask bit array passed in, in bits.
+ *
+ * @param[out] cu_mask Bit-vector representing the CU mask.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_QUEUE @p queue is NULL or invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p num_cu_mask_count is 0, not
+ * a multiple of 32 or @p cu_mask is NULL.
+ *
+ */
+hsa_status_t HSA_API hsa_amd_queue_cu_get_mask(const hsa_queue_t* queue, uint32_t num_cu_mask_count,
+                                               uint32_t* cu_mask);
+
  /**
   * @brief Memory segments associated with a memory pool.
   */
@@ -778,6 +969,24 @@ typedef enum {
    HSA_AMD_MEMORY_POOL_INFO_ALLOC_MAX_SIZE = 16,
  } hsa_amd_memory_pool_info_t;
  
+/**
+ * @brief Memory pool flag used to specify allocation directives
+ *
+ */
+typedef enum hsa_amd_memory_pool_flag_s {
+  /**
+   * Allocates memory that conforms to standard HSA memory consistency model
+   */
+  HSA_AMD_MEMORY_POOL_STANDARD_FLAG = 0,
+  /**
+   * Allocates fine grain memory type where memory ordering is per point to point
+   * connection. Atomic memory operations on these memory buffers are not
+   * guaranteed to be visible at system scope.
+   */
+  HSA_AMD_MEMORY_POOL_PCIE_FLAG = 1,
+
+} hsa_amd_memory_pool_flag_t;
+
  /**
   * @brief Get the current value of an attribute of a memory pool.
   *
@@ -846,7 +1055,7 @@ hsa_status_t HSA_API hsa_amd_agent_iterate_memory_pools(
   * ::HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE in @p memory_pool.
   *
   * @param[in] flags A bit-field that is used to specify allocation
- * directives. Reserved parameter, must be 0.
+ * directives.
   *
   * @param[out] ptr Pointer to the location where to store the base virtual
   * address of
@@ -903,6 +1112,8 @@ hsa_status_t HSA_API hsa_amd_memory_pool_free(void* ptr);
   *
   * @param[in] dst_agent Agent associated with the @p dst. The agent must be able to directly
   * access both the source and destination buffers in their current locations.
+ * May be zero in which case the runtime will attempt to discover the destination agent.
+ * Discovery may have variable and/or high latency.
   *
   * @param[in] src A valid pointer to the source of data to be copied. The source
   * buffer must not overlap with the destination buffer, otherwise the copy will succeed
@@ -910,6 +1121,8 @@ hsa_status_t HSA_API hsa_amd_memory_pool_free(void* ptr);
   *
   * @param[in] src_agent Agent associated with the @p src. The agent must be able to directly
   * access both the source and destination buffers in their current locations.
+ * May be zero in which case the runtime will attempt to discover the destination agent.
+ * Discovery may have variable and/or high latency.
   *
   * @param[in] size Number of bytes to copy. If @p size is 0, no copy is
   * performed and the function returns success. Copying a number of bytes larger
@@ -920,9 +1133,9 @@ hsa_status_t HSA_API hsa_amd_memory_pool_free(void* ptr);
   *
   * @param[in] dep_signals List of signals that must be waited on before the copy
   * operation starts. The copy will start after every signal has been observed with
- * the value 0. The dependent signal should not include completion signal from hsa_amd_memory_async_copy
- * operation to be issued in future as that can result in a deadlock. If @p num_dep_signals is 0, this
- * argument is ignored.
+ * the value 0. The dependent signal should not include completion signal from
+ * hsa_amd_memory_async_copy operation to be issued in future as that can result
+ * in a deadlock. If @p num_dep_signals is 0, this argument is ignored.
   *
   * @param[in] completion_signal Signal used to indicate completion of the copy
   * operation. When the copy operation is finished, the value of the signal is
@@ -937,7 +1150,7 @@ hsa_status_t HSA_API hsa_amd_memory_pool_free(void* ptr);
   * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
   * initialized.
   *
- * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid.
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT An agent is invalid or no discovered agent has access.
   *
   * @retval ::HSA_STATUS_ERROR_INVALID_SIGNAL @p completion_signal is invalid.
   *
@@ -1576,6 +1789,12 @@ typedef struct hsa_amd_pointer_info_s {
    GPU boards) any such agent may be returned.
    */
    hsa_agent_t agentOwner;
+  /*
+  Contains a bitfield of hsa_amd_memory_pool_global_flag_t values.
+  Reports the effective global flags bitmask for the allocation.  This field is not meaningful if
+  the type of the allocation is HSA_EXT_POINTER_TYPE_UNKNOWN.
+  */
+  uint32_t global_flags;
  } hsa_amd_pointer_info_t;
  
  /**
@@ -1611,7 +1830,7 @@ typedef struct hsa_amd_pointer_info_s {
   *
   * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT NULL in @p ptr or @p info.
   */
-hsa_status_t HSA_API hsa_amd_pointer_info(void* ptr,
+hsa_status_t HSA_API hsa_amd_pointer_info(const void* ptr,
                                            hsa_amd_pointer_info_t* info,
                                            void* (*alloc)(size_t),
                                            uint32_t* num_agents_accessible,
@@ -1635,7 +1854,7 @@ hsa_status_t HSA_API hsa_amd_pointer_info(void* ptr,
   *
   * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT @p ptr is not known to ROCr.
   */
-hsa_status_t HSA_API hsa_amd_pointer_info_set_userdata(void* ptr,
+hsa_status_t HSA_API hsa_amd_pointer_info_set_userdata(const void* ptr,
                                                         void* userdata);
  
  /**
@@ -1809,11 +2028,11 @@ typedef enum {
    // GPU attempted access to a host only page.
    HSA_AMD_MEMORY_FAULT_HOST_ONLY = 1 << 3,
    // DRAM ECC failure.
-  HSA_AMD_MEMORY_FAULT_DRAM_ECC = 1 << 4,
+  HSA_AMD_MEMORY_FAULT_DRAMECC = 1 << 4,
    // Can't determine the exact fault address.
    HSA_AMD_MEMORY_FAULT_IMPRECISE = 1 << 5,
    // SRAM ECC failure (ie registers, no fault address).
-  HSA_AMD_MEMORY_FAULT_SRAM_ECC = 1 << 6,
+  HSA_AMD_MEMORY_FAULT_SRAMECC = 1 << 6,
    // GPU reset following unspecified hang.
    HSA_AMD_MEMORY_FAULT_HANG = 1 << 31
  } hsa_amd_memory_fault_reason_t;
@@ -1970,6 +2189,181 @@ hsa_status_t HSA_API hsa_amd_register_deallocation_callback(void* ptr,
  hsa_status_t HSA_API hsa_amd_deregister_deallocation_callback(void* ptr,
                                                        hsa_amd_deallocation_callback_t callback);
  
+typedef enum hsa_amd_svm_model_s {
+  /**
+   * Updates to memory with this attribute conform to HSA memory consistency
+   * model.
+   */
+  HSA_AMD_SVM_GLOBAL_FLAG_FINE_GRAINED = 0,
+  /**
+   * Writes to memory with this attribute can be performed by a single agent
+   * at a time.
+   */
+  HSA_AMD_SVM_GLOBAL_FLAG_COARSE_GRAINED = 1,
+  /**
+   * Memory region queried contains subregions with both
+   * HSA_AMD_SVM_GLOBAL_FLAG_COARSE_GRAINED and
+   * HSA_AMD_SVM_GLOBAL_FLAG_FINE_GRAINED attributes.
+   *
+   * This attribute can not be used in hsa_amd_svm_attributes_set.  It is a
+   * possible return from hsa_amd_svm_attributes_get indicating that the query
+   * region contains both coarse and fine grained memory.
+   */
+  HSA_AMD_SVM_GLOBAL_FLAG_INDETERMINATE = 2
+} hsa_amd_svm_model_t;
+
+typedef enum hsa_amd_svm_attribute_s {
+  // Memory model attribute.
+  // Type of this attribute is hsa_amd_svm_model_t.
+  HSA_AMD_SVM_ATTRIB_GLOBAL_FLAG = 0,
+  // Marks the range read only.  This allows multiple physical copies to be
+  // placed local to each accessing device.
+  // Type of this attribute is bool.
+  HSA_AMD_SVM_ATTRIB_READ_ONLY = 1,
+  // Automatic migrations should attempt to keep the memory within the xgmi hive
+  // containing accessible agents.
+  // Type of this attribute is bool.
+  HSA_AMD_SVM_ATTRIB_HIVE_LOCAL = 2,
+  // Page granularity to migrate at once.  Page granularity is specified as
+  // log2(page_count).
+  // Type of this attribute is uint64_t.
+  HSA_AMD_SVM_ATTRIB_MIGRATION_GRANULARITY = 3,
+  // Physical location to prefer when automatic migration occurs.
+  // Set to the null agent handle (handle == 0) to indicate there
+  // is no preferred location.
+  // Type of this attribute is hsa_agent_t.
+  HSA_AMD_SVM_ATTRIB_PREFERRED_LOCATION = 4,
+  // This attribute can not be used in ::hsa_amd_svm_attributes_set (see
+  // ::hsa_amd_svm_prefetch_async).
+  // Queries the physical location of most recent prefetch command.
+  // If the prefetch location has not been set or is not uniform across the
+  // address range then returned hsa_agent_t::handle will be 0.
+  // Querying this attribute will return the destination agent of the most
+  // recent ::hsa_amd_svm_prefetch_async targeting the address range.  If
+  // multiple async prefetches have been issued targeting the region and the
+  // most recently issued prefetch has completed then the query will return
+  // the location of the most recently completed prefetch.
+  // Type of this attribute is hsa_agent_t.
+  HSA_AMD_SVM_ATTRIB_PREFETCH_LOCATION = 5,
+  // Optimizes with the anticipation that the majority of operations to the
+  // range will be read operations.
+  // Type of this attribute is bool.
+  HSA_AMD_SVM_ATTRIB_READ_MOSTLY = 6,
+  // Allows the execution on GPU.
+  // Type of this attribute is bool.
+  HSA_AMD_SVM_ATTRIB_GPU_EXEC = 7,
+  // This attribute can not be used in ::hsa_amd_svm_attributes_get.
+  // Enables an agent for access to the range.  Access may incur a page fault
+  // and associated memory migration.  Either this or
+  // HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE_IN_PLACE is required prior to SVM
+  // access if HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT is false.
+  // Type of this attribute is hsa_agent_t.
+  HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE = 0x200,
+  // This attribute can not be used in ::hsa_amd_svm_attributes_get.
+  // Enables an agent for access to the range without page faults.  Access
+  // will not incur a page fault and will not cause access based migration.
+  // and associated memory migration.  Either this or
+  // HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE is required prior to SVM access if
+  // HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT is false.
+  // Type of this attribute is hsa_agent_t.
+  HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE_IN_PLACE = 0x201,
+  // This attribute can not be used in ::hsa_amd_svm_attributes_get.
+  // Denies an agent access to the memory range.  Access will cause a terminal
+  // segfault.
+  // Type of this attribute is hsa_agent_t.
+  HSA_AMD_SVM_ATTRIB_AGENT_NO_ACCESS = 0x202,
+  // This attribute can not be used in ::hsa_amd_svm_attributes_set.
+  // Returns the access attribute associated with the agent.
+  // The agent to query must be set in the attribute value field.
+  // The attribute enum will be replaced with the agent's current access
+  // attribute for the address range.
+  // TODO: Clarify KFD return value for non-uniform access attribute.
+  // Type of this attribute is hsa_agent_t.
+  HSA_AMD_SVM_ATTRIB_ACCESS_QUERY = 0x203,
+} hsa_amd_svm_attribute_t;
+
+// List type for hsa_amd_svm_attributes_set/get. 
+typedef struct hsa_amd_svm_attribute_pair_s {
+  // hsa_amd_svm_attribute_t value.
+  uint64_t attribute;
+  // Attribute value.  Bit values should be interpreted according to the type
+  // given in the associated attribute description.
+  uint64_t value;
+} hsa_amd_svm_attribute_pair_t;
+
+/**
+ * @brief Sets SVM memory attributes.
+ *
+ * If HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT returns false then enabling
+ * access to an Agent via this API (setting HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE
+ * or HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE_IN_PLACE) is required prior to SVM
+ * memory access by that Agent.
+ *
+ * Attributes HSA_AMD_SVM_ATTRIB_ACCESS_QUERY and HSA_AMD_SVM_ATTRIB_PREFETCH_LOCATION
+ * may not be used with this API.
+ *
+ * @param[in] ptr Will be aligned down to nearest page boundary.
+ *
+ * @param[in] size Will be aligned up to nearest page boundary.
+ *
+ * @param[in] attribute_list List of attributes to set for the address range.
+ *
+ * @param[in] attribute_count Length of @p attribute_list.
+ */
+hsa_status_t hsa_amd_svm_attributes_set(void* ptr, size_t size,
+                                        hsa_amd_svm_attribute_pair_t* attribute_list,
+                                        size_t attribute_count);
+
+/**
+ * @brief Gets SVM memory attributes.
+ *
+ * Attributes HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE,
+ * HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE_IN_PLACE and
+ * HSA_AMD_SVM_ATTRIB_PREFETCH_LOCATION may not be used with this API.
+ *
+ * Note that attribute HSA_AMD_SVM_ATTRIB_ACCESS_QUERY takes as input an
+ * hsa_agent_t and returns the current access type through its attribute field.
+ *
+ * @param[in] ptr Will be aligned down to nearest page boundary.
+ *
+ * @param[in] size Will be aligned up to nearest page boundary.
+ *
+ * @param[in] attribute_list List of attributes to set for the address range.
+ *
+ * @param[in] attribute_count Length of @p attribute_list.
+ */
+hsa_status_t hsa_amd_svm_attributes_get(void* ptr, size_t size,
+                                        hsa_amd_svm_attribute_pair_t* attribute_list,
+                                        size_t attribute_count);
+
+/**
+ * @brief Asynchronously migrates memory to an agent.
+ *
+ * Schedules memory migration to @p agent when @p dep_signals have been observed equal to zero.
+ * @p completion_signal will decrement when the migration is complete.
+ *
+ * @param[in] ptr Will be aligned down to nearest page boundary.
+ *
+ * @param[in] size Will be aligned up to nearest page boundary.
+ *
+ * @param[in] agent Agent to migrate to.
+ *
+ * @param[in] num_dep_signals Number of dependent signals. Can be 0.
+ *
+ * @param[in] dep_signals List of signals that must be waited on before the migration
+ * operation starts. The migration will start after every signal has been observed with
+ * the value 0. If @p num_dep_signals is 0, this argument is ignored.
+ *
+ * @param[in] completion_signal Signal used to indicate completion of the migration
+ * operation. When the migration operation is finished, the value of the signal is
+ * decremented. The runtime indicates that an error has occurred during the copy
+ * operation by setting the value of the completion signal to a negative
+ * number. If no completion signal is required this handle may be null.
+ */
+hsa_status_t hsa_amd_svm_prefetch_async(void* ptr, size_t size, hsa_agent_t agent,
+                                        uint32_t num_dep_signals, const hsa_signal_t* dep_signals,
+                                        hsa_signal_t completion_signal);
+
  #ifdef __cplusplus
  }  // end extern "C" block
  #endif
diff --git a/include/hsa_ext_image.h b/include/hsa_ext_image.h

index e94d8da202d60036d4c9150f8d9f01ee8ec263a5..ab820b3e3c8717f367bd0c70ea54c05df868c2d7 100644 (file)
--- a/include/hsa_ext_image.h
+++ b/include/hsa_ext_image.h
@@ -1,6 +1,6 @@
  ////////////////////////////////////////////////////////////////////////////////
  //
-// Copyright (C) 2014-2020 Advanced Micro Devices Inc.  All rights reserved.
+// Copyright (C) 2014-2022 Advanced Micro Devices Inc.  All rights reserved.
  //
  // Permission is hereby granted, free of charge, to any person or organization
  // obtaining a copy of the software and accompanying documentation covered by
diff --git a/libgomp/Makefile.in b/libgomp/Makefile.in

index 5f8a5f570647ec64d6f99e4db95170c245a2e7d3..5dca37c5aac51528b9a9eb89eb953bf41de278d2 100644 (file)
--- a/libgomp/Makefile.in
+++ b/libgomp/Makefile.in
@@ -187,7 +187,9 @@ am__DEPENDENCIES_1 =
  @PLUGIN_GCN_TRUE@libgomp_plugin_gcn_la_DEPENDENCIES = libgomp.la \
  @PLUGIN_GCN_TRUE@      $(am__DEPENDENCIES_1)
  @PLUGIN_GCN_TRUE@am_libgomp_plugin_gcn_la_OBJECTS =  \
-@PLUGIN_GCN_TRUE@      libgomp_plugin_gcn_la-plugin-gcn.lo
+@PLUGIN_GCN_TRUE@      libgomp_plugin_gcn_la-plugin-gcn.lo \
+@PLUGIN_GCN_TRUE@      libgomp_plugin_gcn_la-simple-allocator.lo \
+@PLUGIN_GCN_TRUE@      libgomp_plugin_gcn_la-mutex.lo
  libgomp_plugin_gcn_la_OBJECTS = $(am_libgomp_plugin_gcn_la_OBJECTS)
  AM_V_lt = $(am__v_lt_@AM_V@)
  am__v_lt_ = $(am__v_lt_@AM_DEFAULT_V@)
@@ -584,7 +586,9 @@ libgomp_la_SOURCES = alloc.c atomic.c barrier.c critical.c env.c \
  
  # AMD GCN plugin
  @PLUGIN_GCN_TRUE@libgomp_plugin_gcn_version_info = -version-info $(libtool_VERSION)
-@PLUGIN_GCN_TRUE@libgomp_plugin_gcn_la_SOURCES = plugin/plugin-gcn.c
+@PLUGIN_GCN_TRUE@libgomp_plugin_gcn_la_SOURCES = plugin/plugin-gcn.c simple-allocator.c \
+@PLUGIN_GCN_TRUE@                              plugin/mutex.c
+
  @PLUGIN_GCN_TRUE@libgomp_plugin_gcn_la_CPPFLAGS = $(AM_CPPFLAGS) \
  @PLUGIN_GCN_TRUE@      -D_GNU_SOURCE
  
@@ -760,7 +764,9 @@ distclean-compile:
  @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/iter.Plo@am__quote@
  @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/iter_ull.Plo@am__quote@
  @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libgomp-plugin.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libgomp_plugin_gcn_la-mutex.Plo@am__quote@
  @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libgomp_plugin_gcn_la-plugin-gcn.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libgomp_plugin_gcn_la-simple-allocator.Plo@am__quote@
  @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libgomp_plugin_nvptx_la-plugin-nvptx.Plo@am__quote@
  @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/lock.Plo@am__quote@
  @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/loop.Plo@am__quote@
@@ -823,6 +829,20 @@ libgomp_plugin_gcn_la-plugin-gcn.lo: plugin/plugin-gcn.c
  @AMDEP_TRUE@@am__fastdepCC_FALSE@      DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
  @am__fastdepCC_FALSE@  $(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(libgomp_plugin_gcn_la_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libgomp_plugin_gcn_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o libgomp_plugin_gcn_la-plugin-gcn.lo `test -f 'plugin/plugin-gcn.c' || echo '$(srcdir)/'`plugin/plugin-gcn.c
  
+libgomp_plugin_gcn_la-simple-allocator.lo: simple-allocator.c
+@am__fastdepCC_TRUE@   $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(libgomp_plugin_gcn_la_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libgomp_plugin_gcn_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT libgomp_plugin_gcn_la-simple-allocator.lo -MD -MP -MF $(DEPDIR)/libgomp_plugin_gcn_la-simple-allocator.Tpo -c -o libgomp_plugin_gcn_la-simple-allocator.lo `test -f 'simple-allocator.c' || echo '$(srcdir)/'`simple-allocator.c
+@am__fastdepCC_TRUE@   $(AM_V_at)$(am__mv) $(DEPDIR)/libgomp_plugin_gcn_la-simple-allocator.Tpo $(DEPDIR)/libgomp_plugin_gcn_la-simple-allocator.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@      $(AM_V_CC)source='simple-allocator.c' object='libgomp_plugin_gcn_la-simple-allocator.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@      DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@  $(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(libgomp_plugin_gcn_la_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libgomp_plugin_gcn_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o libgomp_plugin_gcn_la-simple-allocator.lo `test -f 'simple-allocator.c' || echo '$(srcdir)/'`simple-allocator.c
+
+libgomp_plugin_gcn_la-mutex.lo: plugin/mutex.c
+@am__fastdepCC_TRUE@   $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(libgomp_plugin_gcn_la_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libgomp_plugin_gcn_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT libgomp_plugin_gcn_la-mutex.lo -MD -MP -MF $(DEPDIR)/libgomp_plugin_gcn_la-mutex.Tpo -c -o libgomp_plugin_gcn_la-mutex.lo `test -f 'plugin/mutex.c' || echo '$(srcdir)/'`plugin/mutex.c
+@am__fastdepCC_TRUE@   $(AM_V_at)$(am__mv) $(DEPDIR)/libgomp_plugin_gcn_la-mutex.Tpo $(DEPDIR)/libgomp_plugin_gcn_la-mutex.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@      $(AM_V_CC)source='plugin/mutex.c' object='libgomp_plugin_gcn_la-mutex.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@      DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@  $(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(libgomp_plugin_gcn_la_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libgomp_plugin_gcn_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o libgomp_plugin_gcn_la-mutex.lo `test -f 'plugin/mutex.c' || echo '$(srcdir)/'`plugin/mutex.c
+
  libgomp_plugin_nvptx_la-plugin-nvptx.lo: plugin/plugin-nvptx.c
  @am__fastdepCC_TRUE@   $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(libgomp_plugin_nvptx_la_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libgomp_plugin_nvptx_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT libgomp_plugin_nvptx_la-plugin-nvptx.lo -MD -MP -MF $(DEPDIR)/libgomp_plugin_nvptx_la-plugin-nvptx.Tpo -c -o libgomp_plugin_nvptx_la-plugin-nvptx.lo `test -f 'plugin/plugin-nvptx.c' || echo '$(srcdir)/'`plugin/plugin-nvptx.c
  @am__fastdepCC_TRUE@   $(AM_V_at)$(am__mv) $(DEPDIR)/libgomp_plugin_nvptx_la-plugin-nvptx.Tpo $(DEPDIR)/libgomp_plugin_nvptx_la-plugin-nvptx.Plo
diff --git a/libgomp/libgomp-plugin.h b/libgomp/libgomp-plugin.h

index 5b4704484dd00a8931aba2e21fa3d249804ae024..71e74527e7191dd0e25e4a428fd5fa5358b4744e 100644 (file)
--- a/libgomp/libgomp-plugin.h
+++ b/libgomp/libgomp-plugin.h
@@ -236,6 +236,18 @@ extern const char *GOMP_OFFLOAD_get_interop_type_desc (struct interop_obj_t *,
                                                        omp_interop_property_t);
  #endif
  
+/* simple-allocator.c  */
+
+typedef struct gomp_simple_alloc_context *gomp_simple_alloc_ctx_p;
+
+gomp_simple_alloc_ctx_p gomp_simple_alloc_init_context ();
+void gomp_simple_alloc_register_memory (gomp_simple_alloc_ctx_p ctx,
+                                       char *base, size_t size);
+void *gomp_simple_alloc (gomp_simple_alloc_ctx_p ctx, size_t size);
+void gomp_simple_free (gomp_simple_alloc_ctx_p ctx, void *addr);
+void *gomp_simple_realloc (gomp_simple_alloc_ctx_p ctx, void *addr,
+                          size_t newsize);
+
  #ifdef __cplusplus
  }
  #endif
diff --git a/libgomp/libgomp.h b/libgomp/libgomp.h

index ff445d1e90c6f70899a11ce123ea05f9cc8f2717..46db7d41f3222f851d692d827ec062932fee7a9b 100644 (file)
--- a/libgomp/libgomp.h
+++ b/libgomp/libgomp.h
@@ -1676,16 +1676,7 @@ gomp_thread_to_pthread_t (struct gomp_thread *thr)
  }
  #endif
  
-/* simple-allocator.c  */
-
-typedef struct gomp_simple_alloc_context *gomp_simple_alloc_ctx_p;
-
-gomp_simple_alloc_ctx_p gomp_simple_alloc_init_context ();
-void gomp_simple_alloc_register_memory (gomp_simple_alloc_ctx_p ctx,
-                                       char *base, size_t size);
-void *gomp_simple_alloc (gomp_simple_alloc_ctx_p ctx, size_t size);
-void gomp_simple_free (gomp_simple_alloc_ctx_p ctx, void *addr);
-void *gomp_simple_realloc (gomp_simple_alloc_ctx_p ctx, void *addr,
-                          size_t newsize);
+/* simple-allocator.c has its prototypes in libgomp-plugin.h so it's
+   accessible from both.  */
  
  #endif /* LIBGOMP_H */
diff --git a/libgomp/libgomp.texi b/libgomp/libgomp.texi

index 2d37c873174407f6048f419eea7cdb7d60d9b082..abe8ef75d8e7483290fe34c7e9045309c1b1858b 100644 (file)
--- a/libgomp/libgomp.texi
+++ b/libgomp/libgomp.texi
@@ -7185,11 +7185,26 @@ The implementation remark:
        a performance boost for NVPTX offload code and also allows unlimited use
        of pinned memory regardless of the OS @code{ulimit}/@code{rlimit}
        settings.
-@item Managed memory allocated with the OpenMP
+@item Managed memory allocated on the host with the
        @code{ompx_gnu_managed_mem_alloc} allocator or in the
-      @code{ompx_gnu_managed_mem_space} is not currently supported for AMD GPU
-      devices; attempting to use it in an allocator will trigger the fall-back
-      trait.
+      @code{ompx_gnu_managed_mem_space} (both GNU extensions) allocate memory
+      equivalent to HIP Managed Memory, although @emph{not} actually allocated
+      using @code{hipMallocManaged}.  This memory is accessible by both the
+      host and the device at the same address, so it need not be mapped with
+      @code{map} clauses.  Instead, use the @code{is_device_ptr} clause or
+      @code{has_device_addr} clause to indicate that the pointer is already
+      accessible on the device.  The ROCm runtime will automatically handle
+      data migration between host and device as needed.  Not all AMD GPU
+      devices support this feature, and many that do require that
+      @code{-mxnack=on} is configured at compile time.  If managed memory is
+      not supported by the default device, as configured at the moment the
+      allocator is called, then the allocator will use the fall-back setting.
+      If the default device is configured differently when the memory is freed,
+      via @code{omp_free} or @code{omp_realloc}, the result may be undefined.
+      If the current device does not support Unified Shared Memory (or it is
+      not enabled with @code{HSA_XNACK=1}) then Managed Memory might still
+      work, but allocations may only be visible to a single device (whichever
+      was the default device when the @emph{first} allocation was made).
  @item The OpenMP routines @code{omp_target_memcpy_rect} and
        @code{omp_target_memcpy_rect_async} and the @code{target update}
        directive for non-contiguous list items use the 3D memory-copy function
diff --git a/libgomp/plugin/Makefrag.am b/libgomp/plugin/Makefrag.am

index 9c273e72f7891e038fdc20555df80a1fd2a53831..dbc02f3cda8ee48be3ec9ed6515239138957c124 100644 (file)
--- a/libgomp/plugin/Makefrag.am
+++ b/libgomp/plugin/Makefrag.am
@@ -57,7 +57,8 @@ if PLUGIN_GCN
  # AMD GCN plugin
  libgomp_plugin_gcn_version_info = -version-info $(libtool_VERSION)
  toolexeclib_LTLIBRARIES += libgomp-plugin-gcn.la
-libgomp_plugin_gcn_la_SOURCES = plugin/plugin-gcn.c
+libgomp_plugin_gcn_la_SOURCES = plugin/plugin-gcn.c simple-allocator.c \
+                               plugin/mutex.c
  libgomp_plugin_gcn_la_CPPFLAGS = $(AM_CPPFLAGS) \
         -D_GNU_SOURCE
  libgomp_plugin_gcn_la_LDFLAGS = $(libgomp_plugin_gcn_version_info) \
diff --git a/libgomp/plugin/mutex.c b/libgomp/plugin/mutex.c

new file mode 100644 (file)

index 0000000..e6981ad
--- /dev/null
+++ b/libgomp/plugin/mutex.c
@@ -0,0 +1,58 @@
+/* Mutex implementation for libgomp plugins.
+
+   Copyright (C) 2025 Free Software Foundation, Inc.
+
+   Contributed by BayLibre
+
+   This file is part of the GNU Offloading and Multi Processing Library
+   (libgomp).
+
+   Libgomp is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+   more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* This is a minimal implementation of the gomp_mutex_t spinlocks, but
+   without all the dependencies used by the config/linux/mutex implementation.
+
+   At the time of writing, this is only used by simple_alloc which has
+   short-lived locks and should be fine with these.  The actual locks are in
+   a header file, so only the fallback "slow" functions are needed here.  */
+
+#include "config.h"
+#include <unistd.h>
+#include "libgomp.h"
+
+#ifndef HAVE_SYNC_BUILTINS
+#error "HAVE_SYNC_BUILTINS is required to build this"
+#endif
+
+void
+gomp_mutex_lock_slow (gomp_mutex_t *mutex, int oldval)
+{
+  while (oldval == 1)
+    {
+      usleep (1);
+      oldval = __atomic_exchange_n (mutex, 1, __ATOMIC_ACQUIRE);
+    }
+}
+
+void
+gomp_mutex_unlock_slow (gomp_mutex_t *mutex)
+{
+  GOMP_PLUGIN_fatal ("gomp_mutex_unlock_slow should be unreachable");
+}
diff --git a/libgomp/plugin/plugin-gcn.c b/libgomp/plugin/plugin-gcn.c

index cd5a19b03551a4769f900ee39cb5c4929de89cef..ece41c59bbb50300a89747ffc3a2b9980a451d6e 100644 (file)
--- a/libgomp/plugin/plugin-gcn.c
+++ b/libgomp/plugin/plugin-gcn.c
@@ -50,6 +50,8 @@
  #include "oacc-plugin.h"
  #include "oacc-int.h"
  #include <assert.h>
+#include <sys/mman.h>
+#include <unistd.h>
  
  /* Create hash-table for declare target's indirect clause on the host;
     see build-target-indirect-htab.h for details.  */
@@ -228,6 +230,9 @@ struct hsa_runtime_fn_info
       const hsa_dim3_t *range, hsa_agent_t copy_agent,
       hsa_amd_copy_direction_t dir, uint32_t num_dep_signals,
       const hsa_signal_t *dep_signals, hsa_signal_t completion_signal);
+  hsa_status_t (*hsa_amd_svm_attributes_set_fn)
+    (void* ptr, size_t size, hsa_amd_svm_attribute_pair_t* attribute_list,
+     size_t attribute_count);
  };
  
  /* As an HIP runtime is dlopened, following structure defines function
@@ -746,6 +751,24 @@ dump_hsa_system_info (void)
      }
    else
      GCN_WARNING ("HSA_SYSTEM_INFO_EXTENSIONS: FAILED\n");
+
+  bool svm_supported;
+  status = hsa_fns.hsa_system_get_info_fn
+    (HSA_AMD_SYSTEM_INFO_SVM_SUPPORTED, &svm_supported);
+  if (status == HSA_STATUS_SUCCESS)
+    GCN_DEBUG ("HSA_AMD_SYSTEM_INFO_SVM_SUPPORTED: %s\n",
+              (svm_supported ? "TRUE" : "FALSE"));
+  else
+    GCN_WARNING ("HSA_AMD_SYSTEM_INFO_SVM_SUPPORTED: FAILED\n");
+
+  bool svm_accessible;
+  status = hsa_fns.hsa_system_get_info_fn
+    (HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT, &svm_accessible);
+  if (status == HSA_STATUS_SUCCESS)
+    GCN_DEBUG ("HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT: %s\n",
+              (svm_accessible ? "TRUE" : "FALSE"));
+  else
+    GCN_WARNING ("HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT: FAILED\n");
  }
  
  /* Dump information about the available hardware.  */
@@ -1470,6 +1493,7 @@ init_hsa_runtime_functions (void)
    DLSYM_OPT_FN (hsa_amd_memory_lock)
    DLSYM_OPT_FN (hsa_amd_memory_unlock)
    DLSYM_OPT_FN (hsa_amd_memory_async_copy_rect)
+  DLSYM_OPT_FN (hsa_amd_svm_attributes_set)
    return true;
  #undef DLSYM_OPT_FN
  #undef DLSYM_FN
@@ -2527,6 +2551,13 @@ isa_matches_agent (struct agent_info *agent, Elf64_Ehdr *image,
               "Consider using ROCR_VISIBLE_DEVICES to disable incompatible "
               "devices or run with LOADER_ENABLE_LOGGING=1 for more details.",
               device_isa_s, agent_isa_s, agent->device_id);
+  else if (strcmp (device_isa_s, agent_isa_s) == 0)
+    snprintf (msg, sizeof msg,
+             "GCN code object features do not match for an unknown reason "
+             "(device %d).\n"
+             "Try to adjust the HSA_XNACK setting (perhaps?), or use\n"
+             "ROCR_VISIBLE_DEVICES to disable incompatible devices.\n",
+             agent->device_id);
    else
      snprintf (msg, sizeof msg,
               "GCN code object ISA '%s' is incompatible with GPU ISA '%s' "
@@ -3188,6 +3219,125 @@ wait_queue (struct goacc_asyncqueue *aq)
  }
  
  /* }}}  */
+/* {{{ Managed Memory
+
+   This implements an allocator equivalent to CUDA "Managed" memory, in which
+   the pages automatically migrate between host and device memory, as needed.
+   These allocations are visible from both the host and devices without the
+   need for explicit mappings.  However, OpenMP does need "is_device_ptr" or
+   "has_device_addr" to function properly.
+
+   There isn't a high-level HSA/ROCr API to allocate managed memory, so we
+   use regular memory and register it with the driver by setting it to
+   "coarse-grained" mode, and setting the "accessible by default" attribute
+   on devices where HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT isn't set
+   as standard (as it isn't on systems that don't support USM, or when
+   HSA_XNACK != 1).
+
+   This is in contrast to GOMP_OFFLOAD_alloc which allocates coarse-grained
+   *GPU memory*, which is not visible on the host.
+
+   It would be possible to register memory returned by malloc, but
+   experimentation shows that doing so causes memory faults within the HSA
+   runtime code.  Therefore, the Managed memory space is allocated as a
+   largish block and then subdivided via a custom allocator.  The "simple"
+   allocator is designed specifically to store its free-chain outside of
+   the registered pages so that allocation does not inadvertently cause
+   pages to migrate.
+
+   Note: if the user has multiple mismatched devices, and one or more do
+   not support USM (or XNACK is off), then each page of the Managed heap
+   could end up associated with a different device (by calling omp_alloc
+   before and after omp_set_default_device).  This issue remains
+   an *unhandled* edge-case, at present.  */
+
+gomp_simple_alloc_ctx_p managed_ctx = NULL;
+
+/* Initialize or extend the Managed memory space.  This is called whenever
+   allocation fails.  SIZE is the minimum size required for the failed
+   allocation to succeed; the function may choose a larger size.
+   Note that Linux lazy allocation means that the memory returned isn't
+   guaranteed to actually exist.  */
+
+static bool
+managed_heap_create (struct agent_info *agent, size_t size)
+{
+  static int lock = 0;
+  while (__atomic_exchange_n (&lock, 1, __ATOMIC_ACQUIRE) != 0)
+    ;
+
+  size_t default_size = 1L * 1024 * 1024 * 1024; /* 1GB */
+  if (size < default_size)
+    size = default_size;
+
+  /* Round up to a whole page.  */
+  int pagesize = getpagesize ();
+  int misalignment = size % pagesize;
+  if (misalignment > 0)
+    size += pagesize - misalignment;
+
+  /* Try to get contiguous memory, but it might not be possible.
+     The most recent previous allocation is at the head of the list.  */
+  static void *addrhint = NULL;
+  void *new_pages = mmap (addrhint, size, PROT_READ | PROT_WRITE,
+                         MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+  if (!new_pages)
+    {
+      GCN_DEBUG ("Could not allocate Managed Memory heap.");
+      __atomic_store_n (&lock, 0, __ATOMIC_RELEASE);
+      return false;
+    }
+
+  /* Register the heap allocation as coarse grained, "Managed" memory.  */
+  struct hsa_amd_svm_attribute_pair_s attr = {
+    HSA_AMD_SVM_ATTRIB_GLOBAL_FLAG,
+    HSA_AMD_SVM_GLOBAL_FLAG_COARSE_GRAINED
+  };
+  hsa_status_t status = hsa_fns.hsa_amd_svm_attributes_set_fn (new_pages, size,
+                                                              &attr, 1);
+  if (status != HSA_STATUS_SUCCESS)
+    GOMP_PLUGIN_fatal ("Failed to allocate Unified Shared Memory;"
+                      " please update your drivers and/or kernel");
+
+  /* The HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE setting is required on devices
+     without default SVM.  */
+  static int svm_accessible = 0xff; /* Use 0xff as "undefined".  */
+  if (svm_accessible == 0xff)
+    {
+      status = hsa_fns.hsa_system_get_info_fn
+       (HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT, &svm_accessible);
+      if (status != HSA_STATUS_SUCCESS)
+       {
+         GCN_DEBUG ("warning: failed to query "
+                    " HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT\n");
+         svm_accessible = false;
+       }
+    }
+  if (svm_accessible == false)
+    {
+      struct hsa_amd_svm_attribute_pair_s attr2;
+      attr2.attribute = HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE;
+      attr2.value = agent->id.handle;
+      status = hsa_fns.hsa_amd_svm_attributes_set_fn (new_pages, size, &attr2,
+                                                     1);
+      if (status != HSA_STATUS_SUCCESS)
+       GOMP_PLUGIN_fatal ("Failed to allocate Unified Shared Memory;"
+                          " please update your drivers and/or kernel");
+    }
+
+  addrhint = new_pages + size;
+
+  /* Initialize a new Managed memory heap, or add the new memory into an
+     existing Managed memory heap.  */
+  if (!managed_ctx)
+    managed_ctx = gomp_simple_alloc_init_context ();
+  gomp_simple_alloc_register_memory (managed_ctx, new_pages, size);
+
+  __atomic_store_n (&lock, 0, __ATOMIC_RELEASE);
+  return true;
+}
+
+/* }}} */
  /* {{{ OpenACC support  */
  
  /* Execute an OpenACC kernel, synchronously or asynchronously.  */
@@ -5061,6 +5211,35 @@ GOMP_OFFLOAD_async_run (int device, void *tgt_fn, void *tgt_vars,
                        GOMP_PLUGIN_target_task_completion, async_data);
  }
  
+/* Allocate memory suitable for Managed Memory.  */
+
+void *
+GOMP_OFFLOAD_managed_alloc (int device, size_t size)
+{
+  struct agent_info *agent = get_agent_info (device);
+  while (1)
+    {
+      void *result = gomp_simple_alloc (managed_ctx, size);
+      if (result)
+       return result;
+
+      /* Allocation failed.  Try again if we can create a new heap block.
+        Note: it's possible another thread could get to the new memory
+        first, so the while loop is necessary. */
+      if (!managed_heap_create (agent, size))
+       return NULL;
+    }
+}
+
+/* Free memory allocated via GOMP_OFFLOAD_managed_alloc.  */
+
+bool
+GOMP_OFFLOAD_managed_free (int device, void *ptr)
+{
+  gomp_simple_free (managed_ctx, ptr);
+  return true;
+}
+
  /* }}} */
  /* {{{ OpenACC Plugin API  */
  
diff --git a/libgomp/simple-allocator.c b/libgomp/simple-allocator.c

index 531bd18e74a65f7e589e6e7eb18821fbb2f40b12..25ec2c77c727c0d28733ef85451fe06755102541 100644 (file)
--- a/libgomp/simple-allocator.c
+++ b/libgomp/simple-allocator.c
@@ -312,4 +312,5 @@ gomp_simple_realloc (gomp_simple_alloc_ctx_p ctx, void *addr, size_t newsize)
  /* Include the splay tree code inline, with the prefixes added.  */
  #define splay_tree_prefix simple_alloc
  #define splay_tree_c
+#define gomp_fatal GOMP_PLUGIN_fatal  /* So it links into a plugin.  */
  #include "splay-tree.h"
diff --git a/libgomp/testsuite/lib/libgomp.exp b/libgomp/testsuite/lib/libgomp.exp

index ba55cd39e2ba46663a7005c089b541a5fe3d0b58..f5683b50725a86f845fed84f35ece71e7528a379 100644 (file)
--- a/libgomp/testsuite/lib/libgomp.exp
+++ b/libgomp/testsuite/lib/libgomp.exp
@@ -729,5 +729,41 @@ proc check_effective_target_omp_managedmem { } {
      if { [check_effective_target_offload_device_nvptx] } {
         return 1
      }
+
+    if { [libgomp_check_effective_target_offload_target "amdgcn"] } {
+       if [check_runtime_nocache managed_available_ {
+           #include <omp.h>
+           #include <stdlib.h>
+           int main ()
+           {
+             const omp_alloctrait_t traits[] = {
+                 { omp_atk_fallback, omp_atv_null_fb }
+             };
+             omp_allocator_handle_t managed_no_fallback
+                = omp_init_allocator (ompx_gnu_managed_mem_space, 1, traits);
+             void *a = omp_alloc (16, managed_no_fallback);
+             return a == NULL;
+           }
+       } ] {
+         return 1
+       }
+    }
+
+    return 0
+}
+
+# return 1 if -mxnack=on is accepted
+
+proc check_effective_target_offload_target_amdgcn_with_xnack { } {
+    if { [libgomp_check_effective_target_offload_target "amdgcn"] } {
+       return [check_no_compiler_messages amd_xnack_ executable {
+          int main () {
+            #pragma omp target
+              ;
+            return 0;
+          }
+       } "-foffload-options=amdgcn-amdhsa=-mxnack=on" ]
+    }
+
      return 0
  }
diff --git a/libgomp/testsuite/libgomp.c++/alloc-managed-1.C b/libgomp/testsuite/libgomp.c++/alloc-managed-1.C

index afd7fd648c6739ef0bc7fc0548bbbd3836986f18..86de0aac400499b667a38467dd0000201fa85091 100644 (file)
--- a/libgomp/testsuite/libgomp.c++/alloc-managed-1.C
+++ b/libgomp/testsuite/libgomp.c++/alloc-managed-1.C
@@ -1,5 +1,6 @@
  // { dg-do run }
  // { dg-require-effective-target omp_managedmem }
+// { dg-additional-options -foffload-options=amdgcn-amdhsa=-mxnack=on { target offload_target_amdgcn_with_xnack } }
  
  // Check that the ompx::allocator::gnu_managed_mem allocator can allocate
  // Managed Memory, and that host and target can see the data, at the same
diff --git a/libgomp/testsuite/libgomp.c-c++-common/requires-4.c b/libgomp/testsuite/libgomp.c-c++-common/requires-4.c

index 8cb4821ee53c2be1956082c0253e67c2c3df49f3..2ebd9c1c8ce64e7cd07198f7adfbd3d74b04549e 100644 (file)
--- a/libgomp/testsuite/libgomp.c-c++-common/requires-4.c
+++ b/libgomp/testsuite/libgomp.c-c++-common/requires-4.c
@@ -3,6 +3,10 @@
  /* { dg-additional-options "-foffload-options=nvptx-none=-misa=sm_35" { target { offload_target_nvptx } } } */
  /* { dg-additional-sources requires-4-aux.c } */
  
+/* GCC explicitly disables XNACK for gfx908 (and others) as the hardware
+   support is limited, which results in a diagnostic.  */
+/* { dg-excess-errors "Unified Shared Memory is enabled, but XNACK is disabled" { target offload_target_amdgcn } } */
+
  /* Check no diagnostic by device-compiler's or host compiler's lto1.
     Other file uses: 'requires reverse_offload', but that's inactive as
     there are no declare target directives, device constructs nor device routines  */
diff --git a/libgomp/testsuite/libgomp.c-c++-common/requires-4a.c b/libgomp/testsuite/libgomp.c-c++-common/requires-4a.c

index 0e0db927c2c18fabb9a11e258a9becb261e11ea7..85301cf31d9700c374ea2750d96365c13cf8d574 100644 (file)
--- a/libgomp/testsuite/libgomp.c-c++-common/requires-4a.c
+++ b/libgomp/testsuite/libgomp.c-c++-common/requires-4a.c
@@ -3,6 +3,10 @@
  /* { dg-additional-options "-foffload-options=nvptx-none=-misa=sm_35" { target { offload_target_nvptx } } } */
  /* { dg-additional-sources requires-4-aux.c } */
  
+/* GCC explicitly disables XNACK for gfx908 (and others) as the hardware
+   support is limited, which results in a diagnostic.  */
+/* { dg-excess-errors "Unified Shared Memory is enabled, but XNACK is disabled" { target offload_target_amdgcn } } */
+
  /* Same as requires-4.c, but uses heap memory for 'a'.  */
  
  /* Check no diagnostic by device-compiler's or host compiler's lto1.
diff --git a/libgomp/testsuite/libgomp.c-c++-common/requires-5.c b/libgomp/testsuite/libgomp.c-c++-common/requires-5.c

index d43d78db6fab61cdc34804a3adf38245576f5bf5..4fd7f1c7885cd1125befed74397f89dc40fb2346 100644 (file)
--- a/libgomp/testsuite/libgomp.c-c++-common/requires-5.c
+++ b/libgomp/testsuite/libgomp.c-c++-common/requires-5.c
@@ -1,5 +1,6 @@
  /* { dg-additional-options "-foffload-options=nvptx-none=-misa=sm_35" { target { offload_target_nvptx } } } */
  /* { dg-additional-sources requires-5-aux.c } */
+/* { dg-excess-errors "Unified Shared Memory is enabled, but XNACK is disabled" { target offload_target_amdgcn } } */
  
  /* Depending on offload device capabilities, it may print something like the
     following (only) if GOMP_DEBUG=1:
diff --git a/libgomp/testsuite/libgomp.c/alloc-managed-1.c b/libgomp/testsuite/libgomp.c/alloc-managed-1.c

index 31b252fc0ae6339aa98d60e4df977fdf348dbcef..88ddcf36d4af3fa33cb24f14ddc38b5d5b6ddb30 100644 (file)
--- a/libgomp/testsuite/libgomp.c/alloc-managed-1.c
+++ b/libgomp/testsuite/libgomp.c/alloc-managed-1.c
@@ -1,5 +1,6 @@
  /* { dg-do run } */
  /* { dg-require-effective-target omp_managedmem } */
+/* { dg-additional-options "-foffload-options=amdgcn-amdhsa=-mxnack=on" { target offload_target_amdgcn_with_xnack } } */
  
  /* Check that omp_alloc can allocate Managed Memory, and that host and target
     can see the data, at the same address, without a mapping.  */
diff --git a/libgomp/testsuite/libgomp.c/alloc-managed-2.c b/libgomp/testsuite/libgomp.c/alloc-managed-2.c

index f7fd30a4f67907aef0919f8018d16d2065dea42e..660f6e6ed2551da1b980d3c21090390eb82adcd9 100644 (file)
--- a/libgomp/testsuite/libgomp.c/alloc-managed-2.c
+++ b/libgomp/testsuite/libgomp.c/alloc-managed-2.c
@@ -1,5 +1,6 @@
  /* { dg-do run } */
  /* { dg-require-effective-target omp_managedmem } */
+/* { dg-additional-options -foffload-options=amdgcn-amdhsa=-mxnack=on { target offload_target_amdgcn_with_xnack } } */
  
  /* Check that omp_calloc can allocate Managed Memory, and that host and target
     can see the data, at the same address, without a mapping.  */
diff --git a/libgomp/testsuite/libgomp.c/alloc-managed-3.c b/libgomp/testsuite/libgomp.c/alloc-managed-3.c

index 17828b76962e173b867efcab37bdc22a9a6d3dbc..fefdeb3a932e34023e78890f113edd174e63f883 100644 (file)
--- a/libgomp/testsuite/libgomp.c/alloc-managed-3.c
+++ b/libgomp/testsuite/libgomp.c/alloc-managed-3.c
@@ -1,5 +1,6 @@
  /* { dg-do run } */
  /* { dg-require-effective-target omp_managedmem } */
+/* { dg-additional-options -foffload-options=amdgcn-amdhsa=-mxnack=on { target offload_target_amdgcn_with_xnack } } */
  
  /* Check that omp_realloc can allocate Managed Memory, and that host and target
     can see the data, at the same address, without a mapping.  */
diff --git a/libgomp/testsuite/libgomp.c/alloc-managed-4.c b/libgomp/testsuite/libgomp.c/alloc-managed-4.c

index 4eaf8259b6fce568f167939e07584dfcc95a03c6..577e3e28ec1f8257d433fd248271f5cd3c73d0d7 100644 (file)
--- a/libgomp/testsuite/libgomp.c/alloc-managed-4.c
+++ b/libgomp/testsuite/libgomp.c/alloc-managed-4.c
@@ -1,5 +1,6 @@
  /* { dg-do run } */
  /* { dg-require-effective-target omp_managedmem } */
+/* { dg-additional-options -foffload-options=amdgcn-amdhsa=-mxnack=on { target offload_target_amdgcn_with_xnack } } */
  /* { dg-shouldfail "" } */
  /* { dg-output "libgomp: attempted to free managed memory at 0x\[0-9a-f\]+, but the default device is set to the host device" } */
  
diff --git a/libgomp/testsuite/libgomp.fortran/alloc-managed-1.f90 b/libgomp/testsuite/libgomp.fortran/alloc-managed-1.f90

index 685aeef7dae216f17d20561ca1eeb94260499c33..e19eb043daaa8e84f8003ebc1d53ae804148f717 100644 (file)
--- a/libgomp/testsuite/libgomp.fortran/alloc-managed-1.f90
+++ b/libgomp/testsuite/libgomp.fortran/alloc-managed-1.f90
@@ -1,5 +1,6 @@
  ! { dg-do run }
  ! { dg-require-effective-target omp_managedmem }
+! { dg-additional-options -foffload-options=amdgcn-amdhsa=-mxnack=on { target offload_target_amdgcn_with_xnack } }
  
  ! Check that omp_alloc can allocate Managed Memory, and that host and target
  ! can see the data, at the same address, without a mapping.
author	Andrew Stubbs <ams@baylibre.com>
	Thu, 20 Nov 2025 17:18:13 +0000 (17:18 +0000)
committer	Andrew Stubbs <ams@baylibre.com>
	Mon, 1 Dec 2025 12:03:35 +0000 (12:03 +0000)
include/hsa.h		patch \| blob \| blame \| history
include/hsa_ext_amd.h		patch \| blob \| blame \| history
include/hsa_ext_image.h		patch \| blob \| blame \| history
libgomp/Makefile.in		patch \| blob \| blame \| history
libgomp/libgomp-plugin.h		patch \| blob \| blame \| history
libgomp/libgomp.h		patch \| blob \| blame \| history
libgomp/libgomp.texi		patch \| blob \| blame \| history
libgomp/plugin/Makefrag.am		patch \| blob \| blame \| history
libgomp/plugin/mutex.c	[new file with mode: 0644]	patch \| blob
libgomp/plugin/plugin-gcn.c		patch \| blob \| blame \| history
libgomp/simple-allocator.c		patch \| blob \| blame \| history
libgomp/testsuite/lib/libgomp.exp		patch \| blob \| blame \| history
libgomp/testsuite/libgomp.c++/alloc-managed-1.C		patch \| blob \| blame \| history
libgomp/testsuite/libgomp.c-c++-common/requires-4.c		patch \| blob \| blame \| history
libgomp/testsuite/libgomp.c-c++-common/requires-4a.c		patch \| blob \| blame \| history
libgomp/testsuite/libgomp.c-c++-common/requires-5.c		patch \| blob \| blame \| history
libgomp/testsuite/libgomp.c/alloc-managed-1.c		patch \| blob \| blame \| history
libgomp/testsuite/libgomp.c/alloc-managed-2.c		patch \| blob \| blame \| history
libgomp/testsuite/libgomp.c/alloc-managed-3.c		patch \| blob \| blame \| history
libgomp/testsuite/libgomp.c/alloc-managed-4.c		patch \| blob \| blame \| history
libgomp/testsuite/libgomp.fortran/alloc-managed-1.f90		patch \| blob \| blame \| history