]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
drm/amdkfd: sever xgmi io link if host driver has disable sharing
authorJonathan Kim <Jonathan.Kim@amd.com>
Fri, 20 Sep 2024 15:46:05 +0000 (11:46 -0400)
committerAlex Deucher <alexander.deucher@amd.com>
Thu, 24 Oct 2024 22:06:34 +0000 (18:06 -0400)
Host drivers can create partial hives per guest by disabling xgmi sharing
between certain peers in the main hive.
Typically, these partial hives are fully connected per guest session.
In the event that the host makes a mistake by adding a non-shared node
to a guest session, have the KFD reflect sharing disabled by severing
the IO link.

Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
Tested-by: James Yao <yiqing.yao@amd.com>
Reviewed-by: Harish Kasiviswanathan <harish.kasiviswanathan@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
drivers/gpu/drm/amd/amdkfd/kfd_crat.c

index 3ef5066ca52948e8a5a7af77b4f8fd7ca40a14b3..b47422b0b5b104582290a3c8a40bd9d8f87cc0a5 100644 (file)
@@ -801,6 +801,23 @@ int amdgpu_xgmi_get_num_links(struct amdgpu_device *adev,
        return  -EINVAL;
 }
 
+bool amdgpu_xgmi_get_is_sharing_enabled(struct amdgpu_device *adev,
+                                       struct amdgpu_device *peer_adev)
+{
+       struct psp_xgmi_topology_info *top = &adev->psp.xgmi_context.top_info;
+       int i;
+
+       /* Sharing should always be enabled for non-SRIOV. */
+       if (!amdgpu_sriov_vf(adev))
+               return true;
+
+       for (i = 0 ; i < top->num_nodes; ++i)
+               if (top->nodes[i].node_id == peer_adev->gmc.xgmi.node_id)
+                       return !!top->nodes[i].is_sharing_enabled;
+
+       return false;
+}
+
 /*
  * Devices that support extended data require the entire hive to initialize with
  * the shared memory buffer flag set.
index 41d5f97fc77acb537adfb21f1bdd04f3887e3805..8cc7ab38db7c78a3da3ed93dc6c17d2430b1491a 100644 (file)
@@ -66,6 +66,8 @@ int amdgpu_xgmi_get_hops_count(struct amdgpu_device *adev,
                struct amdgpu_device *peer_adev);
 int amdgpu_xgmi_get_num_links(struct amdgpu_device *adev,
                struct amdgpu_device *peer_adev);
+bool amdgpu_xgmi_get_is_sharing_enabled(struct amdgpu_device *adev,
+                                       struct amdgpu_device *peer_adev);
 uint64_t amdgpu_xgmi_get_relative_phy_addr(struct amdgpu_device *adev,
                                           uint64_t addr);
 static inline bool amdgpu_xgmi_same_hive(struct amdgpu_device *adev,
index 48caecf7e72ed139872819c1a48cf96ae4528a58..723f1220e1cc987cea1b57c1c0e865f92c2b252d 100644 (file)
@@ -28,6 +28,7 @@
 #include "kfd_topology.h"
 #include "amdgpu.h"
 #include "amdgpu_amdkfd.h"
+#include "amdgpu_xgmi.h"
 
 /* GPU Processor ID base for dGPUs for which VCRAT needs to be created.
  * GPU processor ID are expressed with Bit[31]=1.
@@ -2329,6 +2330,8 @@ static int kfd_create_vcrat_image_gpu(void *pcrat_image,
                                continue;
                        if (peer_dev->gpu->kfd->hive_id != kdev->kfd->hive_id)
                                continue;
+                       if (!amdgpu_xgmi_get_is_sharing_enabled(kdev->adev, peer_dev->gpu->adev))
+                               continue;
                        sub_type_hdr = (typeof(sub_type_hdr))(
                                (char *)sub_type_hdr +
                                sizeof(struct crat_subtype_iolink));