--- /dev/null
+From c0cc999f3c32e65a7c88fb323893ddf897b24488 Mon Sep 17 00:00:00 2001
+From: Ma Jun <Jun.Ma2@amd.com>
+Date: Wed, 2 Nov 2022 15:53:26 +0800
+Subject: drm/amdkfd: Fix the warning of array-index-out-of-bounds
+
+From: Ma Jun <Jun.Ma2@amd.com>
+
+commit c0cc999f3c32e65a7c88fb323893ddf897b24488 upstream.
+
+For some GPUs with more CUs, the original sibling_map[32]
+in struct crat_subtype_cache is not enough
+to save the cache information when create the VCRAT table,
+so skip filling the struct crat_subtype_cache info instead
+fill struct kfd_cache_properties directly to fix this problem.
+
+Signed-off-by: Ma Jun <Jun.Ma2@amd.com>
+Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
+Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
+Cc: "Limonciello, Mario" <Mario.Limonciello@amd.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/gpu/drm/amd/amdkfd/kfd_crat.c | 312 +++---------------------------
+ drivers/gpu/drm/amd/amdkfd/kfd_crat.h | 12 +
+ drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 245 ++++++++++++++++++++++-
+ drivers/gpu/drm/amd/amdkfd/kfd_topology.h | 5
+ 4 files changed, 282 insertions(+), 292 deletions(-)
+
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
+@@ -50,16 +50,6 @@ static inline unsigned int get_and_inc_g
+ return current_id;
+ }
+
+-/* Static table to describe GPU Cache information */
+-struct kfd_gpu_cache_info {
+- uint32_t cache_size;
+- uint32_t cache_level;
+- uint32_t flags;
+- /* Indicates how many Compute Units share this cache
+- * within a SA. Value = 1 indicates the cache is not shared
+- */
+- uint32_t num_cu_shared;
+-};
+
+ static struct kfd_gpu_cache_info kaveri_cache_info[] = {
+ {
+@@ -1119,9 +1109,13 @@ static int kfd_parse_subtype_cache(struc
+ props->cachelines_per_tag = cache->lines_per_tag;
+ props->cache_assoc = cache->associativity;
+ props->cache_latency = cache->cache_latency;
++
+ memcpy(props->sibling_map, cache->sibling_map,
+ sizeof(props->sibling_map));
+
++ /* set the sibling_map_size as 32 for CRAT from ACPI */
++ props->sibling_map_size = CRAT_SIBLINGMAP_SIZE;
++
+ if (cache->flags & CRAT_CACHE_FLAGS_DATA_CACHE)
+ props->cache_type |= HSA_CACHE_TYPE_DATA;
+ if (cache->flags & CRAT_CACHE_FLAGS_INST_CACHE)
+@@ -1339,125 +1333,6 @@ err:
+ return ret;
+ }
+
+-/* Helper function. See kfd_fill_gpu_cache_info for parameter description */
+-static int fill_in_l1_pcache(struct crat_subtype_cache *pcache,
+- struct kfd_gpu_cache_info *pcache_info,
+- struct kfd_cu_info *cu_info,
+- int mem_available,
+- int cu_bitmask,
+- int cache_type, unsigned int cu_processor_id,
+- int cu_block)
+-{
+- unsigned int cu_sibling_map_mask;
+- int first_active_cu;
+-
+- /* First check if enough memory is available */
+- if (sizeof(struct crat_subtype_cache) > mem_available)
+- return -ENOMEM;
+-
+- cu_sibling_map_mask = cu_bitmask;
+- cu_sibling_map_mask >>= cu_block;
+- cu_sibling_map_mask &=
+- ((1 << pcache_info[cache_type].num_cu_shared) - 1);
+- first_active_cu = ffs(cu_sibling_map_mask);
+-
+- /* CU could be inactive. In case of shared cache find the first active
+- * CU. and incase of non-shared cache check if the CU is inactive. If
+- * inactive active skip it
+- */
+- if (first_active_cu) {
+- memset(pcache, 0, sizeof(struct crat_subtype_cache));
+- pcache->type = CRAT_SUBTYPE_CACHE_AFFINITY;
+- pcache->length = sizeof(struct crat_subtype_cache);
+- pcache->flags = pcache_info[cache_type].flags;
+- pcache->processor_id_low = cu_processor_id
+- + (first_active_cu - 1);
+- pcache->cache_level = pcache_info[cache_type].cache_level;
+- pcache->cache_size = pcache_info[cache_type].cache_size;
+-
+- /* Sibling map is w.r.t processor_id_low, so shift out
+- * inactive CU
+- */
+- cu_sibling_map_mask =
+- cu_sibling_map_mask >> (first_active_cu - 1);
+-
+- pcache->sibling_map[0] = (uint8_t)(cu_sibling_map_mask & 0xFF);
+- pcache->sibling_map[1] =
+- (uint8_t)((cu_sibling_map_mask >> 8) & 0xFF);
+- pcache->sibling_map[2] =
+- (uint8_t)((cu_sibling_map_mask >> 16) & 0xFF);
+- pcache->sibling_map[3] =
+- (uint8_t)((cu_sibling_map_mask >> 24) & 0xFF);
+- return 0;
+- }
+- return 1;
+-}
+-
+-/* Helper function. See kfd_fill_gpu_cache_info for parameter description */
+-static int fill_in_l2_l3_pcache(struct crat_subtype_cache *pcache,
+- struct kfd_gpu_cache_info *pcache_info,
+- struct kfd_cu_info *cu_info,
+- int mem_available,
+- int cache_type, unsigned int cu_processor_id)
+-{
+- unsigned int cu_sibling_map_mask;
+- int first_active_cu;
+- int i, j, k;
+-
+- /* First check if enough memory is available */
+- if (sizeof(struct crat_subtype_cache) > mem_available)
+- return -ENOMEM;
+-
+- cu_sibling_map_mask = cu_info->cu_bitmap[0][0];
+- cu_sibling_map_mask &=
+- ((1 << pcache_info[cache_type].num_cu_shared) - 1);
+- first_active_cu = ffs(cu_sibling_map_mask);
+-
+- /* CU could be inactive. In case of shared cache find the first active
+- * CU. and incase of non-shared cache check if the CU is inactive. If
+- * inactive active skip it
+- */
+- if (first_active_cu) {
+- memset(pcache, 0, sizeof(struct crat_subtype_cache));
+- pcache->type = CRAT_SUBTYPE_CACHE_AFFINITY;
+- pcache->length = sizeof(struct crat_subtype_cache);
+- pcache->flags = pcache_info[cache_type].flags;
+- pcache->processor_id_low = cu_processor_id
+- + (first_active_cu - 1);
+- pcache->cache_level = pcache_info[cache_type].cache_level;
+- pcache->cache_size = pcache_info[cache_type].cache_size;
+-
+- /* Sibling map is w.r.t processor_id_low, so shift out
+- * inactive CU
+- */
+- cu_sibling_map_mask =
+- cu_sibling_map_mask >> (first_active_cu - 1);
+- k = 0;
+- for (i = 0; i < cu_info->num_shader_engines; i++) {
+- for (j = 0; j < cu_info->num_shader_arrays_per_engine;
+- j++) {
+- pcache->sibling_map[k] =
+- (uint8_t)(cu_sibling_map_mask & 0xFF);
+- pcache->sibling_map[k+1] =
+- (uint8_t)((cu_sibling_map_mask >> 8) & 0xFF);
+- pcache->sibling_map[k+2] =
+- (uint8_t)((cu_sibling_map_mask >> 16) & 0xFF);
+- pcache->sibling_map[k+3] =
+- (uint8_t)((cu_sibling_map_mask >> 24) & 0xFF);
+- k += 4;
+- cu_sibling_map_mask =
+- cu_info->cu_bitmap[i % 4][j + i / 4];
+- cu_sibling_map_mask &= (
+- (1 << pcache_info[cache_type].num_cu_shared)
+- - 1);
+- }
+- }
+- return 0;
+- }
+- return 1;
+-}
+-
+-#define KFD_MAX_CACHE_TYPES 6
+
+ static int kfd_fill_gpu_cache_info_from_gfx_config(struct kfd_dev *kdev,
+ struct kfd_gpu_cache_info *pcache_info)
+@@ -1531,231 +1406,133 @@ static int kfd_fill_gpu_cache_info_from_
+ return i;
+ }
+
+-/* kfd_fill_gpu_cache_info - Fill GPU cache info using kfd_gpu_cache_info
+- * tables
+- *
+- * @kdev - [IN] GPU device
+- * @gpu_processor_id - [IN] GPU processor ID to which these caches
+- * associate
+- * @available_size - [IN] Amount of memory available in pcache
+- * @cu_info - [IN] Compute Unit info obtained from KGD
+- * @pcache - [OUT] memory into which cache data is to be filled in.
+- * @size_filled - [OUT] amount of data used up in pcache.
+- * @num_of_entries - [OUT] number of caches added
+- */
+-static int kfd_fill_gpu_cache_info(struct kfd_dev *kdev,
+- int gpu_processor_id,
+- int available_size,
+- struct kfd_cu_info *cu_info,
+- struct crat_subtype_cache *pcache,
+- int *size_filled,
+- int *num_of_entries)
++int kfd_get_gpu_cache_info(struct kfd_dev *kdev, struct kfd_gpu_cache_info **pcache_info)
+ {
+- struct kfd_gpu_cache_info *pcache_info;
+- struct kfd_gpu_cache_info cache_info[KFD_MAX_CACHE_TYPES];
+ int num_of_cache_types = 0;
+- int i, j, k;
+- int ct = 0;
+- int mem_available = available_size;
+- unsigned int cu_processor_id;
+- int ret;
+- unsigned int num_cu_shared;
+
+ switch (kdev->adev->asic_type) {
+ case CHIP_KAVERI:
+- pcache_info = kaveri_cache_info;
++ *pcache_info = kaveri_cache_info;
+ num_of_cache_types = ARRAY_SIZE(kaveri_cache_info);
+ break;
+ case CHIP_HAWAII:
+- pcache_info = hawaii_cache_info;
++ *pcache_info = hawaii_cache_info;
+ num_of_cache_types = ARRAY_SIZE(hawaii_cache_info);
+ break;
+ case CHIP_CARRIZO:
+- pcache_info = carrizo_cache_info;
++ *pcache_info = carrizo_cache_info;
+ num_of_cache_types = ARRAY_SIZE(carrizo_cache_info);
+ break;
+ case CHIP_TONGA:
+- pcache_info = tonga_cache_info;
++ *pcache_info = tonga_cache_info;
+ num_of_cache_types = ARRAY_SIZE(tonga_cache_info);
+ break;
+ case CHIP_FIJI:
+- pcache_info = fiji_cache_info;
++ *pcache_info = fiji_cache_info;
+ num_of_cache_types = ARRAY_SIZE(fiji_cache_info);
+ break;
+ case CHIP_POLARIS10:
+- pcache_info = polaris10_cache_info;
++ *pcache_info = polaris10_cache_info;
+ num_of_cache_types = ARRAY_SIZE(polaris10_cache_info);
+ break;
+ case CHIP_POLARIS11:
+- pcache_info = polaris11_cache_info;
++ *pcache_info = polaris11_cache_info;
+ num_of_cache_types = ARRAY_SIZE(polaris11_cache_info);
+ break;
+ case CHIP_POLARIS12:
+- pcache_info = polaris12_cache_info;
++ *pcache_info = polaris12_cache_info;
+ num_of_cache_types = ARRAY_SIZE(polaris12_cache_info);
+ break;
+ case CHIP_VEGAM:
+- pcache_info = vegam_cache_info;
++ *pcache_info = vegam_cache_info;
+ num_of_cache_types = ARRAY_SIZE(vegam_cache_info);
+ break;
+ default:
+ switch (KFD_GC_VERSION(kdev)) {
+ case IP_VERSION(9, 0, 1):
+- pcache_info = vega10_cache_info;
++ *pcache_info = vega10_cache_info;
+ num_of_cache_types = ARRAY_SIZE(vega10_cache_info);
+ break;
+ case IP_VERSION(9, 2, 1):
+- pcache_info = vega12_cache_info;
++ *pcache_info = vega12_cache_info;
+ num_of_cache_types = ARRAY_SIZE(vega12_cache_info);
+ break;
+ case IP_VERSION(9, 4, 0):
+ case IP_VERSION(9, 4, 1):
+- pcache_info = vega20_cache_info;
++ *pcache_info = vega20_cache_info;
+ num_of_cache_types = ARRAY_SIZE(vega20_cache_info);
+ break;
+ case IP_VERSION(9, 4, 2):
+- pcache_info = aldebaran_cache_info;
++ *pcache_info = aldebaran_cache_info;
+ num_of_cache_types = ARRAY_SIZE(aldebaran_cache_info);
+ break;
+ case IP_VERSION(9, 1, 0):
+ case IP_VERSION(9, 2, 2):
+- pcache_info = raven_cache_info;
++ *pcache_info = raven_cache_info;
+ num_of_cache_types = ARRAY_SIZE(raven_cache_info);
+ break;
+ case IP_VERSION(9, 3, 0):
+- pcache_info = renoir_cache_info;
++ *pcache_info = renoir_cache_info;
+ num_of_cache_types = ARRAY_SIZE(renoir_cache_info);
+ break;
+ case IP_VERSION(10, 1, 10):
+ case IP_VERSION(10, 1, 2):
+ case IP_VERSION(10, 1, 3):
+ case IP_VERSION(10, 1, 4):
+- pcache_info = navi10_cache_info;
++ *pcache_info = navi10_cache_info;
+ num_of_cache_types = ARRAY_SIZE(navi10_cache_info);
+ break;
+ case IP_VERSION(10, 1, 1):
+- pcache_info = navi14_cache_info;
++ *pcache_info = navi14_cache_info;
+ num_of_cache_types = ARRAY_SIZE(navi14_cache_info);
+ break;
+ case IP_VERSION(10, 3, 0):
+- pcache_info = sienna_cichlid_cache_info;
++ *pcache_info = sienna_cichlid_cache_info;
+ num_of_cache_types = ARRAY_SIZE(sienna_cichlid_cache_info);
+ break;
+ case IP_VERSION(10, 3, 2):
+- pcache_info = navy_flounder_cache_info;
++ *pcache_info = navy_flounder_cache_info;
+ num_of_cache_types = ARRAY_SIZE(navy_flounder_cache_info);
+ break;
+ case IP_VERSION(10, 3, 4):
+- pcache_info = dimgrey_cavefish_cache_info;
++ *pcache_info = dimgrey_cavefish_cache_info;
+ num_of_cache_types = ARRAY_SIZE(dimgrey_cavefish_cache_info);
+ break;
+ case IP_VERSION(10, 3, 1):
+- pcache_info = vangogh_cache_info;
++ *pcache_info = vangogh_cache_info;
+ num_of_cache_types = ARRAY_SIZE(vangogh_cache_info);
+ break;
+ case IP_VERSION(10, 3, 5):
+- pcache_info = beige_goby_cache_info;
++ *pcache_info = beige_goby_cache_info;
+ num_of_cache_types = ARRAY_SIZE(beige_goby_cache_info);
+ break;
+ case IP_VERSION(10, 3, 3):
+- pcache_info = yellow_carp_cache_info;
++ *pcache_info = yellow_carp_cache_info;
+ num_of_cache_types = ARRAY_SIZE(yellow_carp_cache_info);
+ break;
+ case IP_VERSION(10, 3, 6):
+- pcache_info = gc_10_3_6_cache_info;
++ *pcache_info = gc_10_3_6_cache_info;
+ num_of_cache_types = ARRAY_SIZE(gc_10_3_6_cache_info);
+ break;
+ case IP_VERSION(10, 3, 7):
+- pcache_info = gfx1037_cache_info;
++ *pcache_info = gfx1037_cache_info;
+ num_of_cache_types = ARRAY_SIZE(gfx1037_cache_info);
+ break;
+ case IP_VERSION(11, 0, 0):
+ case IP_VERSION(11, 0, 1):
+ case IP_VERSION(11, 0, 2):
+ case IP_VERSION(11, 0, 3):
+- pcache_info = cache_info;
+ num_of_cache_types =
+- kfd_fill_gpu_cache_info_from_gfx_config(kdev, pcache_info);
++ kfd_fill_gpu_cache_info_from_gfx_config(kdev, *pcache_info);
+ break;
+ default:
+- pcache_info = dummy_cache_info;
++ *pcache_info = dummy_cache_info;
+ num_of_cache_types = ARRAY_SIZE(dummy_cache_info);
+ pr_warn("dummy cache info is used temporarily and real cache info need update later.\n");
+ break;
+ }
+ }
+-
+- *size_filled = 0;
+- *num_of_entries = 0;
+-
+- /* For each type of cache listed in the kfd_gpu_cache_info table,
+- * go through all available Compute Units.
+- * The [i,j,k] loop will
+- * if kfd_gpu_cache_info.num_cu_shared = 1
+- * will parse through all available CU
+- * If (kfd_gpu_cache_info.num_cu_shared != 1)
+- * then it will consider only one CU from
+- * the shared unit
+- */
+-
+- for (ct = 0; ct < num_of_cache_types; ct++) {
+- cu_processor_id = gpu_processor_id;
+- if (pcache_info[ct].cache_level == 1) {
+- for (i = 0; i < cu_info->num_shader_engines; i++) {
+- for (j = 0; j < cu_info->num_shader_arrays_per_engine; j++) {
+- for (k = 0; k < cu_info->num_cu_per_sh;
+- k += pcache_info[ct].num_cu_shared) {
+- ret = fill_in_l1_pcache(pcache,
+- pcache_info,
+- cu_info,
+- mem_available,
+- cu_info->cu_bitmap[i % 4][j + i / 4],
+- ct,
+- cu_processor_id,
+- k);
+-
+- if (ret < 0)
+- break;
+-
+- if (!ret) {
+- pcache++;
+- (*num_of_entries)++;
+- mem_available -= sizeof(*pcache);
+- (*size_filled) += sizeof(*pcache);
+- }
+-
+- /* Move to next CU block */
+- num_cu_shared = ((k + pcache_info[ct].num_cu_shared) <=
+- cu_info->num_cu_per_sh) ?
+- pcache_info[ct].num_cu_shared :
+- (cu_info->num_cu_per_sh - k);
+- cu_processor_id += num_cu_shared;
+- }
+- }
+- }
+- } else {
+- ret = fill_in_l2_l3_pcache(pcache,
+- pcache_info,
+- cu_info,
+- mem_available,
+- ct,
+- cu_processor_id);
+-
+- if (ret < 0)
+- break;
+-
+- if (!ret) {
+- pcache++;
+- (*num_of_entries)++;
+- mem_available -= sizeof(*pcache);
+- (*size_filled) += sizeof(*pcache);
+- }
+- }
+- }
+-
+- pr_debug("Added [%d] GPU cache entries\n", *num_of_entries);
+-
+- return 0;
++ return num_of_cache_types;
+ }
+
+ static bool kfd_ignore_crat(void)
+@@ -2314,8 +2091,6 @@ static int kfd_create_vcrat_image_gpu(vo
+ struct kfd_cu_info cu_info;
+ int avail_size = *size;
+ uint32_t total_num_of_cu;
+- int num_of_cache_entries = 0;
+- int cache_mem_filled = 0;
+ uint32_t nid = 0;
+ int ret = 0;
+
+@@ -2416,31 +2191,12 @@ static int kfd_create_vcrat_image_gpu(vo
+ crat_table->length += sizeof(struct crat_subtype_memory);
+ crat_table->total_entries++;
+
+- /* TODO: Fill in cache information. This information is NOT readily
+- * available in KGD
+- */
+- sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr +
+- sub_type_hdr->length);
+- ret = kfd_fill_gpu_cache_info(kdev, cu->processor_id_low,
+- avail_size,
+- &cu_info,
+- (struct crat_subtype_cache *)sub_type_hdr,
+- &cache_mem_filled,
+- &num_of_cache_entries);
+-
+- if (ret < 0)
+- return ret;
+-
+- crat_table->length += cache_mem_filled;
+- crat_table->total_entries += num_of_cache_entries;
+- avail_size -= cache_mem_filled;
+-
+ /* Fill in Subtype: IO_LINKS
+ * Only direct links are added here which is Link from GPU to
+ * its NUMA node. Indirect links are added by userspace.
+ */
+ sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr +
+- cache_mem_filled);
++ sub_type_hdr->length);
+ ret = kfd_fill_gpu_direct_io_link_to_cpu(&avail_size, kdev,
+ (struct crat_subtype_iolink *)sub_type_hdr, proximity_domain);
+
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.h
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.h
+@@ -317,6 +317,18 @@ struct cdit_header {
+
+ struct kfd_dev;
+
++/* Static table to describe GPU Cache information */
++struct kfd_gpu_cache_info {
++ uint32_t cache_size;
++ uint32_t cache_level;
++ uint32_t flags;
++ /* Indicates how many Compute Units share this cache
++ * within a SA. Value = 1 indicates the cache is not shared
++ */
++ uint32_t num_cu_shared;
++};
++int kfd_get_gpu_cache_info(struct kfd_dev *kdev, struct kfd_gpu_cache_info **pcache_info);
++
+ int kfd_create_crat_image_acpi(void **crat_image, size_t *size);
+ void kfd_destroy_crat_image(void *crat_image);
+ int kfd_parse_crat_table(void *crat_image, struct list_head *device_list,
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
+@@ -364,7 +364,6 @@ static ssize_t kfd_cache_show(struct kob
+
+ /* Making sure that the buffer is an empty string */
+ buffer[0] = 0;
+-
+ cache = container_of(attr, struct kfd_cache_properties, attr);
+ if (cache->gpu && kfd_devcgroup_check_permission(cache->gpu))
+ return -EPERM;
+@@ -379,12 +378,13 @@ static ssize_t kfd_cache_show(struct kob
+ sysfs_show_32bit_prop(buffer, offs, "association", cache->cache_assoc);
+ sysfs_show_32bit_prop(buffer, offs, "latency", cache->cache_latency);
+ sysfs_show_32bit_prop(buffer, offs, "type", cache->cache_type);
++
+ offs += snprintf(buffer+offs, PAGE_SIZE-offs, "sibling_map ");
+- for (i = 0; i < CRAT_SIBLINGMAP_SIZE; i++)
++ for (i = 0; i < cache->sibling_map_size; i++)
+ for (j = 0; j < sizeof(cache->sibling_map[0])*8; j++)
+ /* Check each bit */
+ offs += snprintf(buffer+offs, PAGE_SIZE-offs, "%d,",
+- (cache->sibling_map[i] >> j) & 1);
++ (cache->sibling_map[i] >> j) & 1);
+
+ /* Replace the last "," with end of line */
+ buffer[offs-1] = '\n';
+@@ -1198,7 +1198,6 @@ static struct kfd_topology_device *kfd_a
+ struct kfd_iolink_properties *iolink;
+ struct kfd_iolink_properties *p2plink;
+
+- down_write(&topology_lock);
+ list_for_each_entry(dev, &topology_device_list, list) {
+ /* Discrete GPUs need their own topology device list
+ * entries. Don't assign them to CPU/APU nodes.
+@@ -1222,7 +1221,6 @@ static struct kfd_topology_device *kfd_a
+ break;
+ }
+ }
+- up_write(&topology_lock);
+ return out_dev;
+ }
+
+@@ -1593,6 +1591,221 @@ out:
+ return ret;
+ }
+
++
++/* Helper function. See kfd_fill_gpu_cache_info for parameter description */
++static int fill_in_l1_pcache(struct kfd_cache_properties **props_ext,
++ struct kfd_gpu_cache_info *pcache_info,
++ struct kfd_cu_info *cu_info,
++ int cu_bitmask,
++ int cache_type, unsigned int cu_processor_id,
++ int cu_block)
++{
++ unsigned int cu_sibling_map_mask;
++ int first_active_cu;
++ struct kfd_cache_properties *pcache = NULL;
++
++ cu_sibling_map_mask = cu_bitmask;
++ cu_sibling_map_mask >>= cu_block;
++ cu_sibling_map_mask &= ((1 << pcache_info[cache_type].num_cu_shared) - 1);
++ first_active_cu = ffs(cu_sibling_map_mask);
++
++ /* CU could be inactive. In case of shared cache find the first active
++ * CU. and incase of non-shared cache check if the CU is inactive. If
++ * inactive active skip it
++ */
++ if (first_active_cu) {
++ pcache = kfd_alloc_struct(pcache);
++ if (!pcache)
++ return -ENOMEM;
++
++ memset(pcache, 0, sizeof(struct kfd_cache_properties));
++ pcache->processor_id_low = cu_processor_id + (first_active_cu - 1);
++ pcache->cache_level = pcache_info[cache_type].cache_level;
++ pcache->cache_size = pcache_info[cache_type].cache_size;
++
++ if (pcache_info[cache_type].flags & CRAT_CACHE_FLAGS_DATA_CACHE)
++ pcache->cache_type |= HSA_CACHE_TYPE_DATA;
++ if (pcache_info[cache_type].flags & CRAT_CACHE_FLAGS_INST_CACHE)
++ pcache->cache_type |= HSA_CACHE_TYPE_INSTRUCTION;
++ if (pcache_info[cache_type].flags & CRAT_CACHE_FLAGS_CPU_CACHE)
++ pcache->cache_type |= HSA_CACHE_TYPE_CPU;
++ if (pcache_info[cache_type].flags & CRAT_CACHE_FLAGS_SIMD_CACHE)
++ pcache->cache_type |= HSA_CACHE_TYPE_HSACU;
++
++ /* Sibling map is w.r.t processor_id_low, so shift out
++ * inactive CU
++ */
++ cu_sibling_map_mask =
++ cu_sibling_map_mask >> (first_active_cu - 1);
++
++ pcache->sibling_map[0] = (uint8_t)(cu_sibling_map_mask & 0xFF);
++ pcache->sibling_map[1] =
++ (uint8_t)((cu_sibling_map_mask >> 8) & 0xFF);
++ pcache->sibling_map[2] =
++ (uint8_t)((cu_sibling_map_mask >> 16) & 0xFF);
++ pcache->sibling_map[3] =
++ (uint8_t)((cu_sibling_map_mask >> 24) & 0xFF);
++
++ pcache->sibling_map_size = 4;
++ *props_ext = pcache;
++
++ return 0;
++ }
++ return 1;
++}
++
++/* Helper function. See kfd_fill_gpu_cache_info for parameter description */
++static int fill_in_l2_l3_pcache(struct kfd_cache_properties **props_ext,
++ struct kfd_gpu_cache_info *pcache_info,
++ struct kfd_cu_info *cu_info,
++ int cache_type, unsigned int cu_processor_id)
++{
++ unsigned int cu_sibling_map_mask;
++ int first_active_cu;
++ int i, j, k;
++ struct kfd_cache_properties *pcache = NULL;
++
++ cu_sibling_map_mask = cu_info->cu_bitmap[0][0];
++ cu_sibling_map_mask &=
++ ((1 << pcache_info[cache_type].num_cu_shared) - 1);
++ first_active_cu = ffs(cu_sibling_map_mask);
++
++ /* CU could be inactive. In case of shared cache find the first active
++ * CU. and incase of non-shared cache check if the CU is inactive. If
++ * inactive active skip it
++ */
++ if (first_active_cu) {
++ pcache = kfd_alloc_struct(pcache);
++ if (!pcache)
++ return -ENOMEM;
++
++ memset(pcache, 0, sizeof(struct kfd_cache_properties));
++ pcache->processor_id_low = cu_processor_id
++ + (first_active_cu - 1);
++ pcache->cache_level = pcache_info[cache_type].cache_level;
++ pcache->cache_size = pcache_info[cache_type].cache_size;
++
++ if (pcache_info[cache_type].flags & CRAT_CACHE_FLAGS_DATA_CACHE)
++ pcache->cache_type |= HSA_CACHE_TYPE_DATA;
++ if (pcache_info[cache_type].flags & CRAT_CACHE_FLAGS_INST_CACHE)
++ pcache->cache_type |= HSA_CACHE_TYPE_INSTRUCTION;
++ if (pcache_info[cache_type].flags & CRAT_CACHE_FLAGS_CPU_CACHE)
++ pcache->cache_type |= HSA_CACHE_TYPE_CPU;
++ if (pcache_info[cache_type].flags & CRAT_CACHE_FLAGS_SIMD_CACHE)
++ pcache->cache_type |= HSA_CACHE_TYPE_HSACU;
++
++ /* Sibling map is w.r.t processor_id_low, so shift out
++ * inactive CU
++ */
++ cu_sibling_map_mask = cu_sibling_map_mask >> (first_active_cu - 1);
++ k = 0;
++
++ for (i = 0; i < cu_info->num_shader_engines; i++) {
++ for (j = 0; j < cu_info->num_shader_arrays_per_engine; j++) {
++ pcache->sibling_map[k] = (uint8_t)(cu_sibling_map_mask & 0xFF);
++ pcache->sibling_map[k+1] = (uint8_t)((cu_sibling_map_mask >> 8) & 0xFF);
++ pcache->sibling_map[k+2] = (uint8_t)((cu_sibling_map_mask >> 16) & 0xFF);
++ pcache->sibling_map[k+3] = (uint8_t)((cu_sibling_map_mask >> 24) & 0xFF);
++ k += 4;
++
++ cu_sibling_map_mask = cu_info->cu_bitmap[i % 4][j + i / 4];
++ cu_sibling_map_mask &= ((1 << pcache_info[cache_type].num_cu_shared) - 1);
++ }
++ }
++ pcache->sibling_map_size = k;
++ *props_ext = pcache;
++ return 0;
++ }
++ return 1;
++}
++
++#define KFD_MAX_CACHE_TYPES 6
++
++/* kfd_fill_cache_non_crat_info - Fill GPU cache info using kfd_gpu_cache_info
++ * tables
++ */
++void kfd_fill_cache_non_crat_info(struct kfd_topology_device *dev, struct kfd_dev *kdev)
++{
++ struct kfd_gpu_cache_info *pcache_info = NULL;
++ int i, j, k;
++ int ct = 0;
++ unsigned int cu_processor_id;
++ int ret;
++ unsigned int num_cu_shared;
++ struct kfd_cu_info cu_info;
++ struct kfd_cu_info *pcu_info;
++ int gpu_processor_id;
++ struct kfd_cache_properties *props_ext;
++ int num_of_entries = 0;
++ int num_of_cache_types = 0;
++ struct kfd_gpu_cache_info cache_info[KFD_MAX_CACHE_TYPES];
++
++ amdgpu_amdkfd_get_cu_info(kdev->adev, &cu_info);
++ pcu_info = &cu_info;
++
++ gpu_processor_id = dev->node_props.simd_id_base;
++
++ pcache_info = cache_info;
++ num_of_cache_types = kfd_get_gpu_cache_info(kdev, &pcache_info);
++ if (!num_of_cache_types) {
++ pr_warn("no cache info found\n");
++ return;
++ }
++
++ /* For each type of cache listed in the kfd_gpu_cache_info table,
++ * go through all available Compute Units.
++ * The [i,j,k] loop will
++ * if kfd_gpu_cache_info.num_cu_shared = 1
++ * will parse through all available CU
++ * If (kfd_gpu_cache_info.num_cu_shared != 1)
++ * then it will consider only one CU from
++ * the shared unit
++ */
++ for (ct = 0; ct < num_of_cache_types; ct++) {
++ cu_processor_id = gpu_processor_id;
++ if (pcache_info[ct].cache_level == 1) {
++ for (i = 0; i < pcu_info->num_shader_engines; i++) {
++ for (j = 0; j < pcu_info->num_shader_arrays_per_engine; j++) {
++ for (k = 0; k < pcu_info->num_cu_per_sh; k += pcache_info[ct].num_cu_shared) {
++
++ ret = fill_in_l1_pcache(&props_ext, pcache_info, pcu_info,
++ pcu_info->cu_bitmap[i % 4][j + i / 4], ct,
++ cu_processor_id, k);
++
++ if (ret < 0)
++ break;
++
++ if (!ret) {
++ num_of_entries++;
++ list_add_tail(&props_ext->list, &dev->cache_props);
++ }
++
++ /* Move to next CU block */
++ num_cu_shared = ((k + pcache_info[ct].num_cu_shared) <=
++ pcu_info->num_cu_per_sh) ?
++ pcache_info[ct].num_cu_shared :
++ (pcu_info->num_cu_per_sh - k);
++ cu_processor_id += num_cu_shared;
++ }
++ }
++ }
++ } else {
++ ret = fill_in_l2_l3_pcache(&props_ext, pcache_info,
++ pcu_info, ct, cu_processor_id);
++
++ if (ret < 0)
++ break;
++
++ if (!ret) {
++ num_of_entries++;
++ list_add_tail(&props_ext->list, &dev->cache_props);
++ }
++ }
++ }
++ dev->node_props.caches_count += num_of_entries;
++ pr_debug("Added [%d] GPU cache entries\n", num_of_entries);
++}
++
+ int kfd_topology_add_device(struct kfd_dev *gpu)
+ {
+ uint32_t gpu_id;
+@@ -1617,9 +1830,9 @@ int kfd_topology_add_device(struct kfd_d
+ * CRAT to create a new topology device. Once created assign the gpu to
+ * that topology device
+ */
++ down_write(&topology_lock);
+ dev = kfd_assign_gpu(gpu);
+ if (!dev) {
+- down_write(&topology_lock);
+ proximity_domain = ++topology_crat_proximity_domain;
+
+ res = kfd_create_crat_image_virtual(&crat_image, &image_size,
+@@ -1631,6 +1844,7 @@ int kfd_topology_add_device(struct kfd_d
+ topology_crat_proximity_domain--;
+ return res;
+ }
++
+ res = kfd_parse_crat_table(crat_image,
+ &temp_topology_device_list,
+ proximity_domain);
+@@ -1644,23 +1858,28 @@ int kfd_topology_add_device(struct kfd_d
+ kfd_topology_update_device_list(&temp_topology_device_list,
+ &topology_device_list);
+
++ dev = kfd_assign_gpu(gpu);
++ if (WARN_ON(!dev)) {
++ res = -ENODEV;
++ goto err;
++ }
++
++ /* Fill the cache affinity information here for the GPUs
++ * using VCRAT
++ */
++ kfd_fill_cache_non_crat_info(dev, gpu);
++
+ /* Update the SYSFS tree, since we added another topology
+ * device
+ */
+ res = kfd_topology_update_sysfs();
+- up_write(&topology_lock);
+-
+ if (!res)
+ sys_props.generation_count++;
+ else
+ pr_err("Failed to update GPU (ID: 0x%x) to sysfs topology. res=%d\n",
+ gpu_id, res);
+- dev = kfd_assign_gpu(gpu);
+- if (WARN_ON(!dev)) {
+- res = -ENODEV;
+- goto err;
+- }
+ }
++ up_write(&topology_lock);
+
+ dev->gpu_id = gpu_id;
+ gpu->id = gpu_id;
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h
+@@ -80,6 +80,8 @@ struct kfd_mem_properties {
+ struct attribute attr;
+ };
+
++#define CACHE_SIBLINGMAP_SIZE 64
++
+ struct kfd_cache_properties {
+ struct list_head list;
+ uint32_t processor_id_low;
+@@ -90,10 +92,11 @@ struct kfd_cache_properties {
+ uint32_t cache_assoc;
+ uint32_t cache_latency;
+ uint32_t cache_type;
+- uint8_t sibling_map[CRAT_SIBLINGMAP_SIZE];
++ uint8_t sibling_map[CACHE_SIBLINGMAP_SIZE];
+ struct kfd_dev *gpu;
+ struct kobject *kobj;
+ struct attribute attr;
++ uint32_t sibling_map_size;
+ };
+
+ struct kfd_iolink_properties {