x86,fs/resctrl: Move the resctrl filesystem code to live in /fs/resctrl

author James Morse <james.morse@arm.com>

Thu, 15 May 2025 16:58:54 +0000 (16:58 +0000)

committer Borislav Petkov (AMD) <bp@alien8.de>

Fri, 16 May 2025 12:36:09 +0000 (14:36 +0200)
author James Morse <james.morse@arm.com>
Thu, 15 May 2025 16:58:54 +0000 (16:58 +0000)
committer Borislav Petkov (AMD) <bp@alien8.de>
Fri, 16 May 2025 12:36:09 +0000 (14:36 +0200)
diff --git a/Documentation/arch/x86/index.rst b/Documentation/arch/x86/index.rst

index 8ac64d7de4dc9a4a8ebaa58e189efea5e876ace5..00f9a99689fb80ea865385d786199efdb82d4571 100644 (file)
--- a/Documentation/arch/x86/index.rst
+++ b/Documentation/arch/x86/index.rst
@@ -31,7 +31,6 @@ x86-specific Documentation
     pti
     mds
     microcode
-   resctrl
     tsx_async_abort
     buslock
     usb-legacy-support
diff --git a/Documentation/filesystems/index.rst b/Documentation/filesystems/index.rst

index a9cf8e950b15ad68a021d5f214b07f58d752f4e3..32618512a965078c7a8ab0345ba56767506c5e25 100644 (file)
--- a/Documentation/filesystems/index.rst
+++ b/Documentation/filesystems/index.rst
@@ -113,6 +113,7 @@ Documentation for filesystem implementations.
     qnx6
     ramfs-rootfs-initramfs
     relay
+   resctrl
     romfs
     smb/index
     spufs/index
diff --git a/Documentation/arch/x86/resctrl.rst b/Documentation/filesystems/resctrl.rst

similarity index 100%

rename from Documentation/arch/x86/resctrl.rst

rename to Documentation/filesystems/resctrl.rst
diff --git a/MAINTAINERS b/MAINTAINERS

index ed96cc7ad66258ef541b5252287e5afc33072035..c56ab7d0b62b1d0e4e174dd2637a44455b1cee45 100644 (file)
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -20424,7 +20424,7 @@ M:      Tony Luck <tony.luck@intel.com>
  M:     Reinette Chatre <reinette.chatre@intel.com>
  L:     linux-kernel@vger.kernel.org
  S:     Supported
-F:     Documentation/arch/x86/resctrl*
+F:     Documentation/filesystems/resctrl.rst
  F:     arch/x86/include/asm/resctrl.h
  F:     arch/x86/kernel/cpu/resctrl/
  F:     fs/resctrl/
diff --git a/arch/x86/kernel/cpu/resctrl/Makefile b/arch/x86/kernel/cpu/resctrl/Makefile

index 909be78ec6dac461868ba06361634d9c9341cd2c..d8a04b195da212990289aa95d50fe681a0eea44c 100644 (file)
--- a/arch/x86/kernel/cpu/resctrl/Makefile
+++ b/arch/x86/kernel/cpu/resctrl/Makefile
@@ -5,4 +5,3 @@ obj-$(CONFIG_RESCTRL_FS_PSEUDO_LOCK)    += pseudo_lock.o
  
  # To allow define_trace.h's recursive include:
  CFLAGS_pseudo_lock.o = -I$(src)
-CFLAGS_monitor.o = -I$(src)
diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c

index 110b534d400c7e6f3bc48de7cf0f6815b0900ad3..1189c0df4ad763c176d61b1de7b223f1cea3ac91 100644 (file)
--- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
+++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
@@ -16,277 +16,9 @@
  #define pr_fmt(fmt)    KBUILD_MODNAME ": " fmt
  
  #include <linux/cpu.h>
-#include <linux/kernfs.h>
-#include <linux/seq_file.h>
-#include <linux/slab.h>
-#include <linux/tick.h>
  
  #include "internal.h"
  
-struct rdt_parse_data {
-       struct rdtgroup         *rdtgrp;
-       char                    *buf;
-};
-
-typedef int (ctrlval_parser_t)(struct rdt_parse_data *data,
-                              struct resctrl_schema *s,
-                              struct rdt_ctrl_domain *d);
-
-/*
- * Check whether MBA bandwidth percentage value is correct. The value is
- * checked against the minimum and max bandwidth values specified by the
- * hardware. The allocated bandwidth percentage is rounded to the next
- * control step available on the hardware.
- */
-static bool bw_validate(char *buf, u32 *data, struct rdt_resource *r)
-{
-       int ret;
-       u32 bw;
-
-       /*
-        * Only linear delay values is supported for current Intel SKUs.
-        */
-       if (!r->membw.delay_linear && r->membw.arch_needs_linear) {
-               rdt_last_cmd_puts("No support for non-linear MB domains\n");
-               return false;
-       }
-
-       ret = kstrtou32(buf, 10, &bw);
-       if (ret) {
-               rdt_last_cmd_printf("Invalid MB value %s\n", buf);
-               return false;
-       }
-
-       /* Nothing else to do if software controller is enabled. */
-       if (is_mba_sc(r)) {
-               *data = bw;
-               return true;
-       }
-
-       if (bw < r->membw.min_bw || bw > r->membw.max_bw) {
-               rdt_last_cmd_printf("MB value %u out of range [%d,%d]\n",
-                                   bw, r->membw.min_bw, r->membw.max_bw);
-               return false;
-       }
-
-       *data = roundup(bw, (unsigned long)r->membw.bw_gran);
-       return true;
-}
-
-static int parse_bw(struct rdt_parse_data *data, struct resctrl_schema *s,
-                   struct rdt_ctrl_domain *d)
-{
-       struct resctrl_staged_config *cfg;
-       u32 closid = data->rdtgrp->closid;
-       struct rdt_resource *r = s->res;
-       u32 bw_val;
-
-       cfg = &d->staged_config[s->conf_type];
-       if (cfg->have_new_ctrl) {
-               rdt_last_cmd_printf("Duplicate domain %d\n", d->hdr.id);
-               return -EINVAL;
-       }
-
-       if (!bw_validate(data->buf, &bw_val, r))
-               return -EINVAL;
-
-       if (is_mba_sc(r)) {
-               d->mbps_val[closid] = bw_val;
-               return 0;
-       }
-
-       cfg->new_ctrl = bw_val;
-       cfg->have_new_ctrl = true;
-
-       return 0;
-}
-
-/*
- * Check whether a cache bit mask is valid.
- * On Intel CPUs, non-contiguous 1s value support is indicated by CPUID:
- *   - CPUID.0x10.1:ECX[3]: L3 non-contiguous 1s value supported if 1
- *   - CPUID.0x10.2:ECX[3]: L2 non-contiguous 1s value supported if 1
- *
- * Haswell does not support a non-contiguous 1s value and additionally
- * requires at least two bits set.
- * AMD allows non-contiguous bitmasks.
- */
-static bool cbm_validate(char *buf, u32 *data, struct rdt_resource *r)
-{
-       u32 supported_bits = BIT_MASK(r->cache.cbm_len) - 1;
-       unsigned int cbm_len = r->cache.cbm_len;
-       unsigned long first_bit, zero_bit, val;
-       int ret;
-
-       ret = kstrtoul(buf, 16, &val);
-       if (ret) {
-               rdt_last_cmd_printf("Non-hex character in the mask %s\n", buf);
-               return false;
-       }
-
-       if ((r->cache.min_cbm_bits > 0 && val == 0) || val > supported_bits) {
-               rdt_last_cmd_puts("Mask out of range\n");
-               return false;
-       }
-
-       first_bit = find_first_bit(&val, cbm_len);
-       zero_bit = find_next_zero_bit(&val, cbm_len, first_bit);
-
-       /* Are non-contiguous bitmasks allowed? */
-       if (!r->cache.arch_has_sparse_bitmasks &&
-           (find_next_bit(&val, cbm_len, zero_bit) < cbm_len)) {
-               rdt_last_cmd_printf("The mask %lx has non-consecutive 1-bits\n", val);
-               return false;
-       }
-
-       if ((zero_bit - first_bit) < r->cache.min_cbm_bits) {
-               rdt_last_cmd_printf("Need at least %d bits in the mask\n",
-                                   r->cache.min_cbm_bits);
-               return false;
-       }
-
-       *data = val;
-       return true;
-}
-
-/*
- * Read one cache bit mask (hex). Check that it is valid for the current
- * resource type.
- */
-static int parse_cbm(struct rdt_parse_data *data, struct resctrl_schema *s,
-                    struct rdt_ctrl_domain *d)
-{
-       struct rdtgroup *rdtgrp = data->rdtgrp;
-       struct resctrl_staged_config *cfg;
-       struct rdt_resource *r = s->res;
-       u32 cbm_val;
-
-       cfg = &d->staged_config[s->conf_type];
-       if (cfg->have_new_ctrl) {
-               rdt_last_cmd_printf("Duplicate domain %d\n", d->hdr.id);
-               return -EINVAL;
-       }
-
-       /*
-        * Cannot set up more than one pseudo-locked region in a cache
-        * hierarchy.
-        */
-       if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP &&
-           rdtgroup_pseudo_locked_in_hierarchy(d)) {
-               rdt_last_cmd_puts("Pseudo-locked region in hierarchy\n");
-               return -EINVAL;
-       }
-
-       if (!cbm_validate(data->buf, &cbm_val, r))
-               return -EINVAL;
-
-       if ((rdtgrp->mode == RDT_MODE_EXCLUSIVE ||
-            rdtgrp->mode == RDT_MODE_SHAREABLE) &&
-           rdtgroup_cbm_overlaps_pseudo_locked(d, cbm_val)) {
-               rdt_last_cmd_puts("CBM overlaps with pseudo-locked region\n");
-               return -EINVAL;
-       }
-
-       /*
-        * The CBM may not overlap with the CBM of another closid if
-        * either is exclusive.
-        */
-       if (rdtgroup_cbm_overlaps(s, d, cbm_val, rdtgrp->closid, true)) {
-               rdt_last_cmd_puts("Overlaps with exclusive group\n");
-               return -EINVAL;
-       }
-
-       if (rdtgroup_cbm_overlaps(s, d, cbm_val, rdtgrp->closid, false)) {
-               if (rdtgrp->mode == RDT_MODE_EXCLUSIVE ||
-                   rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
-                       rdt_last_cmd_puts("Overlaps with other group\n");
-                       return -EINVAL;
-               }
-       }
-
-       cfg->new_ctrl = cbm_val;
-       cfg->have_new_ctrl = true;
-
-       return 0;
-}
-
-/*
- * For each domain in this resource we expect to find a series of:
- *     id=mask
- * separated by ";". The "id" is in decimal, and must match one of
- * the "id"s for this resource.
- */
-static int parse_line(char *line, struct resctrl_schema *s,
-                     struct rdtgroup *rdtgrp)
-{
-       enum resctrl_conf_type t = s->conf_type;
-       ctrlval_parser_t *parse_ctrlval = NULL;
-       struct resctrl_staged_config *cfg;
-       struct rdt_resource *r = s->res;
-       struct rdt_parse_data data;
-       struct rdt_ctrl_domain *d;
-       char *dom = NULL, *id;
-       unsigned long dom_id;
-
-       /* Walking r->domains, ensure it can't race with cpuhp */
-       lockdep_assert_cpus_held();
-
-       switch (r->schema_fmt) {
-       case RESCTRL_SCHEMA_BITMAP:
-               parse_ctrlval = &parse_cbm;
-               break;
-       case RESCTRL_SCHEMA_RANGE:
-               parse_ctrlval = &parse_bw;
-               break;
-       }
-
-       if (WARN_ON_ONCE(!parse_ctrlval))
-               return -EINVAL;
-
-       if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP &&
-           (r->rid == RDT_RESOURCE_MBA || r->rid == RDT_RESOURCE_SMBA)) {
-               rdt_last_cmd_puts("Cannot pseudo-lock MBA resource\n");
-               return -EINVAL;
-       }
-
-next:
-       if (!line || line[0] == '\0')
-               return 0;
-       dom = strsep(&line, ";");
-       id = strsep(&dom, "=");
-       if (!dom || kstrtoul(id, 10, &dom_id)) {
-               rdt_last_cmd_puts("Missing '=' or non-numeric domain\n");
-               return -EINVAL;
-       }
-       dom = strim(dom);
-       list_for_each_entry(d, &r->ctrl_domains, hdr.list) {
-               if (d->hdr.id == dom_id) {
-                       data.buf = dom;
-                       data.rdtgrp = rdtgrp;
-                       if (parse_ctrlval(&data, s, d))
-                               return -EINVAL;
-                       if (rdtgrp->mode ==  RDT_MODE_PSEUDO_LOCKSETUP) {
-                               cfg = &d->staged_config[t];
-                               /*
-                                * In pseudo-locking setup mode and just
-                                * parsed a valid CBM that should be
-                                * pseudo-locked. Only one locked region per
-                                * resource group and domain so just do
-                                * the required initialization for single
-                                * region and return.
-                                */
-                               rdtgrp->plr->s = s;
-                               rdtgrp->plr->d = d;
-                               rdtgrp->plr->cbm = cfg->new_ctrl;
-                               d->plr = rdtgrp->plr;
-                               return 0;
-                       }
-                       goto next;
-               }
-       }
-       return -EINVAL;
-}
-
  int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_ctrl_domain *d,
                             u32 closid, enum resctrl_conf_type t, u32 cfg_val)
  {
@@ -351,100 +83,6 @@ int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid)
         return 0;
  }
  
-static int rdtgroup_parse_resource(char *resname, char *tok,
-                                  struct rdtgroup *rdtgrp)
-{
-       struct resctrl_schema *s;
-
-       list_for_each_entry(s, &resctrl_schema_all, list) {
-               if (!strcmp(resname, s->name) && rdtgrp->closid < s->num_closid)
-                       return parse_line(tok, s, rdtgrp);
-       }
-       rdt_last_cmd_printf("Unknown or unsupported resource name '%s'\n", resname);
-       return -EINVAL;
-}
-
-ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
-                               char *buf, size_t nbytes, loff_t off)
-{
-       struct resctrl_schema *s;
-       struct rdtgroup *rdtgrp;
-       struct rdt_resource *r;
-       char *tok, *resname;
-       int ret = 0;
-
-       /* Valid input requires a trailing newline */
-       if (nbytes == 0 || buf[nbytes - 1] != '\n')
-               return -EINVAL;
-       buf[nbytes - 1] = '\0';
-
-       rdtgrp = rdtgroup_kn_lock_live(of->kn);
-       if (!rdtgrp) {
-               rdtgroup_kn_unlock(of->kn);
-               return -ENOENT;
-       }
-       rdt_last_cmd_clear();
-
-       /*
-        * No changes to pseudo-locked region allowed. It has to be removed
-        * and re-created instead.
-        */
-       if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) {
-               ret = -EINVAL;
-               rdt_last_cmd_puts("Resource group is pseudo-locked\n");
-               goto out;
-       }
-
-       rdt_staged_configs_clear();
-
-       while ((tok = strsep(&buf, "\n")) != NULL) {
-               resname = strim(strsep(&tok, ":"));
-               if (!tok) {
-                       rdt_last_cmd_puts("Missing ':'\n");
-                       ret = -EINVAL;
-                       goto out;
-               }
-               if (tok[0] == '\0') {
-                       rdt_last_cmd_printf("Missing '%s' value\n", resname);
-                       ret = -EINVAL;
-                       goto out;
-               }
-               ret = rdtgroup_parse_resource(resname, tok, rdtgrp);
-               if (ret)
-                       goto out;
-       }
-
-       list_for_each_entry(s, &resctrl_schema_all, list) {
-               r = s->res;
-
-               /*
-                * Writes to mba_sc resources update the software controller,
-                * not the control MSR.
-                */
-               if (is_mba_sc(r))
-                       continue;
-
-               ret = resctrl_arch_update_domains(r, rdtgrp->closid);
-               if (ret)
-                       goto out;
-       }
-
-       if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
-               /*
-                * If pseudo-locking fails we keep the resource group in
-                * mode RDT_MODE_PSEUDO_LOCKSETUP with its class of service
-                * active and updated for just the domain the pseudo-locked
-                * region was requested for.
-                */
-               ret = rdtgroup_pseudo_lock_create(rdtgrp);
-       }
-
-out:
-       rdt_staged_configs_clear();
-       rdtgroup_kn_unlock(of->kn);
-       return ret ?: nbytes;
-}
-
  u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_ctrl_domain *d,
                             u32 closid, enum resctrl_conf_type type)
  {
@@ -453,282 +91,3 @@ u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_ctrl_domain *d,
  
         return hw_dom->ctrl_val[idx];
  }
-
-static void show_doms(struct seq_file *s, struct resctrl_schema *schema, int closid)
-{
-       struct rdt_resource *r = schema->res;
-       struct rdt_ctrl_domain *dom;
-       bool sep = false;
-       u32 ctrl_val;
-
-       /* Walking r->domains, ensure it can't race with cpuhp */
-       lockdep_assert_cpus_held();
-
-       seq_printf(s, "%*s:", max_name_width, schema->name);
-       list_for_each_entry(dom, &r->ctrl_domains, hdr.list) {
-               if (sep)
-                       seq_puts(s, ";");
-
-               if (is_mba_sc(r))
-                       ctrl_val = dom->mbps_val[closid];
-               else
-                       ctrl_val = resctrl_arch_get_config(r, dom, closid,
-                                                          schema->conf_type);
-
-               seq_printf(s, schema->fmt_str, dom->hdr.id, ctrl_val);
-               sep = true;
-       }
-       seq_puts(s, "\n");
-}
-
-int rdtgroup_schemata_show(struct kernfs_open_file *of,
-                          struct seq_file *s, void *v)
-{
-       struct resctrl_schema *schema;
-       struct rdtgroup *rdtgrp;
-       int ret = 0;
-       u32 closid;
-
-       rdtgrp = rdtgroup_kn_lock_live(of->kn);
-       if (rdtgrp) {
-               if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
-                       list_for_each_entry(schema, &resctrl_schema_all, list) {
-                               seq_printf(s, "%s:uninitialized\n", schema->name);
-                       }
-               } else if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) {
-                       if (!rdtgrp->plr->d) {
-                               rdt_last_cmd_clear();
-                               rdt_last_cmd_puts("Cache domain offline\n");
-                               ret = -ENODEV;
-                       } else {
-                               seq_printf(s, "%s:%d=%x\n",
-                                          rdtgrp->plr->s->res->name,
-                                          rdtgrp->plr->d->hdr.id,
-                                          rdtgrp->plr->cbm);
-                       }
-               } else {
-                       closid = rdtgrp->closid;
-                       list_for_each_entry(schema, &resctrl_schema_all, list) {
-                               if (closid < schema->num_closid)
-                                       show_doms(s, schema, closid);
-                       }
-               }
-       } else {
-               ret = -ENOENT;
-       }
-       rdtgroup_kn_unlock(of->kn);
-       return ret;
-}
-
-static int smp_mon_event_count(void *arg)
-{
-       mon_event_count(arg);
-
-       return 0;
-}
-
-ssize_t rdtgroup_mba_mbps_event_write(struct kernfs_open_file *of,
-                                     char *buf, size_t nbytes, loff_t off)
-{
-       struct rdtgroup *rdtgrp;
-       int ret = 0;
-
-       /* Valid input requires a trailing newline */
-       if (nbytes == 0 || buf[nbytes - 1] != '\n')
-               return -EINVAL;
-       buf[nbytes - 1] = '\0';
-
-       rdtgrp = rdtgroup_kn_lock_live(of->kn);
-       if (!rdtgrp) {
-               rdtgroup_kn_unlock(of->kn);
-               return -ENOENT;
-       }
-       rdt_last_cmd_clear();
-
-       if (!strcmp(buf, "mbm_local_bytes")) {
-               if (resctrl_arch_is_mbm_local_enabled())
-                       rdtgrp->mba_mbps_event = QOS_L3_MBM_LOCAL_EVENT_ID;
-               else
-                       ret = -EINVAL;
-       } else if (!strcmp(buf, "mbm_total_bytes")) {
-               if (resctrl_arch_is_mbm_total_enabled())
-                       rdtgrp->mba_mbps_event = QOS_L3_MBM_TOTAL_EVENT_ID;
-               else
-                       ret = -EINVAL;
-       } else {
-               ret = -EINVAL;
-       }
-
-       if (ret)
-               rdt_last_cmd_printf("Unsupported event id '%s'\n", buf);
-
-       rdtgroup_kn_unlock(of->kn);
-
-       return ret ?: nbytes;
-}
-
-int rdtgroup_mba_mbps_event_show(struct kernfs_open_file *of,
-                                struct seq_file *s, void *v)
-{
-       struct rdtgroup *rdtgrp;
-       int ret = 0;
-
-       rdtgrp = rdtgroup_kn_lock_live(of->kn);
-
-       if (rdtgrp) {
-               switch (rdtgrp->mba_mbps_event) {
-               case QOS_L3_MBM_LOCAL_EVENT_ID:
-                       seq_puts(s, "mbm_local_bytes\n");
-                       break;
-               case QOS_L3_MBM_TOTAL_EVENT_ID:
-                       seq_puts(s, "mbm_total_bytes\n");
-                       break;
-               default:
-                       pr_warn_once("Bad event %d\n", rdtgrp->mba_mbps_event);
-                       ret = -EINVAL;
-                       break;
-               }
-       } else {
-               ret = -ENOENT;
-       }
-
-       rdtgroup_kn_unlock(of->kn);
-
-       return ret;
-}
-
-struct rdt_domain_hdr *resctrl_find_domain(struct list_head *h, int id,
-                                          struct list_head **pos)
-{
-       struct rdt_domain_hdr *d;
-       struct list_head *l;
-
-       list_for_each(l, h) {
-               d = list_entry(l, struct rdt_domain_hdr, list);
-               /* When id is found, return its domain. */
-               if (id == d->id)
-                       return d;
-               /* Stop searching when finding id's position in sorted list. */
-               if (id < d->id)
-                       break;
-       }
-
-       if (pos)
-               *pos = l;
-
-       return NULL;
-}
-
-void mon_event_read(struct rmid_read *rr, struct rdt_resource *r,
-                   struct rdt_mon_domain *d, struct rdtgroup *rdtgrp,
-                   cpumask_t *cpumask, int evtid, int first)
-{
-       int cpu;
-
-       /* When picking a CPU from cpu_mask, ensure it can't race with cpuhp */
-       lockdep_assert_cpus_held();
-
-       /*
-        * Setup the parameters to pass to mon_event_count() to read the data.
-        */
-       rr->rgrp = rdtgrp;
-       rr->evtid = evtid;
-       rr->r = r;
-       rr->d = d;
-       rr->first = first;
-       rr->arch_mon_ctx = resctrl_arch_mon_ctx_alloc(r, evtid);
-       if (IS_ERR(rr->arch_mon_ctx)) {
-               rr->err = -EINVAL;
-               return;
-       }
-
-       cpu = cpumask_any_housekeeping(cpumask, RESCTRL_PICK_ANY_CPU);
-
-       /*
-        * cpumask_any_housekeeping() prefers housekeeping CPUs, but
-        * are all the CPUs nohz_full? If yes, pick a CPU to IPI.
-        * MPAM's resctrl_arch_rmid_read() is unable to read the
-        * counters on some platforms if its called in IRQ context.
-        */
-       if (tick_nohz_full_cpu(cpu))
-               smp_call_function_any(cpumask, mon_event_count, rr, 1);
-       else
-               smp_call_on_cpu(cpu, smp_mon_event_count, rr, false);
-
-       resctrl_arch_mon_ctx_free(r, evtid, rr->arch_mon_ctx);
-}
-
-int rdtgroup_mondata_show(struct seq_file *m, void *arg)
-{
-       struct kernfs_open_file *of = m->private;
-       enum resctrl_res_level resid;
-       enum resctrl_event_id evtid;
-       struct rdt_domain_hdr *hdr;
-       struct rmid_read rr = {0};
-       struct rdt_mon_domain *d;
-       struct rdtgroup *rdtgrp;
-       struct rdt_resource *r;
-       struct mon_data *md;
-       int domid, ret = 0;
-
-       rdtgrp = rdtgroup_kn_lock_live(of->kn);
-       if (!rdtgrp) {
-               ret = -ENOENT;
-               goto out;
-       }
-
-       md = of->kn->priv;
-       if (WARN_ON_ONCE(!md)) {
-               ret = -EIO;
-               goto out;
-       }
-
-       resid = md->rid;
-       domid = md->domid;
-       evtid = md->evtid;
-       r = resctrl_arch_get_resource(resid);
-
-       if (md->sum) {
-               /*
-                * This file requires summing across all domains that share
-                * the L3 cache id that was provided in the "domid" field of the
-                * struct mon_data. Search all domains in the resource for
-                * one that matches this cache id.
-                */
-               list_for_each_entry(d, &r->mon_domains, hdr.list) {
-                       if (d->ci->id == domid) {
-                               rr.ci = d->ci;
-                               mon_event_read(&rr, r, NULL, rdtgrp,
-                                              &d->ci->shared_cpu_map, evtid, false);
-                               goto checkresult;
-                       }
-               }
-               ret = -ENOENT;
-               goto out;
-       } else {
-               /*
-                * This file provides data from a single domain. Search
-                * the resource to find the domain with "domid".
-                */
-               hdr = resctrl_find_domain(&r->mon_domains, domid, NULL);
-               if (!hdr || WARN_ON_ONCE(hdr->type != RESCTRL_MON_DOMAIN)) {
-                       ret = -ENOENT;
-                       goto out;
-               }
-               d = container_of(hdr, struct rdt_mon_domain, hdr);
-               mon_event_read(&rr, r, d, rdtgrp, &d->hdr.cpu_mask, evtid, false);
-       }
-
-checkresult:
-
-       if (rr.err == -EIO)
-               seq_puts(m, "Error\n");
-       else if (rr.err == -EINVAL)
-               seq_puts(m, "Unavailable\n");
-       else
-               seq_printf(m, "%llu\n", rr.val);
-
-out:
-       rdtgroup_kn_unlock(of->kn);
-       return ret;
-}
diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h

index dc63ac538a81693dfa9fc0141cec95aeff28973e..5e3c41b3643737e25110d1e91a3ab17f13cc2f06 100644 (file)
--- a/arch/x86/kernel/cpu/resctrl/internal.h
+++ b/arch/x86/kernel/cpu/resctrl/internal.h
@@ -3,24 +3,21 @@
  #define _ASM_X86_RESCTRL_INTERNAL_H
  
  #include <linux/resctrl.h>
-#include <linux/sched.h>
-#include <linux/kernfs.h>
-#include <linux/fs_context.h>
-#include <linux/jump_label.h>
-#include <linux/tick.h>
  
  #define L3_QOS_CDP_ENABLE              0x01ULL
  
  #define L2_QOS_CDP_ENABLE              0x01ULL
  
-#define CQM_LIMBOCHECK_INTERVAL        1000
-
  #define MBM_CNTR_WIDTH_BASE            24
+
  #define MBA_IS_LINEAR                  0x4
+
  #define MBM_CNTR_WIDTH_OFFSET_AMD      20
  
  #define RMID_VAL_ERROR                 BIT_ULL(63)
+
  #define RMID_VAL_UNAVAIL               BIT_ULL(62)
+
  /*
   * With the above fields in use 62 bits remain in MSR_IA32_QM_CTR for
   * data to be returned. The counter width is discovered from the hardware
@@ -28,263 +25,6 @@
   */
  #define MBM_CNTR_WIDTH_OFFSET_MAX (62 - MBM_CNTR_WIDTH_BASE)
  
-/**
- * cpumask_any_housekeeping() - Choose any CPU in @mask, preferring those that
- *                             aren't marked nohz_full
- * @mask:      The mask to pick a CPU from.
- * @exclude_cpu:The CPU to avoid picking.
- *
- * Returns a CPU from @mask, but not @exclude_cpu. If there are housekeeping
- * CPUs that don't use nohz_full, these are preferred. Pass
- * RESCTRL_PICK_ANY_CPU to avoid excluding any CPUs.
- *
- * When a CPU is excluded, returns >= nr_cpu_ids if no CPUs are available.
- */
-static inline unsigned int
-cpumask_any_housekeeping(const struct cpumask *mask, int exclude_cpu)
-{
-       unsigned int cpu;
-
-       /* Try to find a CPU that isn't nohz_full to use in preference */
-       if (tick_nohz_full_enabled()) {
-               cpu = cpumask_any_andnot_but(mask, tick_nohz_full_mask, exclude_cpu);
-               if (cpu < nr_cpu_ids)
-                       return cpu;
-       }
-
-       return cpumask_any_but(mask, exclude_cpu);
-}
-
-struct rdt_fs_context {
-       struct kernfs_fs_context        kfc;
-       bool                            enable_cdpl2;
-       bool                            enable_cdpl3;
-       bool                            enable_mba_mbps;
-       bool                            enable_debug;
-};
-
-static inline struct rdt_fs_context *rdt_fc2context(struct fs_context *fc)
-{
-       struct kernfs_fs_context *kfc = fc->fs_private;
-
-       return container_of(kfc, struct rdt_fs_context, kfc);
-}
-
-/**
- * struct mon_evt - Entry in the event list of a resource
- * @evtid:             event id
- * @name:              name of the event
- * @configurable:      true if the event is configurable
- * @list:              entry in &rdt_resource->evt_list
- */
-struct mon_evt {
-       enum resctrl_event_id   evtid;
-       char                    *name;
-       bool                    configurable;
-       struct list_head        list;
-};
-
-/**
- * struct mon_data - Monitoring details for each event file.
- * @list:            Member of the global @mon_data_kn_priv_list list.
- * @rid:             Resource id associated with the event file.
- * @evtid:           Event id associated with the event file.
- * @sum:             Set when event must be summed across multiple
- *                   domains.
- * @domid:           When @sum is zero this is the domain to which
- *                   the event file belongs. When @sum is one this
- *                   is the id of the L3 cache that all domains to be
- *                   summed share.
- *
- * Pointed to by the kernfs kn->priv field of monitoring event files.
- * Readers and writers must hold rdtgroup_mutex.
- */
-struct mon_data {
-       struct list_head        list;
-       enum resctrl_res_level  rid;
-       enum resctrl_event_id   evtid;
-       int                     domid;
-       bool                    sum;
-};
-
-/**
- * struct rmid_read - Data passed across smp_call*() to read event count.
- * @rgrp:  Resource group for which the counter is being read. If it is a parent
- *        resource group then its event count is summed with the count from all
- *        its child resource groups.
- * @r:    Resource describing the properties of the event being read.
- * @d:    Domain that the counter should be read from. If NULL then sum all
- *        domains in @r sharing L3 @ci.id
- * @evtid: Which monitor event to read.
- * @first: Initialize MBM counter when true.
- * @ci:    Cacheinfo for L3. Only set when @d is NULL. Used when summing domains.
- * @err:   Error encountered when reading counter.
- * @val:   Returned value of event counter. If @rgrp is a parent resource group,
- *        @val includes the sum of event counts from its child resource groups.
- *        If @d is NULL, @val includes the sum of all domains in @r sharing @ci.id,
- *        (summed across child resource groups if @rgrp is a parent resource group).
- * @arch_mon_ctx: Hardware monitor allocated for this read request (MPAM only).
- */
-struct rmid_read {
-       struct rdtgroup         *rgrp;
-       struct rdt_resource     *r;
-       struct rdt_mon_domain   *d;
-       enum resctrl_event_id   evtid;
-       bool                    first;
-       struct cacheinfo        *ci;
-       int                     err;
-       u64                     val;
-       void                    *arch_mon_ctx;
-};
-
-extern struct list_head resctrl_schema_all;
-extern bool resctrl_mounted;
-
-enum rdt_group_type {
-       RDTCTRL_GROUP = 0,
-       RDTMON_GROUP,
-       RDT_NUM_GROUP,
-};
-
-/**
- * enum rdtgrp_mode - Mode of a RDT resource group
- * @RDT_MODE_SHAREABLE: This resource group allows sharing of its allocations
- * @RDT_MODE_EXCLUSIVE: No sharing of this resource group's allocations allowed
- * @RDT_MODE_PSEUDO_LOCKSETUP: Resource group will be used for Pseudo-Locking
- * @RDT_MODE_PSEUDO_LOCKED: No sharing of this resource group's allocations
- *                          allowed AND the allocations are Cache Pseudo-Locked
- * @RDT_NUM_MODES: Total number of modes
- *
- * The mode of a resource group enables control over the allowed overlap
- * between allocations associated with different resource groups (classes
- * of service). User is able to modify the mode of a resource group by
- * writing to the "mode" resctrl file associated with the resource group.
- *
- * The "shareable", "exclusive", and "pseudo-locksetup" modes are set by
- * writing the appropriate text to the "mode" file. A resource group enters
- * "pseudo-locked" mode after the schemata is written while the resource
- * group is in "pseudo-locksetup" mode.
- */
-enum rdtgrp_mode {
-       RDT_MODE_SHAREABLE = 0,
-       RDT_MODE_EXCLUSIVE,
-       RDT_MODE_PSEUDO_LOCKSETUP,
-       RDT_MODE_PSEUDO_LOCKED,
-
-       /* Must be last */
-       RDT_NUM_MODES,
-};
-
-/**
- * struct mongroup - store mon group's data in resctrl fs.
- * @mon_data_kn:               kernfs node for the mon_data directory
- * @parent:                    parent rdtgrp
- * @crdtgrp_list:              child rdtgroup node list
- * @rmid:                      rmid for this rdtgroup
- */
-struct mongroup {
-       struct kernfs_node      *mon_data_kn;
-       struct rdtgroup         *parent;
-       struct list_head        crdtgrp_list;
-       u32                     rmid;
-};
-
-/**
- * struct rdtgroup - store rdtgroup's data in resctrl file system.
- * @kn:                                kernfs node
- * @rdtgroup_list:             linked list for all rdtgroups
- * @closid:                    closid for this rdtgroup
- * @cpu_mask:                  CPUs assigned to this rdtgroup
- * @flags:                     status bits
- * @waitcount:                 how many cpus expect to find this
- *                             group when they acquire rdtgroup_mutex
- * @type:                      indicates type of this rdtgroup - either
- *                             monitor only or ctrl_mon group
- * @mon:                       mongroup related data
- * @mode:                      mode of resource group
- * @mba_mbps_event:            input monitoring event id when mba_sc is enabled
- * @plr:                       pseudo-locked region
- */
-struct rdtgroup {
-       struct kernfs_node              *kn;
-       struct list_head                rdtgroup_list;
-       u32                             closid;
-       struct cpumask                  cpu_mask;
-       int                             flags;
-       atomic_t                        waitcount;
-       enum rdt_group_type             type;
-       struct mongroup                 mon;
-       enum rdtgrp_mode                mode;
-       enum resctrl_event_id           mba_mbps_event;
-       struct pseudo_lock_region       *plr;
-};
-
-/* rdtgroup.flags */
-#define        RDT_DELETED             1
-
-/* rftype.flags */
-#define RFTYPE_FLAGS_CPUS_LIST 1
-
-/*
- * Define the file type flags for base and info directories.
- */
-#define RFTYPE_INFO                    BIT(0)
-#define RFTYPE_BASE                    BIT(1)
-#define RFTYPE_CTRL                    BIT(4)
-#define RFTYPE_MON                     BIT(5)
-#define RFTYPE_TOP                     BIT(6)
-#define RFTYPE_RES_CACHE               BIT(8)
-#define RFTYPE_RES_MB                  BIT(9)
-#define RFTYPE_DEBUG                   BIT(10)
-#define RFTYPE_CTRL_INFO               (RFTYPE_INFO | RFTYPE_CTRL)
-#define RFTYPE_MON_INFO                        (RFTYPE_INFO | RFTYPE_MON)
-#define RFTYPE_TOP_INFO                        (RFTYPE_INFO | RFTYPE_TOP)
-#define RFTYPE_CTRL_BASE               (RFTYPE_BASE | RFTYPE_CTRL)
-#define RFTYPE_MON_BASE                        (RFTYPE_BASE | RFTYPE_MON)
-
-/* List of all resource groups */
-extern struct list_head rdt_all_groups;
-
-extern int max_name_width;
-
-/**
- * struct rftype - describe each file in the resctrl file system
- * @name:      File name
- * @mode:      Access mode
- * @kf_ops:    File operations
- * @flags:     File specific RFTYPE_FLAGS_* flags
- * @fflags:    File specific RFTYPE_* flags
- * @seq_show:  Show content of the file
- * @write:     Write to the file
- */
-struct rftype {
-       char                    *name;
-       umode_t                 mode;
-       const struct kernfs_ops *kf_ops;
-       unsigned long           flags;
-       unsigned long           fflags;
-
-       int (*seq_show)(struct kernfs_open_file *of,
-                       struct seq_file *sf, void *v);
-       /*
-        * write() is the generic write callback which maps directly to
-        * kernfs write operation and overrides all other operations.
-        * Maximum write size is determined by ->max_write_len.
-        */
-       ssize_t (*write)(struct kernfs_open_file *of,
-                        char *buf, size_t nbytes, loff_t off);
-};
-
-/**
- * struct mbm_state - status for each MBM counter in each domain
- * @prev_bw_bytes: Previous bytes value read for bandwidth calculation
- * @prev_bw:   The most recent bandwidth in MBps
- */
-struct mbm_state {
-       u64     prev_bw_bytes;
-       u32     prev_bw;
-};
-
  /**
   * struct arch_mbm_state - values used to compute resctrl_arch_rmid_read()s
   *                        return value.
@@ -382,17 +122,7 @@ static inline struct rdt_hw_resource *resctrl_to_arch_res(struct rdt_resource *r
         return container_of(r, struct rdt_hw_resource, r_resctrl);
  }
  
-extern struct mutex rdtgroup_mutex;
-
-static inline const char *rdt_kn_name(const struct kernfs_node *kn)
-{
-       return rcu_dereference_check(kn->name, lockdep_is_held(&rdtgroup_mutex));
-}
-
  extern struct rdt_hw_resource rdt_resources_all[];
-extern struct rdtgroup rdtgroup_default;
-extern struct dentry *debugfs_resctrl;
-extern enum resctrl_event_id mba_mbps_default_event;
  
  void arch_mon_domain_online(struct rdt_resource *r, struct rdt_mon_domain *d);
  
@@ -429,99 +159,14 @@ union cpuid_0x10_x_edx {
         unsigned int full;
  };
  
-void rdt_last_cmd_clear(void);
-void rdt_last_cmd_puts(const char *s);
-__printf(1, 2)
-void rdt_last_cmd_printf(const char *fmt, ...);
-
  void rdt_ctrl_update(void *arg);
-struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn);
-void rdtgroup_kn_unlock(struct kernfs_node *kn);
-int rdtgroup_kn_mode_restrict(struct rdtgroup *r, const char *name);
-int rdtgroup_kn_mode_restore(struct rdtgroup *r, const char *name,
-                            umode_t mask);
-ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
-                               char *buf, size_t nbytes, loff_t off);
-int rdtgroup_schemata_show(struct kernfs_open_file *of,
-                          struct seq_file *s, void *v);
-ssize_t rdtgroup_mba_mbps_event_write(struct kernfs_open_file *of,
-                                     char *buf, size_t nbytes, loff_t off);
-int rdtgroup_mba_mbps_event_show(struct kernfs_open_file *of,
-                                struct seq_file *s, void *v);
-bool rdtgroup_cbm_overlaps(struct resctrl_schema *s, struct rdt_ctrl_domain *d,
-                          unsigned long cbm, int closid, bool exclusive);
-unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r, struct rdt_ctrl_domain *d,
-                                 unsigned long cbm);
-enum rdtgrp_mode rdtgroup_mode_by_closid(int closid);
-int rdtgroup_tasks_assigned(struct rdtgroup *r);
-int closids_supported(void);
-void closid_free(int closid);
-int alloc_rmid(u32 closid);
-void free_rmid(u32 closid, u32 rmid);
-int rdt_get_mon_l3_config(struct rdt_resource *r);
-void resctrl_mon_resource_exit(void);
-bool rdt_cpu_has(int flag);
-void mon_event_count(void *info);
-int rdtgroup_mondata_show(struct seq_file *m, void *arg);
-void mon_event_read(struct rmid_read *rr, struct rdt_resource *r,
-                   struct rdt_mon_domain *d, struct rdtgroup *rdtgrp,
-                   cpumask_t *cpumask, int evtid, int first);
-int resctrl_mon_resource_init(void);
-void mbm_setup_overflow_handler(struct rdt_mon_domain *dom,
-                               unsigned long delay_ms,
-                               int exclude_cpu);
-void mbm_handle_overflow(struct work_struct *work);
-void __init intel_rdt_mbm_apply_quirk(void);
-bool is_mba_sc(struct rdt_resource *r);
-void cqm_setup_limbo_handler(struct rdt_mon_domain *dom, unsigned long delay_ms,
-                            int exclude_cpu);
-void cqm_handle_limbo(struct work_struct *work);
-bool has_busy_rmid(struct rdt_mon_domain *d);
-void __check_limbo(struct rdt_mon_domain *d, bool force_free);
-void rdt_domain_reconfigure_cdp(struct rdt_resource *r);
-void resctrl_file_fflags_init(const char *config, unsigned long fflags);
-void rdt_staged_configs_clear(void);
-bool closid_allocated(unsigned int closid);
-int resctrl_find_cleanest_closid(void);
-
-#ifdef CONFIG_RESCTRL_FS_PSEUDO_LOCK
-int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp);
-int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp);
-bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_ctrl_domain *d, unsigned long cbm);
-bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_ctrl_domain *d);
-int rdt_pseudo_lock_init(void);
-void rdt_pseudo_lock_release(void);
-int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp);
-void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp);
-#else
-static inline int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp)
-{
-       return -EOPNOTSUPP;
-}
  
-static inline int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp)
-{
-       return -EOPNOTSUPP;
-}
-
-static inline bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_ctrl_domain *d, unsigned long cbm)
-{
-       return false;
-}
+int rdt_get_mon_l3_config(struct rdt_resource *r);
  
-static inline bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_ctrl_domain *d)
-{
-       return false;
-}
+bool rdt_cpu_has(int flag);
  
-static inline int rdt_pseudo_lock_init(void) { return 0; }
-static inline void rdt_pseudo_lock_release(void) { }
-static inline int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp)
-{
-       return -EOPNOTSUPP;
-}
+void __init intel_rdt_mbm_apply_quirk(void);
  
-static inline void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp) { }
-#endif /* CONFIG_RESCTRL_FS_PSEUDO_LOCK */
+void rdt_domain_reconfigure_cdp(struct rdt_resource *r);
  
  #endif /* _ASM_X86_RESCTRL_INTERNAL_H */
diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c

index 8847c23e9ac164950c6f030e590af4b2c671cda4..3fc4d9f56f0d60b362c51cfd2464f7019d0574ad 100644 (file)
--- a/arch/x86/kernel/cpu/resctrl/monitor.c
+++ b/arch/x86/kernel/cpu/resctrl/monitor.c
@@ -18,65 +18,12 @@
  #define pr_fmt(fmt)    "resctrl: " fmt
  
  #include <linux/cpu.h>
-#include <linux/module.h>
  #include <linux/resctrl.h>
-#include <linux/sizes.h>
-#include <linux/slab.h>
  
  #include <asm/cpu_device_id.h>
  
  #include "internal.h"
  
-#define CREATE_TRACE_POINTS
-#include "monitor_trace.h"
-
-/**
- * struct rmid_entry - dirty tracking for all RMID.
- * @closid:    The CLOSID for this entry.
- * @rmid:      The RMID for this entry.
- * @busy:      The number of domains with cached data using this RMID.
- * @list:      Member of the rmid_free_lru list when busy == 0.
- *
- * Depending on the architecture the correct monitor is accessed using
- * both @closid and @rmid, or @rmid only.
- *
- * Take the rdtgroup_mutex when accessing.
- */
-struct rmid_entry {
-       u32                             closid;
-       u32                             rmid;
-       int                             busy;
-       struct list_head                list;
-};
-
-/*
- * @rmid_free_lru - A least recently used list of free RMIDs
- *     These RMIDs are guaranteed to have an occupancy less than the
- *     threshold occupancy
- */
-static LIST_HEAD(rmid_free_lru);
-
-/*
- * @closid_num_dirty_rmid    The number of dirty RMID each CLOSID has.
- *     Only allocated when CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID is defined.
- *     Indexed by CLOSID. Protected by rdtgroup_mutex.
- */
-static u32 *closid_num_dirty_rmid;
-
-/*
- * @rmid_limbo_count - count of currently unused but (potentially)
- *     dirty RMIDs.
- *     This counts RMIDs that no one is currently using but that
- *     may have a occupancy value > resctrl_rmid_realloc_threshold. User can
- *     change the threshold occupancy value.
- */
-static unsigned int rmid_limbo_count;
-
-/*
- * @rmid_entry - The entry in the limbo and free lists.
- */
-static struct rmid_entry       *rmid_ptrs;
-
  /*
   * Global boolean for rdt_monitor which is true if any
   * resource monitoring is enabled.
@@ -88,23 +35,12 @@ bool rdt_mon_capable;
   */
  unsigned int rdt_mon_features;
  
-/*
- * This is the threshold cache occupancy in bytes at which we will consider an
- * RMID available for re-allocation.
- */
-unsigned int resctrl_rmid_realloc_threshold;
-
-/*
- * This is the maximum value for the reallocation threshold, in bytes.
- */
-unsigned int resctrl_rmid_realloc_limit;
-
  #define CF(cf) ((unsigned long)(1048576 * (cf) + 0.5))
  
  static int snc_nodes_per_l3_cache = 1;
  
  /*
- * The correction factor table is documented in Documentation/arch/x86/resctrl.rst.
+ * The correction factor table is documented in Documentation/filesystems/resctrl.rst.
   * If rmid > rmid threshold, MBM total and local values should be multiplied
   * by the correction factor.
   *
@@ -153,6 +89,7 @@ static const struct mbm_correction_factor_table {
  };
  
  static u32 mbm_cf_rmidthreshold __read_mostly = UINT_MAX;
+
  static u64 mbm_cf __read_mostly;
  
  static inline u64 get_corrected_mbm_count(u32 rmid, unsigned long val)
@@ -164,33 +101,6 @@ static inline u64 get_corrected_mbm_count(u32 rmid, unsigned long val)
         return val;
  }
  
-/*
- * x86 and arm64 differ in their handling of monitoring.
- * x86's RMID are independent numbers, there is only one source of traffic
- * with an RMID value of '1'.
- * arm64's PMG extends the PARTID/CLOSID space, there are multiple sources of
- * traffic with a PMG value of '1', one for each CLOSID, meaning the RMID
- * value is no longer unique.
- * To account for this, resctrl uses an index. On x86 this is just the RMID,
- * on arm64 it encodes the CLOSID and RMID. This gives a unique number.
- *
- * The domain's rmid_busy_llc and rmid_ptrs[] are sized by index. The arch code
- * must accept an attempt to read every index.
- */
-static inline struct rmid_entry *__rmid_entry(u32 idx)
-{
-       struct rmid_entry *entry;
-       u32 closid, rmid;
-
-       entry = &rmid_ptrs[idx];
-       resctrl_arch_rmid_idx_decode(idx, &closid, &rmid);
-
-       WARN_ON_ONCE(entry->closid != closid);
-       WARN_ON_ONCE(entry->rmid != rmid);
-
-       return entry;
-}
-
  /*
   * When Sub-NUMA Cluster (SNC) mode is not enabled (as indicated by
   * "snc_nodes_per_l3_cache == 1") no translation of the RMID value is
@@ -347,769 +257,6 @@ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_mon_domain *d,
         return 0;
  }
  
-static void limbo_release_entry(struct rmid_entry *entry)
-{
-       lockdep_assert_held(&rdtgroup_mutex);
-
-       rmid_limbo_count--;
-       list_add_tail(&entry->list, &rmid_free_lru);
-
-       if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID))
-               closid_num_dirty_rmid[entry->closid]--;
-}
-
-/*
- * Check the RMIDs that are marked as busy for this domain. If the
- * reported LLC occupancy is below the threshold clear the busy bit and
- * decrement the count. If the busy count gets to zero on an RMID, we
- * free the RMID
- */
-void __check_limbo(struct rdt_mon_domain *d, bool force_free)
-{
-       struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3);
-       u32 idx_limit = resctrl_arch_system_num_rmid_idx();
-       struct rmid_entry *entry;
-       u32 idx, cur_idx = 1;
-       void *arch_mon_ctx;
-       bool rmid_dirty;
-       u64 val = 0;
-
-       arch_mon_ctx = resctrl_arch_mon_ctx_alloc(r, QOS_L3_OCCUP_EVENT_ID);
-       if (IS_ERR(arch_mon_ctx)) {
-               pr_warn_ratelimited("Failed to allocate monitor context: %ld",
-                                   PTR_ERR(arch_mon_ctx));
-               return;
-       }
-
-       /*
-        * Skip RMID 0 and start from RMID 1 and check all the RMIDs that
-        * are marked as busy for occupancy < threshold. If the occupancy
-        * is less than the threshold decrement the busy counter of the
-        * RMID and move it to the free list when the counter reaches 0.
-        */
-       for (;;) {
-               idx = find_next_bit(d->rmid_busy_llc, idx_limit, cur_idx);
-               if (idx >= idx_limit)
-                       break;
-
-               entry = __rmid_entry(idx);
-               if (resctrl_arch_rmid_read(r, d, entry->closid, entry->rmid,
-                                          QOS_L3_OCCUP_EVENT_ID, &val,
-                                          arch_mon_ctx)) {
-                       rmid_dirty = true;
-               } else {
-                       rmid_dirty = (val >= resctrl_rmid_realloc_threshold);
-
-                       /*
-                        * x86's CLOSID and RMID are independent numbers, so the entry's
-                        * CLOSID is an empty CLOSID (X86_RESCTRL_EMPTY_CLOSID). On Arm the
-                        * RMID (PMG) extends the CLOSID (PARTID) space with bits that aren't
-                        * used to select the configuration. It is thus necessary to track both
-                        * CLOSID and RMID because there may be dependencies between them
-                        * on some architectures.
-                        */
-                       trace_mon_llc_occupancy_limbo(entry->closid, entry->rmid, d->hdr.id, val);
-               }
-
-               if (force_free || !rmid_dirty) {
-                       clear_bit(idx, d->rmid_busy_llc);
-                       if (!--entry->busy)
-                               limbo_release_entry(entry);
-               }
-               cur_idx = idx + 1;
-       }
-
-       resctrl_arch_mon_ctx_free(r, QOS_L3_OCCUP_EVENT_ID, arch_mon_ctx);
-}
-
-bool has_busy_rmid(struct rdt_mon_domain *d)
-{
-       u32 idx_limit = resctrl_arch_system_num_rmid_idx();
-
-       return find_first_bit(d->rmid_busy_llc, idx_limit) != idx_limit;
-}
-
-static struct rmid_entry *resctrl_find_free_rmid(u32 closid)
-{
-       struct rmid_entry *itr;
-       u32 itr_idx, cmp_idx;
-
-       if (list_empty(&rmid_free_lru))
-               return rmid_limbo_count ? ERR_PTR(-EBUSY) : ERR_PTR(-ENOSPC);
-
-       list_for_each_entry(itr, &rmid_free_lru, list) {
-               /*
-                * Get the index of this free RMID, and the index it would need
-                * to be if it were used with this CLOSID.
-                * If the CLOSID is irrelevant on this architecture, the two
-                * index values are always the same on every entry and thus the
-                * very first entry will be returned.
-                */
-               itr_idx = resctrl_arch_rmid_idx_encode(itr->closid, itr->rmid);
-               cmp_idx = resctrl_arch_rmid_idx_encode(closid, itr->rmid);
-
-               if (itr_idx == cmp_idx)
-                       return itr;
-       }
-
-       return ERR_PTR(-ENOSPC);
-}
-
-/**
- * resctrl_find_cleanest_closid() - Find a CLOSID where all the associated
- *                                  RMID are clean, or the CLOSID that has
- *                                  the most clean RMID.
- *
- * MPAM's equivalent of RMID are per-CLOSID, meaning a freshly allocated CLOSID
- * may not be able to allocate clean RMID. To avoid this the allocator will
- * choose the CLOSID with the most clean RMID.
- *
- * When the CLOSID and RMID are independent numbers, the first free CLOSID will
- * be returned.
- */
-int resctrl_find_cleanest_closid(void)
-{
-       u32 cleanest_closid = ~0;
-       int i = 0;
-
-       lockdep_assert_held(&rdtgroup_mutex);
-
-       if (!IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID))
-               return -EIO;
-
-       for (i = 0; i < closids_supported(); i++) {
-               int num_dirty;
-
-               if (closid_allocated(i))
-                       continue;
-
-               num_dirty = closid_num_dirty_rmid[i];
-               if (num_dirty == 0)
-                       return i;
-
-               if (cleanest_closid == ~0)
-                       cleanest_closid = i;
-
-               if (num_dirty < closid_num_dirty_rmid[cleanest_closid])
-                       cleanest_closid = i;
-       }
-
-       if (cleanest_closid == ~0)
-               return -ENOSPC;
-
-       return cleanest_closid;
-}
-
-/*
- * For MPAM the RMID value is not unique, and has to be considered with
- * the CLOSID. The (CLOSID, RMID) pair is allocated on all domains, which
- * allows all domains to be managed by a single free list.
- * Each domain also has a rmid_busy_llc to reduce the work of the limbo handler.
- */
-int alloc_rmid(u32 closid)
-{
-       struct rmid_entry *entry;
-
-       lockdep_assert_held(&rdtgroup_mutex);
-
-       entry = resctrl_find_free_rmid(closid);
-       if (IS_ERR(entry))
-               return PTR_ERR(entry);
-
-       list_del(&entry->list);
-       return entry->rmid;
-}
-
-static void add_rmid_to_limbo(struct rmid_entry *entry)
-{
-       struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3);
-       struct rdt_mon_domain *d;
-       u32 idx;
-
-       lockdep_assert_held(&rdtgroup_mutex);
-
-       /* Walking r->domains, ensure it can't race with cpuhp */
-       lockdep_assert_cpus_held();
-
-       idx = resctrl_arch_rmid_idx_encode(entry->closid, entry->rmid);
-
-       entry->busy = 0;
-       list_for_each_entry(d, &r->mon_domains, hdr.list) {
-               /*
-                * For the first limbo RMID in the domain,
-                * setup up the limbo worker.
-                */
-               if (!has_busy_rmid(d))
-                       cqm_setup_limbo_handler(d, CQM_LIMBOCHECK_INTERVAL,
-                                               RESCTRL_PICK_ANY_CPU);
-               set_bit(idx, d->rmid_busy_llc);
-               entry->busy++;
-       }
-
-       rmid_limbo_count++;
-       if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID))
-               closid_num_dirty_rmid[entry->closid]++;
-}
-
-void free_rmid(u32 closid, u32 rmid)
-{
-       u32 idx = resctrl_arch_rmid_idx_encode(closid, rmid);
-       struct rmid_entry *entry;
-
-       lockdep_assert_held(&rdtgroup_mutex);
-
-       /*
-        * Do not allow the default rmid to be free'd. Comparing by index
-        * allows architectures that ignore the closid parameter to avoid an
-        * unnecessary check.
-        */
-       if (!resctrl_arch_mon_capable() ||
-           idx == resctrl_arch_rmid_idx_encode(RESCTRL_RESERVED_CLOSID,
-                                               RESCTRL_RESERVED_RMID))
-               return;
-
-       entry = __rmid_entry(idx);
-
-       if (resctrl_arch_is_llc_occupancy_enabled())
-               add_rmid_to_limbo(entry);
-       else
-               list_add_tail(&entry->list, &rmid_free_lru);
-}
-
-static struct mbm_state *get_mbm_state(struct rdt_mon_domain *d, u32 closid,
-                                      u32 rmid, enum resctrl_event_id evtid)
-{
-       u32 idx = resctrl_arch_rmid_idx_encode(closid, rmid);
-
-       switch (evtid) {
-       case QOS_L3_MBM_TOTAL_EVENT_ID:
-               return &d->mbm_total[idx];
-       case QOS_L3_MBM_LOCAL_EVENT_ID:
-               return &d->mbm_local[idx];
-       default:
-               return NULL;
-       }
-}
-
-static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr)
-{
-       int cpu = smp_processor_id();
-       struct rdt_mon_domain *d;
-       struct mbm_state *m;
-       int err, ret;
-       u64 tval = 0;
-
-       if (rr->first) {
-               resctrl_arch_reset_rmid(rr->r, rr->d, closid, rmid, rr->evtid);
-               m = get_mbm_state(rr->d, closid, rmid, rr->evtid);
-               if (m)
-                       memset(m, 0, sizeof(struct mbm_state));
-               return 0;
-       }
-
-       if (rr->d) {
-               /* Reading a single domain, must be on a CPU in that domain. */
-               if (!cpumask_test_cpu(cpu, &rr->d->hdr.cpu_mask))
-                       return -EINVAL;
-               rr->err = resctrl_arch_rmid_read(rr->r, rr->d, closid, rmid,
-                                                rr->evtid, &tval, rr->arch_mon_ctx);
-               if (rr->err)
-                       return rr->err;
-
-               rr->val += tval;
-
-               return 0;
-       }
-
-       /* Summing domains that share a cache, must be on a CPU for that cache. */
-       if (!cpumask_test_cpu(cpu, &rr->ci->shared_cpu_map))
-               return -EINVAL;
-
-       /*
-        * Legacy files must report the sum of an event across all
-        * domains that share the same L3 cache instance.
-        * Report success if a read from any domain succeeds, -EINVAL
-        * (translated to "Unavailable" for user space) if reading from
-        * all domains fail for any reason.
-        */
-       ret = -EINVAL;
-       list_for_each_entry(d, &rr->r->mon_domains, hdr.list) {
-               if (d->ci->id != rr->ci->id)
-                       continue;
-               err = resctrl_arch_rmid_read(rr->r, d, closid, rmid,
-                                            rr->evtid, &tval, rr->arch_mon_ctx);
-               if (!err) {
-                       rr->val += tval;
-                       ret = 0;
-               }
-       }
-
-       if (ret)
-               rr->err = ret;
-
-       return ret;
-}
-
-/*
- * mbm_bw_count() - Update bw count from values previously read by
- *                 __mon_event_count().
- * @closid:    The closid used to identify the cached mbm_state.
- * @rmid:      The rmid used to identify the cached mbm_state.
- * @rr:                The struct rmid_read populated by __mon_event_count().
- *
- * Supporting function to calculate the memory bandwidth
- * and delta bandwidth in MBps. The chunks value previously read by
- * __mon_event_count() is compared with the chunks value from the previous
- * invocation. This must be called once per second to maintain values in MBps.
- */
-static void mbm_bw_count(u32 closid, u32 rmid, struct rmid_read *rr)
-{
-       u64 cur_bw, bytes, cur_bytes;
-       struct mbm_state *m;
-
-       m = get_mbm_state(rr->d, closid, rmid, rr->evtid);
-       if (WARN_ON_ONCE(!m))
-               return;
-
-       cur_bytes = rr->val;
-       bytes = cur_bytes - m->prev_bw_bytes;
-       m->prev_bw_bytes = cur_bytes;
-
-       cur_bw = bytes / SZ_1M;
-
-       m->prev_bw = cur_bw;
-}
-
-/*
- * This is scheduled by mon_event_read() to read the CQM/MBM counters
- * on a domain.
- */
-void mon_event_count(void *info)
-{
-       struct rdtgroup *rdtgrp, *entry;
-       struct rmid_read *rr = info;
-       struct list_head *head;
-       int ret;
-
-       rdtgrp = rr->rgrp;
-
-       ret = __mon_event_count(rdtgrp->closid, rdtgrp->mon.rmid, rr);
-
-       /*
-        * For Ctrl groups read data from child monitor groups and
-        * add them together. Count events which are read successfully.
-        * Discard the rmid_read's reporting errors.
-        */
-       head = &rdtgrp->mon.crdtgrp_list;
-
-       if (rdtgrp->type == RDTCTRL_GROUP) {
-               list_for_each_entry(entry, head, mon.crdtgrp_list) {
-                       if (__mon_event_count(entry->closid, entry->mon.rmid,
-                                             rr) == 0)
-                               ret = 0;
-               }
-       }
-
-       /*
-        * __mon_event_count() calls for newly created monitor groups may
-        * report -EINVAL/Unavailable if the monitor hasn't seen any traffic.
-        * Discard error if any of the monitor event reads succeeded.
-        */
-       if (ret == 0)
-               rr->err = 0;
-}
-
-static struct rdt_ctrl_domain *get_ctrl_domain_from_cpu(int cpu,
-                                                       struct rdt_resource *r)
-{
-       struct rdt_ctrl_domain *d;
-
-       lockdep_assert_cpus_held();
-
-       list_for_each_entry(d, &r->ctrl_domains, hdr.list) {
-               /* Find the domain that contains this CPU */
-               if (cpumask_test_cpu(cpu, &d->hdr.cpu_mask))
-                       return d;
-       }
-
-       return NULL;
-}
-
-/*
- * Feedback loop for MBA software controller (mba_sc)
- *
- * mba_sc is a feedback loop where we periodically read MBM counters and
- * adjust the bandwidth percentage values via the IA32_MBA_THRTL_MSRs so
- * that:
- *
- *   current bandwidth(cur_bw) < user specified bandwidth(user_bw)
- *
- * This uses the MBM counters to measure the bandwidth and MBA throttle
- * MSRs to control the bandwidth for a particular rdtgrp. It builds on the
- * fact that resctrl rdtgroups have both monitoring and control.
- *
- * The frequency of the checks is 1s and we just tag along the MBM overflow
- * timer. Having 1s interval makes the calculation of bandwidth simpler.
- *
- * Although MBA's goal is to restrict the bandwidth to a maximum, there may
- * be a need to increase the bandwidth to avoid unnecessarily restricting
- * the L2 <-> L3 traffic.
- *
- * Since MBA controls the L2 external bandwidth where as MBM measures the
- * L3 external bandwidth the following sequence could lead to such a
- * situation.
- *
- * Consider an rdtgroup which had high L3 <-> memory traffic in initial
- * phases -> mba_sc kicks in and reduced bandwidth percentage values -> but
- * after some time rdtgroup has mostly L2 <-> L3 traffic.
- *
- * In this case we may restrict the rdtgroup's L2 <-> L3 traffic as its
- * throttle MSRs already have low percentage values.  To avoid
- * unnecessarily restricting such rdtgroups, we also increase the bandwidth.
- */
-static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_mon_domain *dom_mbm)
-{
-       u32 closid, rmid, cur_msr_val, new_msr_val;
-       struct mbm_state *pmbm_data, *cmbm_data;
-       struct rdt_ctrl_domain *dom_mba;
-       enum resctrl_event_id evt_id;
-       struct rdt_resource *r_mba;
-       struct list_head *head;
-       struct rdtgroup *entry;
-       u32 cur_bw, user_bw;
-
-       r_mba = resctrl_arch_get_resource(RDT_RESOURCE_MBA);
-       evt_id = rgrp->mba_mbps_event;
-
-       closid = rgrp->closid;
-       rmid = rgrp->mon.rmid;
-       pmbm_data = get_mbm_state(dom_mbm, closid, rmid, evt_id);
-       if (WARN_ON_ONCE(!pmbm_data))
-               return;
-
-       dom_mba = get_ctrl_domain_from_cpu(smp_processor_id(), r_mba);
-       if (!dom_mba) {
-               pr_warn_once("Failure to get domain for MBA update\n");
-               return;
-       }
-
-       cur_bw = pmbm_data->prev_bw;
-       user_bw = dom_mba->mbps_val[closid];
-
-       /* MBA resource doesn't support CDP */
-       cur_msr_val = resctrl_arch_get_config(r_mba, dom_mba, closid, CDP_NONE);
-
-       /*
-        * For Ctrl groups read data from child monitor groups.
-        */
-       head = &rgrp->mon.crdtgrp_list;
-       list_for_each_entry(entry, head, mon.crdtgrp_list) {
-               cmbm_data = get_mbm_state(dom_mbm, entry->closid, entry->mon.rmid, evt_id);
-               if (WARN_ON_ONCE(!cmbm_data))
-                       return;
-               cur_bw += cmbm_data->prev_bw;
-       }
-
-       /*
-        * Scale up/down the bandwidth linearly for the ctrl group.  The
-        * bandwidth step is the bandwidth granularity specified by the
-        * hardware.
-        * Always increase throttling if current bandwidth is above the
-        * target set by user.
-        * But avoid thrashing up and down on every poll by checking
-        * whether a decrease in throttling is likely to push the group
-        * back over target. E.g. if currently throttling to 30% of bandwidth
-        * on a system with 10% granularity steps, check whether moving to
-        * 40% would go past the limit by multiplying current bandwidth by
-        * "(30 + 10) / 30".
-        */
-       if (cur_msr_val > r_mba->membw.min_bw && user_bw < cur_bw) {
-               new_msr_val = cur_msr_val - r_mba->membw.bw_gran;
-       } else if (cur_msr_val < MAX_MBA_BW &&
-                  (user_bw > (cur_bw * (cur_msr_val + r_mba->membw.min_bw) / cur_msr_val))) {
-               new_msr_val = cur_msr_val + r_mba->membw.bw_gran;
-       } else {
-               return;
-       }
-
-       resctrl_arch_update_one(r_mba, dom_mba, closid, CDP_NONE, new_msr_val);
-}
-
-static void mbm_update_one_event(struct rdt_resource *r, struct rdt_mon_domain *d,
-                                u32 closid, u32 rmid, enum resctrl_event_id evtid)
-{
-       struct rmid_read rr = {0};
-
-       rr.r = r;
-       rr.d = d;
-       rr.evtid = evtid;
-       rr.arch_mon_ctx = resctrl_arch_mon_ctx_alloc(rr.r, rr.evtid);
-       if (IS_ERR(rr.arch_mon_ctx)) {
-               pr_warn_ratelimited("Failed to allocate monitor context: %ld",
-                                   PTR_ERR(rr.arch_mon_ctx));
-               return;
-       }
-
-       __mon_event_count(closid, rmid, &rr);
-
-       /*
-        * If the software controller is enabled, compute the
-        * bandwidth for this event id.
-        */
-       if (is_mba_sc(NULL))
-               mbm_bw_count(closid, rmid, &rr);
-
-       resctrl_arch_mon_ctx_free(rr.r, rr.evtid, rr.arch_mon_ctx);
-}
-
-static void mbm_update(struct rdt_resource *r, struct rdt_mon_domain *d,
-                      u32 closid, u32 rmid)
-{
-       /*
-        * This is protected from concurrent reads from user as both
-        * the user and overflow handler hold the global mutex.
-        */
-       if (resctrl_arch_is_mbm_total_enabled())
-               mbm_update_one_event(r, d, closid, rmid, QOS_L3_MBM_TOTAL_EVENT_ID);
-
-       if (resctrl_arch_is_mbm_local_enabled())
-               mbm_update_one_event(r, d, closid, rmid, QOS_L3_MBM_LOCAL_EVENT_ID);
-}
-
-/*
- * Handler to scan the limbo list and move the RMIDs
- * to free list whose occupancy < threshold_occupancy.
- */
-void cqm_handle_limbo(struct work_struct *work)
-{
-       unsigned long delay = msecs_to_jiffies(CQM_LIMBOCHECK_INTERVAL);
-       struct rdt_mon_domain *d;
-
-       cpus_read_lock();
-       mutex_lock(&rdtgroup_mutex);
-
-       d = container_of(work, struct rdt_mon_domain, cqm_limbo.work);
-
-       __check_limbo(d, false);
-
-       if (has_busy_rmid(d)) {
-               d->cqm_work_cpu = cpumask_any_housekeeping(&d->hdr.cpu_mask,
-                                                          RESCTRL_PICK_ANY_CPU);
-               schedule_delayed_work_on(d->cqm_work_cpu, &d->cqm_limbo,
-                                        delay);
-       }
-
-       mutex_unlock(&rdtgroup_mutex);
-       cpus_read_unlock();
-}
-
-/**
- * cqm_setup_limbo_handler() - Schedule the limbo handler to run for this
- *                             domain.
- * @dom:           The domain the limbo handler should run for.
- * @delay_ms:      How far in the future the handler should run.
- * @exclude_cpu:   Which CPU the handler should not run on,
- *                RESCTRL_PICK_ANY_CPU to pick any CPU.
- */
-void cqm_setup_limbo_handler(struct rdt_mon_domain *dom, unsigned long delay_ms,
-                            int exclude_cpu)
-{
-       unsigned long delay = msecs_to_jiffies(delay_ms);
-       int cpu;
-
-       cpu = cpumask_any_housekeeping(&dom->hdr.cpu_mask, exclude_cpu);
-       dom->cqm_work_cpu = cpu;
-
-       if (cpu < nr_cpu_ids)
-               schedule_delayed_work_on(cpu, &dom->cqm_limbo, delay);
-}
-
-void mbm_handle_overflow(struct work_struct *work)
-{
-       unsigned long delay = msecs_to_jiffies(MBM_OVERFLOW_INTERVAL);
-       struct rdtgroup *prgrp, *crgrp;
-       struct rdt_mon_domain *d;
-       struct list_head *head;
-       struct rdt_resource *r;
-
-       cpus_read_lock();
-       mutex_lock(&rdtgroup_mutex);
-
-       /*
-        * If the filesystem has been unmounted this work no longer needs to
-        * run.
-        */
-       if (!resctrl_mounted || !resctrl_arch_mon_capable())
-               goto out_unlock;
-
-       r = resctrl_arch_get_resource(RDT_RESOURCE_L3);
-       d = container_of(work, struct rdt_mon_domain, mbm_over.work);
-
-       list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) {
-               mbm_update(r, d, prgrp->closid, prgrp->mon.rmid);
-
-               head = &prgrp->mon.crdtgrp_list;
-               list_for_each_entry(crgrp, head, mon.crdtgrp_list)
-                       mbm_update(r, d, crgrp->closid, crgrp->mon.rmid);
-
-               if (is_mba_sc(NULL))
-                       update_mba_bw(prgrp, d);
-       }
-
-       /*
-        * Re-check for housekeeping CPUs. This allows the overflow handler to
-        * move off a nohz_full CPU quickly.
-        */
-       d->mbm_work_cpu = cpumask_any_housekeeping(&d->hdr.cpu_mask,
-                                                  RESCTRL_PICK_ANY_CPU);
-       schedule_delayed_work_on(d->mbm_work_cpu, &d->mbm_over, delay);
-
-out_unlock:
-       mutex_unlock(&rdtgroup_mutex);
-       cpus_read_unlock();
-}
-
-/**
- * mbm_setup_overflow_handler() - Schedule the overflow handler to run for this
- *                                domain.
- * @dom:           The domain the overflow handler should run for.
- * @delay_ms:      How far in the future the handler should run.
- * @exclude_cpu:   Which CPU the handler should not run on,
- *                RESCTRL_PICK_ANY_CPU to pick any CPU.
- */
-void mbm_setup_overflow_handler(struct rdt_mon_domain *dom, unsigned long delay_ms,
-                               int exclude_cpu)
-{
-       unsigned long delay = msecs_to_jiffies(delay_ms);
-       int cpu;
-
-       /*
-        * When a domain comes online there is no guarantee the filesystem is
-        * mounted. If not, there is no need to catch counter overflow.
-        */
-       if (!resctrl_mounted || !resctrl_arch_mon_capable())
-               return;
-       cpu = cpumask_any_housekeeping(&dom->hdr.cpu_mask, exclude_cpu);
-       dom->mbm_work_cpu = cpu;
-
-       if (cpu < nr_cpu_ids)
-               schedule_delayed_work_on(cpu, &dom->mbm_over, delay);
-}
-
-static int dom_data_init(struct rdt_resource *r)
-{
-       u32 idx_limit = resctrl_arch_system_num_rmid_idx();
-       u32 num_closid = resctrl_arch_get_num_closid(r);
-       struct rmid_entry *entry = NULL;
-       int err = 0, i;
-       u32 idx;
-
-       mutex_lock(&rdtgroup_mutex);
-       if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) {
-               u32 *tmp;
-
-               /*
-                * If the architecture hasn't provided a sanitised value here,
-                * this may result in larger arrays than necessary. Resctrl will
-                * use a smaller system wide value based on the resources in
-                * use.
-                */
-               tmp = kcalloc(num_closid, sizeof(*tmp), GFP_KERNEL);
-               if (!tmp) {
-                       err = -ENOMEM;
-                       goto out_unlock;
-               }
-
-               closid_num_dirty_rmid = tmp;
-       }
-
-       rmid_ptrs = kcalloc(idx_limit, sizeof(struct rmid_entry), GFP_KERNEL);
-       if (!rmid_ptrs) {
-               if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) {
-                       kfree(closid_num_dirty_rmid);
-                       closid_num_dirty_rmid = NULL;
-               }
-               err = -ENOMEM;
-               goto out_unlock;
-       }
-
-       for (i = 0; i < idx_limit; i++) {
-               entry = &rmid_ptrs[i];
-               INIT_LIST_HEAD(&entry->list);
-
-               resctrl_arch_rmid_idx_decode(i, &entry->closid, &entry->rmid);
-               list_add_tail(&entry->list, &rmid_free_lru);
-       }
-
-       /*
-        * RESCTRL_RESERVED_CLOSID and RESCTRL_RESERVED_RMID are special and
-        * are always allocated. These are used for the rdtgroup_default
-        * control group, which will be setup later in resctrl_init().
-        */
-       idx = resctrl_arch_rmid_idx_encode(RESCTRL_RESERVED_CLOSID,
-                                          RESCTRL_RESERVED_RMID);
-       entry = __rmid_entry(idx);
-       list_del(&entry->list);
-
-out_unlock:
-       mutex_unlock(&rdtgroup_mutex);
-
-       return err;
-}
-
-static void dom_data_exit(struct rdt_resource *r)
-{
-       mutex_lock(&rdtgroup_mutex);
-
-       if (!r->mon_capable)
-               goto out_unlock;
-
-       if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) {
-               kfree(closid_num_dirty_rmid);
-               closid_num_dirty_rmid = NULL;
-       }
-
-       kfree(rmid_ptrs);
-       rmid_ptrs = NULL;
-
-out_unlock:
-       mutex_unlock(&rdtgroup_mutex);
-}
-
-static struct mon_evt llc_occupancy_event = {
-       .name           = "llc_occupancy",
-       .evtid          = QOS_L3_OCCUP_EVENT_ID,
-};
-
-static struct mon_evt mbm_total_event = {
-       .name           = "mbm_total_bytes",
-       .evtid          = QOS_L3_MBM_TOTAL_EVENT_ID,
-};
-
-static struct mon_evt mbm_local_event = {
-       .name           = "mbm_local_bytes",
-       .evtid          = QOS_L3_MBM_LOCAL_EVENT_ID,
-};
-
-/*
- * Initialize the event list for the resource.
- *
- * Note that MBM events are also part of RDT_RESOURCE_L3 resource
- * because as per the SDM the total and local memory bandwidth
- * are enumerated as part of L3 monitoring.
- */
-static void l3_mon_evt_init(struct rdt_resource *r)
-{
-       INIT_LIST_HEAD(&r->evt_list);
-
-       if (resctrl_arch_is_llc_occupancy_enabled())
-               list_add_tail(&llc_occupancy_event.list, &r->evt_list);
-       if (resctrl_arch_is_mbm_total_enabled())
-               list_add_tail(&mbm_total_event.list, &r->evt_list);
-       if (resctrl_arch_is_mbm_local_enabled())
-               list_add_tail(&mbm_local_event.list, &r->evt_list);
-}
-
  /*
   * The power-on reset value of MSR_RMID_SNC_CONFIG is 0x1
   * which indicates that RMIDs are configured in legacy mode.
@@ -1193,51 +340,6 @@ static __init int snc_get_config(void)
         return ret;
  }
  
-/**
- * resctrl_mon_resource_init() - Initialise global monitoring structures.
- *
- * Allocate and initialise global monitor resources that do not belong to a
- * specific domain. i.e. the rmid_ptrs[] used for the limbo and free lists.
- * Called once during boot after the struct rdt_resource's have been configured
- * but before the filesystem is mounted.
- * Resctrl's cpuhp callbacks may be called before this point to bring a domain
- * online.
- *
- * Returns 0 for success, or -ENOMEM.
- */
-int resctrl_mon_resource_init(void)
-{
-       struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3);
-       int ret;
-
-       if (!r->mon_capable)
-               return 0;
-
-       ret = dom_data_init(r);
-       if (ret)
-               return ret;
-
-       l3_mon_evt_init(r);
-
-       if (resctrl_arch_is_evt_configurable(QOS_L3_MBM_TOTAL_EVENT_ID)) {
-               mbm_total_event.configurable = true;
-               resctrl_file_fflags_init("mbm_total_bytes_config",
-                                        RFTYPE_MON_INFO | RFTYPE_RES_CACHE);
-       }
-       if (resctrl_arch_is_evt_configurable(QOS_L3_MBM_LOCAL_EVENT_ID)) {
-               mbm_local_event.configurable = true;
-               resctrl_file_fflags_init("mbm_local_bytes_config",
-                                        RFTYPE_MON_INFO | RFTYPE_RES_CACHE);
-       }
-
-       if (resctrl_arch_is_mbm_local_enabled())
-               mba_mbps_default_event = QOS_L3_MBM_LOCAL_EVENT_ID;
-       else if (resctrl_arch_is_mbm_total_enabled())
-               mba_mbps_default_event = QOS_L3_MBM_TOTAL_EVENT_ID;
-
-       return 0;
-}
-
  int __init rdt_get_mon_l3_config(struct rdt_resource *r)
  {
         unsigned int mbm_offset = boot_cpu_data.x86_cache_mbm_width_offset;
@@ -1285,13 +387,6 @@ int __init rdt_get_mon_l3_config(struct rdt_resource *r)
         return 0;
  }
  
-void resctrl_mon_resource_exit(void)
-{
-       struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3);
-
-       dom_data_exit(r);
-}
-
  void __init intel_rdt_mbm_apply_quirk(void)
  {
         int cf_index;
diff --git a/arch/x86/kernel/cpu/resctrl/monitor_trace.h b/arch/x86/kernel/cpu/resctrl/monitor_trace.h

deleted file mode 100644 (file)

index ade67da..0000000
--- a/arch/x86/kernel/cpu/resctrl/monitor_trace.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM resctrl
-
-#if !defined(_FS_RESCTRL_MONITOR_TRACE_H) || defined(TRACE_HEADER_MULTI_READ)
-#define _FS_RESCTRL_MONITOR_TRACE_H
-
-#include <linux/tracepoint.h>
-
-TRACE_EVENT(mon_llc_occupancy_limbo,
-           TP_PROTO(u32 ctrl_hw_id, u32 mon_hw_id, int domain_id, u64 llc_occupancy_bytes),
-           TP_ARGS(ctrl_hw_id, mon_hw_id, domain_id, llc_occupancy_bytes),
-           TP_STRUCT__entry(__field(u32, ctrl_hw_id)
-                            __field(u32, mon_hw_id)
-                            __field(int, domain_id)
-                            __field(u64, llc_occupancy_bytes)),
-           TP_fast_assign(__entry->ctrl_hw_id = ctrl_hw_id;
-                          __entry->mon_hw_id = mon_hw_id;
-                          __entry->domain_id = domain_id;
-                          __entry->llc_occupancy_bytes = llc_occupancy_bytes;),
-           TP_printk("ctrl_hw_id=%u mon_hw_id=%u domain_id=%d llc_occupancy_bytes=%llu",
-                     __entry->ctrl_hw_id, __entry->mon_hw_id, __entry->domain_id,
-                     __entry->llc_occupancy_bytes)
-          );
-
-#endif /* _FS_RESCTRL_MONITOR_TRACE_H */
-
-#undef TRACE_INCLUDE_PATH
-#define TRACE_INCLUDE_PATH .
-#define TRACE_INCLUDE_FILE monitor_trace
-#include <trace/define_trace.h>
diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c

index bb39ffdd852495d00d7644ab1d70f4972814c47f..241d0d7e1cb5fc5537ad075ff4b1dc0d34cc4621 100644 (file)
--- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
+++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
@@ -12,17 +12,10 @@
  #define pr_fmt(fmt)    KBUILD_MODNAME ": " fmt
  
  #include <linux/cacheflush.h>
-#include <linux/cacheinfo.h>
  #include <linux/cpu.h>
-#include <linux/cpumask.h>
-#include <linux/debugfs.h>
-#include <linux/kthread.h>
-#include <linux/mman.h>
  #include <linux/perf_event.h>
  #include <linux/pm_qos.h>
  #include <linux/resctrl.h>
-#include <linux/slab.h>
-#include <linux/uaccess.h>
  
  #include <asm/cpu_device_id.h>
  #include <asm/perf_event.h>
@@ -31,6 +24,7 @@
  #include "internal.h"
  
  #define CREATE_TRACE_POINTS
+
  #include "pseudo_lock_trace.h"
  
  /*
@@ -39,29 +33,6 @@
   */
  static u64 prefetch_disable_bits;
  
-/*
- * Major number assigned to and shared by all devices exposing
- * pseudo-locked regions.
- */
-static unsigned int pseudo_lock_major;
-static unsigned long pseudo_lock_minor_avail = GENMASK(MINORBITS, 0);
-
-static char *pseudo_lock_devnode(const struct device *dev, umode_t *mode)
-{
-       const struct rdtgroup *rdtgrp;
-
-       rdtgrp = dev_get_drvdata(dev);
-       if (mode)
-               *mode = 0600;
-       guard(mutex)(&rdtgroup_mutex);
-       return kasprintf(GFP_KERNEL, "pseudo_lock/%s", rdt_kn_name(rdtgrp->kn));
-}
-
-static const struct class pseudo_lock_class = {
-       .name = "pseudo_lock",
-       .devnode = pseudo_lock_devnode,
-};
-
  /**
   * resctrl_arch_get_prefetch_disable_bits - prefetch disable bits of supported
   *                                          platforms
@@ -122,298 +93,6 @@ u64 resctrl_arch_get_prefetch_disable_bits(void)
         return prefetch_disable_bits;
  }
  
-/**
- * pseudo_lock_minor_get - Obtain available minor number
- * @minor: Pointer to where new minor number will be stored
- *
- * A bitmask is used to track available minor numbers. Here the next free
- * minor number is marked as unavailable and returned.
- *
- * Return: 0 on success, <0 on failure.
- */
-static int pseudo_lock_minor_get(unsigned int *minor)
-{
-       unsigned long first_bit;
-
-       first_bit = find_first_bit(&pseudo_lock_minor_avail, MINORBITS);
-
-       if (first_bit == MINORBITS)
-               return -ENOSPC;
-
-       __clear_bit(first_bit, &pseudo_lock_minor_avail);
-       *minor = first_bit;
-
-       return 0;
-}
-
-/**
- * pseudo_lock_minor_release - Return minor number to available
- * @minor: The minor number made available
- */
-static void pseudo_lock_minor_release(unsigned int minor)
-{
-       __set_bit(minor, &pseudo_lock_minor_avail);
-}
-
-/**
- * region_find_by_minor - Locate a pseudo-lock region by inode minor number
- * @minor: The minor number of the device representing pseudo-locked region
- *
- * When the character device is accessed we need to determine which
- * pseudo-locked region it belongs to. This is done by matching the minor
- * number of the device to the pseudo-locked region it belongs.
- *
- * Minor numbers are assigned at the time a pseudo-locked region is associated
- * with a cache instance.
- *
- * Return: On success return pointer to resource group owning the pseudo-locked
- *         region, NULL on failure.
- */
-static struct rdtgroup *region_find_by_minor(unsigned int minor)
-{
-       struct rdtgroup *rdtgrp, *rdtgrp_match = NULL;
-
-       list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) {
-               if (rdtgrp->plr && rdtgrp->plr->minor == minor) {
-                       rdtgrp_match = rdtgrp;
-                       break;
-               }
-       }
-       return rdtgrp_match;
-}
-
-/**
- * struct pseudo_lock_pm_req - A power management QoS request list entry
- * @list:      Entry within the @pm_reqs list for a pseudo-locked region
- * @req:       PM QoS request
- */
-struct pseudo_lock_pm_req {
-       struct list_head list;
-       struct dev_pm_qos_request req;
-};
-
-static void pseudo_lock_cstates_relax(struct pseudo_lock_region *plr)
-{
-       struct pseudo_lock_pm_req *pm_req, *next;
-
-       list_for_each_entry_safe(pm_req, next, &plr->pm_reqs, list) {
-               dev_pm_qos_remove_request(&pm_req->req);
-               list_del(&pm_req->list);
-               kfree(pm_req);
-       }
-}
-
-/**
- * pseudo_lock_cstates_constrain - Restrict cores from entering C6
- * @plr: Pseudo-locked region
- *
- * To prevent the cache from being affected by power management entering
- * C6 has to be avoided. This is accomplished by requesting a latency
- * requirement lower than lowest C6 exit latency of all supported
- * platforms as found in the cpuidle state tables in the intel_idle driver.
- * At this time it is possible to do so with a single latency requirement
- * for all supported platforms.
- *
- * Since Goldmont is supported, which is affected by X86_BUG_MONITOR,
- * the ACPI latencies need to be considered while keeping in mind that C2
- * may be set to map to deeper sleep states. In this case the latency
- * requirement needs to prevent entering C2 also.
- *
- * Return: 0 on success, <0 on failure
- */
-static int pseudo_lock_cstates_constrain(struct pseudo_lock_region *plr)
-{
-       struct pseudo_lock_pm_req *pm_req;
-       int cpu;
-       int ret;
-
-       for_each_cpu(cpu, &plr->d->hdr.cpu_mask) {
-               pm_req = kzalloc(sizeof(*pm_req), GFP_KERNEL);
-               if (!pm_req) {
-                       rdt_last_cmd_puts("Failure to allocate memory for PM QoS\n");
-                       ret = -ENOMEM;
-                       goto out_err;
-               }
-               ret = dev_pm_qos_add_request(get_cpu_device(cpu),
-                                            &pm_req->req,
-                                            DEV_PM_QOS_RESUME_LATENCY,
-                                            30);
-               if (ret < 0) {
-                       rdt_last_cmd_printf("Failed to add latency req CPU%d\n",
-                                           cpu);
-                       kfree(pm_req);
-                       ret = -1;
-                       goto out_err;
-               }
-               list_add(&pm_req->list, &plr->pm_reqs);
-       }
-
-       return 0;
-
-out_err:
-       pseudo_lock_cstates_relax(plr);
-       return ret;
-}
-
-/**
- * pseudo_lock_region_clear - Reset pseudo-lock region data
- * @plr: pseudo-lock region
- *
- * All content of the pseudo-locked region is reset - any memory allocated
- * freed.
- *
- * Return: void
- */
-static void pseudo_lock_region_clear(struct pseudo_lock_region *plr)
-{
-       plr->size = 0;
-       plr->line_size = 0;
-       kfree(plr->kmem);
-       plr->kmem = NULL;
-       plr->s = NULL;
-       if (plr->d)
-               plr->d->plr = NULL;
-       plr->d = NULL;
-       plr->cbm = 0;
-       plr->debugfs_dir = NULL;
-}
-
-/**
- * pseudo_lock_region_init - Initialize pseudo-lock region information
- * @plr: pseudo-lock region
- *
- * Called after user provided a schemata to be pseudo-locked. From the
- * schemata the &struct pseudo_lock_region is on entry already initialized
- * with the resource, domain, and capacity bitmask. Here the information
- * required for pseudo-locking is deduced from this data and &struct
- * pseudo_lock_region initialized further. This information includes:
- * - size in bytes of the region to be pseudo-locked
- * - cache line size to know the stride with which data needs to be accessed
- *   to be pseudo-locked
- * - a cpu associated with the cache instance on which the pseudo-locking
- *   flow can be executed
- *
- * Return: 0 on success, <0 on failure. Descriptive error will be written
- * to last_cmd_status buffer.
- */
-static int pseudo_lock_region_init(struct pseudo_lock_region *plr)
-{
-       enum resctrl_scope scope = plr->s->res->ctrl_scope;
-       struct cacheinfo *ci;
-       int ret;
-
-       if (WARN_ON_ONCE(scope != RESCTRL_L2_CACHE && scope != RESCTRL_L3_CACHE))
-               return -ENODEV;
-
-       /* Pick the first cpu we find that is associated with the cache. */
-       plr->cpu = cpumask_first(&plr->d->hdr.cpu_mask);
-
-       if (!cpu_online(plr->cpu)) {
-               rdt_last_cmd_printf("CPU %u associated with cache not online\n",
-                                   plr->cpu);
-               ret = -ENODEV;
-               goto out_region;
-       }
-
-       ci = get_cpu_cacheinfo_level(plr->cpu, scope);
-       if (ci) {
-               plr->line_size = ci->coherency_line_size;
-               plr->size = rdtgroup_cbm_to_size(plr->s->res, plr->d, plr->cbm);
-               return 0;
-       }
-
-       ret = -1;
-       rdt_last_cmd_puts("Unable to determine cache line size\n");
-out_region:
-       pseudo_lock_region_clear(plr);
-       return ret;
-}
-
-/**
- * pseudo_lock_init - Initialize a pseudo-lock region
- * @rdtgrp: resource group to which new pseudo-locked region will belong
- *
- * A pseudo-locked region is associated with a resource group. When this
- * association is created the pseudo-locked region is initialized. The
- * details of the pseudo-locked region are not known at this time so only
- * allocation is done and association established.
- *
- * Return: 0 on success, <0 on failure
- */
-static int pseudo_lock_init(struct rdtgroup *rdtgrp)
-{
-       struct pseudo_lock_region *plr;
-
-       plr = kzalloc(sizeof(*plr), GFP_KERNEL);
-       if (!plr)
-               return -ENOMEM;
-
-       init_waitqueue_head(&plr->lock_thread_wq);
-       INIT_LIST_HEAD(&plr->pm_reqs);
-       rdtgrp->plr = plr;
-       return 0;
-}
-
-/**
- * pseudo_lock_region_alloc - Allocate kernel memory that will be pseudo-locked
- * @plr: pseudo-lock region
- *
- * Initialize the details required to set up the pseudo-locked region and
- * allocate the contiguous memory that will be pseudo-locked to the cache.
- *
- * Return: 0 on success, <0 on failure.  Descriptive error will be written
- * to last_cmd_status buffer.
- */
-static int pseudo_lock_region_alloc(struct pseudo_lock_region *plr)
-{
-       int ret;
-
-       ret = pseudo_lock_region_init(plr);
-       if (ret < 0)
-               return ret;
-
-       /*
-        * We do not yet support contiguous regions larger than
-        * KMALLOC_MAX_SIZE.
-        */
-       if (plr->size > KMALLOC_MAX_SIZE) {
-               rdt_last_cmd_puts("Requested region exceeds maximum size\n");
-               ret = -E2BIG;
-               goto out_region;
-       }
-
-       plr->kmem = kzalloc(plr->size, GFP_KERNEL);
-       if (!plr->kmem) {
-               rdt_last_cmd_puts("Unable to allocate memory\n");
-               ret = -ENOMEM;
-               goto out_region;
-       }
-
-       ret = 0;
-       goto out;
-out_region:
-       pseudo_lock_region_clear(plr);
-out:
-       return ret;
-}
-
-/**
- * pseudo_lock_free - Free a pseudo-locked region
- * @rdtgrp: resource group to which pseudo-locked region belonged
- *
- * The pseudo-locked region's resources have already been released, or not
- * yet created at this point. Now it can be freed and disassociated from the
- * resource group.
- *
- * Return: void
- */
-static void pseudo_lock_free(struct rdtgroup *rdtgrp)
-{
-       pseudo_lock_region_clear(rdtgrp->plr);
-       kfree(rdtgrp->plr);
-       rdtgrp->plr = NULL;
-}
-
  /**
   * resctrl_arch_pseudo_lock_fn - Load kernel memory into cache
   * @_plr: the pseudo-lock region descriptor
@@ -543,340 +222,6 @@ int resctrl_arch_pseudo_lock_fn(void *_plr)
         return 0;
  }
  
-/**
- * rdtgroup_monitor_in_progress - Test if monitoring in progress
- * @rdtgrp: resource group being queried
- *
- * Return: 1 if monitor groups have been created for this resource
- * group, 0 otherwise.
- */
-static int rdtgroup_monitor_in_progress(struct rdtgroup *rdtgrp)
-{
-       return !list_empty(&rdtgrp->mon.crdtgrp_list);
-}
-
-/**
- * rdtgroup_locksetup_user_restrict - Restrict user access to group
- * @rdtgrp: resource group needing access restricted
- *
- * A resource group used for cache pseudo-locking cannot have cpus or tasks
- * assigned to it. This is communicated to the user by restricting access
- * to all the files that can be used to make such changes.
- *
- * Permissions restored with rdtgroup_locksetup_user_restore()
- *
- * Return: 0 on success, <0 on failure. If a failure occurs during the
- * restriction of access an attempt will be made to restore permissions but
- * the state of the mode of these files will be uncertain when a failure
- * occurs.
- */
-static int rdtgroup_locksetup_user_restrict(struct rdtgroup *rdtgrp)
-{
-       int ret;
-
-       ret = rdtgroup_kn_mode_restrict(rdtgrp, "tasks");
-       if (ret)
-               return ret;
-
-       ret = rdtgroup_kn_mode_restrict(rdtgrp, "cpus");
-       if (ret)
-               goto err_tasks;
-
-       ret = rdtgroup_kn_mode_restrict(rdtgrp, "cpus_list");
-       if (ret)
-               goto err_cpus;
-
-       if (resctrl_arch_mon_capable()) {
-               ret = rdtgroup_kn_mode_restrict(rdtgrp, "mon_groups");
-               if (ret)
-                       goto err_cpus_list;
-       }
-
-       ret = 0;
-       goto out;
-
-err_cpus_list:
-       rdtgroup_kn_mode_restore(rdtgrp, "cpus_list", 0777);
-err_cpus:
-       rdtgroup_kn_mode_restore(rdtgrp, "cpus", 0777);
-err_tasks:
-       rdtgroup_kn_mode_restore(rdtgrp, "tasks", 0777);
-out:
-       return ret;
-}
-
-/**
- * rdtgroup_locksetup_user_restore - Restore user access to group
- * @rdtgrp: resource group needing access restored
- *
- * Restore all file access previously removed using
- * rdtgroup_locksetup_user_restrict()
- *
- * Return: 0 on success, <0 on failure.  If a failure occurs during the
- * restoration of access an attempt will be made to restrict permissions
- * again but the state of the mode of these files will be uncertain when
- * a failure occurs.
- */
-static int rdtgroup_locksetup_user_restore(struct rdtgroup *rdtgrp)
-{
-       int ret;
-
-       ret = rdtgroup_kn_mode_restore(rdtgrp, "tasks", 0777);
-       if (ret)
-               return ret;
-
-       ret = rdtgroup_kn_mode_restore(rdtgrp, "cpus", 0777);
-       if (ret)
-               goto err_tasks;
-
-       ret = rdtgroup_kn_mode_restore(rdtgrp, "cpus_list", 0777);
-       if (ret)
-               goto err_cpus;
-
-       if (resctrl_arch_mon_capable()) {
-               ret = rdtgroup_kn_mode_restore(rdtgrp, "mon_groups", 0777);
-               if (ret)
-                       goto err_cpus_list;
-       }
-
-       ret = 0;
-       goto out;
-
-err_cpus_list:
-       rdtgroup_kn_mode_restrict(rdtgrp, "cpus_list");
-err_cpus:
-       rdtgroup_kn_mode_restrict(rdtgrp, "cpus");
-err_tasks:
-       rdtgroup_kn_mode_restrict(rdtgrp, "tasks");
-out:
-       return ret;
-}
-
-/**
- * rdtgroup_locksetup_enter - Resource group enters locksetup mode
- * @rdtgrp: resource group requested to enter locksetup mode
- *
- * A resource group enters locksetup mode to reflect that it would be used
- * to represent a pseudo-locked region and is in the process of being set
- * up to do so. A resource group used for a pseudo-locked region would
- * lose the closid associated with it so we cannot allow it to have any
- * tasks or cpus assigned nor permit tasks or cpus to be assigned in the
- * future. Monitoring of a pseudo-locked region is not allowed either.
- *
- * The above and more restrictions on a pseudo-locked region are checked
- * for and enforced before the resource group enters the locksetup mode.
- *
- * Returns: 0 if the resource group successfully entered locksetup mode, <0
- * on failure. On failure the last_cmd_status buffer is updated with text to
- * communicate details of failure to the user.
- */
-int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp)
-{
-       int ret;
-
-       /*
-        * The default resource group can neither be removed nor lose the
-        * default closid associated with it.
-        */
-       if (rdtgrp == &rdtgroup_default) {
-               rdt_last_cmd_puts("Cannot pseudo-lock default group\n");
-               return -EINVAL;
-       }
-
-       /*
-        * Cache Pseudo-locking not supported when CDP is enabled.
-        *
-        * Some things to consider if you would like to enable this
-        * support (using L3 CDP as example):
-        * - When CDP is enabled two separate resources are exposed,
-        *   L3DATA and L3CODE, but they are actually on the same cache.
-        *   The implication for pseudo-locking is that if a
-        *   pseudo-locked region is created on a domain of one
-        *   resource (eg. L3CODE), then a pseudo-locked region cannot
-        *   be created on that same domain of the other resource
-        *   (eg. L3DATA). This is because the creation of a
-        *   pseudo-locked region involves a call to wbinvd that will
-        *   affect all cache allocations on particular domain.
-        * - Considering the previous, it may be possible to only
-        *   expose one of the CDP resources to pseudo-locking and
-        *   hide the other. For example, we could consider to only
-        *   expose L3DATA and since the L3 cache is unified it is
-        *   still possible to place instructions there are execute it.
-        * - If only one region is exposed to pseudo-locking we should
-        *   still keep in mind that availability of a portion of cache
-        *   for pseudo-locking should take into account both resources.
-        *   Similarly, if a pseudo-locked region is created in one
-        *   resource, the portion of cache used by it should be made
-        *   unavailable to all future allocations from both resources.
-        */
-       if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L3) ||
-           resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L2)) {
-               rdt_last_cmd_puts("CDP enabled\n");
-               return -EINVAL;
-       }
-
-       /*
-        * Not knowing the bits to disable prefetching implies that this
-        * platform does not support Cache Pseudo-Locking.
-        */
-       if (resctrl_arch_get_prefetch_disable_bits() == 0) {
-               rdt_last_cmd_puts("Pseudo-locking not supported\n");
-               return -EINVAL;
-       }
-
-       if (rdtgroup_monitor_in_progress(rdtgrp)) {
-               rdt_last_cmd_puts("Monitoring in progress\n");
-               return -EINVAL;
-       }
-
-       if (rdtgroup_tasks_assigned(rdtgrp)) {
-               rdt_last_cmd_puts("Tasks assigned to resource group\n");
-               return -EINVAL;
-       }
-
-       if (!cpumask_empty(&rdtgrp->cpu_mask)) {
-               rdt_last_cmd_puts("CPUs assigned to resource group\n");
-               return -EINVAL;
-       }
-
-       if (rdtgroup_locksetup_user_restrict(rdtgrp)) {
-               rdt_last_cmd_puts("Unable to modify resctrl permissions\n");
-               return -EIO;
-       }
-
-       ret = pseudo_lock_init(rdtgrp);
-       if (ret) {
-               rdt_last_cmd_puts("Unable to init pseudo-lock region\n");
-               goto out_release;
-       }
-
-       /*
-        * If this system is capable of monitoring a rmid would have been
-        * allocated when the control group was created. This is not needed
-        * anymore when this group would be used for pseudo-locking. This
-        * is safe to call on platforms not capable of monitoring.
-        */
-       free_rmid(rdtgrp->closid, rdtgrp->mon.rmid);
-
-       ret = 0;
-       goto out;
-
-out_release:
-       rdtgroup_locksetup_user_restore(rdtgrp);
-out:
-       return ret;
-}
-
-/**
- * rdtgroup_locksetup_exit - resource group exist locksetup mode
- * @rdtgrp: resource group
- *
- * When a resource group exits locksetup mode the earlier restrictions are
- * lifted.
- *
- * Return: 0 on success, <0 on failure
- */
-int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp)
-{
-       int ret;
-
-       if (resctrl_arch_mon_capable()) {
-               ret = alloc_rmid(rdtgrp->closid);
-               if (ret < 0) {
-                       rdt_last_cmd_puts("Out of RMIDs\n");
-                       return ret;
-               }
-               rdtgrp->mon.rmid = ret;
-       }
-
-       ret = rdtgroup_locksetup_user_restore(rdtgrp);
-       if (ret) {
-               free_rmid(rdtgrp->closid, rdtgrp->mon.rmid);
-               return ret;
-       }
-
-       pseudo_lock_free(rdtgrp);
-       return 0;
-}
-
-/**
- * rdtgroup_cbm_overlaps_pseudo_locked - Test if CBM or portion is pseudo-locked
- * @d: RDT domain
- * @cbm: CBM to test
- *
- * @d represents a cache instance and @cbm a capacity bitmask that is
- * considered for it. Determine if @cbm overlaps with any existing
- * pseudo-locked region on @d.
- *
- * @cbm is unsigned long, even if only 32 bits are used, to make the
- * bitmap functions work correctly.
- *
- * Return: true if @cbm overlaps with pseudo-locked region on @d, false
- * otherwise.
- */
-bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_ctrl_domain *d, unsigned long cbm)
-{
-       unsigned int cbm_len;
-       unsigned long cbm_b;
-
-       if (d->plr) {
-               cbm_len = d->plr->s->res->cache.cbm_len;
-               cbm_b = d->plr->cbm;
-               if (bitmap_intersects(&cbm, &cbm_b, cbm_len))
-                       return true;
-       }
-       return false;
-}
-
-/**
- * rdtgroup_pseudo_locked_in_hierarchy - Pseudo-locked region in cache hierarchy
- * @d: RDT domain under test
- *
- * The setup of a pseudo-locked region affects all cache instances within
- * the hierarchy of the region. It is thus essential to know if any
- * pseudo-locked regions exist within a cache hierarchy to prevent any
- * attempts to create new pseudo-locked regions in the same hierarchy.
- *
- * Return: true if a pseudo-locked region exists in the hierarchy of @d or
- *         if it is not possible to test due to memory allocation issue,
- *         false otherwise.
- */
-bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_ctrl_domain *d)
-{
-       struct rdt_ctrl_domain *d_i;
-       cpumask_var_t cpu_with_psl;
-       struct rdt_resource *r;
-       bool ret = false;
-
-       /* Walking r->domains, ensure it can't race with cpuhp */
-       lockdep_assert_cpus_held();
-
-       if (!zalloc_cpumask_var(&cpu_with_psl, GFP_KERNEL))
-               return true;
-
-       /*
-        * First determine which cpus have pseudo-locked regions
-        * associated with them.
-        */
-       for_each_alloc_capable_rdt_resource(r) {
-               list_for_each_entry(d_i, &r->ctrl_domains, hdr.list) {
-                       if (d_i->plr)
-                               cpumask_or(cpu_with_psl, cpu_with_psl,
-                                          &d_i->hdr.cpu_mask);
-               }
-       }
-
-       /*
-        * Next test if new pseudo-locked region would intersect with
-        * existing region.
-        */
-       if (cpumask_intersects(&d->hdr.cpu_mask, cpu_with_psl))
-               ret = true;
-
-       free_cpumask_var(cpu_with_psl);
-       return ret;
-}
-
  /**
   * resctrl_arch_measure_cycles_lat_fn - Measure cycle latency to read
   *                                      pseudo-locked memory
@@ -1169,433 +514,3 @@ out:
         wake_up_interruptible(&plr->lock_thread_wq);
         return 0;
  }
-
-/**
- * pseudo_lock_measure_cycles - Trigger latency measure to pseudo-locked region
- * @rdtgrp: Resource group to which the pseudo-locked region belongs.
- * @sel: Selector of which measurement to perform on a pseudo-locked region.
- *
- * The measurement of latency to access a pseudo-locked region should be
- * done from a cpu that is associated with that pseudo-locked region.
- * Determine which cpu is associated with this region and start a thread on
- * that cpu to perform the measurement, wait for that thread to complete.
- *
- * Return: 0 on success, <0 on failure
- */
-static int pseudo_lock_measure_cycles(struct rdtgroup *rdtgrp, int sel)
-{
-       struct pseudo_lock_region *plr = rdtgrp->plr;
-       struct task_struct *thread;
-       unsigned int cpu;
-       int ret = -1;
-
-       cpus_read_lock();
-       mutex_lock(&rdtgroup_mutex);
-
-       if (rdtgrp->flags & RDT_DELETED) {
-               ret = -ENODEV;
-               goto out;
-       }
-
-       if (!plr->d) {
-               ret = -ENODEV;
-               goto out;
-       }
-
-       plr->thread_done = 0;
-       cpu = cpumask_first(&plr->d->hdr.cpu_mask);
-       if (!cpu_online(cpu)) {
-               ret = -ENODEV;
-               goto out;
-       }
-
-       plr->cpu = cpu;
-
-       if (sel == 1)
-               thread = kthread_run_on_cpu(resctrl_arch_measure_cycles_lat_fn,
-                                           plr, cpu, "pseudo_lock_measure/%u");
-       else if (sel == 2)
-               thread = kthread_run_on_cpu(resctrl_arch_measure_l2_residency,
-                                           plr, cpu, "pseudo_lock_measure/%u");
-       else if (sel == 3)
-               thread = kthread_run_on_cpu(resctrl_arch_measure_l3_residency,
-                                           plr, cpu, "pseudo_lock_measure/%u");
-       else
-               goto out;
-
-       if (IS_ERR(thread)) {
-               ret = PTR_ERR(thread);
-               goto out;
-       }
-
-       ret = wait_event_interruptible(plr->lock_thread_wq,
-                                      plr->thread_done == 1);
-       if (ret < 0)
-               goto out;
-
-       ret = 0;
-
-out:
-       mutex_unlock(&rdtgroup_mutex);
-       cpus_read_unlock();
-       return ret;
-}
-
-static ssize_t pseudo_lock_measure_trigger(struct file *file,
-                                          const char __user *user_buf,
-                                          size_t count, loff_t *ppos)
-{
-       struct rdtgroup *rdtgrp = file->private_data;
-       size_t buf_size;
-       char buf[32];
-       int ret;
-       int sel;
-
-       buf_size = min(count, (sizeof(buf) - 1));
-       if (copy_from_user(buf, user_buf, buf_size))
-               return -EFAULT;
-
-       buf[buf_size] = '\0';
-       ret = kstrtoint(buf, 10, &sel);
-       if (ret == 0) {
-               if (sel != 1 && sel != 2 && sel != 3)
-                       return -EINVAL;
-               ret = debugfs_file_get(file->f_path.dentry);
-               if (ret)
-                       return ret;
-               ret = pseudo_lock_measure_cycles(rdtgrp, sel);
-               if (ret == 0)
-                       ret = count;
-               debugfs_file_put(file->f_path.dentry);
-       }
-
-       return ret;
-}
-
-static const struct file_operations pseudo_measure_fops = {
-       .write = pseudo_lock_measure_trigger,
-       .open = simple_open,
-       .llseek = default_llseek,
-};
-
-/**
- * rdtgroup_pseudo_lock_create - Create a pseudo-locked region
- * @rdtgrp: resource group to which pseudo-lock region belongs
- *
- * Called when a resource group in the pseudo-locksetup mode receives a
- * valid schemata that should be pseudo-locked. Since the resource group is
- * in pseudo-locksetup mode the &struct pseudo_lock_region has already been
- * allocated and initialized with the essential information. If a failure
- * occurs the resource group remains in the pseudo-locksetup mode with the
- * &struct pseudo_lock_region associated with it, but cleared from all
- * information and ready for the user to re-attempt pseudo-locking by
- * writing the schemata again.
- *
- * Return: 0 if the pseudo-locked region was successfully pseudo-locked, <0
- * on failure. Descriptive error will be written to last_cmd_status buffer.
- */
-int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp)
-{
-       struct pseudo_lock_region *plr = rdtgrp->plr;
-       struct task_struct *thread;
-       unsigned int new_minor;
-       struct device *dev;
-       char *kn_name __free(kfree) = NULL;
-       int ret;
-
-       ret = pseudo_lock_region_alloc(plr);
-       if (ret < 0)
-               return ret;
-
-       ret = pseudo_lock_cstates_constrain(plr);
-       if (ret < 0) {
-               ret = -EINVAL;
-               goto out_region;
-       }
-       kn_name = kstrdup(rdt_kn_name(rdtgrp->kn), GFP_KERNEL);
-       if (!kn_name) {
-               ret = -ENOMEM;
-               goto out_cstates;
-       }
-
-       plr->thread_done = 0;
-
-       thread = kthread_run_on_cpu(resctrl_arch_pseudo_lock_fn, plr,
-                                   plr->cpu, "pseudo_lock/%u");
-       if (IS_ERR(thread)) {
-               ret = PTR_ERR(thread);
-               rdt_last_cmd_printf("Locking thread returned error %d\n", ret);
-               goto out_cstates;
-       }
-
-       ret = wait_event_interruptible(plr->lock_thread_wq,
-                                      plr->thread_done == 1);
-       if (ret < 0) {
-               /*
-                * If the thread does not get on the CPU for whatever
-                * reason and the process which sets up the region is
-                * interrupted then this will leave the thread in runnable
-                * state and once it gets on the CPU it will dereference
-                * the cleared, but not freed, plr struct resulting in an
-                * empty pseudo-locking loop.
-                */
-               rdt_last_cmd_puts("Locking thread interrupted\n");
-               goto out_cstates;
-       }
-
-       ret = pseudo_lock_minor_get(&new_minor);
-       if (ret < 0) {
-               rdt_last_cmd_puts("Unable to obtain a new minor number\n");
-               goto out_cstates;
-       }
-
-       /*
-        * Unlock access but do not release the reference. The
-        * pseudo-locked region will still be here on return.
-        *
-        * The mutex has to be released temporarily to avoid a potential
-        * deadlock with the mm->mmap_lock which is obtained in the
-        * device_create() and debugfs_create_dir() callpath below as well as
-        * before the mmap() callback is called.
-        */
-       mutex_unlock(&rdtgroup_mutex);
-
-       if (!IS_ERR_OR_NULL(debugfs_resctrl)) {
-               plr->debugfs_dir = debugfs_create_dir(kn_name, debugfs_resctrl);
-               if (!IS_ERR_OR_NULL(plr->debugfs_dir))
-                       debugfs_create_file("pseudo_lock_measure", 0200,
-                                           plr->debugfs_dir, rdtgrp,
-                                           &pseudo_measure_fops);
-       }
-
-       dev = device_create(&pseudo_lock_class, NULL,
-                           MKDEV(pseudo_lock_major, new_minor),
-                           rdtgrp, "%s", kn_name);
-
-       mutex_lock(&rdtgroup_mutex);
-
-       if (IS_ERR(dev)) {
-               ret = PTR_ERR(dev);
-               rdt_last_cmd_printf("Failed to create character device: %d\n",
-                                   ret);
-               goto out_debugfs;
-       }
-
-       /* We released the mutex - check if group was removed while we did so */
-       if (rdtgrp->flags & RDT_DELETED) {
-               ret = -ENODEV;
-               goto out_device;
-       }
-
-       plr->minor = new_minor;
-
-       rdtgrp->mode = RDT_MODE_PSEUDO_LOCKED;
-       closid_free(rdtgrp->closid);
-       rdtgroup_kn_mode_restore(rdtgrp, "cpus", 0444);
-       rdtgroup_kn_mode_restore(rdtgrp, "cpus_list", 0444);
-
-       ret = 0;
-       goto out;
-
-out_device:
-       device_destroy(&pseudo_lock_class, MKDEV(pseudo_lock_major, new_minor));
-out_debugfs:
-       debugfs_remove_recursive(plr->debugfs_dir);
-       pseudo_lock_minor_release(new_minor);
-out_cstates:
-       pseudo_lock_cstates_relax(plr);
-out_region:
-       pseudo_lock_region_clear(plr);
-out:
-       return ret;
-}
-
-/**
- * rdtgroup_pseudo_lock_remove - Remove a pseudo-locked region
- * @rdtgrp: resource group to which the pseudo-locked region belongs
- *
- * The removal of a pseudo-locked region can be initiated when the resource
- * group is removed from user space via a "rmdir" from userspace or the
- * unmount of the resctrl filesystem. On removal the resource group does
- * not go back to pseudo-locksetup mode before it is removed, instead it is
- * removed directly. There is thus asymmetry with the creation where the
- * &struct pseudo_lock_region is removed here while it was not created in
- * rdtgroup_pseudo_lock_create().
- *
- * Return: void
- */
-void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp)
-{
-       struct pseudo_lock_region *plr = rdtgrp->plr;
-
-       if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
-               /*
-                * Default group cannot be a pseudo-locked region so we can
-                * free closid here.
-                */
-               closid_free(rdtgrp->closid);
-               goto free;
-       }
-
-       pseudo_lock_cstates_relax(plr);
-       debugfs_remove_recursive(rdtgrp->plr->debugfs_dir);
-       device_destroy(&pseudo_lock_class, MKDEV(pseudo_lock_major, plr->minor));
-       pseudo_lock_minor_release(plr->minor);
-
-free:
-       pseudo_lock_free(rdtgrp);
-}
-
-static int pseudo_lock_dev_open(struct inode *inode, struct file *filp)
-{
-       struct rdtgroup *rdtgrp;
-
-       mutex_lock(&rdtgroup_mutex);
-
-       rdtgrp = region_find_by_minor(iminor(inode));
-       if (!rdtgrp) {
-               mutex_unlock(&rdtgroup_mutex);
-               return -ENODEV;
-       }
-
-       filp->private_data = rdtgrp;
-       atomic_inc(&rdtgrp->waitcount);
-       /* Perform a non-seekable open - llseek is not supported */
-       filp->f_mode &= ~(FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE);
-
-       mutex_unlock(&rdtgroup_mutex);
-
-       return 0;
-}
-
-static int pseudo_lock_dev_release(struct inode *inode, struct file *filp)
-{
-       struct rdtgroup *rdtgrp;
-
-       mutex_lock(&rdtgroup_mutex);
-       rdtgrp = filp->private_data;
-       WARN_ON(!rdtgrp);
-       if (!rdtgrp) {
-               mutex_unlock(&rdtgroup_mutex);
-               return -ENODEV;
-       }
-       filp->private_data = NULL;
-       atomic_dec(&rdtgrp->waitcount);
-       mutex_unlock(&rdtgroup_mutex);
-       return 0;
-}
-
-static int pseudo_lock_dev_mremap(struct vm_area_struct *area)
-{
-       /* Not supported */
-       return -EINVAL;
-}
-
-static const struct vm_operations_struct pseudo_mmap_ops = {
-       .mremap = pseudo_lock_dev_mremap,
-};
-
-static int pseudo_lock_dev_mmap(struct file *filp, struct vm_area_struct *vma)
-{
-       unsigned long vsize = vma->vm_end - vma->vm_start;
-       unsigned long off = vma->vm_pgoff << PAGE_SHIFT;
-       struct pseudo_lock_region *plr;
-       struct rdtgroup *rdtgrp;
-       unsigned long physical;
-       unsigned long psize;
-
-       mutex_lock(&rdtgroup_mutex);
-
-       rdtgrp = filp->private_data;
-       WARN_ON(!rdtgrp);
-       if (!rdtgrp) {
-               mutex_unlock(&rdtgroup_mutex);
-               return -ENODEV;
-       }
-
-       plr = rdtgrp->plr;
-
-       if (!plr->d) {
-               mutex_unlock(&rdtgroup_mutex);
-               return -ENODEV;
-       }
-
-       /*
-        * Task is required to run with affinity to the cpus associated
-        * with the pseudo-locked region. If this is not the case the task
-        * may be scheduled elsewhere and invalidate entries in the
-        * pseudo-locked region.
-        */
-       if (!cpumask_subset(current->cpus_ptr, &plr->d->hdr.cpu_mask)) {
-               mutex_unlock(&rdtgroup_mutex);
-               return -EINVAL;
-       }
-
-       physical = __pa(plr->kmem) >> PAGE_SHIFT;
-       psize = plr->size - off;
-
-       if (off > plr->size) {
-               mutex_unlock(&rdtgroup_mutex);
-               return -ENOSPC;
-       }
-
-       /*
-        * Ensure changes are carried directly to the memory being mapped,
-        * do not allow copy-on-write mapping.
-        */
-       if (!(vma->vm_flags & VM_SHARED)) {
-               mutex_unlock(&rdtgroup_mutex);
-               return -EINVAL;
-       }
-
-       if (vsize > psize) {
-               mutex_unlock(&rdtgroup_mutex);
-               return -ENOSPC;
-       }
-
-       memset(plr->kmem + off, 0, vsize);
-
-       if (remap_pfn_range(vma, vma->vm_start, physical + vma->vm_pgoff,
-                           vsize, vma->vm_page_prot)) {
-               mutex_unlock(&rdtgroup_mutex);
-               return -EAGAIN;
-       }
-       vma->vm_ops = &pseudo_mmap_ops;
-       mutex_unlock(&rdtgroup_mutex);
-       return 0;
-}
-
-static const struct file_operations pseudo_lock_dev_fops = {
-       .owner =        THIS_MODULE,
-       .read =         NULL,
-       .write =        NULL,
-       .open =         pseudo_lock_dev_open,
-       .release =      pseudo_lock_dev_release,
-       .mmap =         pseudo_lock_dev_mmap,
-};
-
-int rdt_pseudo_lock_init(void)
-{
-       int ret;
-
-       ret = register_chrdev(0, "pseudo_lock", &pseudo_lock_dev_fops);
-       if (ret < 0)
-               return ret;
-
-       pseudo_lock_major = ret;
-
-       ret = class_register(&pseudo_lock_class);
-       if (ret) {
-               unregister_chrdev(pseudo_lock_major, "pseudo_lock");
-               return ret;
-       }
-
-       return 0;
-}
-
-void rdt_pseudo_lock_release(void)
-{
-       class_unregister(&pseudo_lock_class);
-       unregister_chrdev(pseudo_lock_major, "pseudo_lock");
-       pseudo_lock_major = 0;
-}
diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock_trace.h b/arch/x86/kernel/cpu/resctrl/pseudo_lock_trace.h

index 5a0fae61d3eebf217377e85efa62697c6524cd56..7c8aef08010f1431a2cfcfb96a32873cd03fc024 100644 (file)
--- a/arch/x86/kernel/cpu/resctrl/pseudo_lock_trace.h
+++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock_trace.h
@@ -39,5 +39,7 @@ TRACE_EVENT(pseudo_lock_l3,
  
  #undef TRACE_INCLUDE_PATH
  #define TRACE_INCLUDE_PATH .
+
  #define TRACE_INCLUDE_FILE pseudo_lock_trace
+
  #include <trace/define_trace.h>
diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c

index ace86b6dcedec851a701051753f0c169733a1b75..c7a7f0ae373adf1a7a348f1f8b37dc05c570a10d 100644 (file)
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -32,4547 +32,230 @@
  #include "internal.h"
  
  DEFINE_STATIC_KEY_FALSE(rdt_enable_key);
-DEFINE_STATIC_KEY_FALSE(rdt_mon_enable_key);
-DEFINE_STATIC_KEY_FALSE(rdt_alloc_enable_key);
-
-/* Mutex to protect rdtgroup access. */
-DEFINE_MUTEX(rdtgroup_mutex);
-
-static struct kernfs_root *rdt_root;
-struct rdtgroup rdtgroup_default;
-LIST_HEAD(rdt_all_groups);
-
-/* list of entries for the schemata file */
-LIST_HEAD(resctrl_schema_all);
-
-/*
- * List of struct mon_data containing private data of event files for use by
- * rdtgroup_mondata_show(). Protected by rdtgroup_mutex.
- */
-static LIST_HEAD(mon_data_kn_priv_list);
-
-/* The filesystem can only be mounted once. */
-bool resctrl_mounted;
-
-/* Kernel fs node for "info" directory under root */
-static struct kernfs_node *kn_info;
-
-/* Kernel fs node for "mon_groups" directory under root */
-static struct kernfs_node *kn_mongrp;
-
-/* Kernel fs node for "mon_data" directory under root */
-static struct kernfs_node *kn_mondata;
-
-/*
- * Used to store the max resource name width to display the schemata names in
- * a tabular format.
- */
-int max_name_width;
-
-static struct seq_buf last_cmd_status;
-static char last_cmd_status_buf[512];
  
-static int rdtgroup_setup_root(struct rdt_fs_context *ctx);
-static void rdtgroup_destroy_root(void);
+DEFINE_STATIC_KEY_FALSE(rdt_mon_enable_key);
  
-struct dentry *debugfs_resctrl;
+DEFINE_STATIC_KEY_FALSE(rdt_alloc_enable_key);
  
  /*
- * Memory bandwidth monitoring event to use for the default CTRL_MON group
- * and each new CTRL_MON group created by the user.  Only relevant when
- * the filesystem is mounted with the "mba_MBps" option so it does not
- * matter that it remains uninitialized on systems that do not support
- * the "mba_MBps" option.
+ * This is safe against resctrl_arch_sched_in() called from __switch_to()
+ * because __switch_to() is executed with interrupts disabled. A local call
+ * from update_closid_rmid() is protected against __switch_to() because
+ * preemption is disabled.
   */
-enum resctrl_event_id mba_mbps_default_event;
-
-static bool resctrl_debug;
-
-void rdt_last_cmd_clear(void)
-{
-       lockdep_assert_held(&rdtgroup_mutex);
-       seq_buf_clear(&last_cmd_status);
-}
-
-void rdt_last_cmd_puts(const char *s)
-{
-       lockdep_assert_held(&rdtgroup_mutex);
-       seq_buf_puts(&last_cmd_status, s);
-}
-
-void rdt_last_cmd_printf(const char *fmt, ...)
-{
-       va_list ap;
-
-       va_start(ap, fmt);
-       lockdep_assert_held(&rdtgroup_mutex);
-       seq_buf_vprintf(&last_cmd_status, fmt, ap);
-       va_end(ap);
-}
-
-void rdt_staged_configs_clear(void)
+void resctrl_arch_sync_cpu_closid_rmid(void *info)
  {
-       struct rdt_ctrl_domain *dom;
-       struct rdt_resource *r;
-
-       lockdep_assert_held(&rdtgroup_mutex);
+       struct resctrl_cpu_defaults *r = info;
  
-       for_each_alloc_capable_rdt_resource(r) {
-               list_for_each_entry(dom, &r->ctrl_domains, hdr.list)
-                       memset(dom->staged_config, 0, sizeof(dom->staged_config));
+       if (r) {
+               this_cpu_write(pqr_state.default_closid, r->closid);
+               this_cpu_write(pqr_state.default_rmid, r->rmid);
         }
-}
  
-static bool resctrl_is_mbm_enabled(void)
-{
-       return (resctrl_arch_is_mbm_total_enabled() ||
-               resctrl_arch_is_mbm_local_enabled());
+       /*
+        * We cannot unconditionally write the MSR because the current
+        * executing task might have its own closid selected. Just reuse
+        * the context switch code.
+        */
+       resctrl_arch_sched_in(current);
  }
  
-static bool resctrl_is_mbm_event(int e)
-{
-       return (e >= QOS_L3_MBM_TOTAL_EVENT_ID &&
-               e <= QOS_L3_MBM_LOCAL_EVENT_ID);
-}
+#define INVALID_CONFIG_INDEX   UINT_MAX
  
-/*
- * Trivial allocator for CLOSIDs. Use BITMAP APIs to manipulate a bitmap
- * of free CLOSIDs.
+/**
+ * mon_event_config_index_get - get the hardware index for the
+ *                              configurable event
+ * @evtid: event id.
   *
- * Using a global CLOSID across all resources has some advantages and
- * some drawbacks:
- * + We can simply set current's closid to assign a task to a resource
- *   group.
- * + Context switch code can avoid extra memory references deciding which
- *   CLOSID to load into the PQR_ASSOC MSR
- * - We give up some options in configuring resource groups across multi-socket
- *   systems.
- * - Our choices on how to configure each resource become progressively more
- *   limited as the number of resources grows.
+ * Return: 0 for evtid == QOS_L3_MBM_TOTAL_EVENT_ID
+ *         1 for evtid == QOS_L3_MBM_LOCAL_EVENT_ID
+ *         INVALID_CONFIG_INDEX for invalid evtid
   */
-static unsigned long *closid_free_map;
-static int closid_free_map_len;
-
-int closids_supported(void)
-{
-       return closid_free_map_len;
-}
-
-static int closid_init(void)
+static inline unsigned int mon_event_config_index_get(u32 evtid)
  {
-       struct resctrl_schema *s;
-       u32 rdt_min_closid = ~0;
-
-       /* Monitor only platforms still call closid_init() */
-       if (list_empty(&resctrl_schema_all))
+       switch (evtid) {
+       case QOS_L3_MBM_TOTAL_EVENT_ID:
                 return 0;
-
-       /* Compute rdt_min_closid across all resources */
-       list_for_each_entry(s, &resctrl_schema_all, list)
-               rdt_min_closid = min(rdt_min_closid, s->num_closid);
-
-       closid_free_map = bitmap_alloc(rdt_min_closid, GFP_KERNEL);
-       if (!closid_free_map)
-               return -ENOMEM;
-       bitmap_fill(closid_free_map, rdt_min_closid);
-
-       /* RESCTRL_RESERVED_CLOSID is always reserved for the default group */
-       __clear_bit(RESCTRL_RESERVED_CLOSID, closid_free_map);
-       closid_free_map_len = rdt_min_closid;
-
-       return 0;
-}
-
-static void closid_exit(void)
-{
-       bitmap_free(closid_free_map);
-       closid_free_map = NULL;
+       case QOS_L3_MBM_LOCAL_EVENT_ID:
+               return 1;
+       default:
+               /* Should never reach here */
+               return INVALID_CONFIG_INDEX;
+       }
  }
  
-static int closid_alloc(void)
+void resctrl_arch_mon_event_config_read(void *_config_info)
  {
-       int cleanest_closid;
-       u32 closid;
-
-       lockdep_assert_held(&rdtgroup_mutex);
+       struct resctrl_mon_config_info *config_info = _config_info;
+       unsigned int index;
+       u64 msrval;
  
-       if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID) &&
-           resctrl_arch_is_llc_occupancy_enabled()) {
-               cleanest_closid = resctrl_find_cleanest_closid();
-               if (cleanest_closid < 0)
-                       return cleanest_closid;
-               closid = cleanest_closid;
-       } else {
-               closid = find_first_bit(closid_free_map, closid_free_map_len);
-               if (closid == closid_free_map_len)
-                       return -ENOSPC;
+       index = mon_event_config_index_get(config_info->evtid);
+       if (index == INVALID_CONFIG_INDEX) {
+               pr_warn_once("Invalid event id %d\n", config_info->evtid);
+               return;
         }
-       __clear_bit(closid, closid_free_map);
-
-       return closid;
-}
-
-void closid_free(int closid)
-{
-       lockdep_assert_held(&rdtgroup_mutex);
+       rdmsrl(MSR_IA32_EVT_CFG_BASE + index, msrval);
  
-       __set_bit(closid, closid_free_map);
+       /* Report only the valid event configuration bits */
+       config_info->mon_config = msrval & MAX_EVT_CONFIG_BITS;
  }
  
-/**
- * closid_allocated - test if provided closid is in use
- * @closid: closid to be tested
- *
- * Return: true if @closid is currently associated with a resource group,
- * false if @closid is free
- */
-bool closid_allocated(unsigned int closid)
+void resctrl_arch_mon_event_config_write(void *_config_info)
  {
-       lockdep_assert_held(&rdtgroup_mutex);
+       struct resctrl_mon_config_info *config_info = _config_info;
+       unsigned int index;
  
-       return !test_bit(closid, closid_free_map);
+       index = mon_event_config_index_get(config_info->evtid);
+       if (index == INVALID_CONFIG_INDEX) {
+               pr_warn_once("Invalid event id %d\n", config_info->evtid);
+               return;
+       }
+       wrmsr(MSR_IA32_EVT_CFG_BASE + index, config_info->mon_config, 0);
  }
  
-/**
- * rdtgroup_mode_by_closid - Return mode of resource group with closid
- * @closid: closid if the resource group
- *
- * Each resource group is associated with a @closid. Here the mode
- * of a resource group can be queried by searching for it using its closid.
- *
- * Return: mode as &enum rdtgrp_mode of resource group with closid @closid
- */
-enum rdtgrp_mode rdtgroup_mode_by_closid(int closid)
+static void l3_qos_cfg_update(void *arg)
  {
-       struct rdtgroup *rdtgrp;
-
-       list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) {
-               if (rdtgrp->closid == closid)
-                       return rdtgrp->mode;
-       }
+       bool *enable = arg;
  
-       return RDT_NUM_MODES;
+       wrmsrl(MSR_IA32_L3_QOS_CFG, *enable ? L3_QOS_CDP_ENABLE : 0ULL);
  }
  
-static const char * const rdt_mode_str[] = {
-       [RDT_MODE_SHAREABLE]            = "shareable",
-       [RDT_MODE_EXCLUSIVE]            = "exclusive",
-       [RDT_MODE_PSEUDO_LOCKSETUP]     = "pseudo-locksetup",
-       [RDT_MODE_PSEUDO_LOCKED]        = "pseudo-locked",
-};
-
-/**
- * rdtgroup_mode_str - Return the string representation of mode
- * @mode: the resource group mode as &enum rdtgroup_mode
- *
- * Return: string representation of valid mode, "unknown" otherwise
- */
-static const char *rdtgroup_mode_str(enum rdtgrp_mode mode)
+static void l2_qos_cfg_update(void *arg)
  {
-       if (mode < RDT_MODE_SHAREABLE || mode >= RDT_NUM_MODES)
-               return "unknown";
+       bool *enable = arg;
  
-       return rdt_mode_str[mode];
+       wrmsrl(MSR_IA32_L2_QOS_CFG, *enable ? L2_QOS_CDP_ENABLE : 0ULL);
  }
  
-/* set uid and gid of rdtgroup dirs and files to that of the creator */
-static int rdtgroup_kn_set_ugid(struct kernfs_node *kn)
+static int set_cache_qos_cfg(int level, bool enable)
  {
-       struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID,
-                               .ia_uid = current_fsuid(),
-                               .ia_gid = current_fsgid(), };
-
-       if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) &&
-           gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID))
-               return 0;
+       void (*update)(void *arg);
+       struct rdt_ctrl_domain *d;
+       struct rdt_resource *r_l;
+       cpumask_var_t cpu_mask;
+       int cpu;
  
-       return kernfs_setattr(kn, &iattr);
-}
+       /* Walking r->domains, ensure it can't race with cpuhp */
+       lockdep_assert_cpus_held();
  
-static int rdtgroup_add_file(struct kernfs_node *parent_kn, struct rftype *rft)
-{
-       struct kernfs_node *kn;
-       int ret;
+       if (level == RDT_RESOURCE_L3)
+               update = l3_qos_cfg_update;
+       else if (level == RDT_RESOURCE_L2)
+               update = l2_qos_cfg_update;
+       else
+               return -EINVAL;
  
-       kn = __kernfs_create_file(parent_kn, rft->name, rft->mode,
-                                 GLOBAL_ROOT_UID, GLOBAL_ROOT_GID,
-                                 0, rft->kf_ops, rft, NULL, NULL);
-       if (IS_ERR(kn))
-               return PTR_ERR(kn);
+       if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL))
+               return -ENOMEM;
  
-       ret = rdtgroup_kn_set_ugid(kn);
-       if (ret) {
-               kernfs_remove(kn);
-               return ret;
+       r_l = &rdt_resources_all[level].r_resctrl;
+       list_for_each_entry(d, &r_l->ctrl_domains, hdr.list) {
+               if (r_l->cache.arch_has_per_cpu_cfg)
+                       /* Pick all the CPUs in the domain instance */
+                       for_each_cpu(cpu, &d->hdr.cpu_mask)
+                               cpumask_set_cpu(cpu, cpu_mask);
+               else
+                       /* Pick one CPU from each domain instance to update MSR */
+                       cpumask_set_cpu(cpumask_any(&d->hdr.cpu_mask), cpu_mask);
         }
  
-       return 0;
-}
+       /* Update QOS_CFG MSR on all the CPUs in cpu_mask */
+       on_each_cpu_mask(cpu_mask, update, &enable, 1);
  
-static int rdtgroup_seqfile_show(struct seq_file *m, void *arg)
-{
-       struct kernfs_open_file *of = m->private;
-       struct rftype *rft = of->kn->priv;
+       free_cpumask_var(cpu_mask);
  
-       if (rft->seq_show)
-               return rft->seq_show(of, m, arg);
         return 0;
  }
  
-static ssize_t rdtgroup_file_write(struct kernfs_open_file *of, char *buf,
-                                  size_t nbytes, loff_t off)
+/* Restore the qos cfg state when a domain comes online */
+void rdt_domain_reconfigure_cdp(struct rdt_resource *r)
  {
-       struct rftype *rft = of->kn->priv;
-
-       if (rft->write)
-               return rft->write(of, buf, nbytes, off);
-
-       return -EINVAL;
-}
-
-static const struct kernfs_ops rdtgroup_kf_single_ops = {
-       .atomic_write_len       = PAGE_SIZE,
-       .write                  = rdtgroup_file_write,
-       .seq_show               = rdtgroup_seqfile_show,
-};
+       struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
  
-static const struct kernfs_ops kf_mondata_ops = {
-       .atomic_write_len       = PAGE_SIZE,
-       .seq_show               = rdtgroup_mondata_show,
-};
+       if (!r->cdp_capable)
+               return;
  
-static bool is_cpu_list(struct kernfs_open_file *of)
-{
-       struct rftype *rft = of->kn->priv;
+       if (r->rid == RDT_RESOURCE_L2)
+               l2_qos_cfg_update(&hw_res->cdp_enabled);
  
-       return rft->flags & RFTYPE_FLAGS_CPUS_LIST;
+       if (r->rid == RDT_RESOURCE_L3)
+               l3_qos_cfg_update(&hw_res->cdp_enabled);
  }
  
-static int rdtgroup_cpus_show(struct kernfs_open_file *of,
-                             struct seq_file *s, void *v)
+static int cdp_enable(int level)
  {
-       struct rdtgroup *rdtgrp;
-       struct cpumask *mask;
-       int ret = 0;
+       struct rdt_resource *r_l = &rdt_resources_all[level].r_resctrl;
+       int ret;
  
-       rdtgrp = rdtgroup_kn_lock_live(of->kn);
+       if (!r_l->alloc_capable)
+               return -EINVAL;
  
-       if (rdtgrp) {
-               if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) {
-                       if (!rdtgrp->plr->d) {
-                               rdt_last_cmd_clear();
-                               rdt_last_cmd_puts("Cache domain offline\n");
-                               ret = -ENODEV;
-                       } else {
-                               mask = &rdtgrp->plr->d->hdr.cpu_mask;
-                               seq_printf(s, is_cpu_list(of) ?
-                                          "%*pbl\n" : "%*pb\n",
-                                          cpumask_pr_args(mask));
-                       }
-               } else {
-                       seq_printf(s, is_cpu_list(of) ? "%*pbl\n" : "%*pb\n",
-                                  cpumask_pr_args(&rdtgrp->cpu_mask));
-               }
-       } else {
-               ret = -ENOENT;
-       }
-       rdtgroup_kn_unlock(of->kn);
+       ret = set_cache_qos_cfg(level, true);
+       if (!ret)
+               rdt_resources_all[level].cdp_enabled = true;
  
         return ret;
  }
  
-/*
- * This is safe against resctrl_arch_sched_in() called from __switch_to()
- * because __switch_to() is executed with interrupts disabled. A local call
- * from update_closid_rmid() is protected against __switch_to() because
- * preemption is disabled.
- */
-void resctrl_arch_sync_cpu_closid_rmid(void *info)
-{
-       struct resctrl_cpu_defaults *r = info;
-
-       if (r) {
-               this_cpu_write(pqr_state.default_closid, r->closid);
-               this_cpu_write(pqr_state.default_rmid, r->rmid);
-       }
-
-       /*
-        * We cannot unconditionally write the MSR because the current
-        * executing task might have its own closid selected. Just reuse
-        * the context switch code.
-        */
-       resctrl_arch_sched_in(current);
-}
-
-/*
- * Update the PGR_ASSOC MSR on all cpus in @cpu_mask,
- *
- * Per task closids/rmids must have been set up before calling this function.
- * @r may be NULL.
- */
-static void
-update_closid_rmid(const struct cpumask *cpu_mask, struct rdtgroup *r)
+static void cdp_disable(int level)
  {
-       struct resctrl_cpu_defaults defaults, *p = NULL;
+       struct rdt_hw_resource *r_hw = &rdt_resources_all[level];
  
-       if (r) {
-               defaults.closid = r->closid;
-               defaults.rmid = r->mon.rmid;
-               p = &defaults;
+       if (r_hw->cdp_enabled) {
+               set_cache_qos_cfg(level, false);
+               r_hw->cdp_enabled = false;
         }
-
-       on_each_cpu_mask(cpu_mask, resctrl_arch_sync_cpu_closid_rmid, p, 1);
  }
  
-static int cpus_mon_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask,
-                         cpumask_var_t tmpmask)
+int resctrl_arch_set_cdp_enabled(enum resctrl_res_level l, bool enable)
  {
-       struct rdtgroup *prgrp = rdtgrp->mon.parent, *crgrp;
-       struct list_head *head;
+       struct rdt_hw_resource *hw_res = &rdt_resources_all[l];
  
-       /* Check whether cpus belong to parent ctrl group */
-       cpumask_andnot(tmpmask, newmask, &prgrp->cpu_mask);
-       if (!cpumask_empty(tmpmask)) {
-               rdt_last_cmd_puts("Can only add CPUs to mongroup that belong to parent\n");
+       if (!hw_res->r_resctrl.cdp_capable)
                 return -EINVAL;
-       }
-
-       /* Check whether cpus are dropped from this group */
-       cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask);
-       if (!cpumask_empty(tmpmask)) {
-               /* Give any dropped cpus to parent rdtgroup */
-               cpumask_or(&prgrp->cpu_mask, &prgrp->cpu_mask, tmpmask);
-               update_closid_rmid(tmpmask, prgrp);
-       }
  
-       /*
-        * If we added cpus, remove them from previous group that owned them
-        * and update per-cpu rmid
-        */
-       cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask);
-       if (!cpumask_empty(tmpmask)) {
-               head = &prgrp->mon.crdtgrp_list;
-               list_for_each_entry(crgrp, head, mon.crdtgrp_list) {
-                       if (crgrp == rdtgrp)
-                               continue;
-                       cpumask_andnot(&crgrp->cpu_mask, &crgrp->cpu_mask,
-                                      tmpmask);
-               }
-               update_closid_rmid(tmpmask, rdtgrp);
-       }
+       if (enable)
+               return cdp_enable(l);
  
-       /* Done pushing/pulling - update this group with new mask */
-       cpumask_copy(&rdtgrp->cpu_mask, newmask);
+       cdp_disable(l);
  
         return 0;
  }
  
-static void cpumask_rdtgrp_clear(struct rdtgroup *r, struct cpumask *m)
+bool resctrl_arch_get_cdp_enabled(enum resctrl_res_level l)
  {
-       struct rdtgroup *crgrp;
-
-       cpumask_andnot(&r->cpu_mask, &r->cpu_mask, m);
-       /* update the child mon group masks as well*/
-       list_for_each_entry(crgrp, &r->mon.crdtgrp_list, mon.crdtgrp_list)
-               cpumask_and(&crgrp->cpu_mask, &r->cpu_mask, &crgrp->cpu_mask);
+       return rdt_resources_all[l].cdp_enabled;
  }
  
-static int cpus_ctrl_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask,
-                          cpumask_var_t tmpmask, cpumask_var_t tmpmask1)
+void resctrl_arch_reset_all_ctrls(struct rdt_resource *r)
  {
-       struct rdtgroup *r, *crgrp;
-       struct list_head *head;
+       struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
+       struct rdt_hw_ctrl_domain *hw_dom;
+       struct msr_param msr_param;
+       struct rdt_ctrl_domain *d;
+       int i;
  
-       /* Check whether cpus are dropped from this group */
-       cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask);
-       if (!cpumask_empty(tmpmask)) {
-               /* Can't drop from default group */
-               if (rdtgrp == &rdtgroup_default) {
-                       rdt_last_cmd_puts("Can't drop CPUs from default group\n");
-                       return -EINVAL;
-               }
+       /* Walking r->domains, ensure it can't race with cpuhp */
+       lockdep_assert_cpus_held();
  
-               /* Give any dropped cpus to rdtgroup_default */
-               cpumask_or(&rdtgroup_default.cpu_mask,
-                          &rdtgroup_default.cpu_mask, tmpmask);
-               update_closid_rmid(tmpmask, &rdtgroup_default);
-       }
+       msr_param.res = r;
+       msr_param.low = 0;
+       msr_param.high = hw_res->num_closid;
  
         /*
-        * If we added cpus, remove them from previous group and
-        * the prev group's child groups that owned them
-        * and update per-cpu closid/rmid.
+        * Disable resource control for this resource by setting all
+        * CBMs in all ctrl_domains to the maximum mask value. Pick one CPU
+        * from each domain to update the MSRs below.
          */
-       cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask);
-       if (!cpumask_empty(tmpmask)) {
-               list_for_each_entry(r, &rdt_all_groups, rdtgroup_list) {
-                       if (r == rdtgrp)
-                               continue;
-                       cpumask_and(tmpmask1, &r->cpu_mask, tmpmask);
-                       if (!cpumask_empty(tmpmask1))
-                               cpumask_rdtgrp_clear(r, tmpmask1);
-               }
-               update_closid_rmid(tmpmask, rdtgrp);
-       }
+       list_for_each_entry(d, &r->ctrl_domains, hdr.list) {
+               hw_dom = resctrl_to_arch_ctrl_dom(d);
  
-       /* Done pushing/pulling - update this group with new mask */
-       cpumask_copy(&rdtgrp->cpu_mask, newmask);
+               for (i = 0; i < hw_res->num_closid; i++)
+                       hw_dom->ctrl_val[i] = resctrl_get_default_ctrl(r);
+               msr_param.dom = d;
+               smp_call_function_any(&d->hdr.cpu_mask, rdt_ctrl_update, &msr_param, 1);
+       }
  
-       /*
-        * Clear child mon group masks since there is a new parent mask
-        * now and update the rmid for the cpus the child lost.
-        */
-       head = &rdtgrp->mon.crdtgrp_list;
-       list_for_each_entry(crgrp, head, mon.crdtgrp_list) {
-               cpumask_and(tmpmask, &rdtgrp->cpu_mask, &crgrp->cpu_mask);
-               update_closid_rmid(tmpmask, rdtgrp);
-               cpumask_clear(&crgrp->cpu_mask);
-       }
-
-       return 0;
-}
-
-static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of,
-                                  char *buf, size_t nbytes, loff_t off)
-{
-       cpumask_var_t tmpmask, newmask, tmpmask1;
-       struct rdtgroup *rdtgrp;
-       int ret;
-
-       if (!buf)
-               return -EINVAL;
-
-       if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
-               return -ENOMEM;
-       if (!zalloc_cpumask_var(&newmask, GFP_KERNEL)) {
-               free_cpumask_var(tmpmask);
-               return -ENOMEM;
-       }
-       if (!zalloc_cpumask_var(&tmpmask1, GFP_KERNEL)) {
-               free_cpumask_var(tmpmask);
-               free_cpumask_var(newmask);
-               return -ENOMEM;
-       }
-
-       rdtgrp = rdtgroup_kn_lock_live(of->kn);
-       if (!rdtgrp) {
-               ret = -ENOENT;
-               goto unlock;
-       }
-
-       if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED ||
-           rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
-               ret = -EINVAL;
-               rdt_last_cmd_puts("Pseudo-locking in progress\n");
-               goto unlock;
-       }
-
-       if (is_cpu_list(of))
-               ret = cpulist_parse(buf, newmask);
-       else
-               ret = cpumask_parse(buf, newmask);
-
-       if (ret) {
-               rdt_last_cmd_puts("Bad CPU list/mask\n");
-               goto unlock;
-       }
-
-       /* check that user didn't specify any offline cpus */
-       cpumask_andnot(tmpmask, newmask, cpu_online_mask);
-       if (!cpumask_empty(tmpmask)) {
-               ret = -EINVAL;
-               rdt_last_cmd_puts("Can only assign online CPUs\n");
-               goto unlock;
-       }
-
-       if (rdtgrp->type == RDTCTRL_GROUP)
-               ret = cpus_ctrl_write(rdtgrp, newmask, tmpmask, tmpmask1);
-       else if (rdtgrp->type == RDTMON_GROUP)
-               ret = cpus_mon_write(rdtgrp, newmask, tmpmask);
-       else
-               ret = -EINVAL;
-
-unlock:
-       rdtgroup_kn_unlock(of->kn);
-       free_cpumask_var(tmpmask);
-       free_cpumask_var(newmask);
-       free_cpumask_var(tmpmask1);
-
-       return ret ?: nbytes;
-}
-
-/**
- * rdtgroup_remove - the helper to remove resource group safely
- * @rdtgrp: resource group to remove
- *
- * On resource group creation via a mkdir, an extra kernfs_node reference is
- * taken to ensure that the rdtgroup structure remains accessible for the
- * rdtgroup_kn_unlock() calls where it is removed.
- *
- * Drop the extra reference here, then free the rdtgroup structure.
- *
- * Return: void
- */
-static void rdtgroup_remove(struct rdtgroup *rdtgrp)
-{
-       kernfs_put(rdtgrp->kn);
-       kfree(rdtgrp);
-}
-
-static void _update_task_closid_rmid(void *task)
-{
-       /*
-        * If the task is still current on this CPU, update PQR_ASSOC MSR.
-        * Otherwise, the MSR is updated when the task is scheduled in.
-        */
-       if (task == current)
-               resctrl_arch_sched_in(task);
-}
-
-static void update_task_closid_rmid(struct task_struct *t)
-{
-       if (IS_ENABLED(CONFIG_SMP) && task_curr(t))
-               smp_call_function_single(task_cpu(t), _update_task_closid_rmid, t, 1);
-       else
-               _update_task_closid_rmid(t);
-}
-
-static bool task_in_rdtgroup(struct task_struct *tsk, struct rdtgroup *rdtgrp)
-{
-       u32 closid, rmid = rdtgrp->mon.rmid;
-
-       if (rdtgrp->type == RDTCTRL_GROUP)
-               closid = rdtgrp->closid;
-       else if (rdtgrp->type == RDTMON_GROUP)
-               closid = rdtgrp->mon.parent->closid;
-       else
-               return false;
-
-       return resctrl_arch_match_closid(tsk, closid) &&
-              resctrl_arch_match_rmid(tsk, closid, rmid);
-}
-
-static int __rdtgroup_move_task(struct task_struct *tsk,
-                               struct rdtgroup *rdtgrp)
-{
-       /* If the task is already in rdtgrp, no need to move the task. */
-       if (task_in_rdtgroup(tsk, rdtgrp))
-               return 0;
-
-       /*
-        * Set the task's closid/rmid before the PQR_ASSOC MSR can be
-        * updated by them.
-        *
-        * For ctrl_mon groups, move both closid and rmid.
-        * For monitor groups, can move the tasks only from
-        * their parent CTRL group.
-        */
-       if (rdtgrp->type == RDTMON_GROUP &&
-           !resctrl_arch_match_closid(tsk, rdtgrp->mon.parent->closid)) {
-               rdt_last_cmd_puts("Can't move task to different control group\n");
-               return -EINVAL;
-       }
-
-       if (rdtgrp->type == RDTMON_GROUP)
-               resctrl_arch_set_closid_rmid(tsk, rdtgrp->mon.parent->closid,
-                                            rdtgrp->mon.rmid);
-       else
-               resctrl_arch_set_closid_rmid(tsk, rdtgrp->closid,
-                                            rdtgrp->mon.rmid);
-
-       /*
-        * Ensure the task's closid and rmid are written before determining if
-        * the task is current that will decide if it will be interrupted.
-        * This pairs with the full barrier between the rq->curr update and
-        * resctrl_arch_sched_in() during context switch.
-        */
-       smp_mb();
-
-       /*
-        * By now, the task's closid and rmid are set. If the task is current
-        * on a CPU, the PQR_ASSOC MSR needs to be updated to make the resource
-        * group go into effect. If the task is not current, the MSR will be
-        * updated when the task is scheduled in.
-        */
-       update_task_closid_rmid(tsk);
-
-       return 0;
-}
-
-static bool is_closid_match(struct task_struct *t, struct rdtgroup *r)
-{
-       return (resctrl_arch_alloc_capable() && (r->type == RDTCTRL_GROUP) &&
-               resctrl_arch_match_closid(t, r->closid));
-}
-
-static bool is_rmid_match(struct task_struct *t, struct rdtgroup *r)
-{
-       return (resctrl_arch_mon_capable() && (r->type == RDTMON_GROUP) &&
-               resctrl_arch_match_rmid(t, r->mon.parent->closid,
-                                       r->mon.rmid));
-}
-
-/**
- * rdtgroup_tasks_assigned - Test if tasks have been assigned to resource group
- * @r: Resource group
- *
- * Return: 1 if tasks have been assigned to @r, 0 otherwise
- */
-int rdtgroup_tasks_assigned(struct rdtgroup *r)
-{
-       struct task_struct *p, *t;
-       int ret = 0;
-
-       lockdep_assert_held(&rdtgroup_mutex);
-
-       rcu_read_lock();
-       for_each_process_thread(p, t) {
-               if (is_closid_match(t, r) || is_rmid_match(t, r)) {
-                       ret = 1;
-                       break;
-               }
-       }
-       rcu_read_unlock();
-
-       return ret;
-}
-
-static int rdtgroup_task_write_permission(struct task_struct *task,
-                                         struct kernfs_open_file *of)
-{
-       const struct cred *tcred = get_task_cred(task);
-       const struct cred *cred = current_cred();
-       int ret = 0;
-
-       /*
-        * Even if we're attaching all tasks in the thread group, we only
-        * need to check permissions on one of them.
-        */
-       if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
-           !uid_eq(cred->euid, tcred->uid) &&
-           !uid_eq(cred->euid, tcred->suid)) {
-               rdt_last_cmd_printf("No permission to move task %d\n", task->pid);
-               ret = -EPERM;
-       }
-
-       put_cred(tcred);
-       return ret;
-}
-
-static int rdtgroup_move_task(pid_t pid, struct rdtgroup *rdtgrp,
-                             struct kernfs_open_file *of)
-{
-       struct task_struct *tsk;
-       int ret;
-
-       rcu_read_lock();
-       if (pid) {
-               tsk = find_task_by_vpid(pid);
-               if (!tsk) {
-                       rcu_read_unlock();
-                       rdt_last_cmd_printf("No task %d\n", pid);
-                       return -ESRCH;
-               }
-       } else {
-               tsk = current;
-       }
-
-       get_task_struct(tsk);
-       rcu_read_unlock();
-
-       ret = rdtgroup_task_write_permission(tsk, of);
-       if (!ret)
-               ret = __rdtgroup_move_task(tsk, rdtgrp);
-
-       put_task_struct(tsk);
-       return ret;
-}
-
-static ssize_t rdtgroup_tasks_write(struct kernfs_open_file *of,
-                                   char *buf, size_t nbytes, loff_t off)
-{
-       struct rdtgroup *rdtgrp;
-       char *pid_str;
-       int ret = 0;
-       pid_t pid;
-
-       rdtgrp = rdtgroup_kn_lock_live(of->kn);
-       if (!rdtgrp) {
-               rdtgroup_kn_unlock(of->kn);
-               return -ENOENT;
-       }
-       rdt_last_cmd_clear();
-
-       if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED ||
-           rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
-               ret = -EINVAL;
-               rdt_last_cmd_puts("Pseudo-locking in progress\n");
-               goto unlock;
-       }
-
-       while (buf && buf[0] != '\0' && buf[0] != '\n') {
-               pid_str = strim(strsep(&buf, ","));
-
-               if (kstrtoint(pid_str, 0, &pid)) {
-                       rdt_last_cmd_printf("Task list parsing error pid %s\n", pid_str);
-                       ret = -EINVAL;
-                       break;
-               }
-
-               if (pid < 0) {
-                       rdt_last_cmd_printf("Invalid pid %d\n", pid);
-                       ret = -EINVAL;
-                       break;
-               }
-
-               ret = rdtgroup_move_task(pid, rdtgrp, of);
-               if (ret) {
-                       rdt_last_cmd_printf("Error while processing task %d\n", pid);
-                       break;
-               }
-       }
-
-unlock:
-       rdtgroup_kn_unlock(of->kn);
-
-       return ret ?: nbytes;
-}
-
-static void show_rdt_tasks(struct rdtgroup *r, struct seq_file *s)
-{
-       struct task_struct *p, *t;
-       pid_t pid;
-
-       rcu_read_lock();
-       for_each_process_thread(p, t) {
-               if (is_closid_match(t, r) || is_rmid_match(t, r)) {
-                       pid = task_pid_vnr(t);
-                       if (pid)
-                               seq_printf(s, "%d\n", pid);
-               }
-       }
-       rcu_read_unlock();
-}
-
-static int rdtgroup_tasks_show(struct kernfs_open_file *of,
-                              struct seq_file *s, void *v)
-{
-       struct rdtgroup *rdtgrp;
-       int ret = 0;
-
-       rdtgrp = rdtgroup_kn_lock_live(of->kn);
-       if (rdtgrp)
-               show_rdt_tasks(rdtgrp, s);
-       else
-               ret = -ENOENT;
-       rdtgroup_kn_unlock(of->kn);
-
-       return ret;
-}
-
-static int rdtgroup_closid_show(struct kernfs_open_file *of,
-                               struct seq_file *s, void *v)
-{
-       struct rdtgroup *rdtgrp;
-       int ret = 0;
-
-       rdtgrp = rdtgroup_kn_lock_live(of->kn);
-       if (rdtgrp)
-               seq_printf(s, "%u\n", rdtgrp->closid);
-       else
-               ret = -ENOENT;
-       rdtgroup_kn_unlock(of->kn);
-
-       return ret;
-}
-
-static int rdtgroup_rmid_show(struct kernfs_open_file *of,
-                             struct seq_file *s, void *v)
-{
-       struct rdtgroup *rdtgrp;
-       int ret = 0;
-
-       rdtgrp = rdtgroup_kn_lock_live(of->kn);
-       if (rdtgrp)
-               seq_printf(s, "%u\n", rdtgrp->mon.rmid);
-       else
-               ret = -ENOENT;
-       rdtgroup_kn_unlock(of->kn);
-
-       return ret;
-}
-
-#ifdef CONFIG_PROC_CPU_RESCTRL
-
-/*
- * A task can only be part of one resctrl control group and of one monitor
- * group which is associated to that control group.
- *
- * 1)   res:
- *      mon:
- *
- *    resctrl is not available.
- *
- * 2)   res:/
- *      mon:
- *
- *    Task is part of the root resctrl control group, and it is not associated
- *    to any monitor group.
- *
- * 3)  res:/
- *     mon:mon0
- *
- *    Task is part of the root resctrl control group and monitor group mon0.
- *
- * 4)  res:group0
- *     mon:
- *
- *    Task is part of resctrl control group group0, and it is not associated
- *    to any monitor group.
- *
- * 5) res:group0
- *    mon:mon1
- *
- *    Task is part of resctrl control group group0 and monitor group mon1.
- */
-int proc_resctrl_show(struct seq_file *s, struct pid_namespace *ns,
-                     struct pid *pid, struct task_struct *tsk)
-{
-       struct rdtgroup *rdtg;
-       int ret = 0;
-
-       mutex_lock(&rdtgroup_mutex);
-
-       /* Return empty if resctrl has not been mounted. */
-       if (!resctrl_mounted) {
-               seq_puts(s, "res:\nmon:\n");
-               goto unlock;
-       }
-
-       list_for_each_entry(rdtg, &rdt_all_groups, rdtgroup_list) {
-               struct rdtgroup *crg;
-
-               /*
-                * Task information is only relevant for shareable
-                * and exclusive groups.
-                */
-               if (rdtg->mode != RDT_MODE_SHAREABLE &&
-                   rdtg->mode != RDT_MODE_EXCLUSIVE)
-                       continue;
-
-               if (!resctrl_arch_match_closid(tsk, rdtg->closid))
-                       continue;
-
-               seq_printf(s, "res:%s%s\n", (rdtg == &rdtgroup_default) ? "/" : "",
-                          rdt_kn_name(rdtg->kn));
-               seq_puts(s, "mon:");
-               list_for_each_entry(crg, &rdtg->mon.crdtgrp_list,
-                                   mon.crdtgrp_list) {
-                       if (!resctrl_arch_match_rmid(tsk, crg->mon.parent->closid,
-                                                    crg->mon.rmid))
-                               continue;
-                       seq_printf(s, "%s", rdt_kn_name(crg->kn));
-                       break;
-               }
-               seq_putc(s, '\n');
-               goto unlock;
-       }
-       /*
-        * The above search should succeed. Otherwise return
-        * with an error.
-        */
-       ret = -ENOENT;
-unlock:
-       mutex_unlock(&rdtgroup_mutex);
-
-       return ret;
-}
-#endif
-
-static int rdt_last_cmd_status_show(struct kernfs_open_file *of,
-                                   struct seq_file *seq, void *v)
-{
-       int len;
-
-       mutex_lock(&rdtgroup_mutex);
-       len = seq_buf_used(&last_cmd_status);
-       if (len)
-               seq_printf(seq, "%.*s", len, last_cmd_status_buf);
-       else
-               seq_puts(seq, "ok\n");
-       mutex_unlock(&rdtgroup_mutex);
-       return 0;
-}
-
-static void *rdt_kn_parent_priv(struct kernfs_node *kn)
-{
-       /*
-        * The parent pointer is only valid within RCU section since it can be
-        * replaced.
-        */
-       guard(rcu)();
-       return rcu_dereference(kn->__parent)->priv;
-}
-
-static int rdt_num_closids_show(struct kernfs_open_file *of,
-                               struct seq_file *seq, void *v)
-{
-       struct resctrl_schema *s = rdt_kn_parent_priv(of->kn);
-
-       seq_printf(seq, "%u\n", s->num_closid);
-       return 0;
-}
-
-static int rdt_default_ctrl_show(struct kernfs_open_file *of,
-                                struct seq_file *seq, void *v)
-{
-       struct resctrl_schema *s = rdt_kn_parent_priv(of->kn);
-       struct rdt_resource *r = s->res;
-
-       seq_printf(seq, "%x\n", resctrl_get_default_ctrl(r));
-       return 0;
-}
-
-static int rdt_min_cbm_bits_show(struct kernfs_open_file *of,
-                                struct seq_file *seq, void *v)
-{
-       struct resctrl_schema *s = rdt_kn_parent_priv(of->kn);
-       struct rdt_resource *r = s->res;
-
-       seq_printf(seq, "%u\n", r->cache.min_cbm_bits);
-       return 0;
-}
-
-static int rdt_shareable_bits_show(struct kernfs_open_file *of,
-                                  struct seq_file *seq, void *v)
-{
-       struct resctrl_schema *s = rdt_kn_parent_priv(of->kn);
-       struct rdt_resource *r = s->res;
-
-       seq_printf(seq, "%x\n", r->cache.shareable_bits);
-       return 0;
-}
-
-/*
- * rdt_bit_usage_show - Display current usage of resources
- *
- * A domain is a shared resource that can now be allocated differently. Here
- * we display the current regions of the domain as an annotated bitmask.
- * For each domain of this resource its allocation bitmask
- * is annotated as below to indicate the current usage of the corresponding bit:
- *   0 - currently unused
- *   X - currently available for sharing and used by software and hardware
- *   H - currently used by hardware only but available for software use
- *   S - currently used and shareable by software only
- *   E - currently used exclusively by one resource group
- *   P - currently pseudo-locked by one resource group
- */
-static int rdt_bit_usage_show(struct kernfs_open_file *of,
-                             struct seq_file *seq, void *v)
-{
-       struct resctrl_schema *s = rdt_kn_parent_priv(of->kn);
-       /*
-        * Use unsigned long even though only 32 bits are used to ensure
-        * test_bit() is used safely.
-        */
-       unsigned long sw_shareable = 0, hw_shareable = 0;
-       unsigned long exclusive = 0, pseudo_locked = 0;
-       struct rdt_resource *r = s->res;
-       struct rdt_ctrl_domain *dom;
-       int i, hwb, swb, excl, psl;
-       enum rdtgrp_mode mode;
-       bool sep = false;
-       u32 ctrl_val;
-
-       cpus_read_lock();
-       mutex_lock(&rdtgroup_mutex);
-       hw_shareable = r->cache.shareable_bits;
-       list_for_each_entry(dom, &r->ctrl_domains, hdr.list) {
-               if (sep)
-                       seq_putc(seq, ';');
-               sw_shareable = 0;
-               exclusive = 0;
-               seq_printf(seq, "%d=", dom->hdr.id);
-               for (i = 0; i < closids_supported(); i++) {
-                       if (!closid_allocated(i))
-                               continue;
-                       ctrl_val = resctrl_arch_get_config(r, dom, i,
-                                                          s->conf_type);
-                       mode = rdtgroup_mode_by_closid(i);
-                       switch (mode) {
-                       case RDT_MODE_SHAREABLE:
-                               sw_shareable |= ctrl_val;
-                               break;
-                       case RDT_MODE_EXCLUSIVE:
-                               exclusive |= ctrl_val;
-                               break;
-                       case RDT_MODE_PSEUDO_LOCKSETUP:
-                       /*
-                        * RDT_MODE_PSEUDO_LOCKSETUP is possible
-                        * here but not included since the CBM
-                        * associated with this CLOSID in this mode
-                        * is not initialized and no task or cpu can be
-                        * assigned this CLOSID.
-                        */
-                               break;
-                       case RDT_MODE_PSEUDO_LOCKED:
-                       case RDT_NUM_MODES:
-                               WARN(1,
-                                    "invalid mode for closid %d\n", i);
-                               break;
-                       }
-               }
-               for (i = r->cache.cbm_len - 1; i >= 0; i--) {
-                       pseudo_locked = dom->plr ? dom->plr->cbm : 0;
-                       hwb = test_bit(i, &hw_shareable);
-                       swb = test_bit(i, &sw_shareable);
-                       excl = test_bit(i, &exclusive);
-                       psl = test_bit(i, &pseudo_locked);
-                       if (hwb && swb)
-                               seq_putc(seq, 'X');
-                       else if (hwb && !swb)
-                               seq_putc(seq, 'H');
-                       else if (!hwb && swb)
-                               seq_putc(seq, 'S');
-                       else if (excl)
-                               seq_putc(seq, 'E');
-                       else if (psl)
-                               seq_putc(seq, 'P');
-                       else /* Unused bits remain */
-                               seq_putc(seq, '0');
-               }
-               sep = true;
-       }
-       seq_putc(seq, '\n');
-       mutex_unlock(&rdtgroup_mutex);
-       cpus_read_unlock();
-       return 0;
-}
-
-static int rdt_min_bw_show(struct kernfs_open_file *of,
-                          struct seq_file *seq, void *v)
-{
-       struct resctrl_schema *s = rdt_kn_parent_priv(of->kn);
-       struct rdt_resource *r = s->res;
-
-       seq_printf(seq, "%u\n", r->membw.min_bw);
-       return 0;
-}
-
-static int rdt_num_rmids_show(struct kernfs_open_file *of,
-                             struct seq_file *seq, void *v)
-{
-       struct rdt_resource *r = rdt_kn_parent_priv(of->kn);
-
-       seq_printf(seq, "%d\n", r->num_rmid);
-
-       return 0;
-}
-
-static int rdt_mon_features_show(struct kernfs_open_file *of,
-                                struct seq_file *seq, void *v)
-{
-       struct rdt_resource *r = rdt_kn_parent_priv(of->kn);
-       struct mon_evt *mevt;
-
-       list_for_each_entry(mevt, &r->evt_list, list) {
-               seq_printf(seq, "%s\n", mevt->name);
-               if (mevt->configurable)
-                       seq_printf(seq, "%s_config\n", mevt->name);
-       }
-
-       return 0;
-}
-
-static int rdt_bw_gran_show(struct kernfs_open_file *of,
-                           struct seq_file *seq, void *v)
-{
-       struct resctrl_schema *s = rdt_kn_parent_priv(of->kn);
-       struct rdt_resource *r = s->res;
-
-       seq_printf(seq, "%u\n", r->membw.bw_gran);
-       return 0;
-}
-
-static int rdt_delay_linear_show(struct kernfs_open_file *of,
-                                struct seq_file *seq, void *v)
-{
-       struct resctrl_schema *s = rdt_kn_parent_priv(of->kn);
-       struct rdt_resource *r = s->res;
-
-       seq_printf(seq, "%u\n", r->membw.delay_linear);
-       return 0;
-}
-
-static int max_threshold_occ_show(struct kernfs_open_file *of,
-                                 struct seq_file *seq, void *v)
-{
-       seq_printf(seq, "%u\n", resctrl_rmid_realloc_threshold);
-
-       return 0;
-}
-
-static int rdt_thread_throttle_mode_show(struct kernfs_open_file *of,
-                                        struct seq_file *seq, void *v)
-{
-       struct resctrl_schema *s = rdt_kn_parent_priv(of->kn);
-       struct rdt_resource *r = s->res;
-
-       switch (r->membw.throttle_mode) {
-       case THREAD_THROTTLE_PER_THREAD:
-               seq_puts(seq, "per-thread\n");
-               return 0;
-       case THREAD_THROTTLE_MAX:
-               seq_puts(seq, "max\n");
-               return 0;
-       case THREAD_THROTTLE_UNDEFINED:
-               seq_puts(seq, "undefined\n");
-               return 0;
-       }
-
-       WARN_ON_ONCE(1);
-
-       return 0;
-}
-
-static ssize_t max_threshold_occ_write(struct kernfs_open_file *of,
-                                      char *buf, size_t nbytes, loff_t off)
-{
-       unsigned int bytes;
-       int ret;
-
-       ret = kstrtouint(buf, 0, &bytes);
-       if (ret)
-               return ret;
-
-       if (bytes > resctrl_rmid_realloc_limit)
-               return -EINVAL;
-
-       resctrl_rmid_realloc_threshold = resctrl_arch_round_mon_val(bytes);
-
-       return nbytes;
-}
-
-/*
- * rdtgroup_mode_show - Display mode of this resource group
- */
-static int rdtgroup_mode_show(struct kernfs_open_file *of,
-                             struct seq_file *s, void *v)
-{
-       struct rdtgroup *rdtgrp;
-
-       rdtgrp = rdtgroup_kn_lock_live(of->kn);
-       if (!rdtgrp) {
-               rdtgroup_kn_unlock(of->kn);
-               return -ENOENT;
-       }
-
-       seq_printf(s, "%s\n", rdtgroup_mode_str(rdtgrp->mode));
-
-       rdtgroup_kn_unlock(of->kn);
-       return 0;
-}
-
-static enum resctrl_conf_type resctrl_peer_type(enum resctrl_conf_type my_type)
-{
-       switch (my_type) {
-       case CDP_CODE:
-               return CDP_DATA;
-       case CDP_DATA:
-               return CDP_CODE;
-       default:
-       case CDP_NONE:
-               return CDP_NONE;
-       }
-}
-
-static int rdt_has_sparse_bitmasks_show(struct kernfs_open_file *of,
-                                       struct seq_file *seq, void *v)
-{
-       struct resctrl_schema *s = rdt_kn_parent_priv(of->kn);
-       struct rdt_resource *r = s->res;
-
-       seq_printf(seq, "%u\n", r->cache.arch_has_sparse_bitmasks);
-
-       return 0;
-}
-
-/**
- * __rdtgroup_cbm_overlaps - Does CBM for intended closid overlap with other
- * @r: Resource to which domain instance @d belongs.
- * @d: The domain instance for which @closid is being tested.
- * @cbm: Capacity bitmask being tested.
- * @closid: Intended closid for @cbm.
- * @type: CDP type of @r.
- * @exclusive: Only check if overlaps with exclusive resource groups
- *
- * Checks if provided @cbm intended to be used for @closid on domain
- * @d overlaps with any other closids or other hardware usage associated
- * with this domain. If @exclusive is true then only overlaps with
- * resource groups in exclusive mode will be considered. If @exclusive
- * is false then overlaps with any resource group or hardware entities
- * will be considered.
- *
- * @cbm is unsigned long, even if only 32 bits are used, to make the
- * bitmap functions work correctly.
- *
- * Return: false if CBM does not overlap, true if it does.
- */
-static bool __rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_ctrl_domain *d,
-                                   unsigned long cbm, int closid,
-                                   enum resctrl_conf_type type, bool exclusive)
-{
-       enum rdtgrp_mode mode;
-       unsigned long ctrl_b;
-       int i;
-
-       /* Check for any overlap with regions used by hardware directly */
-       if (!exclusive) {
-               ctrl_b = r->cache.shareable_bits;
-               if (bitmap_intersects(&cbm, &ctrl_b, r->cache.cbm_len))
-                       return true;
-       }
-
-       /* Check for overlap with other resource groups */
-       for (i = 0; i < closids_supported(); i++) {
-               ctrl_b = resctrl_arch_get_config(r, d, i, type);
-               mode = rdtgroup_mode_by_closid(i);
-               if (closid_allocated(i) && i != closid &&
-                   mode != RDT_MODE_PSEUDO_LOCKSETUP) {
-                       if (bitmap_intersects(&cbm, &ctrl_b, r->cache.cbm_len)) {
-                               if (exclusive) {
-                                       if (mode == RDT_MODE_EXCLUSIVE)
-                                               return true;
-                                       continue;
-                               }
-                               return true;
-                       }
-               }
-       }
-
-       return false;
-}
-
-/**
- * rdtgroup_cbm_overlaps - Does CBM overlap with other use of hardware
- * @s: Schema for the resource to which domain instance @d belongs.
- * @d: The domain instance for which @closid is being tested.
- * @cbm: Capacity bitmask being tested.
- * @closid: Intended closid for @cbm.
- * @exclusive: Only check if overlaps with exclusive resource groups
- *
- * Resources that can be allocated using a CBM can use the CBM to control
- * the overlap of these allocations. rdtgroup_cmb_overlaps() is the test
- * for overlap. Overlap test is not limited to the specific resource for
- * which the CBM is intended though - when dealing with CDP resources that
- * share the underlying hardware the overlap check should be performed on
- * the CDP resource sharing the hardware also.
- *
- * Refer to description of __rdtgroup_cbm_overlaps() for the details of the
- * overlap test.
- *
- * Return: true if CBM overlap detected, false if there is no overlap
- */
-bool rdtgroup_cbm_overlaps(struct resctrl_schema *s, struct rdt_ctrl_domain *d,
-                          unsigned long cbm, int closid, bool exclusive)
-{
-       enum resctrl_conf_type peer_type = resctrl_peer_type(s->conf_type);
-       struct rdt_resource *r = s->res;
-
-       if (__rdtgroup_cbm_overlaps(r, d, cbm, closid, s->conf_type,
-                                   exclusive))
-               return true;
-
-       if (!resctrl_arch_get_cdp_enabled(r->rid))
-               return false;
-       return  __rdtgroup_cbm_overlaps(r, d, cbm, closid, peer_type, exclusive);
-}
-
-/**
- * rdtgroup_mode_test_exclusive - Test if this resource group can be exclusive
- * @rdtgrp: Resource group identified through its closid.
- *
- * An exclusive resource group implies that there should be no sharing of
- * its allocated resources. At the time this group is considered to be
- * exclusive this test can determine if its current schemata supports this
- * setting by testing for overlap with all other resource groups.
- *
- * Return: true if resource group can be exclusive, false if there is overlap
- * with allocations of other resource groups and thus this resource group
- * cannot be exclusive.
- */
-static bool rdtgroup_mode_test_exclusive(struct rdtgroup *rdtgrp)
-{
-       int closid = rdtgrp->closid;
-       struct rdt_ctrl_domain *d;
-       struct resctrl_schema *s;
-       struct rdt_resource *r;
-       bool has_cache = false;
-       u32 ctrl;
-
-       /* Walking r->domains, ensure it can't race with cpuhp */
-       lockdep_assert_cpus_held();
-
-       list_for_each_entry(s, &resctrl_schema_all, list) {
-               r = s->res;
-               if (r->rid == RDT_RESOURCE_MBA || r->rid == RDT_RESOURCE_SMBA)
-                       continue;
-               has_cache = true;
-               list_for_each_entry(d, &r->ctrl_domains, hdr.list) {
-                       ctrl = resctrl_arch_get_config(r, d, closid,
-                                                      s->conf_type);
-                       if (rdtgroup_cbm_overlaps(s, d, ctrl, closid, false)) {
-                               rdt_last_cmd_puts("Schemata overlaps\n");
-                               return false;
-                       }
-               }
-       }
-
-       if (!has_cache) {
-               rdt_last_cmd_puts("Cannot be exclusive without CAT/CDP\n");
-               return false;
-       }
-
-       return true;
-}
-
-/*
- * rdtgroup_mode_write - Modify the resource group's mode
- */
-static ssize_t rdtgroup_mode_write(struct kernfs_open_file *of,
-                                  char *buf, size_t nbytes, loff_t off)
-{
-       struct rdtgroup *rdtgrp;
-       enum rdtgrp_mode mode;
-       int ret = 0;
-
-       /* Valid input requires a trailing newline */
-       if (nbytes == 0 || buf[nbytes - 1] != '\n')
-               return -EINVAL;
-       buf[nbytes - 1] = '\0';
-
-       rdtgrp = rdtgroup_kn_lock_live(of->kn);
-       if (!rdtgrp) {
-               rdtgroup_kn_unlock(of->kn);
-               return -ENOENT;
-       }
-
-       rdt_last_cmd_clear();
-
-       mode = rdtgrp->mode;
-
-       if ((!strcmp(buf, "shareable") && mode == RDT_MODE_SHAREABLE) ||
-           (!strcmp(buf, "exclusive") && mode == RDT_MODE_EXCLUSIVE) ||
-           (!strcmp(buf, "pseudo-locksetup") &&
-            mode == RDT_MODE_PSEUDO_LOCKSETUP) ||
-           (!strcmp(buf, "pseudo-locked") && mode == RDT_MODE_PSEUDO_LOCKED))
-               goto out;
-
-       if (mode == RDT_MODE_PSEUDO_LOCKED) {
-               rdt_last_cmd_puts("Cannot change pseudo-locked group\n");
-               ret = -EINVAL;
-               goto out;
-       }
-
-       if (!strcmp(buf, "shareable")) {
-               if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
-                       ret = rdtgroup_locksetup_exit(rdtgrp);
-                       if (ret)
-                               goto out;
-               }
-               rdtgrp->mode = RDT_MODE_SHAREABLE;
-       } else if (!strcmp(buf, "exclusive")) {
-               if (!rdtgroup_mode_test_exclusive(rdtgrp)) {
-                       ret = -EINVAL;
-                       goto out;
-               }
-               if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
-                       ret = rdtgroup_locksetup_exit(rdtgrp);
-                       if (ret)
-                               goto out;
-               }
-               rdtgrp->mode = RDT_MODE_EXCLUSIVE;
-       } else if (IS_ENABLED(CONFIG_RESCTRL_FS_PSEUDO_LOCK) &&
-                  !strcmp(buf, "pseudo-locksetup")) {
-               ret = rdtgroup_locksetup_enter(rdtgrp);
-               if (ret)
-                       goto out;
-               rdtgrp->mode = RDT_MODE_PSEUDO_LOCKSETUP;
-       } else {
-               rdt_last_cmd_puts("Unknown or unsupported mode\n");
-               ret = -EINVAL;
-       }
-
-out:
-       rdtgroup_kn_unlock(of->kn);
-       return ret ?: nbytes;
-}
-
-/**
- * rdtgroup_cbm_to_size - Translate CBM to size in bytes
- * @r: RDT resource to which @d belongs.
- * @d: RDT domain instance.
- * @cbm: bitmask for which the size should be computed.
- *
- * The bitmask provided associated with the RDT domain instance @d will be
- * translated into how many bytes it represents. The size in bytes is
- * computed by first dividing the total cache size by the CBM length to
- * determine how many bytes each bit in the bitmask represents. The result
- * is multiplied with the number of bits set in the bitmask.
- *
- * @cbm is unsigned long, even if only 32 bits are used to make the
- * bitmap functions work correctly.
- */
-unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r,
-                                 struct rdt_ctrl_domain *d, unsigned long cbm)
-{
-       unsigned int size = 0;
-       struct cacheinfo *ci;
-       int num_b;
-
-       if (WARN_ON_ONCE(r->ctrl_scope != RESCTRL_L2_CACHE && r->ctrl_scope != RESCTRL_L3_CACHE))
-               return size;
-
-       num_b = bitmap_weight(&cbm, r->cache.cbm_len);
-       ci = get_cpu_cacheinfo_level(cpumask_any(&d->hdr.cpu_mask), r->ctrl_scope);
-       if (ci)
-               size = ci->size / r->cache.cbm_len * num_b;
-
-       return size;
-}
-
-bool is_mba_sc(struct rdt_resource *r)
-{
-       if (!r)
-               r = resctrl_arch_get_resource(RDT_RESOURCE_MBA);
-
-       /*
-        * The software controller support is only applicable to MBA resource.
-        * Make sure to check for resource type.
-        */
-       if (r->rid != RDT_RESOURCE_MBA)
-               return false;
-
-       return r->membw.mba_sc;
-}
-
-/*
- * rdtgroup_size_show - Display size in bytes of allocated regions
- *
- * The "size" file mirrors the layout of the "schemata" file, printing the
- * size in bytes of each region instead of the capacity bitmask.
- */
-static int rdtgroup_size_show(struct kernfs_open_file *of,
-                             struct seq_file *s, void *v)
-{
-       struct resctrl_schema *schema;
-       enum resctrl_conf_type type;
-       struct rdt_ctrl_domain *d;
-       struct rdtgroup *rdtgrp;
-       struct rdt_resource *r;
-       unsigned int size;
-       int ret = 0;
-       u32 closid;
-       bool sep;
-       u32 ctrl;
-
-       rdtgrp = rdtgroup_kn_lock_live(of->kn);
-       if (!rdtgrp) {
-               rdtgroup_kn_unlock(of->kn);
-               return -ENOENT;
-       }
-
-       if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) {
-               if (!rdtgrp->plr->d) {
-                       rdt_last_cmd_clear();
-                       rdt_last_cmd_puts("Cache domain offline\n");
-                       ret = -ENODEV;
-               } else {
-                       seq_printf(s, "%*s:", max_name_width,
-                                  rdtgrp->plr->s->name);
-                       size = rdtgroup_cbm_to_size(rdtgrp->plr->s->res,
-                                                   rdtgrp->plr->d,
-                                                   rdtgrp->plr->cbm);
-                       seq_printf(s, "%d=%u\n", rdtgrp->plr->d->hdr.id, size);
-               }
-               goto out;
-       }
-
-       closid = rdtgrp->closid;
-
-       list_for_each_entry(schema, &resctrl_schema_all, list) {
-               r = schema->res;
-               type = schema->conf_type;
-               sep = false;
-               seq_printf(s, "%*s:", max_name_width, schema->name);
-               list_for_each_entry(d, &r->ctrl_domains, hdr.list) {
-                       if (sep)
-                               seq_putc(s, ';');
-                       if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
-                               size = 0;
-                       } else {
-                               if (is_mba_sc(r))
-                                       ctrl = d->mbps_val[closid];
-                               else
-                                       ctrl = resctrl_arch_get_config(r, d,
-                                                                      closid,
-                                                                      type);
-                               if (r->rid == RDT_RESOURCE_MBA ||
-                                   r->rid == RDT_RESOURCE_SMBA)
-                                       size = ctrl;
-                               else
-                                       size = rdtgroup_cbm_to_size(r, d, ctrl);
-                       }
-                       seq_printf(s, "%d=%u", d->hdr.id, size);
-                       sep = true;
-               }
-               seq_putc(s, '\n');
-       }
-
-out:
-       rdtgroup_kn_unlock(of->kn);
-
-       return ret;
-}
-
-#define INVALID_CONFIG_INDEX   UINT_MAX
-
-/**
- * mon_event_config_index_get - get the hardware index for the
- *                              configurable event
- * @evtid: event id.
- *
- * Return: 0 for evtid == QOS_L3_MBM_TOTAL_EVENT_ID
- *         1 for evtid == QOS_L3_MBM_LOCAL_EVENT_ID
- *         INVALID_CONFIG_INDEX for invalid evtid
- */
-static inline unsigned int mon_event_config_index_get(u32 evtid)
-{
-       switch (evtid) {
-       case QOS_L3_MBM_TOTAL_EVENT_ID:
-               return 0;
-       case QOS_L3_MBM_LOCAL_EVENT_ID:
-               return 1;
-       default:
-               /* Should never reach here */
-               return INVALID_CONFIG_INDEX;
-       }
-}
-
-void resctrl_arch_mon_event_config_read(void *_config_info)
-{
-       struct resctrl_mon_config_info *config_info = _config_info;
-       unsigned int index;
-       u64 msrval;
-
-       index = mon_event_config_index_get(config_info->evtid);
-       if (index == INVALID_CONFIG_INDEX) {
-               pr_warn_once("Invalid event id %d\n", config_info->evtid);
-               return;
-       }
-       rdmsrl(MSR_IA32_EVT_CFG_BASE + index, msrval);
-
-       /* Report only the valid event configuration bits */
-       config_info->mon_config = msrval & MAX_EVT_CONFIG_BITS;
-}
-
-static void mondata_config_read(struct resctrl_mon_config_info *mon_info)
-{
-       smp_call_function_any(&mon_info->d->hdr.cpu_mask,
-                             resctrl_arch_mon_event_config_read, mon_info, 1);
-}
-
-static int mbm_config_show(struct seq_file *s, struct rdt_resource *r, u32 evtid)
-{
-       struct resctrl_mon_config_info mon_info;
-       struct rdt_mon_domain *dom;
-       bool sep = false;
-
-       cpus_read_lock();
-       mutex_lock(&rdtgroup_mutex);
-
-       list_for_each_entry(dom, &r->mon_domains, hdr.list) {
-               if (sep)
-                       seq_puts(s, ";");
-
-               memset(&mon_info, 0, sizeof(struct resctrl_mon_config_info));
-               mon_info.r = r;
-               mon_info.d = dom;
-               mon_info.evtid = evtid;
-               mondata_config_read(&mon_info);
-
-               seq_printf(s, "%d=0x%02x", dom->hdr.id, mon_info.mon_config);
-               sep = true;
-       }
-       seq_puts(s, "\n");
-
-       mutex_unlock(&rdtgroup_mutex);
-       cpus_read_unlock();
-
-       return 0;
-}
-
-static int mbm_total_bytes_config_show(struct kernfs_open_file *of,
-                                      struct seq_file *seq, void *v)
-{
-       struct rdt_resource *r = rdt_kn_parent_priv(of->kn);
-
-       mbm_config_show(seq, r, QOS_L3_MBM_TOTAL_EVENT_ID);
-
-       return 0;
-}
-
-static int mbm_local_bytes_config_show(struct kernfs_open_file *of,
-                                      struct seq_file *seq, void *v)
-{
-       struct rdt_resource *r = rdt_kn_parent_priv(of->kn);
-
-       mbm_config_show(seq, r, QOS_L3_MBM_LOCAL_EVENT_ID);
-
-       return 0;
-}
-
-void resctrl_arch_mon_event_config_write(void *_config_info)
-{
-       struct resctrl_mon_config_info *config_info = _config_info;
-       unsigned int index;
-
-       index = mon_event_config_index_get(config_info->evtid);
-       if (index == INVALID_CONFIG_INDEX) {
-               pr_warn_once("Invalid event id %d\n", config_info->evtid);
-               return;
-       }
-       wrmsr(MSR_IA32_EVT_CFG_BASE + index, config_info->mon_config, 0);
-}
-
-static void mbm_config_write_domain(struct rdt_resource *r,
-                                   struct rdt_mon_domain *d, u32 evtid, u32 val)
-{
-       struct resctrl_mon_config_info mon_info = {0};
-
-       /*
-        * Read the current config value first. If both are the same then
-        * no need to write it again.
-        */
-       mon_info.r = r;
-       mon_info.d = d;
-       mon_info.evtid = evtid;
-       mondata_config_read(&mon_info);
-       if (mon_info.mon_config == val)
-               return;
-
-       mon_info.mon_config = val;
-
-       /*
-        * Update MSR_IA32_EVT_CFG_BASE MSR on one of the CPUs in the
-        * domain. The MSRs offset from MSR MSR_IA32_EVT_CFG_BASE
-        * are scoped at the domain level. Writing any of these MSRs
-        * on one CPU is observed by all the CPUs in the domain.
-        */
-       smp_call_function_any(&d->hdr.cpu_mask, resctrl_arch_mon_event_config_write,
-                             &mon_info, 1);
-
-       /*
-        * When an Event Configuration is changed, the bandwidth counters
-        * for all RMIDs and Events will be cleared by the hardware. The
-        * hardware also sets MSR_IA32_QM_CTR.Unavailable (bit 62) for
-        * every RMID on the next read to any event for every RMID.
-        * Subsequent reads will have MSR_IA32_QM_CTR.Unavailable (bit 62)
-        * cleared while it is tracked by the hardware. Clear the
-        * mbm_local and mbm_total counts for all the RMIDs.
-        */
-       resctrl_arch_reset_rmid_all(r, d);
-}
-
-static int mon_config_write(struct rdt_resource *r, char *tok, u32 evtid)
-{
-       char *dom_str = NULL, *id_str;
-       unsigned long dom_id, val;
-       struct rdt_mon_domain *d;
-
-       /* Walking r->domains, ensure it can't race with cpuhp */
-       lockdep_assert_cpus_held();
-
-next:
-       if (!tok || tok[0] == '\0')
-               return 0;
-
-       /* Start processing the strings for each domain */
-       dom_str = strim(strsep(&tok, ";"));
-       id_str = strsep(&dom_str, "=");
-
-       if (!id_str || kstrtoul(id_str, 10, &dom_id)) {
-               rdt_last_cmd_puts("Missing '=' or non-numeric domain id\n");
-               return -EINVAL;
-       }
-
-       if (!dom_str || kstrtoul(dom_str, 16, &val)) {
-               rdt_last_cmd_puts("Non-numeric event configuration value\n");
-               return -EINVAL;
-       }
-
-       /* Value from user cannot be more than the supported set of events */
-       if ((val & r->mbm_cfg_mask) != val) {
-               rdt_last_cmd_printf("Invalid event configuration: max valid mask is 0x%02x\n",
-                                   r->mbm_cfg_mask);
-               return -EINVAL;
-       }
-
-       list_for_each_entry(d, &r->mon_domains, hdr.list) {
-               if (d->hdr.id == dom_id) {
-                       mbm_config_write_domain(r, d, evtid, val);
-                       goto next;
-               }
-       }
-
-       return -EINVAL;
-}
-
-static ssize_t mbm_total_bytes_config_write(struct kernfs_open_file *of,
-                                           char *buf, size_t nbytes,
-                                           loff_t off)
-{
-       struct rdt_resource *r = rdt_kn_parent_priv(of->kn);
-       int ret;
-
-       /* Valid input requires a trailing newline */
-       if (nbytes == 0 || buf[nbytes - 1] != '\n')
-               return -EINVAL;
-
-       cpus_read_lock();
-       mutex_lock(&rdtgroup_mutex);
-
-       rdt_last_cmd_clear();
-
-       buf[nbytes - 1] = '\0';
-
-       ret = mon_config_write(r, buf, QOS_L3_MBM_TOTAL_EVENT_ID);
-
-       mutex_unlock(&rdtgroup_mutex);
-       cpus_read_unlock();
-
-       return ret ?: nbytes;
-}
-
-static ssize_t mbm_local_bytes_config_write(struct kernfs_open_file *of,
-                                           char *buf, size_t nbytes,
-                                           loff_t off)
-{
-       struct rdt_resource *r = rdt_kn_parent_priv(of->kn);
-       int ret;
-
-       /* Valid input requires a trailing newline */
-       if (nbytes == 0 || buf[nbytes - 1] != '\n')
-               return -EINVAL;
-
-       cpus_read_lock();
-       mutex_lock(&rdtgroup_mutex);
-
-       rdt_last_cmd_clear();
-
-       buf[nbytes - 1] = '\0';
-
-       ret = mon_config_write(r, buf, QOS_L3_MBM_LOCAL_EVENT_ID);
-
-       mutex_unlock(&rdtgroup_mutex);
-       cpus_read_unlock();
-
-       return ret ?: nbytes;
-}
-
-/* rdtgroup information files for one cache resource. */
-static struct rftype res_common_files[] = {
-       {
-               .name           = "last_cmd_status",
-               .mode           = 0444,
-               .kf_ops         = &rdtgroup_kf_single_ops,
-               .seq_show       = rdt_last_cmd_status_show,
-               .fflags         = RFTYPE_TOP_INFO,
-       },
-       {
-               .name           = "num_closids",
-               .mode           = 0444,
-               .kf_ops         = &rdtgroup_kf_single_ops,
-               .seq_show       = rdt_num_closids_show,
-               .fflags         = RFTYPE_CTRL_INFO,
-       },
-       {
-               .name           = "mon_features",
-               .mode           = 0444,
-               .kf_ops         = &rdtgroup_kf_single_ops,
-               .seq_show       = rdt_mon_features_show,
-               .fflags         = RFTYPE_MON_INFO,
-       },
-       {
-               .name           = "num_rmids",
-               .mode           = 0444,
-               .kf_ops         = &rdtgroup_kf_single_ops,
-               .seq_show       = rdt_num_rmids_show,
-               .fflags         = RFTYPE_MON_INFO,
-       },
-       {
-               .name           = "cbm_mask",
-               .mode           = 0444,
-               .kf_ops         = &rdtgroup_kf_single_ops,
-               .seq_show       = rdt_default_ctrl_show,
-               .fflags         = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE,
-       },
-       {
-               .name           = "min_cbm_bits",
-               .mode           = 0444,
-               .kf_ops         = &rdtgroup_kf_single_ops,
-               .seq_show       = rdt_min_cbm_bits_show,
-               .fflags         = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE,
-       },
-       {
-               .name           = "shareable_bits",
-               .mode           = 0444,
-               .kf_ops         = &rdtgroup_kf_single_ops,
-               .seq_show       = rdt_shareable_bits_show,
-               .fflags         = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE,
-       },
-       {
-               .name           = "bit_usage",
-               .mode           = 0444,
-               .kf_ops         = &rdtgroup_kf_single_ops,
-               .seq_show       = rdt_bit_usage_show,
-               .fflags         = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE,
-       },
-       {
-               .name           = "min_bandwidth",
-               .mode           = 0444,
-               .kf_ops         = &rdtgroup_kf_single_ops,
-               .seq_show       = rdt_min_bw_show,
-               .fflags         = RFTYPE_CTRL_INFO | RFTYPE_RES_MB,
-       },
-       {
-               .name           = "bandwidth_gran",
-               .mode           = 0444,
-               .kf_ops         = &rdtgroup_kf_single_ops,
-               .seq_show       = rdt_bw_gran_show,
-               .fflags         = RFTYPE_CTRL_INFO | RFTYPE_RES_MB,
-       },
-       {
-               .name           = "delay_linear",
-               .mode           = 0444,
-               .kf_ops         = &rdtgroup_kf_single_ops,
-               .seq_show       = rdt_delay_linear_show,
-               .fflags         = RFTYPE_CTRL_INFO | RFTYPE_RES_MB,
-       },
-       /*
-        * Platform specific which (if any) capabilities are provided by
-        * thread_throttle_mode. Defer "fflags" initialization to platform
-        * discovery.
-        */
-       {
-               .name           = "thread_throttle_mode",
-               .mode           = 0444,
-               .kf_ops         = &rdtgroup_kf_single_ops,
-               .seq_show       = rdt_thread_throttle_mode_show,
-       },
-       {
-               .name           = "max_threshold_occupancy",
-               .mode           = 0644,
-               .kf_ops         = &rdtgroup_kf_single_ops,
-               .write          = max_threshold_occ_write,
-               .seq_show       = max_threshold_occ_show,
-               .fflags         = RFTYPE_MON_INFO | RFTYPE_RES_CACHE,
-       },
-       {
-               .name           = "mbm_total_bytes_config",
-               .mode           = 0644,
-               .kf_ops         = &rdtgroup_kf_single_ops,
-               .seq_show       = mbm_total_bytes_config_show,
-               .write          = mbm_total_bytes_config_write,
-       },
-       {
-               .name           = "mbm_local_bytes_config",
-               .mode           = 0644,
-               .kf_ops         = &rdtgroup_kf_single_ops,
-               .seq_show       = mbm_local_bytes_config_show,
-               .write          = mbm_local_bytes_config_write,
-       },
-       {
-               .name           = "cpus",
-               .mode           = 0644,
-               .kf_ops         = &rdtgroup_kf_single_ops,
-               .write          = rdtgroup_cpus_write,
-               .seq_show       = rdtgroup_cpus_show,
-               .fflags         = RFTYPE_BASE,
-       },
-       {
-               .name           = "cpus_list",
-               .mode           = 0644,
-               .kf_ops         = &rdtgroup_kf_single_ops,
-               .write          = rdtgroup_cpus_write,
-               .seq_show       = rdtgroup_cpus_show,
-               .flags          = RFTYPE_FLAGS_CPUS_LIST,
-               .fflags         = RFTYPE_BASE,
-       },
-       {
-               .name           = "tasks",
-               .mode           = 0644,
-               .kf_ops         = &rdtgroup_kf_single_ops,
-               .write          = rdtgroup_tasks_write,
-               .seq_show       = rdtgroup_tasks_show,
-               .fflags         = RFTYPE_BASE,
-       },
-       {
-               .name           = "mon_hw_id",
-               .mode           = 0444,
-               .kf_ops         = &rdtgroup_kf_single_ops,
-               .seq_show       = rdtgroup_rmid_show,
-               .fflags         = RFTYPE_MON_BASE | RFTYPE_DEBUG,
-       },
-       {
-               .name           = "schemata",
-               .mode           = 0644,
-               .kf_ops         = &rdtgroup_kf_single_ops,
-               .write          = rdtgroup_schemata_write,
-               .seq_show       = rdtgroup_schemata_show,
-               .fflags         = RFTYPE_CTRL_BASE,
-       },
-       {
-               .name           = "mba_MBps_event",
-               .mode           = 0644,
-               .kf_ops         = &rdtgroup_kf_single_ops,
-               .write          = rdtgroup_mba_mbps_event_write,
-               .seq_show       = rdtgroup_mba_mbps_event_show,
-       },
-       {
-               .name           = "mode",
-               .mode           = 0644,
-               .kf_ops         = &rdtgroup_kf_single_ops,
-               .write          = rdtgroup_mode_write,
-               .seq_show       = rdtgroup_mode_show,
-               .fflags         = RFTYPE_CTRL_BASE,
-       },
-       {
-               .name           = "size",
-               .mode           = 0444,
-               .kf_ops         = &rdtgroup_kf_single_ops,
-               .seq_show       = rdtgroup_size_show,
-               .fflags         = RFTYPE_CTRL_BASE,
-       },
-       {
-               .name           = "sparse_masks",
-               .mode           = 0444,
-               .kf_ops         = &rdtgroup_kf_single_ops,
-               .seq_show       = rdt_has_sparse_bitmasks_show,
-               .fflags         = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE,
-       },
-       {
-               .name           = "ctrl_hw_id",
-               .mode           = 0444,
-               .kf_ops         = &rdtgroup_kf_single_ops,
-               .seq_show       = rdtgroup_closid_show,
-               .fflags         = RFTYPE_CTRL_BASE | RFTYPE_DEBUG,
-       },
-};
-
-static int rdtgroup_add_files(struct kernfs_node *kn, unsigned long fflags)
-{
-       struct rftype *rfts, *rft;
-       int ret, len;
-
-       rfts = res_common_files;
-       len = ARRAY_SIZE(res_common_files);
-
-       lockdep_assert_held(&rdtgroup_mutex);
-
-       if (resctrl_debug)
-               fflags |= RFTYPE_DEBUG;
-
-       for (rft = rfts; rft < rfts + len; rft++) {
-               if (rft->fflags && ((fflags & rft->fflags) == rft->fflags)) {
-                       ret = rdtgroup_add_file(kn, rft);
-                       if (ret)
-                               goto error;
-               }
-       }
-
-       return 0;
-error:
-       pr_warn("Failed to add %s, err=%d\n", rft->name, ret);
-       while (--rft >= rfts) {
-               if ((fflags & rft->fflags) == rft->fflags)
-                       kernfs_remove_by_name(kn, rft->name);
-       }
-       return ret;
-}
-
-static struct rftype *rdtgroup_get_rftype_by_name(const char *name)
-{
-       struct rftype *rfts, *rft;
-       int len;
-
-       rfts = res_common_files;
-       len = ARRAY_SIZE(res_common_files);
-
-       for (rft = rfts; rft < rfts + len; rft++) {
-               if (!strcmp(rft->name, name))
-                       return rft;
-       }
-
-       return NULL;
-}
-
-static void thread_throttle_mode_init(void)
-{
-       enum membw_throttle_mode throttle_mode = THREAD_THROTTLE_UNDEFINED;
-       struct rdt_resource *r_mba, *r_smba;
-
-       r_mba = resctrl_arch_get_resource(RDT_RESOURCE_MBA);
-       if (r_mba->alloc_capable &&
-           r_mba->membw.throttle_mode != THREAD_THROTTLE_UNDEFINED)
-               throttle_mode = r_mba->membw.throttle_mode;
-
-       r_smba = resctrl_arch_get_resource(RDT_RESOURCE_SMBA);
-       if (r_smba->alloc_capable &&
-           r_smba->membw.throttle_mode != THREAD_THROTTLE_UNDEFINED)
-               throttle_mode = r_smba->membw.throttle_mode;
-
-       if (throttle_mode == THREAD_THROTTLE_UNDEFINED)
-               return;
-
-       resctrl_file_fflags_init("thread_throttle_mode",
-                                RFTYPE_CTRL_INFO | RFTYPE_RES_MB);
-}
-
-void resctrl_file_fflags_init(const char *config, unsigned long fflags)
-{
-       struct rftype *rft;
-
-       rft = rdtgroup_get_rftype_by_name(config);
-       if (rft)
-               rft->fflags = fflags;
-}
-
-/**
- * rdtgroup_kn_mode_restrict - Restrict user access to named resctrl file
- * @r: The resource group with which the file is associated.
- * @name: Name of the file
- *
- * The permissions of named resctrl file, directory, or link are modified
- * to not allow read, write, or execute by any user.
- *
- * WARNING: This function is intended to communicate to the user that the
- * resctrl file has been locked down - that it is not relevant to the
- * particular state the system finds itself in. It should not be relied
- * on to protect from user access because after the file's permissions
- * are restricted the user can still change the permissions using chmod
- * from the command line.
- *
- * Return: 0 on success, <0 on failure.
- */
-int rdtgroup_kn_mode_restrict(struct rdtgroup *r, const char *name)
-{
-       struct iattr iattr = {.ia_valid = ATTR_MODE,};
-       struct kernfs_node *kn;
-       int ret = 0;
-
-       kn = kernfs_find_and_get_ns(r->kn, name, NULL);
-       if (!kn)
-               return -ENOENT;
-
-       switch (kernfs_type(kn)) {
-       case KERNFS_DIR:
-               iattr.ia_mode = S_IFDIR;
-               break;
-       case KERNFS_FILE:
-               iattr.ia_mode = S_IFREG;
-               break;
-       case KERNFS_LINK:
-               iattr.ia_mode = S_IFLNK;
-               break;
-       }
-
-       ret = kernfs_setattr(kn, &iattr);
-       kernfs_put(kn);
-       return ret;
-}
-
-/**
- * rdtgroup_kn_mode_restore - Restore user access to named resctrl file
- * @r: The resource group with which the file is associated.
- * @name: Name of the file
- * @mask: Mask of permissions that should be restored
- *
- * Restore the permissions of the named file. If @name is a directory the
- * permissions of its parent will be used.
- *
- * Return: 0 on success, <0 on failure.
- */
-int rdtgroup_kn_mode_restore(struct rdtgroup *r, const char *name,
-                            umode_t mask)
-{
-       struct iattr iattr = {.ia_valid = ATTR_MODE,};
-       struct kernfs_node *kn, *parent;
-       struct rftype *rfts, *rft;
-       int ret, len;
-
-       rfts = res_common_files;
-       len = ARRAY_SIZE(res_common_files);
-
-       for (rft = rfts; rft < rfts + len; rft++) {
-               if (!strcmp(rft->name, name))
-                       iattr.ia_mode = rft->mode & mask;
-       }
-
-       kn = kernfs_find_and_get_ns(r->kn, name, NULL);
-       if (!kn)
-               return -ENOENT;
-
-       switch (kernfs_type(kn)) {
-       case KERNFS_DIR:
-               parent = kernfs_get_parent(kn);
-               if (parent) {
-                       iattr.ia_mode |= parent->mode;
-                       kernfs_put(parent);
-               }
-               iattr.ia_mode |= S_IFDIR;
-               break;
-       case KERNFS_FILE:
-               iattr.ia_mode |= S_IFREG;
-               break;
-       case KERNFS_LINK:
-               iattr.ia_mode |= S_IFLNK;
-               break;
-       }
-
-       ret = kernfs_setattr(kn, &iattr);
-       kernfs_put(kn);
-       return ret;
-}
-
-static int rdtgroup_mkdir_info_resdir(void *priv, char *name,
-                                     unsigned long fflags)
-{
-       struct kernfs_node *kn_subdir;
-       int ret;
-
-       kn_subdir = kernfs_create_dir(kn_info, name,
-                                     kn_info->mode, priv);
-       if (IS_ERR(kn_subdir))
-               return PTR_ERR(kn_subdir);
-
-       ret = rdtgroup_kn_set_ugid(kn_subdir);
-       if (ret)
-               return ret;
-
-       ret = rdtgroup_add_files(kn_subdir, fflags);
-       if (!ret)
-               kernfs_activate(kn_subdir);
-
-       return ret;
-}
-
-static unsigned long fflags_from_resource(struct rdt_resource *r)
-{
-       switch (r->rid) {
-       case RDT_RESOURCE_L3:
-       case RDT_RESOURCE_L2:
-               return RFTYPE_RES_CACHE;
-       case RDT_RESOURCE_MBA:
-       case RDT_RESOURCE_SMBA:
-               return RFTYPE_RES_MB;
-       }
-
-       return WARN_ON_ONCE(1);
-}
-
-static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn)
-{
-       struct resctrl_schema *s;
-       struct rdt_resource *r;
-       unsigned long fflags;
-       char name[32];
-       int ret;
-
-       /* create the directory */
-       kn_info = kernfs_create_dir(parent_kn, "info", parent_kn->mode, NULL);
-       if (IS_ERR(kn_info))
-               return PTR_ERR(kn_info);
-
-       ret = rdtgroup_add_files(kn_info, RFTYPE_TOP_INFO);
-       if (ret)
-               goto out_destroy;
-
-       /* loop over enabled controls, these are all alloc_capable */
-       list_for_each_entry(s, &resctrl_schema_all, list) {
-               r = s->res;
-               fflags = fflags_from_resource(r) | RFTYPE_CTRL_INFO;
-               ret = rdtgroup_mkdir_info_resdir(s, s->name, fflags);
-               if (ret)
-                       goto out_destroy;
-       }
-
-       for_each_mon_capable_rdt_resource(r) {
-               fflags = fflags_from_resource(r) | RFTYPE_MON_INFO;
-               sprintf(name, "%s_MON", r->name);
-               ret = rdtgroup_mkdir_info_resdir(r, name, fflags);
-               if (ret)
-                       goto out_destroy;
-       }
-
-       ret = rdtgroup_kn_set_ugid(kn_info);
-       if (ret)
-               goto out_destroy;
-
-       kernfs_activate(kn_info);
-
-       return 0;
-
-out_destroy:
-       kernfs_remove(kn_info);
-       return ret;
-}
-
-static int
-mongroup_create_dir(struct kernfs_node *parent_kn, struct rdtgroup *prgrp,
-                   char *name, struct kernfs_node **dest_kn)
-{
-       struct kernfs_node *kn;
-       int ret;
-
-       /* create the directory */
-       kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp);
-       if (IS_ERR(kn))
-               return PTR_ERR(kn);
-
-       if (dest_kn)
-               *dest_kn = kn;
-
-       ret = rdtgroup_kn_set_ugid(kn);
-       if (ret)
-               goto out_destroy;
-
-       kernfs_activate(kn);
-
-       return 0;
-
-out_destroy:
-       kernfs_remove(kn);
-       return ret;
-}
-
-static void l3_qos_cfg_update(void *arg)
-{
-       bool *enable = arg;
-
-       wrmsrl(MSR_IA32_L3_QOS_CFG, *enable ? L3_QOS_CDP_ENABLE : 0ULL);
-}
-
-static void l2_qos_cfg_update(void *arg)
-{
-       bool *enable = arg;
-
-       wrmsrl(MSR_IA32_L2_QOS_CFG, *enable ? L2_QOS_CDP_ENABLE : 0ULL);
-}
-
-static inline bool is_mba_linear(void)
-{
-       return resctrl_arch_get_resource(RDT_RESOURCE_MBA)->membw.delay_linear;
-}
-
-static int set_cache_qos_cfg(int level, bool enable)
-{
-       void (*update)(void *arg);
-       struct rdt_ctrl_domain *d;
-       struct rdt_resource *r_l;
-       cpumask_var_t cpu_mask;
-       int cpu;
-
-       /* Walking r->domains, ensure it can't race with cpuhp */
-       lockdep_assert_cpus_held();
-
-       if (level == RDT_RESOURCE_L3)
-               update = l3_qos_cfg_update;
-       else if (level == RDT_RESOURCE_L2)
-               update = l2_qos_cfg_update;
-       else
-               return -EINVAL;
-
-       if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL))
-               return -ENOMEM;
-
-       r_l = &rdt_resources_all[level].r_resctrl;
-       list_for_each_entry(d, &r_l->ctrl_domains, hdr.list) {
-               if (r_l->cache.arch_has_per_cpu_cfg)
-                       /* Pick all the CPUs in the domain instance */
-                       for_each_cpu(cpu, &d->hdr.cpu_mask)
-                               cpumask_set_cpu(cpu, cpu_mask);
-               else
-                       /* Pick one CPU from each domain instance to update MSR */
-                       cpumask_set_cpu(cpumask_any(&d->hdr.cpu_mask), cpu_mask);
-       }
-
-       /* Update QOS_CFG MSR on all the CPUs in cpu_mask */
-       on_each_cpu_mask(cpu_mask, update, &enable, 1);
-
-       free_cpumask_var(cpu_mask);
-
-       return 0;
-}
-
-/* Restore the qos cfg state when a domain comes online */
-void rdt_domain_reconfigure_cdp(struct rdt_resource *r)
-{
-       struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
-
-       if (!r->cdp_capable)
-               return;
-
-       if (r->rid == RDT_RESOURCE_L2)
-               l2_qos_cfg_update(&hw_res->cdp_enabled);
-
-       if (r->rid == RDT_RESOURCE_L3)
-               l3_qos_cfg_update(&hw_res->cdp_enabled);
-}
-
-static int mba_sc_domain_allocate(struct rdt_resource *r, struct rdt_ctrl_domain *d)
-{
-       u32 num_closid = resctrl_arch_get_num_closid(r);
-       int cpu = cpumask_any(&d->hdr.cpu_mask);
-       int i;
-
-       d->mbps_val = kcalloc_node(num_closid, sizeof(*d->mbps_val),
-                                  GFP_KERNEL, cpu_to_node(cpu));
-       if (!d->mbps_val)
-               return -ENOMEM;
-
-       for (i = 0; i < num_closid; i++)
-               d->mbps_val[i] = MBA_MAX_MBPS;
-
-       return 0;
-}
-
-static void mba_sc_domain_destroy(struct rdt_resource *r,
-                                 struct rdt_ctrl_domain *d)
-{
-       kfree(d->mbps_val);
-       d->mbps_val = NULL;
-}
-
-/*
- * MBA software controller is supported only if
- * MBM is supported and MBA is in linear scale,
- * and the MBM monitor scope is the same as MBA
- * control scope.
- */
-static bool supports_mba_mbps(void)
-{
-       struct rdt_resource *rmbm = resctrl_arch_get_resource(RDT_RESOURCE_L3);
-       struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_MBA);
-
-       return (resctrl_is_mbm_enabled() &&
-               r->alloc_capable && is_mba_linear() &&
-               r->ctrl_scope == rmbm->mon_scope);
-}
-
-/*
- * Enable or disable the MBA software controller
- * which helps user specify bandwidth in MBps.
- */
-static int set_mba_sc(bool mba_sc)
-{
-       struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_MBA);
-       u32 num_closid = resctrl_arch_get_num_closid(r);
-       struct rdt_ctrl_domain *d;
-       unsigned long fflags;
-       int i;
-
-       if (!supports_mba_mbps() || mba_sc == is_mba_sc(r))
-               return -EINVAL;
-
-       r->membw.mba_sc = mba_sc;
-
-       rdtgroup_default.mba_mbps_event = mba_mbps_default_event;
-
-       list_for_each_entry(d, &r->ctrl_domains, hdr.list) {
-               for (i = 0; i < num_closid; i++)
-                       d->mbps_val[i] = MBA_MAX_MBPS;
-       }
-
-       fflags = mba_sc ? RFTYPE_CTRL_BASE | RFTYPE_MON_BASE : 0;
-       resctrl_file_fflags_init("mba_MBps_event", fflags);
-
-       return 0;
-}
-
-static int cdp_enable(int level)
-{
-       struct rdt_resource *r_l = &rdt_resources_all[level].r_resctrl;
-       int ret;
-
-       if (!r_l->alloc_capable)
-               return -EINVAL;
-
-       ret = set_cache_qos_cfg(level, true);
-       if (!ret)
-               rdt_resources_all[level].cdp_enabled = true;
-
-       return ret;
-}
-
-static void cdp_disable(int level)
-{
-       struct rdt_hw_resource *r_hw = &rdt_resources_all[level];
-
-       if (r_hw->cdp_enabled) {
-               set_cache_qos_cfg(level, false);
-               r_hw->cdp_enabled = false;
-       }
-}
-
-int resctrl_arch_set_cdp_enabled(enum resctrl_res_level l, bool enable)
-{
-       struct rdt_hw_resource *hw_res = &rdt_resources_all[l];
-
-       if (!hw_res->r_resctrl.cdp_capable)
-               return -EINVAL;
-
-       if (enable)
-               return cdp_enable(l);
-
-       cdp_disable(l);
-
-       return 0;
-}
-
-bool resctrl_arch_get_cdp_enabled(enum resctrl_res_level l)
-{
-       return rdt_resources_all[l].cdp_enabled;
-}
-
-/*
- * We don't allow rdtgroup directories to be created anywhere
- * except the root directory. Thus when looking for the rdtgroup
- * structure for a kernfs node we are either looking at a directory,
- * in which case the rdtgroup structure is pointed at by the "priv"
- * field, otherwise we have a file, and need only look to the parent
- * to find the rdtgroup.
- */
-static struct rdtgroup *kernfs_to_rdtgroup(struct kernfs_node *kn)
-{
-       if (kernfs_type(kn) == KERNFS_DIR) {
-               /*
-                * All the resource directories use "kn->priv"
-                * to point to the "struct rdtgroup" for the
-                * resource. "info" and its subdirectories don't
-                * have rdtgroup structures, so return NULL here.
-                */
-               if (kn == kn_info ||
-                   rcu_access_pointer(kn->__parent) == kn_info)
-                       return NULL;
-               else
-                       return kn->priv;
-       } else {
-               return rdt_kn_parent_priv(kn);
-       }
-}
-
-static void rdtgroup_kn_get(struct rdtgroup *rdtgrp, struct kernfs_node *kn)
-{
-       atomic_inc(&rdtgrp->waitcount);
-       kernfs_break_active_protection(kn);
-}
-
-static void rdtgroup_kn_put(struct rdtgroup *rdtgrp, struct kernfs_node *kn)
-{
-       if (atomic_dec_and_test(&rdtgrp->waitcount) &&
-           (rdtgrp->flags & RDT_DELETED)) {
-               if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP ||
-                   rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED)
-                       rdtgroup_pseudo_lock_remove(rdtgrp);
-               kernfs_unbreak_active_protection(kn);
-               rdtgroup_remove(rdtgrp);
-       } else {
-               kernfs_unbreak_active_protection(kn);
-       }
-}
-
-struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn)
-{
-       struct rdtgroup *rdtgrp = kernfs_to_rdtgroup(kn);
-
-       if (!rdtgrp)
-               return NULL;
-
-       rdtgroup_kn_get(rdtgrp, kn);
-
-       cpus_read_lock();
-       mutex_lock(&rdtgroup_mutex);
-
-       /* Was this group deleted while we waited? */
-       if (rdtgrp->flags & RDT_DELETED)
-               return NULL;
-
-       return rdtgrp;
-}
-
-void rdtgroup_kn_unlock(struct kernfs_node *kn)
-{
-       struct rdtgroup *rdtgrp = kernfs_to_rdtgroup(kn);
-
-       if (!rdtgrp)
-               return;
-
-       mutex_unlock(&rdtgroup_mutex);
-       cpus_read_unlock();
-
-       rdtgroup_kn_put(rdtgrp, kn);
-}
-
-static int mkdir_mondata_all(struct kernfs_node *parent_kn,
-                            struct rdtgroup *prgrp,
-                            struct kernfs_node **mon_data_kn);
-
-static void rdt_disable_ctx(void)
-{
-       resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, false);
-       resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, false);
-       set_mba_sc(false);
-
-       resctrl_debug = false;
-}
-
-static int rdt_enable_ctx(struct rdt_fs_context *ctx)
-{
-       int ret = 0;
-
-       if (ctx->enable_cdpl2) {
-               ret = resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, true);
-               if (ret)
-                       goto out_done;
-       }
-
-       if (ctx->enable_cdpl3) {
-               ret = resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, true);
-               if (ret)
-                       goto out_cdpl2;
-       }
-
-       if (ctx->enable_mba_mbps) {
-               ret = set_mba_sc(true);
-               if (ret)
-                       goto out_cdpl3;
-       }
-
-       if (ctx->enable_debug)
-               resctrl_debug = true;
-
-       return 0;
-
-out_cdpl3:
-       resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, false);
-out_cdpl2:
-       resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, false);
-out_done:
-       return ret;
-}
-
-static int schemata_list_add(struct rdt_resource *r, enum resctrl_conf_type type)
-{
-       struct resctrl_schema *s;
-       const char *suffix = "";
-       int ret, cl;
-
-       s = kzalloc(sizeof(*s), GFP_KERNEL);
-       if (!s)
-               return -ENOMEM;
-
-       s->res = r;
-       s->num_closid = resctrl_arch_get_num_closid(r);
-       if (resctrl_arch_get_cdp_enabled(r->rid))
-               s->num_closid /= 2;
-
-       s->conf_type = type;
-       switch (type) {
-       case CDP_CODE:
-               suffix = "CODE";
-               break;
-       case CDP_DATA:
-               suffix = "DATA";
-               break;
-       case CDP_NONE:
-               suffix = "";
-               break;
-       }
-
-       ret = snprintf(s->name, sizeof(s->name), "%s%s", r->name, suffix);
-       if (ret >= sizeof(s->name)) {
-               kfree(s);
-               return -EINVAL;
-       }
-
-       cl = strlen(s->name);
-
-       /*
-        * If CDP is supported by this resource, but not enabled,
-        * include the suffix. This ensures the tabular format of the
-        * schemata file does not change between mounts of the filesystem.
-        */
-       if (r->cdp_capable && !resctrl_arch_get_cdp_enabled(r->rid))
-               cl += 4;
-
-       if (cl > max_name_width)
-               max_name_width = cl;
-
-       switch (r->schema_fmt) {
-       case RESCTRL_SCHEMA_BITMAP:
-               s->fmt_str = "%d=%x";
-               break;
-       case RESCTRL_SCHEMA_RANGE:
-               s->fmt_str = "%d=%u";
-               break;
-       }
-
-       if (WARN_ON_ONCE(!s->fmt_str)) {
-               kfree(s);
-               return -EINVAL;
-       }
-
-       INIT_LIST_HEAD(&s->list);
-       list_add(&s->list, &resctrl_schema_all);
-
-       return 0;
-}
-
-static int schemata_list_create(void)
-{
-       struct rdt_resource *r;
-       int ret = 0;
-
-       for_each_alloc_capable_rdt_resource(r) {
-               if (resctrl_arch_get_cdp_enabled(r->rid)) {
-                       ret = schemata_list_add(r, CDP_CODE);
-                       if (ret)
-                               break;
-
-                       ret = schemata_list_add(r, CDP_DATA);
-               } else {
-                       ret = schemata_list_add(r, CDP_NONE);
-               }
-
-               if (ret)
-                       break;
-       }
-
-       return ret;
-}
-
-static void schemata_list_destroy(void)
-{
-       struct resctrl_schema *s, *tmp;
-
-       list_for_each_entry_safe(s, tmp, &resctrl_schema_all, list) {
-               list_del(&s->list);
-               kfree(s);
-       }
-}
-
-static int rdt_get_tree(struct fs_context *fc)
-{
-       struct rdt_fs_context *ctx = rdt_fc2context(fc);
-       unsigned long flags = RFTYPE_CTRL_BASE;
-       struct rdt_mon_domain *dom;
-       struct rdt_resource *r;
-       int ret;
-
-       cpus_read_lock();
-       mutex_lock(&rdtgroup_mutex);
-       /*
-        * resctrl file system can only be mounted once.
-        */
-       if (resctrl_mounted) {
-               ret = -EBUSY;
-               goto out;
-       }
-
-       ret = rdtgroup_setup_root(ctx);
-       if (ret)
-               goto out;
-
-       ret = rdt_enable_ctx(ctx);
-       if (ret)
-               goto out_root;
-
-       ret = schemata_list_create();
-       if (ret) {
-               schemata_list_destroy();
-               goto out_ctx;
-       }
-
-       ret = closid_init();
-       if (ret)
-               goto out_schemata_free;
-
-       if (resctrl_arch_mon_capable())
-               flags |= RFTYPE_MON;
-
-       ret = rdtgroup_add_files(rdtgroup_default.kn, flags);
-       if (ret)
-               goto out_closid_exit;
-
-       kernfs_activate(rdtgroup_default.kn);
-
-       ret = rdtgroup_create_info_dir(rdtgroup_default.kn);
-       if (ret < 0)
-               goto out_closid_exit;
-
-       if (resctrl_arch_mon_capable()) {
-               ret = mongroup_create_dir(rdtgroup_default.kn,
-                                         &rdtgroup_default, "mon_groups",
-                                         &kn_mongrp);
-               if (ret < 0)
-                       goto out_info;
-
-               ret = mkdir_mondata_all(rdtgroup_default.kn,
-                                       &rdtgroup_default, &kn_mondata);
-               if (ret < 0)
-                       goto out_mongrp;
-               rdtgroup_default.mon.mon_data_kn = kn_mondata;
-       }
-
-       ret = rdt_pseudo_lock_init();
-       if (ret)
-               goto out_mondata;
-
-       ret = kernfs_get_tree(fc);
-       if (ret < 0)
-               goto out_psl;
-
-       if (resctrl_arch_alloc_capable())
-               resctrl_arch_enable_alloc();
-       if (resctrl_arch_mon_capable())
-               resctrl_arch_enable_mon();
-
-       if (resctrl_arch_alloc_capable() || resctrl_arch_mon_capable())
-               resctrl_mounted = true;
-
-       if (resctrl_is_mbm_enabled()) {
-               r = resctrl_arch_get_resource(RDT_RESOURCE_L3);
-               list_for_each_entry(dom, &r->mon_domains, hdr.list)
-                       mbm_setup_overflow_handler(dom, MBM_OVERFLOW_INTERVAL,
-                                                  RESCTRL_PICK_ANY_CPU);
-       }
-
-       goto out;
-
-out_psl:
-       rdt_pseudo_lock_release();
-out_mondata:
-       if (resctrl_arch_mon_capable())
-               kernfs_remove(kn_mondata);
-out_mongrp:
-       if (resctrl_arch_mon_capable())
-               kernfs_remove(kn_mongrp);
-out_info:
-       kernfs_remove(kn_info);
-out_closid_exit:
-       closid_exit();
-out_schemata_free:
-       schemata_list_destroy();
-out_ctx:
-       rdt_disable_ctx();
-out_root:
-       rdtgroup_destroy_root();
-out:
-       rdt_last_cmd_clear();
-       mutex_unlock(&rdtgroup_mutex);
-       cpus_read_unlock();
-       return ret;
-}
-
-enum rdt_param {
-       Opt_cdp,
-       Opt_cdpl2,
-       Opt_mba_mbps,
-       Opt_debug,
-       nr__rdt_params
-};
-
-static const struct fs_parameter_spec rdt_fs_parameters[] = {
-       fsparam_flag("cdp",             Opt_cdp),
-       fsparam_flag("cdpl2",           Opt_cdpl2),
-       fsparam_flag("mba_MBps",        Opt_mba_mbps),
-       fsparam_flag("debug",           Opt_debug),
-       {}
-};
-
-static int rdt_parse_param(struct fs_context *fc, struct fs_parameter *param)
-{
-       struct rdt_fs_context *ctx = rdt_fc2context(fc);
-       struct fs_parse_result result;
-       const char *msg;
-       int opt;
-
-       opt = fs_parse(fc, rdt_fs_parameters, param, &result);
-       if (opt < 0)
-               return opt;
-
-       switch (opt) {
-       case Opt_cdp:
-               ctx->enable_cdpl3 = true;
-               return 0;
-       case Opt_cdpl2:
-               ctx->enable_cdpl2 = true;
-               return 0;
-       case Opt_mba_mbps:
-               msg = "mba_MBps requires MBM and linear scale MBA at L3 scope";
-               if (!supports_mba_mbps())
-                       return invalfc(fc, msg);
-               ctx->enable_mba_mbps = true;
-               return 0;
-       case Opt_debug:
-               ctx->enable_debug = true;
-               return 0;
-       }
-
-       return -EINVAL;
-}
-
-static void rdt_fs_context_free(struct fs_context *fc)
-{
-       struct rdt_fs_context *ctx = rdt_fc2context(fc);
-
-       kernfs_free_fs_context(fc);
-       kfree(ctx);
-}
-
-static const struct fs_context_operations rdt_fs_context_ops = {
-       .free           = rdt_fs_context_free,
-       .parse_param    = rdt_parse_param,
-       .get_tree       = rdt_get_tree,
-};
-
-static int rdt_init_fs_context(struct fs_context *fc)
-{
-       struct rdt_fs_context *ctx;
-
-       ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
-       if (!ctx)
-               return -ENOMEM;
-
-       ctx->kfc.magic = RDTGROUP_SUPER_MAGIC;
-       fc->fs_private = &ctx->kfc;
-       fc->ops = &rdt_fs_context_ops;
-       put_user_ns(fc->user_ns);
-       fc->user_ns = get_user_ns(&init_user_ns);
-       fc->global = true;
-       return 0;
-}
-
-void resctrl_arch_reset_all_ctrls(struct rdt_resource *r)
-{
-       struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
-       struct rdt_hw_ctrl_domain *hw_dom;
-       struct msr_param msr_param;
-       struct rdt_ctrl_domain *d;
-       int i;
-
-       /* Walking r->domains, ensure it can't race with cpuhp */
-       lockdep_assert_cpus_held();
-
-       msr_param.res = r;
-       msr_param.low = 0;
-       msr_param.high = hw_res->num_closid;
-
-       /*
-        * Disable resource control for this resource by setting all
-        * CBMs in all ctrl_domains to the maximum mask value. Pick one CPU
-        * from each domain to update the MSRs below.
-        */
-       list_for_each_entry(d, &r->ctrl_domains, hdr.list) {
-               hw_dom = resctrl_to_arch_ctrl_dom(d);
-
-               for (i = 0; i < hw_res->num_closid; i++)
-                       hw_dom->ctrl_val[i] = resctrl_get_default_ctrl(r);
-               msr_param.dom = d;
-               smp_call_function_any(&d->hdr.cpu_mask, rdt_ctrl_update, &msr_param, 1);
-       }
-
-       return;
-}
-
-/*
- * Move tasks from one to the other group. If @from is NULL, then all tasks
- * in the systems are moved unconditionally (used for teardown).
- *
- * If @mask is not NULL the cpus on which moved tasks are running are set
- * in that mask so the update smp function call is restricted to affected
- * cpus.
- */
-static void rdt_move_group_tasks(struct rdtgroup *from, struct rdtgroup *to,
-                                struct cpumask *mask)
-{
-       struct task_struct *p, *t;
-
-       read_lock(&tasklist_lock);
-       for_each_process_thread(p, t) {
-               if (!from || is_closid_match(t, from) ||
-                   is_rmid_match(t, from)) {
-                       resctrl_arch_set_closid_rmid(t, to->closid,
-                                                    to->mon.rmid);
-
-                       /*
-                        * Order the closid/rmid stores above before the loads
-                        * in task_curr(). This pairs with the full barrier
-                        * between the rq->curr update and
-                        * resctrl_arch_sched_in() during context switch.
-                        */
-                       smp_mb();
-
-                       /*
-                        * If the task is on a CPU, set the CPU in the mask.
-                        * The detection is inaccurate as tasks might move or
-                        * schedule before the smp function call takes place.
-                        * In such a case the function call is pointless, but
-                        * there is no other side effect.
-                        */
-                       if (IS_ENABLED(CONFIG_SMP) && mask && task_curr(t))
-                               cpumask_set_cpu(task_cpu(t), mask);
-               }
-       }
-       read_unlock(&tasklist_lock);
-}
-
-static void free_all_child_rdtgrp(struct rdtgroup *rdtgrp)
-{
-       struct rdtgroup *sentry, *stmp;
-       struct list_head *head;
-
-       head = &rdtgrp->mon.crdtgrp_list;
-       list_for_each_entry_safe(sentry, stmp, head, mon.crdtgrp_list) {
-               free_rmid(sentry->closid, sentry->mon.rmid);
-               list_del(&sentry->mon.crdtgrp_list);
-
-               if (atomic_read(&sentry->waitcount) != 0)
-                       sentry->flags = RDT_DELETED;
-               else
-                       rdtgroup_remove(sentry);
-       }
-}
-
-/*
- * Forcibly remove all of subdirectories under root.
- */
-static void rmdir_all_sub(void)
-{
-       struct rdtgroup *rdtgrp, *tmp;
-
-       /* Move all tasks to the default resource group */
-       rdt_move_group_tasks(NULL, &rdtgroup_default, NULL);
-
-       list_for_each_entry_safe(rdtgrp, tmp, &rdt_all_groups, rdtgroup_list) {
-               /* Free any child rmids */
-               free_all_child_rdtgrp(rdtgrp);
-
-               /* Remove each rdtgroup other than root */
-               if (rdtgrp == &rdtgroup_default)
-                       continue;
-
-               if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP ||
-                   rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED)
-                       rdtgroup_pseudo_lock_remove(rdtgrp);
-
-               /*
-                * Give any CPUs back to the default group. We cannot copy
-                * cpu_online_mask because a CPU might have executed the
-                * offline callback already, but is still marked online.
-                */
-               cpumask_or(&rdtgroup_default.cpu_mask,
-                          &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask);
-
-               free_rmid(rdtgrp->closid, rdtgrp->mon.rmid);
-
-               kernfs_remove(rdtgrp->kn);
-               list_del(&rdtgrp->rdtgroup_list);
-
-               if (atomic_read(&rdtgrp->waitcount) != 0)
-                       rdtgrp->flags = RDT_DELETED;
-               else
-                       rdtgroup_remove(rdtgrp);
-       }
-       /* Notify online CPUs to update per cpu storage and PQR_ASSOC MSR */
-       update_closid_rmid(cpu_online_mask, &rdtgroup_default);
-
-       kernfs_remove(kn_info);
-       kernfs_remove(kn_mongrp);
-       kernfs_remove(kn_mondata);
-}
-
-/**
- * mon_get_kn_priv() - Get the mon_data priv data for this event.
- *
- * The same values are used across the mon_data directories of all control and
- * monitor groups for the same event in the same domain. Keep a list of
- * allocated structures and re-use an existing one with the same values for
- * @rid, @domid, etc.
- *
- * @rid:    The resource id for the event file being created.
- * @domid:  The domain id for the event file being created.
- * @mevt:   The type of event file being created.
- * @do_sum: Whether SNC summing monitors are being created.
- */
-static struct mon_data *mon_get_kn_priv(enum resctrl_res_level rid, int domid,
-                                       struct mon_evt *mevt,
-                                       bool do_sum)
-{
-       struct mon_data *priv;
-
-       lockdep_assert_held(&rdtgroup_mutex);
-
-       list_for_each_entry(priv, &mon_data_kn_priv_list, list) {
-               if (priv->rid == rid && priv->domid == domid &&
-                   priv->sum == do_sum && priv->evtid == mevt->evtid)
-                       return priv;
-       }
-
-       priv = kzalloc(sizeof(*priv), GFP_KERNEL);
-       if (!priv)
-               return NULL;
-
-       priv->rid = rid;
-       priv->domid = domid;
-       priv->sum = do_sum;
-       priv->evtid = mevt->evtid;
-       list_add_tail(&priv->list, &mon_data_kn_priv_list);
-
-       return priv;
-}
-
-/**
- * mon_put_kn_priv() - Free all allocated mon_data structures.
- *
- * Called when resctrl file system is unmounted.
- */
-static void mon_put_kn_priv(void)
-{
-       struct mon_data *priv, *tmp;
-
-       lockdep_assert_held(&rdtgroup_mutex);
-
-       list_for_each_entry_safe(priv, tmp, &mon_data_kn_priv_list, list) {
-               list_del(&priv->list);
-               kfree(priv);
-       }
-}
-
-static void resctrl_fs_teardown(void)
-{
-       lockdep_assert_held(&rdtgroup_mutex);
-
-       /* Cleared by rdtgroup_destroy_root() */
-       if (!rdtgroup_default.kn)
-               return;
-
-       rmdir_all_sub();
-       mon_put_kn_priv();
-       rdt_pseudo_lock_release();
-       rdtgroup_default.mode = RDT_MODE_SHAREABLE;
-       closid_exit();
-       schemata_list_destroy();
-       rdtgroup_destroy_root();
-}
-
-static void rdt_kill_sb(struct super_block *sb)
-{
-       struct rdt_resource *r;
-
-       cpus_read_lock();
-       mutex_lock(&rdtgroup_mutex);
-
-       rdt_disable_ctx();
-
-       /* Put everything back to default values. */
-       for_each_alloc_capable_rdt_resource(r)
-               resctrl_arch_reset_all_ctrls(r);
-
-       resctrl_fs_teardown();
-       if (resctrl_arch_alloc_capable())
-               resctrl_arch_disable_alloc();
-       if (resctrl_arch_mon_capable())
-               resctrl_arch_disable_mon();
-       resctrl_mounted = false;
-       kernfs_kill_sb(sb);
-       mutex_unlock(&rdtgroup_mutex);
-       cpus_read_unlock();
-}
-
-static struct file_system_type rdt_fs_type = {
-       .name                   = "resctrl",
-       .init_fs_context        = rdt_init_fs_context,
-       .parameters             = rdt_fs_parameters,
-       .kill_sb                = rdt_kill_sb,
-};
-
-static int mon_addfile(struct kernfs_node *parent_kn, const char *name,
-                      void *priv)
-{
-       struct kernfs_node *kn;
-       int ret = 0;
-
-       kn = __kernfs_create_file(parent_kn, name, 0444,
-                                 GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, 0,
-                                 &kf_mondata_ops, priv, NULL, NULL);
-       if (IS_ERR(kn))
-               return PTR_ERR(kn);
-
-       ret = rdtgroup_kn_set_ugid(kn);
-       if (ret) {
-               kernfs_remove(kn);
-               return ret;
-       }
-
-       return ret;
-}
-
-static void mon_rmdir_one_subdir(struct kernfs_node *pkn, char *name, char *subname)
-{
-       struct kernfs_node *kn;
-
-       kn = kernfs_find_and_get(pkn, name);
-       if (!kn)
-               return;
-       kernfs_put(kn);
-
-       if (kn->dir.subdirs <= 1)
-               kernfs_remove(kn);
-       else
-               kernfs_remove_by_name(kn, subname);
-}
-
-/*
- * Remove all subdirectories of mon_data of ctrl_mon groups
- * and monitor groups for the given domain.
- * Remove files and directories containing "sum" of domain data
- * when last domain being summed is removed.
- */
-static void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r,
-                                          struct rdt_mon_domain *d)
-{
-       struct rdtgroup *prgrp, *crgrp;
-       char subname[32];
-       bool snc_mode;
-       char name[32];
-
-       snc_mode = r->mon_scope == RESCTRL_L3_NODE;
-       sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci->id : d->hdr.id);
-       if (snc_mode)
-               sprintf(subname, "mon_sub_%s_%02d", r->name, d->hdr.id);
-
-       list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) {
-               mon_rmdir_one_subdir(prgrp->mon.mon_data_kn, name, subname);
-
-               list_for_each_entry(crgrp, &prgrp->mon.crdtgrp_list, mon.crdtgrp_list)
-                       mon_rmdir_one_subdir(crgrp->mon.mon_data_kn, name, subname);
-       }
-}
-
-static int mon_add_all_files(struct kernfs_node *kn, struct rdt_mon_domain *d,
-                            struct rdt_resource *r, struct rdtgroup *prgrp,
-                            bool do_sum)
-{
-       struct rmid_read rr = {0};
-       struct mon_data *priv;
-       struct mon_evt *mevt;
-       int ret, domid;
-
-       if (WARN_ON(list_empty(&r->evt_list)))
-               return -EPERM;
-
-       list_for_each_entry(mevt, &r->evt_list, list) {
-               domid = do_sum ? d->ci->id : d->hdr.id;
-               priv = mon_get_kn_priv(r->rid, domid, mevt, do_sum);
-               if (WARN_ON_ONCE(!priv))
-                       return -EINVAL;
-
-               ret = mon_addfile(kn, mevt->name, priv);
-               if (ret)
-                       return ret;
-
-               if (!do_sum && resctrl_is_mbm_event(mevt->evtid))
-                       mon_event_read(&rr, r, d, prgrp, &d->hdr.cpu_mask, mevt->evtid, true);
-       }
-
-       return 0;
-}
-
-static int mkdir_mondata_subdir(struct kernfs_node *parent_kn,
-                               struct rdt_mon_domain *d,
-                               struct rdt_resource *r, struct rdtgroup *prgrp)
-{
-       struct kernfs_node *kn, *ckn;
-       char name[32];
-       bool snc_mode;
-       int ret = 0;
-
-       lockdep_assert_held(&rdtgroup_mutex);
-
-       snc_mode = r->mon_scope == RESCTRL_L3_NODE;
-       sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci->id : d->hdr.id);
-       kn = kernfs_find_and_get(parent_kn, name);
-       if (kn) {
-               /*
-                * rdtgroup_mutex will prevent this directory from being
-                * removed. No need to keep this hold.
-                */
-               kernfs_put(kn);
-       } else {
-               kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp);
-               if (IS_ERR(kn))
-                       return PTR_ERR(kn);
-
-               ret = rdtgroup_kn_set_ugid(kn);
-               if (ret)
-                       goto out_destroy;
-               ret = mon_add_all_files(kn, d, r, prgrp, snc_mode);
-               if (ret)
-                       goto out_destroy;
-       }
-
-       if (snc_mode) {
-               sprintf(name, "mon_sub_%s_%02d", r->name, d->hdr.id);
-               ckn = kernfs_create_dir(kn, name, parent_kn->mode, prgrp);
-               if (IS_ERR(ckn)) {
-                       ret = -EINVAL;
-                       goto out_destroy;
-               }
-
-               ret = rdtgroup_kn_set_ugid(ckn);
-               if (ret)
-                       goto out_destroy;
-
-               ret = mon_add_all_files(ckn, d, r, prgrp, false);
-               if (ret)
-                       goto out_destroy;
-       }
-
-       kernfs_activate(kn);
-       return 0;
-
-out_destroy:
-       kernfs_remove(kn);
-       return ret;
-}
-
-/*
- * Add all subdirectories of mon_data for "ctrl_mon" groups
- * and "monitor" groups with given domain id.
- */
-static void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r,
-                                          struct rdt_mon_domain *d)
-{
-       struct kernfs_node *parent_kn;
-       struct rdtgroup *prgrp, *crgrp;
-       struct list_head *head;
-
-       list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) {
-               parent_kn = prgrp->mon.mon_data_kn;
-               mkdir_mondata_subdir(parent_kn, d, r, prgrp);
-
-               head = &prgrp->mon.crdtgrp_list;
-               list_for_each_entry(crgrp, head, mon.crdtgrp_list) {
-                       parent_kn = crgrp->mon.mon_data_kn;
-                       mkdir_mondata_subdir(parent_kn, d, r, crgrp);
-               }
-       }
-}
-
-static int mkdir_mondata_subdir_alldom(struct kernfs_node *parent_kn,
-                                      struct rdt_resource *r,
-                                      struct rdtgroup *prgrp)
-{
-       struct rdt_mon_domain *dom;
-       int ret;
-
-       /* Walking r->domains, ensure it can't race with cpuhp */
-       lockdep_assert_cpus_held();
-
-       list_for_each_entry(dom, &r->mon_domains, hdr.list) {
-               ret = mkdir_mondata_subdir(parent_kn, dom, r, prgrp);
-               if (ret)
-                       return ret;
-       }
-
-       return 0;
-}
-
-/*
- * This creates a directory mon_data which contains the monitored data.
- *
- * mon_data has one directory for each domain which are named
- * in the format mon_<domain_name>_<domain_id>. For ex: A mon_data
- * with L3 domain looks as below:
- * ./mon_data:
- * mon_L3_00
- * mon_L3_01
- * mon_L3_02
- * ...
- *
- * Each domain directory has one file per event:
- * ./mon_L3_00/:
- * llc_occupancy
- *
- */
-static int mkdir_mondata_all(struct kernfs_node *parent_kn,
-                            struct rdtgroup *prgrp,
-                            struct kernfs_node **dest_kn)
-{
-       struct rdt_resource *r;
-       struct kernfs_node *kn;
-       int ret;
-
-       /*
-        * Create the mon_data directory first.
-        */
-       ret = mongroup_create_dir(parent_kn, prgrp, "mon_data", &kn);
-       if (ret)
-               return ret;
-
-       if (dest_kn)
-               *dest_kn = kn;
-
-       /*
-        * Create the subdirectories for each domain. Note that all events
-        * in a domain like L3 are grouped into a resource whose domain is L3
-        */
-       for_each_mon_capable_rdt_resource(r) {
-               ret = mkdir_mondata_subdir_alldom(kn, r, prgrp);
-               if (ret)
-                       goto out_destroy;
-       }
-
-       return 0;
-
-out_destroy:
-       kernfs_remove(kn);
-       return ret;
-}
-
-/**
- * cbm_ensure_valid - Enforce validity on provided CBM
- * @_val:      Candidate CBM
- * @r:         RDT resource to which the CBM belongs
- *
- * The provided CBM represents all cache portions available for use. This
- * may be represented by a bitmap that does not consist of contiguous ones
- * and thus be an invalid CBM.
- * Here the provided CBM is forced to be a valid CBM by only considering
- * the first set of contiguous bits as valid and clearing all bits.
- * The intention here is to provide a valid default CBM with which a new
- * resource group is initialized. The user can follow this with a
- * modification to the CBM if the default does not satisfy the
- * requirements.
- */
-static u32 cbm_ensure_valid(u32 _val, struct rdt_resource *r)
-{
-       unsigned int cbm_len = r->cache.cbm_len;
-       unsigned long first_bit, zero_bit;
-       unsigned long val = _val;
-
-       if (!val)
-               return 0;
-
-       first_bit = find_first_bit(&val, cbm_len);
-       zero_bit = find_next_zero_bit(&val, cbm_len, first_bit);
-
-       /* Clear any remaining bits to ensure contiguous region */
-       bitmap_clear(&val, zero_bit, cbm_len - zero_bit);
-       return (u32)val;
-}
-
-/*
- * Initialize cache resources per RDT domain
- *
- * Set the RDT domain up to start off with all usable allocations. That is,
- * all shareable and unused bits. All-zero CBM is invalid.
- */
-static int __init_one_rdt_domain(struct rdt_ctrl_domain *d, struct resctrl_schema *s,
-                                u32 closid)
-{
-       enum resctrl_conf_type peer_type = resctrl_peer_type(s->conf_type);
-       enum resctrl_conf_type t = s->conf_type;
-       struct resctrl_staged_config *cfg;
-       struct rdt_resource *r = s->res;
-       u32 used_b = 0, unused_b = 0;
-       unsigned long tmp_cbm;
-       enum rdtgrp_mode mode;
-       u32 peer_ctl, ctrl_val;
-       int i;
-
-       cfg = &d->staged_config[t];
-       cfg->have_new_ctrl = false;
-       cfg->new_ctrl = r->cache.shareable_bits;
-       used_b = r->cache.shareable_bits;
-       for (i = 0; i < closids_supported(); i++) {
-               if (closid_allocated(i) && i != closid) {
-                       mode = rdtgroup_mode_by_closid(i);
-                       if (mode == RDT_MODE_PSEUDO_LOCKSETUP)
-                               /*
-                                * ctrl values for locksetup aren't relevant
-                                * until the schemata is written, and the mode
-                                * becomes RDT_MODE_PSEUDO_LOCKED.
-                                */
-                               continue;
-                       /*
-                        * If CDP is active include peer domain's
-                        * usage to ensure there is no overlap
-                        * with an exclusive group.
-                        */
-                       if (resctrl_arch_get_cdp_enabled(r->rid))
-                               peer_ctl = resctrl_arch_get_config(r, d, i,
-                                                                  peer_type);
-                       else
-                               peer_ctl = 0;
-                       ctrl_val = resctrl_arch_get_config(r, d, i,
-                                                          s->conf_type);
-                       used_b |= ctrl_val | peer_ctl;
-                       if (mode == RDT_MODE_SHAREABLE)
-                               cfg->new_ctrl |= ctrl_val | peer_ctl;
-               }
-       }
-       if (d->plr && d->plr->cbm > 0)
-               used_b |= d->plr->cbm;
-       unused_b = used_b ^ (BIT_MASK(r->cache.cbm_len) - 1);
-       unused_b &= BIT_MASK(r->cache.cbm_len) - 1;
-       cfg->new_ctrl |= unused_b;
-       /*
-        * Force the initial CBM to be valid, user can
-        * modify the CBM based on system availability.
-        */
-       cfg->new_ctrl = cbm_ensure_valid(cfg->new_ctrl, r);
-       /*
-        * Assign the u32 CBM to an unsigned long to ensure that
-        * bitmap_weight() does not access out-of-bound memory.
-        */
-       tmp_cbm = cfg->new_ctrl;
-       if (bitmap_weight(&tmp_cbm, r->cache.cbm_len) < r->cache.min_cbm_bits) {
-               rdt_last_cmd_printf("No space on %s:%d\n", s->name, d->hdr.id);
-               return -ENOSPC;
-       }
-       cfg->have_new_ctrl = true;
-
-       return 0;
-}
-
-/*
- * Initialize cache resources with default values.
- *
- * A new RDT group is being created on an allocation capable (CAT)
- * supporting system. Set this group up to start off with all usable
- * allocations.
- *
- * If there are no more shareable bits available on any domain then
- * the entire allocation will fail.
- */
-static int rdtgroup_init_cat(struct resctrl_schema *s, u32 closid)
-{
-       struct rdt_ctrl_domain *d;
-       int ret;
-
-       list_for_each_entry(d, &s->res->ctrl_domains, hdr.list) {
-               ret = __init_one_rdt_domain(d, s, closid);
-               if (ret < 0)
-                       return ret;
-       }
-
-       return 0;
-}
-
-/* Initialize MBA resource with default values. */
-static void rdtgroup_init_mba(struct rdt_resource *r, u32 closid)
-{
-       struct resctrl_staged_config *cfg;
-       struct rdt_ctrl_domain *d;
-
-       list_for_each_entry(d, &r->ctrl_domains, hdr.list) {
-               if (is_mba_sc(r)) {
-                       d->mbps_val[closid] = MBA_MAX_MBPS;
-                       continue;
-               }
-
-               cfg = &d->staged_config[CDP_NONE];
-               cfg->new_ctrl = resctrl_get_default_ctrl(r);
-               cfg->have_new_ctrl = true;
-       }
-}
-
-/* Initialize the RDT group's allocations. */
-static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp)
-{
-       struct resctrl_schema *s;
-       struct rdt_resource *r;
-       int ret = 0;
-
-       rdt_staged_configs_clear();
-
-       list_for_each_entry(s, &resctrl_schema_all, list) {
-               r = s->res;
-               if (r->rid == RDT_RESOURCE_MBA ||
-                   r->rid == RDT_RESOURCE_SMBA) {
-                       rdtgroup_init_mba(r, rdtgrp->closid);
-                       if (is_mba_sc(r))
-                               continue;
-               } else {
-                       ret = rdtgroup_init_cat(s, rdtgrp->closid);
-                       if (ret < 0)
-                               goto out;
-               }
-
-               ret = resctrl_arch_update_domains(r, rdtgrp->closid);
-               if (ret < 0) {
-                       rdt_last_cmd_puts("Failed to initialize allocations\n");
-                       goto out;
-               }
-       }
-
-       rdtgrp->mode = RDT_MODE_SHAREABLE;
-
-out:
-       rdt_staged_configs_clear();
-       return ret;
-}
-
-static int mkdir_rdt_prepare_rmid_alloc(struct rdtgroup *rdtgrp)
-{
-       int ret;
-
-       if (!resctrl_arch_mon_capable())
-               return 0;
-
-       ret = alloc_rmid(rdtgrp->closid);
-       if (ret < 0) {
-               rdt_last_cmd_puts("Out of RMIDs\n");
-               return ret;
-       }
-       rdtgrp->mon.rmid = ret;
-
-       ret = mkdir_mondata_all(rdtgrp->kn, rdtgrp, &rdtgrp->mon.mon_data_kn);
-       if (ret) {
-               rdt_last_cmd_puts("kernfs subdir error\n");
-               free_rmid(rdtgrp->closid, rdtgrp->mon.rmid);
-               return ret;
-       }
-
-       return 0;
-}
-
-static void mkdir_rdt_prepare_rmid_free(struct rdtgroup *rgrp)
-{
-       if (resctrl_arch_mon_capable())
-               free_rmid(rgrp->closid, rgrp->mon.rmid);
-}
-
-/*
- * We allow creating mon groups only with in a directory called "mon_groups"
- * which is present in every ctrl_mon group. Check if this is a valid
- * "mon_groups" directory.
- *
- * 1. The directory should be named "mon_groups".
- * 2. The mon group itself should "not" be named "mon_groups".
- *   This makes sure "mon_groups" directory always has a ctrl_mon group
- *   as parent.
- */
-static bool is_mon_groups(struct kernfs_node *kn, const char *name)
-{
-       return (!strcmp(rdt_kn_name(kn), "mon_groups") &&
-               strcmp(name, "mon_groups"));
-}
-
-static int mkdir_rdt_prepare(struct kernfs_node *parent_kn,
-                            const char *name, umode_t mode,
-                            enum rdt_group_type rtype, struct rdtgroup **r)
-{
-       struct rdtgroup *prdtgrp, *rdtgrp;
-       unsigned long files = 0;
-       struct kernfs_node *kn;
-       int ret;
-
-       prdtgrp = rdtgroup_kn_lock_live(parent_kn);
-       if (!prdtgrp) {
-               ret = -ENODEV;
-               goto out_unlock;
-       }
-
-       /*
-        * Check that the parent directory for a monitor group is a "mon_groups"
-        * directory.
-        */
-       if (rtype == RDTMON_GROUP && !is_mon_groups(parent_kn, name)) {
-               ret = -EPERM;
-               goto out_unlock;
-       }
-
-       if (rtype == RDTMON_GROUP &&
-           (prdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP ||
-            prdtgrp->mode == RDT_MODE_PSEUDO_LOCKED)) {
-               ret = -EINVAL;
-               rdt_last_cmd_puts("Pseudo-locking in progress\n");
-               goto out_unlock;
-       }
-
-       /* allocate the rdtgroup. */
-       rdtgrp = kzalloc(sizeof(*rdtgrp), GFP_KERNEL);
-       if (!rdtgrp) {
-               ret = -ENOSPC;
-               rdt_last_cmd_puts("Kernel out of memory\n");
-               goto out_unlock;
-       }
-       *r = rdtgrp;
-       rdtgrp->mon.parent = prdtgrp;
-       rdtgrp->type = rtype;
-       INIT_LIST_HEAD(&rdtgrp->mon.crdtgrp_list);
-
-       /* kernfs creates the directory for rdtgrp */
-       kn = kernfs_create_dir(parent_kn, name, mode, rdtgrp);
-       if (IS_ERR(kn)) {
-               ret = PTR_ERR(kn);
-               rdt_last_cmd_puts("kernfs create error\n");
-               goto out_free_rgrp;
-       }
-       rdtgrp->kn = kn;
-
-       /*
-        * kernfs_remove() will drop the reference count on "kn" which
-        * will free it. But we still need it to stick around for the
-        * rdtgroup_kn_unlock(kn) call. Take one extra reference here,
-        * which will be dropped by kernfs_put() in rdtgroup_remove().
-        */
-       kernfs_get(kn);
-
-       ret = rdtgroup_kn_set_ugid(kn);
-       if (ret) {
-               rdt_last_cmd_puts("kernfs perm error\n");
-               goto out_destroy;
-       }
-
-       if (rtype == RDTCTRL_GROUP) {
-               files = RFTYPE_BASE | RFTYPE_CTRL;
-               if (resctrl_arch_mon_capable())
-                       files |= RFTYPE_MON;
-       } else {
-               files = RFTYPE_BASE | RFTYPE_MON;
-       }
-
-       ret = rdtgroup_add_files(kn, files);
-       if (ret) {
-               rdt_last_cmd_puts("kernfs fill error\n");
-               goto out_destroy;
-       }
-
-       /*
-        * The caller unlocks the parent_kn upon success.
-        */
-       return 0;
-
-out_destroy:
-       kernfs_put(rdtgrp->kn);
-       kernfs_remove(rdtgrp->kn);
-out_free_rgrp:
-       kfree(rdtgrp);
-out_unlock:
-       rdtgroup_kn_unlock(parent_kn);
-       return ret;
-}
-
-static void mkdir_rdt_prepare_clean(struct rdtgroup *rgrp)
-{
-       kernfs_remove(rgrp->kn);
-       rdtgroup_remove(rgrp);
-}
-
-/*
- * Create a monitor group under "mon_groups" directory of a control
- * and monitor group(ctrl_mon). This is a resource group
- * to monitor a subset of tasks and cpus in its parent ctrl_mon group.
- */
-static int rdtgroup_mkdir_mon(struct kernfs_node *parent_kn,
-                             const char *name, umode_t mode)
-{
-       struct rdtgroup *rdtgrp, *prgrp;
-       int ret;
-
-       ret = mkdir_rdt_prepare(parent_kn, name, mode, RDTMON_GROUP, &rdtgrp);
-       if (ret)
-               return ret;
-
-       prgrp = rdtgrp->mon.parent;
-       rdtgrp->closid = prgrp->closid;
-
-       ret = mkdir_rdt_prepare_rmid_alloc(rdtgrp);
-       if (ret) {
-               mkdir_rdt_prepare_clean(rdtgrp);
-               goto out_unlock;
-       }
-
-       kernfs_activate(rdtgrp->kn);
-
-       /*
-        * Add the rdtgrp to the list of rdtgrps the parent
-        * ctrl_mon group has to track.
-        */
-       list_add_tail(&rdtgrp->mon.crdtgrp_list, &prgrp->mon.crdtgrp_list);
-
-out_unlock:
-       rdtgroup_kn_unlock(parent_kn);
-       return ret;
-}
-
-/*
- * These are rdtgroups created under the root directory. Can be used
- * to allocate and monitor resources.
- */
-static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn,
-                                  const char *name, umode_t mode)
-{
-       struct rdtgroup *rdtgrp;
-       struct kernfs_node *kn;
-       u32 closid;
-       int ret;
-
-       ret = mkdir_rdt_prepare(parent_kn, name, mode, RDTCTRL_GROUP, &rdtgrp);
-       if (ret)
-               return ret;
-
-       kn = rdtgrp->kn;
-       ret = closid_alloc();
-       if (ret < 0) {
-               rdt_last_cmd_puts("Out of CLOSIDs\n");
-               goto out_common_fail;
-       }
-       closid = ret;
-       ret = 0;
-
-       rdtgrp->closid = closid;
-
-       ret = mkdir_rdt_prepare_rmid_alloc(rdtgrp);
-       if (ret)
-               goto out_closid_free;
-
-       kernfs_activate(rdtgrp->kn);
-
-       ret = rdtgroup_init_alloc(rdtgrp);
-       if (ret < 0)
-               goto out_rmid_free;
-
-       list_add(&rdtgrp->rdtgroup_list, &rdt_all_groups);
-
-       if (resctrl_arch_mon_capable()) {
-               /*
-                * Create an empty mon_groups directory to hold the subset
-                * of tasks and cpus to monitor.
-                */
-               ret = mongroup_create_dir(kn, rdtgrp, "mon_groups", NULL);
-               if (ret) {
-                       rdt_last_cmd_puts("kernfs subdir error\n");
-                       goto out_del_list;
-               }
-               if (is_mba_sc(NULL))
-                       rdtgrp->mba_mbps_event = mba_mbps_default_event;
-       }
-
-       goto out_unlock;
-
-out_del_list:
-       list_del(&rdtgrp->rdtgroup_list);
-out_rmid_free:
-       mkdir_rdt_prepare_rmid_free(rdtgrp);
-out_closid_free:
-       closid_free(closid);
-out_common_fail:
-       mkdir_rdt_prepare_clean(rdtgrp);
-out_unlock:
-       rdtgroup_kn_unlock(parent_kn);
-       return ret;
-}
-
-static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
-                         umode_t mode)
-{
-       /* Do not accept '\n' to avoid unparsable situation. */
-       if (strchr(name, '\n'))
-               return -EINVAL;
-
-       /*
-        * If the parent directory is the root directory and RDT
-        * allocation is supported, add a control and monitoring
-        * subdirectory
-        */
-       if (resctrl_arch_alloc_capable() && parent_kn == rdtgroup_default.kn)
-               return rdtgroup_mkdir_ctrl_mon(parent_kn, name, mode);
-
-       /* Else, attempt to add a monitoring subdirectory. */
-       if (resctrl_arch_mon_capable())
-               return rdtgroup_mkdir_mon(parent_kn, name, mode);
-
-       return -EPERM;
-}
-
-static int rdtgroup_rmdir_mon(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask)
-{
-       struct rdtgroup *prdtgrp = rdtgrp->mon.parent;
-       u32 closid, rmid;
-       int cpu;
-
-       /* Give any tasks back to the parent group */
-       rdt_move_group_tasks(rdtgrp, prdtgrp, tmpmask);
-
-       /*
-        * Update per cpu closid/rmid of the moved CPUs first.
-        * Note: the closid will not change, but the arch code still needs it.
-        */
-       closid = prdtgrp->closid;
-       rmid = prdtgrp->mon.rmid;
-       for_each_cpu(cpu, &rdtgrp->cpu_mask)
-               resctrl_arch_set_cpu_default_closid_rmid(cpu, closid, rmid);
-
-       /*
-        * Update the MSR on moved CPUs and CPUs which have moved
-        * task running on them.
-        */
-       cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask);
-       update_closid_rmid(tmpmask, NULL);
-
-       rdtgrp->flags = RDT_DELETED;
-       free_rmid(rdtgrp->closid, rdtgrp->mon.rmid);
-
-       /*
-        * Remove the rdtgrp from the parent ctrl_mon group's list
-        */
-       WARN_ON(list_empty(&prdtgrp->mon.crdtgrp_list));
-       list_del(&rdtgrp->mon.crdtgrp_list);
-
-       kernfs_remove(rdtgrp->kn);
-
-       return 0;
-}
-
-static int rdtgroup_ctrl_remove(struct rdtgroup *rdtgrp)
-{
-       rdtgrp->flags = RDT_DELETED;
-       list_del(&rdtgrp->rdtgroup_list);
-
-       kernfs_remove(rdtgrp->kn);
-       return 0;
-}
-
-static int rdtgroup_rmdir_ctrl(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask)
-{
-       u32 closid, rmid;
-       int cpu;
-
-       /* Give any tasks back to the default group */
-       rdt_move_group_tasks(rdtgrp, &rdtgroup_default, tmpmask);
-
-       /* Give any CPUs back to the default group */
-       cpumask_or(&rdtgroup_default.cpu_mask,
-                  &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask);
-
-       /* Update per cpu closid and rmid of the moved CPUs first */
-       closid = rdtgroup_default.closid;
-       rmid = rdtgroup_default.mon.rmid;
-       for_each_cpu(cpu, &rdtgrp->cpu_mask)
-               resctrl_arch_set_cpu_default_closid_rmid(cpu, closid, rmid);
-
-       /*
-        * Update the MSR on moved CPUs and CPUs which have moved
-        * task running on them.
-        */
-       cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask);
-       update_closid_rmid(tmpmask, NULL);
-
-       free_rmid(rdtgrp->closid, rdtgrp->mon.rmid);
-       closid_free(rdtgrp->closid);
-
-       rdtgroup_ctrl_remove(rdtgrp);
-
-       /*
-        * Free all the child monitor group rmids.
-        */
-       free_all_child_rdtgrp(rdtgrp);
-
-       return 0;
-}
-
-static struct kernfs_node *rdt_kn_parent(struct kernfs_node *kn)
-{
-       /*
-        * Valid within the RCU section it was obtained or while rdtgroup_mutex
-        * is held.
-        */
-       return rcu_dereference_check(kn->__parent, lockdep_is_held(&rdtgroup_mutex));
-}
-
-static int rdtgroup_rmdir(struct kernfs_node *kn)
-{
-       struct kernfs_node *parent_kn;
-       struct rdtgroup *rdtgrp;
-       cpumask_var_t tmpmask;
-       int ret = 0;
-
-       if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
-               return -ENOMEM;
-
-       rdtgrp = rdtgroup_kn_lock_live(kn);
-       if (!rdtgrp) {
-               ret = -EPERM;
-               goto out;
-       }
-       parent_kn = rdt_kn_parent(kn);
-
-       /*
-        * If the rdtgroup is a ctrl_mon group and parent directory
-        * is the root directory, remove the ctrl_mon group.
-        *
-        * If the rdtgroup is a mon group and parent directory
-        * is a valid "mon_groups" directory, remove the mon group.
-        */
-       if (rdtgrp->type == RDTCTRL_GROUP && parent_kn == rdtgroup_default.kn &&
-           rdtgrp != &rdtgroup_default) {
-               if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP ||
-                   rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) {
-                       ret = rdtgroup_ctrl_remove(rdtgrp);
-               } else {
-                       ret = rdtgroup_rmdir_ctrl(rdtgrp, tmpmask);
-               }
-       } else if (rdtgrp->type == RDTMON_GROUP &&
-                is_mon_groups(parent_kn, rdt_kn_name(kn))) {
-               ret = rdtgroup_rmdir_mon(rdtgrp, tmpmask);
-       } else {
-               ret = -EPERM;
-       }
-
-out:
-       rdtgroup_kn_unlock(kn);
-       free_cpumask_var(tmpmask);
-       return ret;
-}
-
-/**
- * mongrp_reparent() - replace parent CTRL_MON group of a MON group
- * @rdtgrp:            the MON group whose parent should be replaced
- * @new_prdtgrp:       replacement parent CTRL_MON group for @rdtgrp
- * @cpus:              cpumask provided by the caller for use during this call
- *
- * Replaces the parent CTRL_MON group for a MON group, resulting in all member
- * tasks' CLOSID immediately changing to that of the new parent group.
- * Monitoring data for the group is unaffected by this operation.
- */
-static void mongrp_reparent(struct rdtgroup *rdtgrp,
-                           struct rdtgroup *new_prdtgrp,
-                           cpumask_var_t cpus)
-{
-       struct rdtgroup *prdtgrp = rdtgrp->mon.parent;
-
-       WARN_ON(rdtgrp->type != RDTMON_GROUP);
-       WARN_ON(new_prdtgrp->type != RDTCTRL_GROUP);
-
-       /* Nothing to do when simply renaming a MON group. */
-       if (prdtgrp == new_prdtgrp)
-               return;
-
-       WARN_ON(list_empty(&prdtgrp->mon.crdtgrp_list));
-       list_move_tail(&rdtgrp->mon.crdtgrp_list,
-                      &new_prdtgrp->mon.crdtgrp_list);
-
-       rdtgrp->mon.parent = new_prdtgrp;
-       rdtgrp->closid = new_prdtgrp->closid;
-
-       /* Propagate updated closid to all tasks in this group. */
-       rdt_move_group_tasks(rdtgrp, rdtgrp, cpus);
-
-       update_closid_rmid(cpus, NULL);
-}
-
-static int rdtgroup_rename(struct kernfs_node *kn,
-                          struct kernfs_node *new_parent, const char *new_name)
-{
-       struct kernfs_node *kn_parent;
-       struct rdtgroup *new_prdtgrp;
-       struct rdtgroup *rdtgrp;
-       cpumask_var_t tmpmask;
-       int ret;
-
-       rdtgrp = kernfs_to_rdtgroup(kn);
-       new_prdtgrp = kernfs_to_rdtgroup(new_parent);
-       if (!rdtgrp || !new_prdtgrp)
-               return -ENOENT;
-
-       /* Release both kernfs active_refs before obtaining rdtgroup mutex. */
-       rdtgroup_kn_get(rdtgrp, kn);
-       rdtgroup_kn_get(new_prdtgrp, new_parent);
-
-       mutex_lock(&rdtgroup_mutex);
-
-       rdt_last_cmd_clear();
-
-       /*
-        * Don't allow kernfs_to_rdtgroup() to return a parent rdtgroup if
-        * either kernfs_node is a file.
-        */
-       if (kernfs_type(kn) != KERNFS_DIR ||
-           kernfs_type(new_parent) != KERNFS_DIR) {
-               rdt_last_cmd_puts("Source and destination must be directories");
-               ret = -EPERM;
-               goto out;
-       }
-
-       if ((rdtgrp->flags & RDT_DELETED) || (new_prdtgrp->flags & RDT_DELETED)) {
-               ret = -ENOENT;
-               goto out;
-       }
-
-       kn_parent = rdt_kn_parent(kn);
-       if (rdtgrp->type != RDTMON_GROUP || !kn_parent ||
-           !is_mon_groups(kn_parent, rdt_kn_name(kn))) {
-               rdt_last_cmd_puts("Source must be a MON group\n");
-               ret = -EPERM;
-               goto out;
-       }
-
-       if (!is_mon_groups(new_parent, new_name)) {
-               rdt_last_cmd_puts("Destination must be a mon_groups subdirectory\n");
-               ret = -EPERM;
-               goto out;
-       }
-
-       /*
-        * If the MON group is monitoring CPUs, the CPUs must be assigned to the
-        * current parent CTRL_MON group and therefore cannot be assigned to
-        * the new parent, making the move illegal.
-        */
-       if (!cpumask_empty(&rdtgrp->cpu_mask) &&
-           rdtgrp->mon.parent != new_prdtgrp) {
-               rdt_last_cmd_puts("Cannot move a MON group that monitors CPUs\n");
-               ret = -EPERM;
-               goto out;
-       }
-
-       /*
-        * Allocate the cpumask for use in mongrp_reparent() to avoid the
-        * possibility of failing to allocate it after kernfs_rename() has
-        * succeeded.
-        */
-       if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) {
-               ret = -ENOMEM;
-               goto out;
-       }
-
-       /*
-        * Perform all input validation and allocations needed to ensure
-        * mongrp_reparent() will succeed before calling kernfs_rename(),
-        * otherwise it would be necessary to revert this call if
-        * mongrp_reparent() failed.
-        */
-       ret = kernfs_rename(kn, new_parent, new_name);
-       if (!ret)
-               mongrp_reparent(rdtgrp, new_prdtgrp, tmpmask);
-
-       free_cpumask_var(tmpmask);
-
-out:
-       mutex_unlock(&rdtgroup_mutex);
-       rdtgroup_kn_put(rdtgrp, kn);
-       rdtgroup_kn_put(new_prdtgrp, new_parent);
-       return ret;
-}
-
-static int rdtgroup_show_options(struct seq_file *seq, struct kernfs_root *kf)
-{
-       if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L3))
-               seq_puts(seq, ",cdp");
-
-       if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L2))
-               seq_puts(seq, ",cdpl2");
-
-       if (is_mba_sc(resctrl_arch_get_resource(RDT_RESOURCE_MBA)))
-               seq_puts(seq, ",mba_MBps");
-
-       if (resctrl_debug)
-               seq_puts(seq, ",debug");
-
-       return 0;
-}
-
-static struct kernfs_syscall_ops rdtgroup_kf_syscall_ops = {
-       .mkdir          = rdtgroup_mkdir,
-       .rmdir          = rdtgroup_rmdir,
-       .rename         = rdtgroup_rename,
-       .show_options   = rdtgroup_show_options,
-};
-
-static int rdtgroup_setup_root(struct rdt_fs_context *ctx)
-{
-       rdt_root = kernfs_create_root(&rdtgroup_kf_syscall_ops,
-                                     KERNFS_ROOT_CREATE_DEACTIVATED |
-                                     KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK,
-                                     &rdtgroup_default);
-       if (IS_ERR(rdt_root))
-               return PTR_ERR(rdt_root);
-
-       ctx->kfc.root = rdt_root;
-       rdtgroup_default.kn = kernfs_root_to_node(rdt_root);
-
-       return 0;
-}
-
-static void rdtgroup_destroy_root(void)
-{
-       lockdep_assert_held(&rdtgroup_mutex);
-
-       kernfs_destroy_root(rdt_root);
-       rdtgroup_default.kn = NULL;
-}
-
-static void rdtgroup_setup_default(void)
-{
-       mutex_lock(&rdtgroup_mutex);
-
-       rdtgroup_default.closid = RESCTRL_RESERVED_CLOSID;
-       rdtgroup_default.mon.rmid = RESCTRL_RESERVED_RMID;
-       rdtgroup_default.type = RDTCTRL_GROUP;
-       INIT_LIST_HEAD(&rdtgroup_default.mon.crdtgrp_list);
-
-       list_add(&rdtgroup_default.rdtgroup_list, &rdt_all_groups);
-
-       mutex_unlock(&rdtgroup_mutex);
-}
-
-static void domain_destroy_mon_state(struct rdt_mon_domain *d)
-{
-       bitmap_free(d->rmid_busy_llc);
-       kfree(d->mbm_total);
-       kfree(d->mbm_local);
-}
-
-void resctrl_offline_ctrl_domain(struct rdt_resource *r, struct rdt_ctrl_domain *d)
-{
-       mutex_lock(&rdtgroup_mutex);
-
-       if (supports_mba_mbps() && r->rid == RDT_RESOURCE_MBA)
-               mba_sc_domain_destroy(r, d);
-
-       mutex_unlock(&rdtgroup_mutex);
-}
-
-void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d)
-{
-       mutex_lock(&rdtgroup_mutex);
-
-       /*
-        * If resctrl is mounted, remove all the
-        * per domain monitor data directories.
-        */
-       if (resctrl_mounted && resctrl_arch_mon_capable())
-               rmdir_mondata_subdir_allrdtgrp(r, d);
-
-       if (resctrl_is_mbm_enabled())
-               cancel_delayed_work(&d->mbm_over);
-       if (resctrl_arch_is_llc_occupancy_enabled() && has_busy_rmid(d)) {
-               /*
-                * When a package is going down, forcefully
-                * decrement rmid->ebusy. There is no way to know
-                * that the L3 was flushed and hence may lead to
-                * incorrect counts in rare scenarios, but leaving
-                * the RMID as busy creates RMID leaks if the
-                * package never comes back.
-                */
-               __check_limbo(d, true);
-               cancel_delayed_work(&d->cqm_limbo);
-       }
-
-       domain_destroy_mon_state(d);
-
-       mutex_unlock(&rdtgroup_mutex);
-}
-
-/**
- * domain_setup_mon_state() -  Initialise domain monitoring structures.
- * @r: The resource for the newly online domain.
- * @d: The newly online domain.
- *
- * Allocate monitor resources that belong to this domain.
- * Called when the first CPU of a domain comes online, regardless of whether
- * the filesystem is mounted.
- * During boot this may be called before global allocations have been made by
- * resctrl_mon_resource_init().
- *
- * Returns 0 for success, or -ENOMEM.
- */
-static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_mon_domain *d)
-{
-       u32 idx_limit = resctrl_arch_system_num_rmid_idx();
-       size_t tsize;
-
-       if (resctrl_arch_is_llc_occupancy_enabled()) {
-               d->rmid_busy_llc = bitmap_zalloc(idx_limit, GFP_KERNEL);
-               if (!d->rmid_busy_llc)
-                       return -ENOMEM;
-       }
-       if (resctrl_arch_is_mbm_total_enabled()) {
-               tsize = sizeof(*d->mbm_total);
-               d->mbm_total = kcalloc(idx_limit, tsize, GFP_KERNEL);
-               if (!d->mbm_total) {
-                       bitmap_free(d->rmid_busy_llc);
-                       return -ENOMEM;
-               }
-       }
-       if (resctrl_arch_is_mbm_local_enabled()) {
-               tsize = sizeof(*d->mbm_local);
-               d->mbm_local = kcalloc(idx_limit, tsize, GFP_KERNEL);
-               if (!d->mbm_local) {
-                       bitmap_free(d->rmid_busy_llc);
-                       kfree(d->mbm_total);
-                       return -ENOMEM;
-               }
-       }
-
-       return 0;
-}
-
-int resctrl_online_ctrl_domain(struct rdt_resource *r, struct rdt_ctrl_domain *d)
-{
-       int err = 0;
-
-       mutex_lock(&rdtgroup_mutex);
-
-       if (supports_mba_mbps() && r->rid == RDT_RESOURCE_MBA) {
-               /* RDT_RESOURCE_MBA is never mon_capable */
-               err = mba_sc_domain_allocate(r, d);
-       }
-
-       mutex_unlock(&rdtgroup_mutex);
-
-       return err;
-}
-
-int resctrl_online_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d)
-{
-       int err;
-
-       mutex_lock(&rdtgroup_mutex);
-
-       err = domain_setup_mon_state(r, d);
-       if (err)
-               goto out_unlock;
-
-       if (resctrl_is_mbm_enabled()) {
-               INIT_DELAYED_WORK(&d->mbm_over, mbm_handle_overflow);
-               mbm_setup_overflow_handler(d, MBM_OVERFLOW_INTERVAL,
-                                          RESCTRL_PICK_ANY_CPU);
-       }
-
-       if (resctrl_arch_is_llc_occupancy_enabled())
-               INIT_DELAYED_WORK(&d->cqm_limbo, cqm_handle_limbo);
-
-       /*
-        * If the filesystem is not mounted then only the default resource group
-        * exists. Creation of its directories is deferred until mount time
-        * by rdt_get_tree() calling mkdir_mondata_all().
-        * If resctrl is mounted, add per domain monitor data directories.
-        */
-       if (resctrl_mounted && resctrl_arch_mon_capable())
-               mkdir_mondata_subdir_allrdtgrp(r, d);
-
-out_unlock:
-       mutex_unlock(&rdtgroup_mutex);
-
-       return err;
-}
-
-void resctrl_online_cpu(unsigned int cpu)
-{
-       mutex_lock(&rdtgroup_mutex);
-       /* The CPU is set in default rdtgroup after online. */
-       cpumask_set_cpu(cpu, &rdtgroup_default.cpu_mask);
-       mutex_unlock(&rdtgroup_mutex);
-}
-
-static void clear_childcpus(struct rdtgroup *r, unsigned int cpu)
-{
-       struct rdtgroup *cr;
-
-       list_for_each_entry(cr, &r->mon.crdtgrp_list, mon.crdtgrp_list) {
-               if (cpumask_test_and_clear_cpu(cpu, &cr->cpu_mask))
-                       break;
-       }
-}
-
-static struct rdt_mon_domain *get_mon_domain_from_cpu(int cpu,
-                                                     struct rdt_resource *r)
-{
-       struct rdt_mon_domain *d;
-
-       lockdep_assert_cpus_held();
-
-       list_for_each_entry(d, &r->mon_domains, hdr.list) {
-               /* Find the domain that contains this CPU */
-               if (cpumask_test_cpu(cpu, &d->hdr.cpu_mask))
-                       return d;
-       }
-
-       return NULL;
-}
-
-void resctrl_offline_cpu(unsigned int cpu)
-{
-       struct rdt_resource *l3 = resctrl_arch_get_resource(RDT_RESOURCE_L3);
-       struct rdt_mon_domain *d;
-       struct rdtgroup *rdtgrp;
-
-       mutex_lock(&rdtgroup_mutex);
-       list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) {
-               if (cpumask_test_and_clear_cpu(cpu, &rdtgrp->cpu_mask)) {
-                       clear_childcpus(rdtgrp, cpu);
-                       break;
-               }
-       }
-
-       if (!l3->mon_capable)
-               goto out_unlock;
-
-       d = get_mon_domain_from_cpu(cpu, l3);
-       if (d) {
-               if (resctrl_is_mbm_enabled() && cpu == d->mbm_work_cpu) {
-                       cancel_delayed_work(&d->mbm_over);
-                       mbm_setup_overflow_handler(d, 0, cpu);
-               }
-               if (resctrl_arch_is_llc_occupancy_enabled() &&
-                   cpu == d->cqm_work_cpu && has_busy_rmid(d)) {
-                       cancel_delayed_work(&d->cqm_limbo);
-                       cqm_setup_limbo_handler(d, 0, cpu);
-               }
-       }
-
-out_unlock:
-       mutex_unlock(&rdtgroup_mutex);
-}
-
-/*
- * resctrl_init - resctrl filesystem initialization
- *
- * Setup resctrl file system including set up root, create mount point,
- * register resctrl filesystem, and initialize files under root directory.
- *
- * Return: 0 on success or -errno
- */
-int resctrl_init(void)
-{
-       int ret = 0;
-
-       seq_buf_init(&last_cmd_status, last_cmd_status_buf,
-                    sizeof(last_cmd_status_buf));
-
-       rdtgroup_setup_default();
-
-       thread_throttle_mode_init();
-
-       ret = resctrl_mon_resource_init();
-       if (ret)
-               return ret;
-
-       ret = sysfs_create_mount_point(fs_kobj, "resctrl");
-       if (ret) {
-               resctrl_mon_resource_exit();
-               return ret;
-       }
-
-       ret = register_filesystem(&rdt_fs_type);
-       if (ret)
-               goto cleanup_mountpoint;
-
-       /*
-        * Adding the resctrl debugfs directory here may not be ideal since
-        * it would let the resctrl debugfs directory appear on the debugfs
-        * filesystem before the resctrl filesystem is mounted.
-        * It may also be ok since that would enable debugging of RDT before
-        * resctrl is mounted.
-        * The reason why the debugfs directory is created here and not in
-        * rdt_get_tree() is because rdt_get_tree() takes rdtgroup_mutex and
-        * during the debugfs directory creation also &sb->s_type->i_mutex_key
-        * (the lockdep class of inode->i_rwsem). Other filesystem
-        * interactions (eg. SyS_getdents) have the lock ordering:
-        * &sb->s_type->i_mutex_key --> &mm->mmap_lock
-        * During mmap(), called with &mm->mmap_lock, the rdtgroup_mutex
-        * is taken, thus creating dependency:
-        * &mm->mmap_lock --> rdtgroup_mutex for the latter that can cause
-        * issues considering the other two lock dependencies.
-        * By creating the debugfs directory here we avoid a dependency
-        * that may cause deadlock (even though file operations cannot
-        * occur until the filesystem is mounted, but I do not know how to
-        * tell lockdep that).
-        */
-       debugfs_resctrl = debugfs_create_dir("resctrl", NULL);
-
-       return 0;
-
-cleanup_mountpoint:
-       sysfs_remove_mount_point(fs_kobj, "resctrl");
-       resctrl_mon_resource_exit();
-
-       return ret;
-}
-
-static bool resctrl_online_domains_exist(void)
-{
-       struct rdt_resource *r;
-
-       /*
-        * Only walk capable resources to allow resctrl_arch_get_resource()
-        * to return dummy 'not capable' resources.
-        */
-       for_each_alloc_capable_rdt_resource(r) {
-               if (!list_empty(&r->ctrl_domains))
-                       return true;
-       }
-
-       for_each_mon_capable_rdt_resource(r) {
-               if (!list_empty(&r->mon_domains))
-                       return true;
-       }
-
-       return false;
-}
-
-/**
- * resctrl_exit() - Remove the resctrl filesystem and free resources.
- *
- * Called by the architecture code in response to a fatal error.
- * Removes resctrl files and structures from kernfs to prevent further
- * configuration.
- *
- * When called by the architecture code, all CPUs and resctrl domains must be
- * offline. This ensures the limbo and overflow handlers are not scheduled to
- * run, meaning the data structures they access can be freed by
- * resctrl_mon_resource_exit().
- *
- * After resctrl_exit() returns, the architecture code should return an
- * error from all resctrl_arch_ functions that can do this.
- * resctrl_arch_get_resource() must continue to return struct rdt_resources
- * with the correct rid field to ensure the filesystem can be unmounted.
- */
-void resctrl_exit(void)
-{
-       cpus_read_lock();
-       WARN_ON_ONCE(resctrl_online_domains_exist());
-
-       mutex_lock(&rdtgroup_mutex);
-       resctrl_fs_teardown();
-       mutex_unlock(&rdtgroup_mutex);
-
-       cpus_read_unlock();
-
-       debugfs_remove_recursive(debugfs_resctrl);
-       debugfs_resctrl = NULL;
-       unregister_filesystem(&rdt_fs_type);
-
-       /*
-        * Do not remove the sysfs mount point added by resctrl_init() so that
-        * it can be used to umount resctrl.
-        */
-
-       resctrl_mon_resource_exit();
+       return;
  }
diff --git a/fs/resctrl/Kconfig b/fs/resctrl/Kconfig

index 478a8e2ad99ff76934eabd0d689cbb94d2cefe7e..21671301bd8a490a397981ddd7a95b2ccdbce375 100644 (file)
--- a/fs/resctrl/Kconfig
+++ b/fs/resctrl/Kconfig
@@ -21,7 +21,7 @@ config RESCTRL_FS
           On architectures where this can be disabled independently, it is
           safe to say N.
  
-         See <file:Documentation/arch/x86/resctrl.rst> for more information.
+         See <file:Documentation/filesystems/resctrl.rst> for more information.
  
  config RESCTRL_FS_PSEUDO_LOCK
         bool
diff --git a/fs/resctrl/ctrlmondata.c b/fs/resctrl/ctrlmondata.c

index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..6ed2dfd4dbbd9b3fb7eaa24e5847a04725237e0d 100644 (file)
--- a/fs/resctrl/ctrlmondata.c
+++ b/fs/resctrl/ctrlmondata.c
@@ -0,0 +1,661 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Resource Director Technology(RDT)
+ * - Cache Allocation code.
+ *
+ * Copyright (C) 2016 Intel Corporation
+ *
+ * Authors:
+ *    Fenghua Yu <fenghua.yu@intel.com>
+ *    Tony Luck <tony.luck@intel.com>
+ *
+ * More information about RDT be found in the Intel (R) x86 Architecture
+ * Software Developer Manual June 2016, volume 3, section 17.17.
+ */
+
+#define pr_fmt(fmt)    KBUILD_MODNAME ": " fmt
+
+#include <linux/cpu.h>
+#include <linux/kernfs.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/tick.h>
+
+#include "internal.h"
+
+struct rdt_parse_data {
+       struct rdtgroup         *rdtgrp;
+       char                    *buf;
+};
+
+typedef int (ctrlval_parser_t)(struct rdt_parse_data *data,
+                              struct resctrl_schema *s,
+                              struct rdt_ctrl_domain *d);
+
+/*
+ * Check whether MBA bandwidth percentage value is correct. The value is
+ * checked against the minimum and max bandwidth values specified by the
+ * hardware. The allocated bandwidth percentage is rounded to the next
+ * control step available on the hardware.
+ */
+static bool bw_validate(char *buf, u32 *data, struct rdt_resource *r)
+{
+       int ret;
+       u32 bw;
+
+       /*
+        * Only linear delay values is supported for current Intel SKUs.
+        */
+       if (!r->membw.delay_linear && r->membw.arch_needs_linear) {
+               rdt_last_cmd_puts("No support for non-linear MB domains\n");
+               return false;
+       }
+
+       ret = kstrtou32(buf, 10, &bw);
+       if (ret) {
+               rdt_last_cmd_printf("Invalid MB value %s\n", buf);
+               return false;
+       }
+
+       /* Nothing else to do if software controller is enabled. */
+       if (is_mba_sc(r)) {
+               *data = bw;
+               return true;
+       }
+
+       if (bw < r->membw.min_bw || bw > r->membw.max_bw) {
+               rdt_last_cmd_printf("MB value %u out of range [%d,%d]\n",
+                                   bw, r->membw.min_bw, r->membw.max_bw);
+               return false;
+       }
+
+       *data = roundup(bw, (unsigned long)r->membw.bw_gran);
+       return true;
+}
+
+static int parse_bw(struct rdt_parse_data *data, struct resctrl_schema *s,
+                   struct rdt_ctrl_domain *d)
+{
+       struct resctrl_staged_config *cfg;
+       u32 closid = data->rdtgrp->closid;
+       struct rdt_resource *r = s->res;
+       u32 bw_val;
+
+       cfg = &d->staged_config[s->conf_type];
+       if (cfg->have_new_ctrl) {
+               rdt_last_cmd_printf("Duplicate domain %d\n", d->hdr.id);
+               return -EINVAL;
+       }
+
+       if (!bw_validate(data->buf, &bw_val, r))
+               return -EINVAL;
+
+       if (is_mba_sc(r)) {
+               d->mbps_val[closid] = bw_val;
+               return 0;
+       }
+
+       cfg->new_ctrl = bw_val;
+       cfg->have_new_ctrl = true;
+
+       return 0;
+}
+
+/*
+ * Check whether a cache bit mask is valid.
+ * On Intel CPUs, non-contiguous 1s value support is indicated by CPUID:
+ *   - CPUID.0x10.1:ECX[3]: L3 non-contiguous 1s value supported if 1
+ *   - CPUID.0x10.2:ECX[3]: L2 non-contiguous 1s value supported if 1
+ *
+ * Haswell does not support a non-contiguous 1s value and additionally
+ * requires at least two bits set.
+ * AMD allows non-contiguous bitmasks.
+ */
+static bool cbm_validate(char *buf, u32 *data, struct rdt_resource *r)
+{
+       u32 supported_bits = BIT_MASK(r->cache.cbm_len) - 1;
+       unsigned int cbm_len = r->cache.cbm_len;
+       unsigned long first_bit, zero_bit, val;
+       int ret;
+
+       ret = kstrtoul(buf, 16, &val);
+       if (ret) {
+               rdt_last_cmd_printf("Non-hex character in the mask %s\n", buf);
+               return false;
+       }
+
+       if ((r->cache.min_cbm_bits > 0 && val == 0) || val > supported_bits) {
+               rdt_last_cmd_puts("Mask out of range\n");
+               return false;
+       }
+
+       first_bit = find_first_bit(&val, cbm_len);
+       zero_bit = find_next_zero_bit(&val, cbm_len, first_bit);
+
+       /* Are non-contiguous bitmasks allowed? */
+       if (!r->cache.arch_has_sparse_bitmasks &&
+           (find_next_bit(&val, cbm_len, zero_bit) < cbm_len)) {
+               rdt_last_cmd_printf("The mask %lx has non-consecutive 1-bits\n", val);
+               return false;
+       }
+
+       if ((zero_bit - first_bit) < r->cache.min_cbm_bits) {
+               rdt_last_cmd_printf("Need at least %d bits in the mask\n",
+                                   r->cache.min_cbm_bits);
+               return false;
+       }
+
+       *data = val;
+       return true;
+}
+
+/*
+ * Read one cache bit mask (hex). Check that it is valid for the current
+ * resource type.
+ */
+static int parse_cbm(struct rdt_parse_data *data, struct resctrl_schema *s,
+                    struct rdt_ctrl_domain *d)
+{
+       struct rdtgroup *rdtgrp = data->rdtgrp;
+       struct resctrl_staged_config *cfg;
+       struct rdt_resource *r = s->res;
+       u32 cbm_val;
+
+       cfg = &d->staged_config[s->conf_type];
+       if (cfg->have_new_ctrl) {
+               rdt_last_cmd_printf("Duplicate domain %d\n", d->hdr.id);
+               return -EINVAL;
+       }
+
+       /*
+        * Cannot set up more than one pseudo-locked region in a cache
+        * hierarchy.
+        */
+       if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP &&
+           rdtgroup_pseudo_locked_in_hierarchy(d)) {
+               rdt_last_cmd_puts("Pseudo-locked region in hierarchy\n");
+               return -EINVAL;
+       }
+
+       if (!cbm_validate(data->buf, &cbm_val, r))
+               return -EINVAL;
+
+       if ((rdtgrp->mode == RDT_MODE_EXCLUSIVE ||
+            rdtgrp->mode == RDT_MODE_SHAREABLE) &&
+           rdtgroup_cbm_overlaps_pseudo_locked(d, cbm_val)) {
+               rdt_last_cmd_puts("CBM overlaps with pseudo-locked region\n");
+               return -EINVAL;
+       }
+
+       /*
+        * The CBM may not overlap with the CBM of another closid if
+        * either is exclusive.
+        */
+       if (rdtgroup_cbm_overlaps(s, d, cbm_val, rdtgrp->closid, true)) {
+               rdt_last_cmd_puts("Overlaps with exclusive group\n");
+               return -EINVAL;
+       }
+
+       if (rdtgroup_cbm_overlaps(s, d, cbm_val, rdtgrp->closid, false)) {
+               if (rdtgrp->mode == RDT_MODE_EXCLUSIVE ||
+                   rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
+                       rdt_last_cmd_puts("Overlaps with other group\n");
+                       return -EINVAL;
+               }
+       }
+
+       cfg->new_ctrl = cbm_val;
+       cfg->have_new_ctrl = true;
+
+       return 0;
+}
+
+/*
+ * For each domain in this resource we expect to find a series of:
+ *     id=mask
+ * separated by ";". The "id" is in decimal, and must match one of
+ * the "id"s for this resource.
+ */
+static int parse_line(char *line, struct resctrl_schema *s,
+                     struct rdtgroup *rdtgrp)
+{
+       enum resctrl_conf_type t = s->conf_type;
+       ctrlval_parser_t *parse_ctrlval = NULL;
+       struct resctrl_staged_config *cfg;
+       struct rdt_resource *r = s->res;
+       struct rdt_parse_data data;
+       struct rdt_ctrl_domain *d;
+       char *dom = NULL, *id;
+       unsigned long dom_id;
+
+       /* Walking r->domains, ensure it can't race with cpuhp */
+       lockdep_assert_cpus_held();
+
+       switch (r->schema_fmt) {
+       case RESCTRL_SCHEMA_BITMAP:
+               parse_ctrlval = &parse_cbm;
+               break;
+       case RESCTRL_SCHEMA_RANGE:
+               parse_ctrlval = &parse_bw;
+               break;
+       }
+
+       if (WARN_ON_ONCE(!parse_ctrlval))
+               return -EINVAL;
+
+       if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP &&
+           (r->rid == RDT_RESOURCE_MBA || r->rid == RDT_RESOURCE_SMBA)) {
+               rdt_last_cmd_puts("Cannot pseudo-lock MBA resource\n");
+               return -EINVAL;
+       }
+
+next:
+       if (!line || line[0] == '\0')
+               return 0;
+       dom = strsep(&line, ";");
+       id = strsep(&dom, "=");
+       if (!dom || kstrtoul(id, 10, &dom_id)) {
+               rdt_last_cmd_puts("Missing '=' or non-numeric domain\n");
+               return -EINVAL;
+       }
+       dom = strim(dom);
+       list_for_each_entry(d, &r->ctrl_domains, hdr.list) {
+               if (d->hdr.id == dom_id) {
+                       data.buf = dom;
+                       data.rdtgrp = rdtgrp;
+                       if (parse_ctrlval(&data, s, d))
+                               return -EINVAL;
+                       if (rdtgrp->mode ==  RDT_MODE_PSEUDO_LOCKSETUP) {
+                               cfg = &d->staged_config[t];
+                               /*
+                                * In pseudo-locking setup mode and just
+                                * parsed a valid CBM that should be
+                                * pseudo-locked. Only one locked region per
+                                * resource group and domain so just do
+                                * the required initialization for single
+                                * region and return.
+                                */
+                               rdtgrp->plr->s = s;
+                               rdtgrp->plr->d = d;
+                               rdtgrp->plr->cbm = cfg->new_ctrl;
+                               d->plr = rdtgrp->plr;
+                               return 0;
+                       }
+                       goto next;
+               }
+       }
+       return -EINVAL;
+}
+
+static int rdtgroup_parse_resource(char *resname, char *tok,
+                                  struct rdtgroup *rdtgrp)
+{
+       struct resctrl_schema *s;
+
+       list_for_each_entry(s, &resctrl_schema_all, list) {
+               if (!strcmp(resname, s->name) && rdtgrp->closid < s->num_closid)
+                       return parse_line(tok, s, rdtgrp);
+       }
+       rdt_last_cmd_printf("Unknown or unsupported resource name '%s'\n", resname);
+       return -EINVAL;
+}
+
+ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
+                               char *buf, size_t nbytes, loff_t off)
+{
+       struct resctrl_schema *s;
+       struct rdtgroup *rdtgrp;
+       struct rdt_resource *r;
+       char *tok, *resname;
+       int ret = 0;
+
+       /* Valid input requires a trailing newline */
+       if (nbytes == 0 || buf[nbytes - 1] != '\n')
+               return -EINVAL;
+       buf[nbytes - 1] = '\0';
+
+       rdtgrp = rdtgroup_kn_lock_live(of->kn);
+       if (!rdtgrp) {
+               rdtgroup_kn_unlock(of->kn);
+               return -ENOENT;
+       }
+       rdt_last_cmd_clear();
+
+       /*
+        * No changes to pseudo-locked region allowed. It has to be removed
+        * and re-created instead.
+        */
+       if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) {
+               ret = -EINVAL;
+               rdt_last_cmd_puts("Resource group is pseudo-locked\n");
+               goto out;
+       }
+
+       rdt_staged_configs_clear();
+
+       while ((tok = strsep(&buf, "\n")) != NULL) {
+               resname = strim(strsep(&tok, ":"));
+               if (!tok) {
+                       rdt_last_cmd_puts("Missing ':'\n");
+                       ret = -EINVAL;
+                       goto out;
+               }
+               if (tok[0] == '\0') {
+                       rdt_last_cmd_printf("Missing '%s' value\n", resname);
+                       ret = -EINVAL;
+                       goto out;
+               }
+               ret = rdtgroup_parse_resource(resname, tok, rdtgrp);
+               if (ret)
+                       goto out;
+       }
+
+       list_for_each_entry(s, &resctrl_schema_all, list) {
+               r = s->res;
+
+               /*
+                * Writes to mba_sc resources update the software controller,
+                * not the control MSR.
+                */
+               if (is_mba_sc(r))
+                       continue;
+
+               ret = resctrl_arch_update_domains(r, rdtgrp->closid);
+               if (ret)
+                       goto out;
+       }
+
+       if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
+               /*
+                * If pseudo-locking fails we keep the resource group in
+                * mode RDT_MODE_PSEUDO_LOCKSETUP with its class of service
+                * active and updated for just the domain the pseudo-locked
+                * region was requested for.
+                */
+               ret = rdtgroup_pseudo_lock_create(rdtgrp);
+       }
+
+out:
+       rdt_staged_configs_clear();
+       rdtgroup_kn_unlock(of->kn);
+       return ret ?: nbytes;
+}
+
+static void show_doms(struct seq_file *s, struct resctrl_schema *schema, int closid)
+{
+       struct rdt_resource *r = schema->res;
+       struct rdt_ctrl_domain *dom;
+       bool sep = false;
+       u32 ctrl_val;
+
+       /* Walking r->domains, ensure it can't race with cpuhp */
+       lockdep_assert_cpus_held();
+
+       seq_printf(s, "%*s:", max_name_width, schema->name);
+       list_for_each_entry(dom, &r->ctrl_domains, hdr.list) {
+               if (sep)
+                       seq_puts(s, ";");
+
+               if (is_mba_sc(r))
+                       ctrl_val = dom->mbps_val[closid];
+               else
+                       ctrl_val = resctrl_arch_get_config(r, dom, closid,
+                                                          schema->conf_type);
+
+               seq_printf(s, schema->fmt_str, dom->hdr.id, ctrl_val);
+               sep = true;
+       }
+       seq_puts(s, "\n");
+}
+
+int rdtgroup_schemata_show(struct kernfs_open_file *of,
+                          struct seq_file *s, void *v)
+{
+       struct resctrl_schema *schema;
+       struct rdtgroup *rdtgrp;
+       int ret = 0;
+       u32 closid;
+
+       rdtgrp = rdtgroup_kn_lock_live(of->kn);
+       if (rdtgrp) {
+               if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
+                       list_for_each_entry(schema, &resctrl_schema_all, list) {
+                               seq_printf(s, "%s:uninitialized\n", schema->name);
+                       }
+               } else if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) {
+                       if (!rdtgrp->plr->d) {
+                               rdt_last_cmd_clear();
+                               rdt_last_cmd_puts("Cache domain offline\n");
+                               ret = -ENODEV;
+                       } else {
+                               seq_printf(s, "%s:%d=%x\n",
+                                          rdtgrp->plr->s->res->name,
+                                          rdtgrp->plr->d->hdr.id,
+                                          rdtgrp->plr->cbm);
+                       }
+               } else {
+                       closid = rdtgrp->closid;
+                       list_for_each_entry(schema, &resctrl_schema_all, list) {
+                               if (closid < schema->num_closid)
+                                       show_doms(s, schema, closid);
+                       }
+               }
+       } else {
+               ret = -ENOENT;
+       }
+       rdtgroup_kn_unlock(of->kn);
+       return ret;
+}
+
+static int smp_mon_event_count(void *arg)
+{
+       mon_event_count(arg);
+
+       return 0;
+}
+
+ssize_t rdtgroup_mba_mbps_event_write(struct kernfs_open_file *of,
+                                     char *buf, size_t nbytes, loff_t off)
+{
+       struct rdtgroup *rdtgrp;
+       int ret = 0;
+
+       /* Valid input requires a trailing newline */
+       if (nbytes == 0 || buf[nbytes - 1] != '\n')
+               return -EINVAL;
+       buf[nbytes - 1] = '\0';
+
+       rdtgrp = rdtgroup_kn_lock_live(of->kn);
+       if (!rdtgrp) {
+               rdtgroup_kn_unlock(of->kn);
+               return -ENOENT;
+       }
+       rdt_last_cmd_clear();
+
+       if (!strcmp(buf, "mbm_local_bytes")) {
+               if (resctrl_arch_is_mbm_local_enabled())
+                       rdtgrp->mba_mbps_event = QOS_L3_MBM_LOCAL_EVENT_ID;
+               else
+                       ret = -EINVAL;
+       } else if (!strcmp(buf, "mbm_total_bytes")) {
+               if (resctrl_arch_is_mbm_total_enabled())
+                       rdtgrp->mba_mbps_event = QOS_L3_MBM_TOTAL_EVENT_ID;
+               else
+                       ret = -EINVAL;
+       } else {
+               ret = -EINVAL;
+       }
+
+       if (ret)
+               rdt_last_cmd_printf("Unsupported event id '%s'\n", buf);
+
+       rdtgroup_kn_unlock(of->kn);
+
+       return ret ?: nbytes;
+}
+
+int rdtgroup_mba_mbps_event_show(struct kernfs_open_file *of,
+                                struct seq_file *s, void *v)
+{
+       struct rdtgroup *rdtgrp;
+       int ret = 0;
+
+       rdtgrp = rdtgroup_kn_lock_live(of->kn);
+
+       if (rdtgrp) {
+               switch (rdtgrp->mba_mbps_event) {
+               case QOS_L3_MBM_LOCAL_EVENT_ID:
+                       seq_puts(s, "mbm_local_bytes\n");
+                       break;
+               case QOS_L3_MBM_TOTAL_EVENT_ID:
+                       seq_puts(s, "mbm_total_bytes\n");
+                       break;
+               default:
+                       pr_warn_once("Bad event %d\n", rdtgrp->mba_mbps_event);
+                       ret = -EINVAL;
+                       break;
+               }
+       } else {
+               ret = -ENOENT;
+       }
+
+       rdtgroup_kn_unlock(of->kn);
+
+       return ret;
+}
+
+struct rdt_domain_hdr *resctrl_find_domain(struct list_head *h, int id,
+                                          struct list_head **pos)
+{
+       struct rdt_domain_hdr *d;
+       struct list_head *l;
+
+       list_for_each(l, h) {
+               d = list_entry(l, struct rdt_domain_hdr, list);
+               /* When id is found, return its domain. */
+               if (id == d->id)
+                       return d;
+               /* Stop searching when finding id's position in sorted list. */
+               if (id < d->id)
+                       break;
+       }
+
+       if (pos)
+               *pos = l;
+
+       return NULL;
+}
+
+void mon_event_read(struct rmid_read *rr, struct rdt_resource *r,
+                   struct rdt_mon_domain *d, struct rdtgroup *rdtgrp,
+                   cpumask_t *cpumask, int evtid, int first)
+{
+       int cpu;
+
+       /* When picking a CPU from cpu_mask, ensure it can't race with cpuhp */
+       lockdep_assert_cpus_held();
+
+       /*
+        * Setup the parameters to pass to mon_event_count() to read the data.
+        */
+       rr->rgrp = rdtgrp;
+       rr->evtid = evtid;
+       rr->r = r;
+       rr->d = d;
+       rr->first = first;
+       rr->arch_mon_ctx = resctrl_arch_mon_ctx_alloc(r, evtid);
+       if (IS_ERR(rr->arch_mon_ctx)) {
+               rr->err = -EINVAL;
+               return;
+       }
+
+       cpu = cpumask_any_housekeeping(cpumask, RESCTRL_PICK_ANY_CPU);
+
+       /*
+        * cpumask_any_housekeeping() prefers housekeeping CPUs, but
+        * are all the CPUs nohz_full? If yes, pick a CPU to IPI.
+        * MPAM's resctrl_arch_rmid_read() is unable to read the
+        * counters on some platforms if its called in IRQ context.
+        */
+       if (tick_nohz_full_cpu(cpu))
+               smp_call_function_any(cpumask, mon_event_count, rr, 1);
+       else
+               smp_call_on_cpu(cpu, smp_mon_event_count, rr, false);
+
+       resctrl_arch_mon_ctx_free(r, evtid, rr->arch_mon_ctx);
+}
+
+int rdtgroup_mondata_show(struct seq_file *m, void *arg)
+{
+       struct kernfs_open_file *of = m->private;
+       enum resctrl_res_level resid;
+       enum resctrl_event_id evtid;
+       struct rdt_domain_hdr *hdr;
+       struct rmid_read rr = {0};
+       struct rdt_mon_domain *d;
+       struct rdtgroup *rdtgrp;
+       struct rdt_resource *r;
+       struct mon_data *md;
+       int domid, ret = 0;
+
+       rdtgrp = rdtgroup_kn_lock_live(of->kn);
+       if (!rdtgrp) {
+               ret = -ENOENT;
+               goto out;
+       }
+
+       md = of->kn->priv;
+       if (WARN_ON_ONCE(!md)) {
+               ret = -EIO;
+               goto out;
+       }
+
+       resid = md->rid;
+       domid = md->domid;
+       evtid = md->evtid;
+       r = resctrl_arch_get_resource(resid);
+
+       if (md->sum) {
+               /*
+                * This file requires summing across all domains that share
+                * the L3 cache id that was provided in the "domid" field of the
+                * struct mon_data. Search all domains in the resource for
+                * one that matches this cache id.
+                */
+               list_for_each_entry(d, &r->mon_domains, hdr.list) {
+                       if (d->ci->id == domid) {
+                               rr.ci = d->ci;
+                               mon_event_read(&rr, r, NULL, rdtgrp,
+                                              &d->ci->shared_cpu_map, evtid, false);
+                               goto checkresult;
+                       }
+               }
+               ret = -ENOENT;
+               goto out;
+       } else {
+               /*
+                * This file provides data from a single domain. Search
+                * the resource to find the domain with "domid".
+                */
+               hdr = resctrl_find_domain(&r->mon_domains, domid, NULL);
+               if (!hdr || WARN_ON_ONCE(hdr->type != RESCTRL_MON_DOMAIN)) {
+                       ret = -ENOENT;
+                       goto out;
+               }
+               d = container_of(hdr, struct rdt_mon_domain, hdr);
+               mon_event_read(&rr, r, d, rdtgrp, &d->hdr.cpu_mask, evtid, false);
+       }
+
+checkresult:
+
+       if (rr.err == -EIO)
+               seq_puts(m, "Error\n");
+       else if (rr.err == -EINVAL)
+               seq_puts(m, "Unavailable\n");
+       else
+               seq_printf(m, "%llu\n", rr.val);
+
+out:
+       rdtgroup_kn_unlock(of->kn);
+       return ret;
+}
diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h

index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..9a8cf6f11151d9c8512344505388135d4196c388 100644 (file)
--- a/fs/resctrl/internal.h
+++ b/fs/resctrl/internal.h
@@ -0,0 +1,426 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _FS_RESCTRL_INTERNAL_H
+#define _FS_RESCTRL_INTERNAL_H
+
+#include <linux/resctrl.h>
+#include <linux/kernfs.h>
+#include <linux/fs_context.h>
+#include <linux/tick.h>
+
+#define CQM_LIMBOCHECK_INTERVAL        1000
+
+/**
+ * cpumask_any_housekeeping() - Choose any CPU in @mask, preferring those that
+ *                             aren't marked nohz_full
+ * @mask:      The mask to pick a CPU from.
+ * @exclude_cpu:The CPU to avoid picking.
+ *
+ * Returns a CPU from @mask, but not @exclude_cpu. If there are housekeeping
+ * CPUs that don't use nohz_full, these are preferred. Pass
+ * RESCTRL_PICK_ANY_CPU to avoid excluding any CPUs.
+ *
+ * When a CPU is excluded, returns >= nr_cpu_ids if no CPUs are available.
+ */
+static inline unsigned int
+cpumask_any_housekeeping(const struct cpumask *mask, int exclude_cpu)
+{
+       unsigned int cpu;
+
+       /* Try to find a CPU that isn't nohz_full to use in preference */
+       if (tick_nohz_full_enabled()) {
+               cpu = cpumask_any_andnot_but(mask, tick_nohz_full_mask, exclude_cpu);
+               if (cpu < nr_cpu_ids)
+                       return cpu;
+       }
+
+       return cpumask_any_but(mask, exclude_cpu);
+}
+
+struct rdt_fs_context {
+       struct kernfs_fs_context        kfc;
+       bool                            enable_cdpl2;
+       bool                            enable_cdpl3;
+       bool                            enable_mba_mbps;
+       bool                            enable_debug;
+};
+
+static inline struct rdt_fs_context *rdt_fc2context(struct fs_context *fc)
+{
+       struct kernfs_fs_context *kfc = fc->fs_private;
+
+       return container_of(kfc, struct rdt_fs_context, kfc);
+}
+
+/**
+ * struct mon_evt - Entry in the event list of a resource
+ * @evtid:             event id
+ * @name:              name of the event
+ * @configurable:      true if the event is configurable
+ * @list:              entry in &rdt_resource->evt_list
+ */
+struct mon_evt {
+       enum resctrl_event_id   evtid;
+       char                    *name;
+       bool                    configurable;
+       struct list_head        list;
+};
+
+/**
+ * struct mon_data - Monitoring details for each event file.
+ * @list:            Member of the global @mon_data_kn_priv_list list.
+ * @rid:             Resource id associated with the event file.
+ * @evtid:           Event id associated with the event file.
+ * @sum:             Set when event must be summed across multiple
+ *                   domains.
+ * @domid:           When @sum is zero this is the domain to which
+ *                   the event file belongs. When @sum is one this
+ *                   is the id of the L3 cache that all domains to be
+ *                   summed share.
+ *
+ * Pointed to by the kernfs kn->priv field of monitoring event files.
+ * Readers and writers must hold rdtgroup_mutex.
+ */
+struct mon_data {
+       struct list_head        list;
+       enum resctrl_res_level  rid;
+       enum resctrl_event_id   evtid;
+       int                     domid;
+       bool                    sum;
+};
+
+/**
+ * struct rmid_read - Data passed across smp_call*() to read event count.
+ * @rgrp:  Resource group for which the counter is being read. If it is a parent
+ *        resource group then its event count is summed with the count from all
+ *        its child resource groups.
+ * @r:    Resource describing the properties of the event being read.
+ * @d:    Domain that the counter should be read from. If NULL then sum all
+ *        domains in @r sharing L3 @ci.id
+ * @evtid: Which monitor event to read.
+ * @first: Initialize MBM counter when true.
+ * @ci:    Cacheinfo for L3. Only set when @d is NULL. Used when summing domains.
+ * @err:   Error encountered when reading counter.
+ * @val:   Returned value of event counter. If @rgrp is a parent resource group,
+ *        @val includes the sum of event counts from its child resource groups.
+ *        If @d is NULL, @val includes the sum of all domains in @r sharing @ci.id,
+ *        (summed across child resource groups if @rgrp is a parent resource group).
+ * @arch_mon_ctx: Hardware monitor allocated for this read request (MPAM only).
+ */
+struct rmid_read {
+       struct rdtgroup         *rgrp;
+       struct rdt_resource     *r;
+       struct rdt_mon_domain   *d;
+       enum resctrl_event_id   evtid;
+       bool                    first;
+       struct cacheinfo        *ci;
+       int                     err;
+       u64                     val;
+       void                    *arch_mon_ctx;
+};
+
+extern struct list_head resctrl_schema_all;
+
+extern bool resctrl_mounted;
+
+enum rdt_group_type {
+       RDTCTRL_GROUP = 0,
+       RDTMON_GROUP,
+       RDT_NUM_GROUP,
+};
+
+/**
+ * enum rdtgrp_mode - Mode of a RDT resource group
+ * @RDT_MODE_SHAREABLE: This resource group allows sharing of its allocations
+ * @RDT_MODE_EXCLUSIVE: No sharing of this resource group's allocations allowed
+ * @RDT_MODE_PSEUDO_LOCKSETUP: Resource group will be used for Pseudo-Locking
+ * @RDT_MODE_PSEUDO_LOCKED: No sharing of this resource group's allocations
+ *                          allowed AND the allocations are Cache Pseudo-Locked
+ * @RDT_NUM_MODES: Total number of modes
+ *
+ * The mode of a resource group enables control over the allowed overlap
+ * between allocations associated with different resource groups (classes
+ * of service). User is able to modify the mode of a resource group by
+ * writing to the "mode" resctrl file associated with the resource group.
+ *
+ * The "shareable", "exclusive", and "pseudo-locksetup" modes are set by
+ * writing the appropriate text to the "mode" file. A resource group enters
+ * "pseudo-locked" mode after the schemata is written while the resource
+ * group is in "pseudo-locksetup" mode.
+ */
+enum rdtgrp_mode {
+       RDT_MODE_SHAREABLE = 0,
+       RDT_MODE_EXCLUSIVE,
+       RDT_MODE_PSEUDO_LOCKSETUP,
+       RDT_MODE_PSEUDO_LOCKED,
+
+       /* Must be last */
+       RDT_NUM_MODES,
+};
+
+/**
+ * struct mongroup - store mon group's data in resctrl fs.
+ * @mon_data_kn:               kernfs node for the mon_data directory
+ * @parent:                    parent rdtgrp
+ * @crdtgrp_list:              child rdtgroup node list
+ * @rmid:                      rmid for this rdtgroup
+ */
+struct mongroup {
+       struct kernfs_node      *mon_data_kn;
+       struct rdtgroup         *parent;
+       struct list_head        crdtgrp_list;
+       u32                     rmid;
+};
+
+/**
+ * struct rdtgroup - store rdtgroup's data in resctrl file system.
+ * @kn:                                kernfs node
+ * @rdtgroup_list:             linked list for all rdtgroups
+ * @closid:                    closid for this rdtgroup
+ * @cpu_mask:                  CPUs assigned to this rdtgroup
+ * @flags:                     status bits
+ * @waitcount:                 how many cpus expect to find this
+ *                             group when they acquire rdtgroup_mutex
+ * @type:                      indicates type of this rdtgroup - either
+ *                             monitor only or ctrl_mon group
+ * @mon:                       mongroup related data
+ * @mode:                      mode of resource group
+ * @mba_mbps_event:            input monitoring event id when mba_sc is enabled
+ * @plr:                       pseudo-locked region
+ */
+struct rdtgroup {
+       struct kernfs_node              *kn;
+       struct list_head                rdtgroup_list;
+       u32                             closid;
+       struct cpumask                  cpu_mask;
+       int                             flags;
+       atomic_t                        waitcount;
+       enum rdt_group_type             type;
+       struct mongroup                 mon;
+       enum rdtgrp_mode                mode;
+       enum resctrl_event_id           mba_mbps_event;
+       struct pseudo_lock_region       *plr;
+};
+
+/* rdtgroup.flags */
+#define        RDT_DELETED             1
+
+/* rftype.flags */
+#define RFTYPE_FLAGS_CPUS_LIST 1
+
+/*
+ * Define the file type flags for base and info directories.
+ */
+#define RFTYPE_INFO                    BIT(0)
+
+#define RFTYPE_BASE                    BIT(1)
+
+#define RFTYPE_CTRL                    BIT(4)
+
+#define RFTYPE_MON                     BIT(5)
+
+#define RFTYPE_TOP                     BIT(6)
+
+#define RFTYPE_RES_CACHE               BIT(8)
+
+#define RFTYPE_RES_MB                  BIT(9)
+
+#define RFTYPE_DEBUG                   BIT(10)
+
+#define RFTYPE_CTRL_INFO               (RFTYPE_INFO | RFTYPE_CTRL)
+
+#define RFTYPE_MON_INFO                        (RFTYPE_INFO | RFTYPE_MON)
+
+#define RFTYPE_TOP_INFO                        (RFTYPE_INFO | RFTYPE_TOP)
+
+#define RFTYPE_CTRL_BASE               (RFTYPE_BASE | RFTYPE_CTRL)
+
+#define RFTYPE_MON_BASE                        (RFTYPE_BASE | RFTYPE_MON)
+
+/* List of all resource groups */
+extern struct list_head rdt_all_groups;
+
+extern int max_name_width;
+
+/**
+ * struct rftype - describe each file in the resctrl file system
+ * @name:      File name
+ * @mode:      Access mode
+ * @kf_ops:    File operations
+ * @flags:     File specific RFTYPE_FLAGS_* flags
+ * @fflags:    File specific RFTYPE_* flags
+ * @seq_show:  Show content of the file
+ * @write:     Write to the file
+ */
+struct rftype {
+       char                    *name;
+       umode_t                 mode;
+       const struct kernfs_ops *kf_ops;
+       unsigned long           flags;
+       unsigned long           fflags;
+
+       int (*seq_show)(struct kernfs_open_file *of,
+                       struct seq_file *sf, void *v);
+       /*
+        * write() is the generic write callback which maps directly to
+        * kernfs write operation and overrides all other operations.
+        * Maximum write size is determined by ->max_write_len.
+        */
+       ssize_t (*write)(struct kernfs_open_file *of,
+                        char *buf, size_t nbytes, loff_t off);
+};
+
+/**
+ * struct mbm_state - status for each MBM counter in each domain
+ * @prev_bw_bytes: Previous bytes value read for bandwidth calculation
+ * @prev_bw:   The most recent bandwidth in MBps
+ */
+struct mbm_state {
+       u64     prev_bw_bytes;
+       u32     prev_bw;
+};
+
+extern struct mutex rdtgroup_mutex;
+
+static inline const char *rdt_kn_name(const struct kernfs_node *kn)
+{
+       return rcu_dereference_check(kn->name, lockdep_is_held(&rdtgroup_mutex));
+}
+
+extern struct rdtgroup rdtgroup_default;
+
+extern struct dentry *debugfs_resctrl;
+
+extern enum resctrl_event_id mba_mbps_default_event;
+
+void rdt_last_cmd_clear(void);
+
+void rdt_last_cmd_puts(const char *s);
+
+__printf(1, 2)
+void rdt_last_cmd_printf(const char *fmt, ...);
+
+struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn);
+
+void rdtgroup_kn_unlock(struct kernfs_node *kn);
+
+int rdtgroup_kn_mode_restrict(struct rdtgroup *r, const char *name);
+
+int rdtgroup_kn_mode_restore(struct rdtgroup *r, const char *name,
+                            umode_t mask);
+
+ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
+                               char *buf, size_t nbytes, loff_t off);
+
+int rdtgroup_schemata_show(struct kernfs_open_file *of,
+                          struct seq_file *s, void *v);
+
+ssize_t rdtgroup_mba_mbps_event_write(struct kernfs_open_file *of,
+                                     char *buf, size_t nbytes, loff_t off);
+
+int rdtgroup_mba_mbps_event_show(struct kernfs_open_file *of,
+                                struct seq_file *s, void *v);
+
+bool rdtgroup_cbm_overlaps(struct resctrl_schema *s, struct rdt_ctrl_domain *d,
+                          unsigned long cbm, int closid, bool exclusive);
+
+unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r, struct rdt_ctrl_domain *d,
+                                 unsigned long cbm);
+
+enum rdtgrp_mode rdtgroup_mode_by_closid(int closid);
+
+int rdtgroup_tasks_assigned(struct rdtgroup *r);
+
+int closids_supported(void);
+
+void closid_free(int closid);
+
+int alloc_rmid(u32 closid);
+
+void free_rmid(u32 closid, u32 rmid);
+
+void resctrl_mon_resource_exit(void);
+
+void mon_event_count(void *info);
+
+int rdtgroup_mondata_show(struct seq_file *m, void *arg);
+
+void mon_event_read(struct rmid_read *rr, struct rdt_resource *r,
+                   struct rdt_mon_domain *d, struct rdtgroup *rdtgrp,
+                   cpumask_t *cpumask, int evtid, int first);
+
+int resctrl_mon_resource_init(void);
+
+void mbm_setup_overflow_handler(struct rdt_mon_domain *dom,
+                               unsigned long delay_ms,
+                               int exclude_cpu);
+
+void mbm_handle_overflow(struct work_struct *work);
+
+bool is_mba_sc(struct rdt_resource *r);
+
+void cqm_setup_limbo_handler(struct rdt_mon_domain *dom, unsigned long delay_ms,
+                            int exclude_cpu);
+
+void cqm_handle_limbo(struct work_struct *work);
+
+bool has_busy_rmid(struct rdt_mon_domain *d);
+
+void __check_limbo(struct rdt_mon_domain *d, bool force_free);
+
+void resctrl_file_fflags_init(const char *config, unsigned long fflags);
+
+void rdt_staged_configs_clear(void);
+
+bool closid_allocated(unsigned int closid);
+
+int resctrl_find_cleanest_closid(void);
+
+#ifdef CONFIG_RESCTRL_FS_PSEUDO_LOCK
+int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp);
+
+int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp);
+
+bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_ctrl_domain *d, unsigned long cbm);
+
+bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_ctrl_domain *d);
+
+int rdt_pseudo_lock_init(void);
+
+void rdt_pseudo_lock_release(void);
+
+int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp);
+
+void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp);
+
+#else
+static inline int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp)
+{
+       return -EOPNOTSUPP;
+}
+
+static inline int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp)
+{
+       return -EOPNOTSUPP;
+}
+
+static inline bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_ctrl_domain *d, unsigned long cbm)
+{
+       return false;
+}
+
+static inline bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_ctrl_domain *d)
+{
+       return false;
+}
+
+static inline int rdt_pseudo_lock_init(void) { return 0; }
+static inline void rdt_pseudo_lock_release(void) { }
+static inline int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp)
+{
+       return -EOPNOTSUPP;
+}
+
+static inline void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp) { }
+#endif /* CONFIG_RESCTRL_FS_PSEUDO_LOCK */
+
+#endif /* _FS_RESCTRL_INTERNAL_H */
diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c

index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..bde2801289d35d27f142898cda4ed7a29e5d69b7 100644 (file)
--- a/fs/resctrl/monitor.c
+++ b/fs/resctrl/monitor.c
@@ -0,0 +1,929 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Resource Director Technology(RDT)
+ * - Monitoring code
+ *
+ * Copyright (C) 2017 Intel Corporation
+ *
+ * Author:
+ *    Vikas Shivappa <vikas.shivappa@intel.com>
+ *
+ * This replaces the cqm.c based on perf but we reuse a lot of
+ * code and datastructures originally from Peter Zijlstra and Matt Fleming.
+ *
+ * More information about RDT be found in the Intel (R) x86 Architecture
+ * Software Developer Manual June 2016, volume 3, section 17.17.
+ */
+
+#define pr_fmt(fmt)    "resctrl: " fmt
+
+#include <linux/cpu.h>
+#include <linux/resctrl.h>
+#include <linux/sizes.h>
+#include <linux/slab.h>
+
+#include "internal.h"
+
+#define CREATE_TRACE_POINTS
+
+#include "monitor_trace.h"
+
+/**
+ * struct rmid_entry - dirty tracking for all RMID.
+ * @closid:    The CLOSID for this entry.
+ * @rmid:      The RMID for this entry.
+ * @busy:      The number of domains with cached data using this RMID.
+ * @list:      Member of the rmid_free_lru list when busy == 0.
+ *
+ * Depending on the architecture the correct monitor is accessed using
+ * both @closid and @rmid, or @rmid only.
+ *
+ * Take the rdtgroup_mutex when accessing.
+ */
+struct rmid_entry {
+       u32                             closid;
+       u32                             rmid;
+       int                             busy;
+       struct list_head                list;
+};
+
+/*
+ * @rmid_free_lru - A least recently used list of free RMIDs
+ *     These RMIDs are guaranteed to have an occupancy less than the
+ *     threshold occupancy
+ */
+static LIST_HEAD(rmid_free_lru);
+
+/*
+ * @closid_num_dirty_rmid    The number of dirty RMID each CLOSID has.
+ *     Only allocated when CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID is defined.
+ *     Indexed by CLOSID. Protected by rdtgroup_mutex.
+ */
+static u32 *closid_num_dirty_rmid;
+
+/*
+ * @rmid_limbo_count - count of currently unused but (potentially)
+ *     dirty RMIDs.
+ *     This counts RMIDs that no one is currently using but that
+ *     may have a occupancy value > resctrl_rmid_realloc_threshold. User can
+ *     change the threshold occupancy value.
+ */
+static unsigned int rmid_limbo_count;
+
+/*
+ * @rmid_entry - The entry in the limbo and free lists.
+ */
+static struct rmid_entry       *rmid_ptrs;
+
+/*
+ * This is the threshold cache occupancy in bytes at which we will consider an
+ * RMID available for re-allocation.
+ */
+unsigned int resctrl_rmid_realloc_threshold;
+
+/*
+ * This is the maximum value for the reallocation threshold, in bytes.
+ */
+unsigned int resctrl_rmid_realloc_limit;
+
+/*
+ * x86 and arm64 differ in their handling of monitoring.
+ * x86's RMID are independent numbers, there is only one source of traffic
+ * with an RMID value of '1'.
+ * arm64's PMG extends the PARTID/CLOSID space, there are multiple sources of
+ * traffic with a PMG value of '1', one for each CLOSID, meaning the RMID
+ * value is no longer unique.
+ * To account for this, resctrl uses an index. On x86 this is just the RMID,
+ * on arm64 it encodes the CLOSID and RMID. This gives a unique number.
+ *
+ * The domain's rmid_busy_llc and rmid_ptrs[] are sized by index. The arch code
+ * must accept an attempt to read every index.
+ */
+static inline struct rmid_entry *__rmid_entry(u32 idx)
+{
+       struct rmid_entry *entry;
+       u32 closid, rmid;
+
+       entry = &rmid_ptrs[idx];
+       resctrl_arch_rmid_idx_decode(idx, &closid, &rmid);
+
+       WARN_ON_ONCE(entry->closid != closid);
+       WARN_ON_ONCE(entry->rmid != rmid);
+
+       return entry;
+}
+
+static void limbo_release_entry(struct rmid_entry *entry)
+{
+       lockdep_assert_held(&rdtgroup_mutex);
+
+       rmid_limbo_count--;
+       list_add_tail(&entry->list, &rmid_free_lru);
+
+       if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID))
+               closid_num_dirty_rmid[entry->closid]--;
+}
+
+/*
+ * Check the RMIDs that are marked as busy for this domain. If the
+ * reported LLC occupancy is below the threshold clear the busy bit and
+ * decrement the count. If the busy count gets to zero on an RMID, we
+ * free the RMID
+ */
+void __check_limbo(struct rdt_mon_domain *d, bool force_free)
+{
+       struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3);
+       u32 idx_limit = resctrl_arch_system_num_rmid_idx();
+       struct rmid_entry *entry;
+       u32 idx, cur_idx = 1;
+       void *arch_mon_ctx;
+       bool rmid_dirty;
+       u64 val = 0;
+
+       arch_mon_ctx = resctrl_arch_mon_ctx_alloc(r, QOS_L3_OCCUP_EVENT_ID);
+       if (IS_ERR(arch_mon_ctx)) {
+               pr_warn_ratelimited("Failed to allocate monitor context: %ld",
+                                   PTR_ERR(arch_mon_ctx));
+               return;
+       }
+
+       /*
+        * Skip RMID 0 and start from RMID 1 and check all the RMIDs that
+        * are marked as busy for occupancy < threshold. If the occupancy
+        * is less than the threshold decrement the busy counter of the
+        * RMID and move it to the free list when the counter reaches 0.
+        */
+       for (;;) {
+               idx = find_next_bit(d->rmid_busy_llc, idx_limit, cur_idx);
+               if (idx >= idx_limit)
+                       break;
+
+               entry = __rmid_entry(idx);
+               if (resctrl_arch_rmid_read(r, d, entry->closid, entry->rmid,
+                                          QOS_L3_OCCUP_EVENT_ID, &val,
+                                          arch_mon_ctx)) {
+                       rmid_dirty = true;
+               } else {
+                       rmid_dirty = (val >= resctrl_rmid_realloc_threshold);
+
+                       /*
+                        * x86's CLOSID and RMID are independent numbers, so the entry's
+                        * CLOSID is an empty CLOSID (X86_RESCTRL_EMPTY_CLOSID). On Arm the
+                        * RMID (PMG) extends the CLOSID (PARTID) space with bits that aren't
+                        * used to select the configuration. It is thus necessary to track both
+                        * CLOSID and RMID because there may be dependencies between them
+                        * on some architectures.
+                        */
+                       trace_mon_llc_occupancy_limbo(entry->closid, entry->rmid, d->hdr.id, val);
+               }
+
+               if (force_free || !rmid_dirty) {
+                       clear_bit(idx, d->rmid_busy_llc);
+                       if (!--entry->busy)
+                               limbo_release_entry(entry);
+               }
+               cur_idx = idx + 1;
+       }
+
+       resctrl_arch_mon_ctx_free(r, QOS_L3_OCCUP_EVENT_ID, arch_mon_ctx);
+}
+
+bool has_busy_rmid(struct rdt_mon_domain *d)
+{
+       u32 idx_limit = resctrl_arch_system_num_rmid_idx();
+
+       return find_first_bit(d->rmid_busy_llc, idx_limit) != idx_limit;
+}
+
+static struct rmid_entry *resctrl_find_free_rmid(u32 closid)
+{
+       struct rmid_entry *itr;
+       u32 itr_idx, cmp_idx;
+
+       if (list_empty(&rmid_free_lru))
+               return rmid_limbo_count ? ERR_PTR(-EBUSY) : ERR_PTR(-ENOSPC);
+
+       list_for_each_entry(itr, &rmid_free_lru, list) {
+               /*
+                * Get the index of this free RMID, and the index it would need
+                * to be if it were used with this CLOSID.
+                * If the CLOSID is irrelevant on this architecture, the two
+                * index values are always the same on every entry and thus the
+                * very first entry will be returned.
+                */
+               itr_idx = resctrl_arch_rmid_idx_encode(itr->closid, itr->rmid);
+               cmp_idx = resctrl_arch_rmid_idx_encode(closid, itr->rmid);
+
+               if (itr_idx == cmp_idx)
+                       return itr;
+       }
+
+       return ERR_PTR(-ENOSPC);
+}
+
+/**
+ * resctrl_find_cleanest_closid() - Find a CLOSID where all the associated
+ *                                  RMID are clean, or the CLOSID that has
+ *                                  the most clean RMID.
+ *
+ * MPAM's equivalent of RMID are per-CLOSID, meaning a freshly allocated CLOSID
+ * may not be able to allocate clean RMID. To avoid this the allocator will
+ * choose the CLOSID with the most clean RMID.
+ *
+ * When the CLOSID and RMID are independent numbers, the first free CLOSID will
+ * be returned.
+ */
+int resctrl_find_cleanest_closid(void)
+{
+       u32 cleanest_closid = ~0;
+       int i = 0;
+
+       lockdep_assert_held(&rdtgroup_mutex);
+
+       if (!IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID))
+               return -EIO;
+
+       for (i = 0; i < closids_supported(); i++) {
+               int num_dirty;
+
+               if (closid_allocated(i))
+                       continue;
+
+               num_dirty = closid_num_dirty_rmid[i];
+               if (num_dirty == 0)
+                       return i;
+
+               if (cleanest_closid == ~0)
+                       cleanest_closid = i;
+
+               if (num_dirty < closid_num_dirty_rmid[cleanest_closid])
+                       cleanest_closid = i;
+       }
+
+       if (cleanest_closid == ~0)
+               return -ENOSPC;
+
+       return cleanest_closid;
+}
+
+/*
+ * For MPAM the RMID value is not unique, and has to be considered with
+ * the CLOSID. The (CLOSID, RMID) pair is allocated on all domains, which
+ * allows all domains to be managed by a single free list.
+ * Each domain also has a rmid_busy_llc to reduce the work of the limbo handler.
+ */
+int alloc_rmid(u32 closid)
+{
+       struct rmid_entry *entry;
+
+       lockdep_assert_held(&rdtgroup_mutex);
+
+       entry = resctrl_find_free_rmid(closid);
+       if (IS_ERR(entry))
+               return PTR_ERR(entry);
+
+       list_del(&entry->list);
+       return entry->rmid;
+}
+
+static void add_rmid_to_limbo(struct rmid_entry *entry)
+{
+       struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3);
+       struct rdt_mon_domain *d;
+       u32 idx;
+
+       lockdep_assert_held(&rdtgroup_mutex);
+
+       /* Walking r->domains, ensure it can't race with cpuhp */
+       lockdep_assert_cpus_held();
+
+       idx = resctrl_arch_rmid_idx_encode(entry->closid, entry->rmid);
+
+       entry->busy = 0;
+       list_for_each_entry(d, &r->mon_domains, hdr.list) {
+               /*
+                * For the first limbo RMID in the domain,
+                * setup up the limbo worker.
+                */
+               if (!has_busy_rmid(d))
+                       cqm_setup_limbo_handler(d, CQM_LIMBOCHECK_INTERVAL,
+                                               RESCTRL_PICK_ANY_CPU);
+               set_bit(idx, d->rmid_busy_llc);
+               entry->busy++;
+       }
+
+       rmid_limbo_count++;
+       if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID))
+               closid_num_dirty_rmid[entry->closid]++;
+}
+
+void free_rmid(u32 closid, u32 rmid)
+{
+       u32 idx = resctrl_arch_rmid_idx_encode(closid, rmid);
+       struct rmid_entry *entry;
+
+       lockdep_assert_held(&rdtgroup_mutex);
+
+       /*
+        * Do not allow the default rmid to be free'd. Comparing by index
+        * allows architectures that ignore the closid parameter to avoid an
+        * unnecessary check.
+        */
+       if (!resctrl_arch_mon_capable() ||
+           idx == resctrl_arch_rmid_idx_encode(RESCTRL_RESERVED_CLOSID,
+                                               RESCTRL_RESERVED_RMID))
+               return;
+
+       entry = __rmid_entry(idx);
+
+       if (resctrl_arch_is_llc_occupancy_enabled())
+               add_rmid_to_limbo(entry);
+       else
+               list_add_tail(&entry->list, &rmid_free_lru);
+}
+
+static struct mbm_state *get_mbm_state(struct rdt_mon_domain *d, u32 closid,
+                                      u32 rmid, enum resctrl_event_id evtid)
+{
+       u32 idx = resctrl_arch_rmid_idx_encode(closid, rmid);
+
+       switch (evtid) {
+       case QOS_L3_MBM_TOTAL_EVENT_ID:
+               return &d->mbm_total[idx];
+       case QOS_L3_MBM_LOCAL_EVENT_ID:
+               return &d->mbm_local[idx];
+       default:
+               return NULL;
+       }
+}
+
+static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr)
+{
+       int cpu = smp_processor_id();
+       struct rdt_mon_domain *d;
+       struct mbm_state *m;
+       int err, ret;
+       u64 tval = 0;
+
+       if (rr->first) {
+               resctrl_arch_reset_rmid(rr->r, rr->d, closid, rmid, rr->evtid);
+               m = get_mbm_state(rr->d, closid, rmid, rr->evtid);
+               if (m)
+                       memset(m, 0, sizeof(struct mbm_state));
+               return 0;
+       }
+
+       if (rr->d) {
+               /* Reading a single domain, must be on a CPU in that domain. */
+               if (!cpumask_test_cpu(cpu, &rr->d->hdr.cpu_mask))
+                       return -EINVAL;
+               rr->err = resctrl_arch_rmid_read(rr->r, rr->d, closid, rmid,
+                                                rr->evtid, &tval, rr->arch_mon_ctx);
+               if (rr->err)
+                       return rr->err;
+
+               rr->val += tval;
+
+               return 0;
+       }
+
+       /* Summing domains that share a cache, must be on a CPU for that cache. */
+       if (!cpumask_test_cpu(cpu, &rr->ci->shared_cpu_map))
+               return -EINVAL;
+
+       /*
+        * Legacy files must report the sum of an event across all
+        * domains that share the same L3 cache instance.
+        * Report success if a read from any domain succeeds, -EINVAL
+        * (translated to "Unavailable" for user space) if reading from
+        * all domains fail for any reason.
+        */
+       ret = -EINVAL;
+       list_for_each_entry(d, &rr->r->mon_domains, hdr.list) {
+               if (d->ci->id != rr->ci->id)
+                       continue;
+               err = resctrl_arch_rmid_read(rr->r, d, closid, rmid,
+                                            rr->evtid, &tval, rr->arch_mon_ctx);
+               if (!err) {
+                       rr->val += tval;
+                       ret = 0;
+               }
+       }
+
+       if (ret)
+               rr->err = ret;
+
+       return ret;
+}
+
+/*
+ * mbm_bw_count() - Update bw count from values previously read by
+ *                 __mon_event_count().
+ * @closid:    The closid used to identify the cached mbm_state.
+ * @rmid:      The rmid used to identify the cached mbm_state.
+ * @rr:                The struct rmid_read populated by __mon_event_count().
+ *
+ * Supporting function to calculate the memory bandwidth
+ * and delta bandwidth in MBps. The chunks value previously read by
+ * __mon_event_count() is compared with the chunks value from the previous
+ * invocation. This must be called once per second to maintain values in MBps.
+ */
+static void mbm_bw_count(u32 closid, u32 rmid, struct rmid_read *rr)
+{
+       u64 cur_bw, bytes, cur_bytes;
+       struct mbm_state *m;
+
+       m = get_mbm_state(rr->d, closid, rmid, rr->evtid);
+       if (WARN_ON_ONCE(!m))
+               return;
+
+       cur_bytes = rr->val;
+       bytes = cur_bytes - m->prev_bw_bytes;
+       m->prev_bw_bytes = cur_bytes;
+
+       cur_bw = bytes / SZ_1M;
+
+       m->prev_bw = cur_bw;
+}
+
+/*
+ * This is scheduled by mon_event_read() to read the CQM/MBM counters
+ * on a domain.
+ */
+void mon_event_count(void *info)
+{
+       struct rdtgroup *rdtgrp, *entry;
+       struct rmid_read *rr = info;
+       struct list_head *head;
+       int ret;
+
+       rdtgrp = rr->rgrp;
+
+       ret = __mon_event_count(rdtgrp->closid, rdtgrp->mon.rmid, rr);
+
+       /*
+        * For Ctrl groups read data from child monitor groups and
+        * add them together. Count events which are read successfully.
+        * Discard the rmid_read's reporting errors.
+        */
+       head = &rdtgrp->mon.crdtgrp_list;
+
+       if (rdtgrp->type == RDTCTRL_GROUP) {
+               list_for_each_entry(entry, head, mon.crdtgrp_list) {
+                       if (__mon_event_count(entry->closid, entry->mon.rmid,
+                                             rr) == 0)
+                               ret = 0;
+               }
+       }
+
+       /*
+        * __mon_event_count() calls for newly created monitor groups may
+        * report -EINVAL/Unavailable if the monitor hasn't seen any traffic.
+        * Discard error if any of the monitor event reads succeeded.
+        */
+       if (ret == 0)
+               rr->err = 0;
+}
+
+static struct rdt_ctrl_domain *get_ctrl_domain_from_cpu(int cpu,
+                                                       struct rdt_resource *r)
+{
+       struct rdt_ctrl_domain *d;
+
+       lockdep_assert_cpus_held();
+
+       list_for_each_entry(d, &r->ctrl_domains, hdr.list) {
+               /* Find the domain that contains this CPU */
+               if (cpumask_test_cpu(cpu, &d->hdr.cpu_mask))
+                       return d;
+       }
+
+       return NULL;
+}
+
+/*
+ * Feedback loop for MBA software controller (mba_sc)
+ *
+ * mba_sc is a feedback loop where we periodically read MBM counters and
+ * adjust the bandwidth percentage values via the IA32_MBA_THRTL_MSRs so
+ * that:
+ *
+ *   current bandwidth(cur_bw) < user specified bandwidth(user_bw)
+ *
+ * This uses the MBM counters to measure the bandwidth and MBA throttle
+ * MSRs to control the bandwidth for a particular rdtgrp. It builds on the
+ * fact that resctrl rdtgroups have both monitoring and control.
+ *
+ * The frequency of the checks is 1s and we just tag along the MBM overflow
+ * timer. Having 1s interval makes the calculation of bandwidth simpler.
+ *
+ * Although MBA's goal is to restrict the bandwidth to a maximum, there may
+ * be a need to increase the bandwidth to avoid unnecessarily restricting
+ * the L2 <-> L3 traffic.
+ *
+ * Since MBA controls the L2 external bandwidth where as MBM measures the
+ * L3 external bandwidth the following sequence could lead to such a
+ * situation.
+ *
+ * Consider an rdtgroup which had high L3 <-> memory traffic in initial
+ * phases -> mba_sc kicks in and reduced bandwidth percentage values -> but
+ * after some time rdtgroup has mostly L2 <-> L3 traffic.
+ *
+ * In this case we may restrict the rdtgroup's L2 <-> L3 traffic as its
+ * throttle MSRs already have low percentage values.  To avoid
+ * unnecessarily restricting such rdtgroups, we also increase the bandwidth.
+ */
+static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_mon_domain *dom_mbm)
+{
+       u32 closid, rmid, cur_msr_val, new_msr_val;
+       struct mbm_state *pmbm_data, *cmbm_data;
+       struct rdt_ctrl_domain *dom_mba;
+       enum resctrl_event_id evt_id;
+       struct rdt_resource *r_mba;
+       struct list_head *head;
+       struct rdtgroup *entry;
+       u32 cur_bw, user_bw;
+
+       r_mba = resctrl_arch_get_resource(RDT_RESOURCE_MBA);
+       evt_id = rgrp->mba_mbps_event;
+
+       closid = rgrp->closid;
+       rmid = rgrp->mon.rmid;
+       pmbm_data = get_mbm_state(dom_mbm, closid, rmid, evt_id);
+       if (WARN_ON_ONCE(!pmbm_data))
+               return;
+
+       dom_mba = get_ctrl_domain_from_cpu(smp_processor_id(), r_mba);
+       if (!dom_mba) {
+               pr_warn_once("Failure to get domain for MBA update\n");
+               return;
+       }
+
+       cur_bw = pmbm_data->prev_bw;
+       user_bw = dom_mba->mbps_val[closid];
+
+       /* MBA resource doesn't support CDP */
+       cur_msr_val = resctrl_arch_get_config(r_mba, dom_mba, closid, CDP_NONE);
+
+       /*
+        * For Ctrl groups read data from child monitor groups.
+        */
+       head = &rgrp->mon.crdtgrp_list;
+       list_for_each_entry(entry, head, mon.crdtgrp_list) {
+               cmbm_data = get_mbm_state(dom_mbm, entry->closid, entry->mon.rmid, evt_id);
+               if (WARN_ON_ONCE(!cmbm_data))
+                       return;
+               cur_bw += cmbm_data->prev_bw;
+       }
+
+       /*
+        * Scale up/down the bandwidth linearly for the ctrl group.  The
+        * bandwidth step is the bandwidth granularity specified by the
+        * hardware.
+        * Always increase throttling if current bandwidth is above the
+        * target set by user.
+        * But avoid thrashing up and down on every poll by checking
+        * whether a decrease in throttling is likely to push the group
+        * back over target. E.g. if currently throttling to 30% of bandwidth
+        * on a system with 10% granularity steps, check whether moving to
+        * 40% would go past the limit by multiplying current bandwidth by
+        * "(30 + 10) / 30".
+        */
+       if (cur_msr_val > r_mba->membw.min_bw && user_bw < cur_bw) {
+               new_msr_val = cur_msr_val - r_mba->membw.bw_gran;
+       } else if (cur_msr_val < MAX_MBA_BW &&
+                  (user_bw > (cur_bw * (cur_msr_val + r_mba->membw.min_bw) / cur_msr_val))) {
+               new_msr_val = cur_msr_val + r_mba->membw.bw_gran;
+       } else {
+               return;
+       }
+
+       resctrl_arch_update_one(r_mba, dom_mba, closid, CDP_NONE, new_msr_val);
+}
+
+static void mbm_update_one_event(struct rdt_resource *r, struct rdt_mon_domain *d,
+                                u32 closid, u32 rmid, enum resctrl_event_id evtid)
+{
+       struct rmid_read rr = {0};
+
+       rr.r = r;
+       rr.d = d;
+       rr.evtid = evtid;
+       rr.arch_mon_ctx = resctrl_arch_mon_ctx_alloc(rr.r, rr.evtid);
+       if (IS_ERR(rr.arch_mon_ctx)) {
+               pr_warn_ratelimited("Failed to allocate monitor context: %ld",
+                                   PTR_ERR(rr.arch_mon_ctx));
+               return;
+       }
+
+       __mon_event_count(closid, rmid, &rr);
+
+       /*
+        * If the software controller is enabled, compute the
+        * bandwidth for this event id.
+        */
+       if (is_mba_sc(NULL))
+               mbm_bw_count(closid, rmid, &rr);
+
+       resctrl_arch_mon_ctx_free(rr.r, rr.evtid, rr.arch_mon_ctx);
+}
+
+static void mbm_update(struct rdt_resource *r, struct rdt_mon_domain *d,
+                      u32 closid, u32 rmid)
+{
+       /*
+        * This is protected from concurrent reads from user as both
+        * the user and overflow handler hold the global mutex.
+        */
+       if (resctrl_arch_is_mbm_total_enabled())
+               mbm_update_one_event(r, d, closid, rmid, QOS_L3_MBM_TOTAL_EVENT_ID);
+
+       if (resctrl_arch_is_mbm_local_enabled())
+               mbm_update_one_event(r, d, closid, rmid, QOS_L3_MBM_LOCAL_EVENT_ID);
+}
+
+/*
+ * Handler to scan the limbo list and move the RMIDs
+ * to free list whose occupancy < threshold_occupancy.
+ */
+void cqm_handle_limbo(struct work_struct *work)
+{
+       unsigned long delay = msecs_to_jiffies(CQM_LIMBOCHECK_INTERVAL);
+       struct rdt_mon_domain *d;
+
+       cpus_read_lock();
+       mutex_lock(&rdtgroup_mutex);
+
+       d = container_of(work, struct rdt_mon_domain, cqm_limbo.work);
+
+       __check_limbo(d, false);
+
+       if (has_busy_rmid(d)) {
+               d->cqm_work_cpu = cpumask_any_housekeeping(&d->hdr.cpu_mask,
+                                                          RESCTRL_PICK_ANY_CPU);
+               schedule_delayed_work_on(d->cqm_work_cpu, &d->cqm_limbo,
+                                        delay);
+       }
+
+       mutex_unlock(&rdtgroup_mutex);
+       cpus_read_unlock();
+}
+
+/**
+ * cqm_setup_limbo_handler() - Schedule the limbo handler to run for this
+ *                             domain.
+ * @dom:           The domain the limbo handler should run for.
+ * @delay_ms:      How far in the future the handler should run.
+ * @exclude_cpu:   Which CPU the handler should not run on,
+ *                RESCTRL_PICK_ANY_CPU to pick any CPU.
+ */
+void cqm_setup_limbo_handler(struct rdt_mon_domain *dom, unsigned long delay_ms,
+                            int exclude_cpu)
+{
+       unsigned long delay = msecs_to_jiffies(delay_ms);
+       int cpu;
+
+       cpu = cpumask_any_housekeeping(&dom->hdr.cpu_mask, exclude_cpu);
+       dom->cqm_work_cpu = cpu;
+
+       if (cpu < nr_cpu_ids)
+               schedule_delayed_work_on(cpu, &dom->cqm_limbo, delay);
+}
+
+void mbm_handle_overflow(struct work_struct *work)
+{
+       unsigned long delay = msecs_to_jiffies(MBM_OVERFLOW_INTERVAL);
+       struct rdtgroup *prgrp, *crgrp;
+       struct rdt_mon_domain *d;
+       struct list_head *head;
+       struct rdt_resource *r;
+
+       cpus_read_lock();
+       mutex_lock(&rdtgroup_mutex);
+
+       /*
+        * If the filesystem has been unmounted this work no longer needs to
+        * run.
+        */
+       if (!resctrl_mounted || !resctrl_arch_mon_capable())
+               goto out_unlock;
+
+       r = resctrl_arch_get_resource(RDT_RESOURCE_L3);
+       d = container_of(work, struct rdt_mon_domain, mbm_over.work);
+
+       list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) {
+               mbm_update(r, d, prgrp->closid, prgrp->mon.rmid);
+
+               head = &prgrp->mon.crdtgrp_list;
+               list_for_each_entry(crgrp, head, mon.crdtgrp_list)
+                       mbm_update(r, d, crgrp->closid, crgrp->mon.rmid);
+
+               if (is_mba_sc(NULL))
+                       update_mba_bw(prgrp, d);
+       }
+
+       /*
+        * Re-check for housekeeping CPUs. This allows the overflow handler to
+        * move off a nohz_full CPU quickly.
+        */
+       d->mbm_work_cpu = cpumask_any_housekeeping(&d->hdr.cpu_mask,
+                                                  RESCTRL_PICK_ANY_CPU);
+       schedule_delayed_work_on(d->mbm_work_cpu, &d->mbm_over, delay);
+
+out_unlock:
+       mutex_unlock(&rdtgroup_mutex);
+       cpus_read_unlock();
+}
+
+/**
+ * mbm_setup_overflow_handler() - Schedule the overflow handler to run for this
+ *                                domain.
+ * @dom:           The domain the overflow handler should run for.
+ * @delay_ms:      How far in the future the handler should run.
+ * @exclude_cpu:   Which CPU the handler should not run on,
+ *                RESCTRL_PICK_ANY_CPU to pick any CPU.
+ */
+void mbm_setup_overflow_handler(struct rdt_mon_domain *dom, unsigned long delay_ms,
+                               int exclude_cpu)
+{
+       unsigned long delay = msecs_to_jiffies(delay_ms);
+       int cpu;
+
+       /*
+        * When a domain comes online there is no guarantee the filesystem is
+        * mounted. If not, there is no need to catch counter overflow.
+        */
+       if (!resctrl_mounted || !resctrl_arch_mon_capable())
+               return;
+       cpu = cpumask_any_housekeeping(&dom->hdr.cpu_mask, exclude_cpu);
+       dom->mbm_work_cpu = cpu;
+
+       if (cpu < nr_cpu_ids)
+               schedule_delayed_work_on(cpu, &dom->mbm_over, delay);
+}
+
+static int dom_data_init(struct rdt_resource *r)
+{
+       u32 idx_limit = resctrl_arch_system_num_rmid_idx();
+       u32 num_closid = resctrl_arch_get_num_closid(r);
+       struct rmid_entry *entry = NULL;
+       int err = 0, i;
+       u32 idx;
+
+       mutex_lock(&rdtgroup_mutex);
+       if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) {
+               u32 *tmp;
+
+               /*
+                * If the architecture hasn't provided a sanitised value here,
+                * this may result in larger arrays than necessary. Resctrl will
+                * use a smaller system wide value based on the resources in
+                * use.
+                */
+               tmp = kcalloc(num_closid, sizeof(*tmp), GFP_KERNEL);
+               if (!tmp) {
+                       err = -ENOMEM;
+                       goto out_unlock;
+               }
+
+               closid_num_dirty_rmid = tmp;
+       }
+
+       rmid_ptrs = kcalloc(idx_limit, sizeof(struct rmid_entry), GFP_KERNEL);
+       if (!rmid_ptrs) {
+               if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) {
+                       kfree(closid_num_dirty_rmid);
+                       closid_num_dirty_rmid = NULL;
+               }
+               err = -ENOMEM;
+               goto out_unlock;
+       }
+
+       for (i = 0; i < idx_limit; i++) {
+               entry = &rmid_ptrs[i];
+               INIT_LIST_HEAD(&entry->list);
+
+               resctrl_arch_rmid_idx_decode(i, &entry->closid, &entry->rmid);
+               list_add_tail(&entry->list, &rmid_free_lru);
+       }
+
+       /*
+        * RESCTRL_RESERVED_CLOSID and RESCTRL_RESERVED_RMID are special and
+        * are always allocated. These are used for the rdtgroup_default
+        * control group, which will be setup later in resctrl_init().
+        */
+       idx = resctrl_arch_rmid_idx_encode(RESCTRL_RESERVED_CLOSID,
+                                          RESCTRL_RESERVED_RMID);
+       entry = __rmid_entry(idx);
+       list_del(&entry->list);
+
+out_unlock:
+       mutex_unlock(&rdtgroup_mutex);
+
+       return err;
+}
+
+static void dom_data_exit(struct rdt_resource *r)
+{
+       mutex_lock(&rdtgroup_mutex);
+
+       if (!r->mon_capable)
+               goto out_unlock;
+
+       if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) {
+               kfree(closid_num_dirty_rmid);
+               closid_num_dirty_rmid = NULL;
+       }
+
+       kfree(rmid_ptrs);
+       rmid_ptrs = NULL;
+
+out_unlock:
+       mutex_unlock(&rdtgroup_mutex);
+}
+
+static struct mon_evt llc_occupancy_event = {
+       .name           = "llc_occupancy",
+       .evtid          = QOS_L3_OCCUP_EVENT_ID,
+};
+
+static struct mon_evt mbm_total_event = {
+       .name           = "mbm_total_bytes",
+       .evtid          = QOS_L3_MBM_TOTAL_EVENT_ID,
+};
+
+static struct mon_evt mbm_local_event = {
+       .name           = "mbm_local_bytes",
+       .evtid          = QOS_L3_MBM_LOCAL_EVENT_ID,
+};
+
+/*
+ * Initialize the event list for the resource.
+ *
+ * Note that MBM events are also part of RDT_RESOURCE_L3 resource
+ * because as per the SDM the total and local memory bandwidth
+ * are enumerated as part of L3 monitoring.
+ */
+static void l3_mon_evt_init(struct rdt_resource *r)
+{
+       INIT_LIST_HEAD(&r->evt_list);
+
+       if (resctrl_arch_is_llc_occupancy_enabled())
+               list_add_tail(&llc_occupancy_event.list, &r->evt_list);
+       if (resctrl_arch_is_mbm_total_enabled())
+               list_add_tail(&mbm_total_event.list, &r->evt_list);
+       if (resctrl_arch_is_mbm_local_enabled())
+               list_add_tail(&mbm_local_event.list, &r->evt_list);
+}
+
+/**
+ * resctrl_mon_resource_init() - Initialise global monitoring structures.
+ *
+ * Allocate and initialise global monitor resources that do not belong to a
+ * specific domain. i.e. the rmid_ptrs[] used for the limbo and free lists.
+ * Called once during boot after the struct rdt_resource's have been configured
+ * but before the filesystem is mounted.
+ * Resctrl's cpuhp callbacks may be called before this point to bring a domain
+ * online.
+ *
+ * Returns 0 for success, or -ENOMEM.
+ */
+int resctrl_mon_resource_init(void)
+{
+       struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3);
+       int ret;
+
+       if (!r->mon_capable)
+               return 0;
+
+       ret = dom_data_init(r);
+       if (ret)
+               return ret;
+
+       l3_mon_evt_init(r);
+
+       if (resctrl_arch_is_evt_configurable(QOS_L3_MBM_TOTAL_EVENT_ID)) {
+               mbm_total_event.configurable = true;
+               resctrl_file_fflags_init("mbm_total_bytes_config",
+                                        RFTYPE_MON_INFO | RFTYPE_RES_CACHE);
+       }
+       if (resctrl_arch_is_evt_configurable(QOS_L3_MBM_LOCAL_EVENT_ID)) {
+               mbm_local_event.configurable = true;
+               resctrl_file_fflags_init("mbm_local_bytes_config",
+                                        RFTYPE_MON_INFO | RFTYPE_RES_CACHE);
+       }
+
+       if (resctrl_arch_is_mbm_local_enabled())
+               mba_mbps_default_event = QOS_L3_MBM_LOCAL_EVENT_ID;
+       else if (resctrl_arch_is_mbm_total_enabled())
+               mba_mbps_default_event = QOS_L3_MBM_TOTAL_EVENT_ID;
+
+       return 0;
+}
+
+void resctrl_mon_resource_exit(void)
+{
+       struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3);
+
+       dom_data_exit(r);
+}
diff --git a/fs/resctrl/monitor_trace.h b/fs/resctrl/monitor_trace.h

index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..fdf49f22576a9b6825d3ade8178eacf99e951256 100644 (file)
--- a/fs/resctrl/monitor_trace.h
+++ b/fs/resctrl/monitor_trace.h
@@ -0,0 +1,33 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM resctrl
+
+#if !defined(_FS_RESCTRL_MONITOR_TRACE_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _FS_RESCTRL_MONITOR_TRACE_H
+
+#include <linux/tracepoint.h>
+
+TRACE_EVENT(mon_llc_occupancy_limbo,
+           TP_PROTO(u32 ctrl_hw_id, u32 mon_hw_id, int domain_id, u64 llc_occupancy_bytes),
+           TP_ARGS(ctrl_hw_id, mon_hw_id, domain_id, llc_occupancy_bytes),
+           TP_STRUCT__entry(__field(u32, ctrl_hw_id)
+                            __field(u32, mon_hw_id)
+                            __field(int, domain_id)
+                            __field(u64, llc_occupancy_bytes)),
+           TP_fast_assign(__entry->ctrl_hw_id = ctrl_hw_id;
+                          __entry->mon_hw_id = mon_hw_id;
+                          __entry->domain_id = domain_id;
+                          __entry->llc_occupancy_bytes = llc_occupancy_bytes;),
+           TP_printk("ctrl_hw_id=%u mon_hw_id=%u domain_id=%d llc_occupancy_bytes=%llu",
+                     __entry->ctrl_hw_id, __entry->mon_hw_id, __entry->domain_id,
+                     __entry->llc_occupancy_bytes)
+          );
+
+#endif /* _FS_RESCTRL_MONITOR_TRACE_H */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+
+#define TRACE_INCLUDE_FILE monitor_trace
+
+#include <trace/define_trace.h>
diff --git a/fs/resctrl/pseudo_lock.c b/fs/resctrl/pseudo_lock.c

index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..ccc2f9213b4b6b5d70ef89393d6268456996fa15 100644 (file)
--- a/fs/resctrl/pseudo_lock.c
+++ b/fs/resctrl/pseudo_lock.c
@@ -0,0 +1,1105 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Resource Director Technology (RDT)
+ *
+ * Pseudo-locking support built on top of Cache Allocation Technology (CAT)
+ *
+ * Copyright (C) 2018 Intel Corporation
+ *
+ * Author: Reinette Chatre <reinette.chatre@intel.com>
+ */
+
+#define pr_fmt(fmt)    KBUILD_MODNAME ": " fmt
+
+#include <linux/cacheinfo.h>
+#include <linux/cpu.h>
+#include <linux/cpumask.h>
+#include <linux/debugfs.h>
+#include <linux/kthread.h>
+#include <linux/mman.h>
+#include <linux/pm_qos.h>
+#include <linux/resctrl.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+
+#include "internal.h"
+
+/*
+ * Major number assigned to and shared by all devices exposing
+ * pseudo-locked regions.
+ */
+static unsigned int pseudo_lock_major;
+
+static unsigned long pseudo_lock_minor_avail = GENMASK(MINORBITS, 0);
+
+static char *pseudo_lock_devnode(const struct device *dev, umode_t *mode)
+{
+       const struct rdtgroup *rdtgrp;
+
+       rdtgrp = dev_get_drvdata(dev);
+       if (mode)
+               *mode = 0600;
+       guard(mutex)(&rdtgroup_mutex);
+       return kasprintf(GFP_KERNEL, "pseudo_lock/%s", rdt_kn_name(rdtgrp->kn));
+}
+
+static const struct class pseudo_lock_class = {
+       .name = "pseudo_lock",
+       .devnode = pseudo_lock_devnode,
+};
+
+/**
+ * pseudo_lock_minor_get - Obtain available minor number
+ * @minor: Pointer to where new minor number will be stored
+ *
+ * A bitmask is used to track available minor numbers. Here the next free
+ * minor number is marked as unavailable and returned.
+ *
+ * Return: 0 on success, <0 on failure.
+ */
+static int pseudo_lock_minor_get(unsigned int *minor)
+{
+       unsigned long first_bit;
+
+       first_bit = find_first_bit(&pseudo_lock_minor_avail, MINORBITS);
+
+       if (first_bit == MINORBITS)
+               return -ENOSPC;
+
+       __clear_bit(first_bit, &pseudo_lock_minor_avail);
+       *minor = first_bit;
+
+       return 0;
+}
+
+/**
+ * pseudo_lock_minor_release - Return minor number to available
+ * @minor: The minor number made available
+ */
+static void pseudo_lock_minor_release(unsigned int minor)
+{
+       __set_bit(minor, &pseudo_lock_minor_avail);
+}
+
+/**
+ * region_find_by_minor - Locate a pseudo-lock region by inode minor number
+ * @minor: The minor number of the device representing pseudo-locked region
+ *
+ * When the character device is accessed we need to determine which
+ * pseudo-locked region it belongs to. This is done by matching the minor
+ * number of the device to the pseudo-locked region it belongs.
+ *
+ * Minor numbers are assigned at the time a pseudo-locked region is associated
+ * with a cache instance.
+ *
+ * Return: On success return pointer to resource group owning the pseudo-locked
+ *         region, NULL on failure.
+ */
+static struct rdtgroup *region_find_by_minor(unsigned int minor)
+{
+       struct rdtgroup *rdtgrp, *rdtgrp_match = NULL;
+
+       list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) {
+               if (rdtgrp->plr && rdtgrp->plr->minor == minor) {
+                       rdtgrp_match = rdtgrp;
+                       break;
+               }
+       }
+       return rdtgrp_match;
+}
+
+/**
+ * struct pseudo_lock_pm_req - A power management QoS request list entry
+ * @list:      Entry within the @pm_reqs list for a pseudo-locked region
+ * @req:       PM QoS request
+ */
+struct pseudo_lock_pm_req {
+       struct list_head list;
+       struct dev_pm_qos_request req;
+};
+
+static void pseudo_lock_cstates_relax(struct pseudo_lock_region *plr)
+{
+       struct pseudo_lock_pm_req *pm_req, *next;
+
+       list_for_each_entry_safe(pm_req, next, &plr->pm_reqs, list) {
+               dev_pm_qos_remove_request(&pm_req->req);
+               list_del(&pm_req->list);
+               kfree(pm_req);
+       }
+}
+
+/**
+ * pseudo_lock_cstates_constrain - Restrict cores from entering C6
+ * @plr: Pseudo-locked region
+ *
+ * To prevent the cache from being affected by power management entering
+ * C6 has to be avoided. This is accomplished by requesting a latency
+ * requirement lower than lowest C6 exit latency of all supported
+ * platforms as found in the cpuidle state tables in the intel_idle driver.
+ * At this time it is possible to do so with a single latency requirement
+ * for all supported platforms.
+ *
+ * Since Goldmont is supported, which is affected by X86_BUG_MONITOR,
+ * the ACPI latencies need to be considered while keeping in mind that C2
+ * may be set to map to deeper sleep states. In this case the latency
+ * requirement needs to prevent entering C2 also.
+ *
+ * Return: 0 on success, <0 on failure
+ */
+static int pseudo_lock_cstates_constrain(struct pseudo_lock_region *plr)
+{
+       struct pseudo_lock_pm_req *pm_req;
+       int cpu;
+       int ret;
+
+       for_each_cpu(cpu, &plr->d->hdr.cpu_mask) {
+               pm_req = kzalloc(sizeof(*pm_req), GFP_KERNEL);
+               if (!pm_req) {
+                       rdt_last_cmd_puts("Failure to allocate memory for PM QoS\n");
+                       ret = -ENOMEM;
+                       goto out_err;
+               }
+               ret = dev_pm_qos_add_request(get_cpu_device(cpu),
+                                            &pm_req->req,
+                                            DEV_PM_QOS_RESUME_LATENCY,
+                                            30);
+               if (ret < 0) {
+                       rdt_last_cmd_printf("Failed to add latency req CPU%d\n",
+                                           cpu);
+                       kfree(pm_req);
+                       ret = -1;
+                       goto out_err;
+               }
+               list_add(&pm_req->list, &plr->pm_reqs);
+       }
+
+       return 0;
+
+out_err:
+       pseudo_lock_cstates_relax(plr);
+       return ret;
+}
+
+/**
+ * pseudo_lock_region_clear - Reset pseudo-lock region data
+ * @plr: pseudo-lock region
+ *
+ * All content of the pseudo-locked region is reset - any memory allocated
+ * freed.
+ *
+ * Return: void
+ */
+static void pseudo_lock_region_clear(struct pseudo_lock_region *plr)
+{
+       plr->size = 0;
+       plr->line_size = 0;
+       kfree(plr->kmem);
+       plr->kmem = NULL;
+       plr->s = NULL;
+       if (plr->d)
+               plr->d->plr = NULL;
+       plr->d = NULL;
+       plr->cbm = 0;
+       plr->debugfs_dir = NULL;
+}
+
+/**
+ * pseudo_lock_region_init - Initialize pseudo-lock region information
+ * @plr: pseudo-lock region
+ *
+ * Called after user provided a schemata to be pseudo-locked. From the
+ * schemata the &struct pseudo_lock_region is on entry already initialized
+ * with the resource, domain, and capacity bitmask. Here the information
+ * required for pseudo-locking is deduced from this data and &struct
+ * pseudo_lock_region initialized further. This information includes:
+ * - size in bytes of the region to be pseudo-locked
+ * - cache line size to know the stride with which data needs to be accessed
+ *   to be pseudo-locked
+ * - a cpu associated with the cache instance on which the pseudo-locking
+ *   flow can be executed
+ *
+ * Return: 0 on success, <0 on failure. Descriptive error will be written
+ * to last_cmd_status buffer.
+ */
+static int pseudo_lock_region_init(struct pseudo_lock_region *plr)
+{
+       enum resctrl_scope scope = plr->s->res->ctrl_scope;
+       struct cacheinfo *ci;
+       int ret;
+
+       if (WARN_ON_ONCE(scope != RESCTRL_L2_CACHE && scope != RESCTRL_L3_CACHE))
+               return -ENODEV;
+
+       /* Pick the first cpu we find that is associated with the cache. */
+       plr->cpu = cpumask_first(&plr->d->hdr.cpu_mask);
+
+       if (!cpu_online(plr->cpu)) {
+               rdt_last_cmd_printf("CPU %u associated with cache not online\n",
+                                   plr->cpu);
+               ret = -ENODEV;
+               goto out_region;
+       }
+
+       ci = get_cpu_cacheinfo_level(plr->cpu, scope);
+       if (ci) {
+               plr->line_size = ci->coherency_line_size;
+               plr->size = rdtgroup_cbm_to_size(plr->s->res, plr->d, plr->cbm);
+               return 0;
+       }
+
+       ret = -1;
+       rdt_last_cmd_puts("Unable to determine cache line size\n");
+out_region:
+       pseudo_lock_region_clear(plr);
+       return ret;
+}
+
+/**
+ * pseudo_lock_init - Initialize a pseudo-lock region
+ * @rdtgrp: resource group to which new pseudo-locked region will belong
+ *
+ * A pseudo-locked region is associated with a resource group. When this
+ * association is created the pseudo-locked region is initialized. The
+ * details of the pseudo-locked region are not known at this time so only
+ * allocation is done and association established.
+ *
+ * Return: 0 on success, <0 on failure
+ */
+static int pseudo_lock_init(struct rdtgroup *rdtgrp)
+{
+       struct pseudo_lock_region *plr;
+
+       plr = kzalloc(sizeof(*plr), GFP_KERNEL);
+       if (!plr)
+               return -ENOMEM;
+
+       init_waitqueue_head(&plr->lock_thread_wq);
+       INIT_LIST_HEAD(&plr->pm_reqs);
+       rdtgrp->plr = plr;
+       return 0;
+}
+
+/**
+ * pseudo_lock_region_alloc - Allocate kernel memory that will be pseudo-locked
+ * @plr: pseudo-lock region
+ *
+ * Initialize the details required to set up the pseudo-locked region and
+ * allocate the contiguous memory that will be pseudo-locked to the cache.
+ *
+ * Return: 0 on success, <0 on failure.  Descriptive error will be written
+ * to last_cmd_status buffer.
+ */
+static int pseudo_lock_region_alloc(struct pseudo_lock_region *plr)
+{
+       int ret;
+
+       ret = pseudo_lock_region_init(plr);
+       if (ret < 0)
+               return ret;
+
+       /*
+        * We do not yet support contiguous regions larger than
+        * KMALLOC_MAX_SIZE.
+        */
+       if (plr->size > KMALLOC_MAX_SIZE) {
+               rdt_last_cmd_puts("Requested region exceeds maximum size\n");
+               ret = -E2BIG;
+               goto out_region;
+       }
+
+       plr->kmem = kzalloc(plr->size, GFP_KERNEL);
+       if (!plr->kmem) {
+               rdt_last_cmd_puts("Unable to allocate memory\n");
+               ret = -ENOMEM;
+               goto out_region;
+       }
+
+       ret = 0;
+       goto out;
+out_region:
+       pseudo_lock_region_clear(plr);
+out:
+       return ret;
+}
+
+/**
+ * pseudo_lock_free - Free a pseudo-locked region
+ * @rdtgrp: resource group to which pseudo-locked region belonged
+ *
+ * The pseudo-locked region's resources have already been released, or not
+ * yet created at this point. Now it can be freed and disassociated from the
+ * resource group.
+ *
+ * Return: void
+ */
+static void pseudo_lock_free(struct rdtgroup *rdtgrp)
+{
+       pseudo_lock_region_clear(rdtgrp->plr);
+       kfree(rdtgrp->plr);
+       rdtgrp->plr = NULL;
+}
+
+/**
+ * rdtgroup_monitor_in_progress - Test if monitoring in progress
+ * @rdtgrp: resource group being queried
+ *
+ * Return: 1 if monitor groups have been created for this resource
+ * group, 0 otherwise.
+ */
+static int rdtgroup_monitor_in_progress(struct rdtgroup *rdtgrp)
+{
+       return !list_empty(&rdtgrp->mon.crdtgrp_list);
+}
+
+/**
+ * rdtgroup_locksetup_user_restrict - Restrict user access to group
+ * @rdtgrp: resource group needing access restricted
+ *
+ * A resource group used for cache pseudo-locking cannot have cpus or tasks
+ * assigned to it. This is communicated to the user by restricting access
+ * to all the files that can be used to make such changes.
+ *
+ * Permissions restored with rdtgroup_locksetup_user_restore()
+ *
+ * Return: 0 on success, <0 on failure. If a failure occurs during the
+ * restriction of access an attempt will be made to restore permissions but
+ * the state of the mode of these files will be uncertain when a failure
+ * occurs.
+ */
+static int rdtgroup_locksetup_user_restrict(struct rdtgroup *rdtgrp)
+{
+       int ret;
+
+       ret = rdtgroup_kn_mode_restrict(rdtgrp, "tasks");
+       if (ret)
+               return ret;
+
+       ret = rdtgroup_kn_mode_restrict(rdtgrp, "cpus");
+       if (ret)
+               goto err_tasks;
+
+       ret = rdtgroup_kn_mode_restrict(rdtgrp, "cpus_list");
+       if (ret)
+               goto err_cpus;
+
+       if (resctrl_arch_mon_capable()) {
+               ret = rdtgroup_kn_mode_restrict(rdtgrp, "mon_groups");
+               if (ret)
+                       goto err_cpus_list;
+       }
+
+       ret = 0;
+       goto out;
+
+err_cpus_list:
+       rdtgroup_kn_mode_restore(rdtgrp, "cpus_list", 0777);
+err_cpus:
+       rdtgroup_kn_mode_restore(rdtgrp, "cpus", 0777);
+err_tasks:
+       rdtgroup_kn_mode_restore(rdtgrp, "tasks", 0777);
+out:
+       return ret;
+}
+
+/**
+ * rdtgroup_locksetup_user_restore - Restore user access to group
+ * @rdtgrp: resource group needing access restored
+ *
+ * Restore all file access previously removed using
+ * rdtgroup_locksetup_user_restrict()
+ *
+ * Return: 0 on success, <0 on failure.  If a failure occurs during the
+ * restoration of access an attempt will be made to restrict permissions
+ * again but the state of the mode of these files will be uncertain when
+ * a failure occurs.
+ */
+static int rdtgroup_locksetup_user_restore(struct rdtgroup *rdtgrp)
+{
+       int ret;
+
+       ret = rdtgroup_kn_mode_restore(rdtgrp, "tasks", 0777);
+       if (ret)
+               return ret;
+
+       ret = rdtgroup_kn_mode_restore(rdtgrp, "cpus", 0777);
+       if (ret)
+               goto err_tasks;
+
+       ret = rdtgroup_kn_mode_restore(rdtgrp, "cpus_list", 0777);
+       if (ret)
+               goto err_cpus;
+
+       if (resctrl_arch_mon_capable()) {
+               ret = rdtgroup_kn_mode_restore(rdtgrp, "mon_groups", 0777);
+               if (ret)
+                       goto err_cpus_list;
+       }
+
+       ret = 0;
+       goto out;
+
+err_cpus_list:
+       rdtgroup_kn_mode_restrict(rdtgrp, "cpus_list");
+err_cpus:
+       rdtgroup_kn_mode_restrict(rdtgrp, "cpus");
+err_tasks:
+       rdtgroup_kn_mode_restrict(rdtgrp, "tasks");
+out:
+       return ret;
+}
+
+/**
+ * rdtgroup_locksetup_enter - Resource group enters locksetup mode
+ * @rdtgrp: resource group requested to enter locksetup mode
+ *
+ * A resource group enters locksetup mode to reflect that it would be used
+ * to represent a pseudo-locked region and is in the process of being set
+ * up to do so. A resource group used for a pseudo-locked region would
+ * lose the closid associated with it so we cannot allow it to have any
+ * tasks or cpus assigned nor permit tasks or cpus to be assigned in the
+ * future. Monitoring of a pseudo-locked region is not allowed either.
+ *
+ * The above and more restrictions on a pseudo-locked region are checked
+ * for and enforced before the resource group enters the locksetup mode.
+ *
+ * Returns: 0 if the resource group successfully entered locksetup mode, <0
+ * on failure. On failure the last_cmd_status buffer is updated with text to
+ * communicate details of failure to the user.
+ */
+int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp)
+{
+       int ret;
+
+       /*
+        * The default resource group can neither be removed nor lose the
+        * default closid associated with it.
+        */
+       if (rdtgrp == &rdtgroup_default) {
+               rdt_last_cmd_puts("Cannot pseudo-lock default group\n");
+               return -EINVAL;
+       }
+
+       /*
+        * Cache Pseudo-locking not supported when CDP is enabled.
+        *
+        * Some things to consider if you would like to enable this
+        * support (using L3 CDP as example):
+        * - When CDP is enabled two separate resources are exposed,
+        *   L3DATA and L3CODE, but they are actually on the same cache.
+        *   The implication for pseudo-locking is that if a
+        *   pseudo-locked region is created on a domain of one
+        *   resource (eg. L3CODE), then a pseudo-locked region cannot
+        *   be created on that same domain of the other resource
+        *   (eg. L3DATA). This is because the creation of a
+        *   pseudo-locked region involves a call to wbinvd that will
+        *   affect all cache allocations on particular domain.
+        * - Considering the previous, it may be possible to only
+        *   expose one of the CDP resources to pseudo-locking and
+        *   hide the other. For example, we could consider to only
+        *   expose L3DATA and since the L3 cache is unified it is
+        *   still possible to place instructions there are execute it.
+        * - If only one region is exposed to pseudo-locking we should
+        *   still keep in mind that availability of a portion of cache
+        *   for pseudo-locking should take into account both resources.
+        *   Similarly, if a pseudo-locked region is created in one
+        *   resource, the portion of cache used by it should be made
+        *   unavailable to all future allocations from both resources.
+        */
+       if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L3) ||
+           resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L2)) {
+               rdt_last_cmd_puts("CDP enabled\n");
+               return -EINVAL;
+       }
+
+       /*
+        * Not knowing the bits to disable prefetching implies that this
+        * platform does not support Cache Pseudo-Locking.
+        */
+       if (resctrl_arch_get_prefetch_disable_bits() == 0) {
+               rdt_last_cmd_puts("Pseudo-locking not supported\n");
+               return -EINVAL;
+       }
+
+       if (rdtgroup_monitor_in_progress(rdtgrp)) {
+               rdt_last_cmd_puts("Monitoring in progress\n");
+               return -EINVAL;
+       }
+
+       if (rdtgroup_tasks_assigned(rdtgrp)) {
+               rdt_last_cmd_puts("Tasks assigned to resource group\n");
+               return -EINVAL;
+       }
+
+       if (!cpumask_empty(&rdtgrp->cpu_mask)) {
+               rdt_last_cmd_puts("CPUs assigned to resource group\n");
+               return -EINVAL;
+       }
+
+       if (rdtgroup_locksetup_user_restrict(rdtgrp)) {
+               rdt_last_cmd_puts("Unable to modify resctrl permissions\n");
+               return -EIO;
+       }
+
+       ret = pseudo_lock_init(rdtgrp);
+       if (ret) {
+               rdt_last_cmd_puts("Unable to init pseudo-lock region\n");
+               goto out_release;
+       }
+
+       /*
+        * If this system is capable of monitoring a rmid would have been
+        * allocated when the control group was created. This is not needed
+        * anymore when this group would be used for pseudo-locking. This
+        * is safe to call on platforms not capable of monitoring.
+        */
+       free_rmid(rdtgrp->closid, rdtgrp->mon.rmid);
+
+       ret = 0;
+       goto out;
+
+out_release:
+       rdtgroup_locksetup_user_restore(rdtgrp);
+out:
+       return ret;
+}
+
+/**
+ * rdtgroup_locksetup_exit - resource group exist locksetup mode
+ * @rdtgrp: resource group
+ *
+ * When a resource group exits locksetup mode the earlier restrictions are
+ * lifted.
+ *
+ * Return: 0 on success, <0 on failure
+ */
+int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp)
+{
+       int ret;
+
+       if (resctrl_arch_mon_capable()) {
+               ret = alloc_rmid(rdtgrp->closid);
+               if (ret < 0) {
+                       rdt_last_cmd_puts("Out of RMIDs\n");
+                       return ret;
+               }
+               rdtgrp->mon.rmid = ret;
+       }
+
+       ret = rdtgroup_locksetup_user_restore(rdtgrp);
+       if (ret) {
+               free_rmid(rdtgrp->closid, rdtgrp->mon.rmid);
+               return ret;
+       }
+
+       pseudo_lock_free(rdtgrp);
+       return 0;
+}
+
+/**
+ * rdtgroup_cbm_overlaps_pseudo_locked - Test if CBM or portion is pseudo-locked
+ * @d: RDT domain
+ * @cbm: CBM to test
+ *
+ * @d represents a cache instance and @cbm a capacity bitmask that is
+ * considered for it. Determine if @cbm overlaps with any existing
+ * pseudo-locked region on @d.
+ *
+ * @cbm is unsigned long, even if only 32 bits are used, to make the
+ * bitmap functions work correctly.
+ *
+ * Return: true if @cbm overlaps with pseudo-locked region on @d, false
+ * otherwise.
+ */
+bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_ctrl_domain *d, unsigned long cbm)
+{
+       unsigned int cbm_len;
+       unsigned long cbm_b;
+
+       if (d->plr) {
+               cbm_len = d->plr->s->res->cache.cbm_len;
+               cbm_b = d->plr->cbm;
+               if (bitmap_intersects(&cbm, &cbm_b, cbm_len))
+                       return true;
+       }
+       return false;
+}
+
+/**
+ * rdtgroup_pseudo_locked_in_hierarchy - Pseudo-locked region in cache hierarchy
+ * @d: RDT domain under test
+ *
+ * The setup of a pseudo-locked region affects all cache instances within
+ * the hierarchy of the region. It is thus essential to know if any
+ * pseudo-locked regions exist within a cache hierarchy to prevent any
+ * attempts to create new pseudo-locked regions in the same hierarchy.
+ *
+ * Return: true if a pseudo-locked region exists in the hierarchy of @d or
+ *         if it is not possible to test due to memory allocation issue,
+ *         false otherwise.
+ */
+bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_ctrl_domain *d)
+{
+       struct rdt_ctrl_domain *d_i;
+       cpumask_var_t cpu_with_psl;
+       struct rdt_resource *r;
+       bool ret = false;
+
+       /* Walking r->domains, ensure it can't race with cpuhp */
+       lockdep_assert_cpus_held();
+
+       if (!zalloc_cpumask_var(&cpu_with_psl, GFP_KERNEL))
+               return true;
+
+       /*
+        * First determine which cpus have pseudo-locked regions
+        * associated with them.
+        */
+       for_each_alloc_capable_rdt_resource(r) {
+               list_for_each_entry(d_i, &r->ctrl_domains, hdr.list) {
+                       if (d_i->plr)
+                               cpumask_or(cpu_with_psl, cpu_with_psl,
+                                          &d_i->hdr.cpu_mask);
+               }
+       }
+
+       /*
+        * Next test if new pseudo-locked region would intersect with
+        * existing region.
+        */
+       if (cpumask_intersects(&d->hdr.cpu_mask, cpu_with_psl))
+               ret = true;
+
+       free_cpumask_var(cpu_with_psl);
+       return ret;
+}
+
+/**
+ * pseudo_lock_measure_cycles - Trigger latency measure to pseudo-locked region
+ * @rdtgrp: Resource group to which the pseudo-locked region belongs.
+ * @sel: Selector of which measurement to perform on a pseudo-locked region.
+ *
+ * The measurement of latency to access a pseudo-locked region should be
+ * done from a cpu that is associated with that pseudo-locked region.
+ * Determine which cpu is associated with this region and start a thread on
+ * that cpu to perform the measurement, wait for that thread to complete.
+ *
+ * Return: 0 on success, <0 on failure
+ */
+static int pseudo_lock_measure_cycles(struct rdtgroup *rdtgrp, int sel)
+{
+       struct pseudo_lock_region *plr = rdtgrp->plr;
+       struct task_struct *thread;
+       unsigned int cpu;
+       int ret = -1;
+
+       cpus_read_lock();
+       mutex_lock(&rdtgroup_mutex);
+
+       if (rdtgrp->flags & RDT_DELETED) {
+               ret = -ENODEV;
+               goto out;
+       }
+
+       if (!plr->d) {
+               ret = -ENODEV;
+               goto out;
+       }
+
+       plr->thread_done = 0;
+       cpu = cpumask_first(&plr->d->hdr.cpu_mask);
+       if (!cpu_online(cpu)) {
+               ret = -ENODEV;
+               goto out;
+       }
+
+       plr->cpu = cpu;
+
+       if (sel == 1)
+               thread = kthread_run_on_cpu(resctrl_arch_measure_cycles_lat_fn,
+                                           plr, cpu, "pseudo_lock_measure/%u");
+       else if (sel == 2)
+               thread = kthread_run_on_cpu(resctrl_arch_measure_l2_residency,
+                                           plr, cpu, "pseudo_lock_measure/%u");
+       else if (sel == 3)
+               thread = kthread_run_on_cpu(resctrl_arch_measure_l3_residency,
+                                           plr, cpu, "pseudo_lock_measure/%u");
+       else
+               goto out;
+
+       if (IS_ERR(thread)) {
+               ret = PTR_ERR(thread);
+               goto out;
+       }
+
+       ret = wait_event_interruptible(plr->lock_thread_wq,
+                                      plr->thread_done == 1);
+       if (ret < 0)
+               goto out;
+
+       ret = 0;
+
+out:
+       mutex_unlock(&rdtgroup_mutex);
+       cpus_read_unlock();
+       return ret;
+}
+
+static ssize_t pseudo_lock_measure_trigger(struct file *file,
+                                          const char __user *user_buf,
+                                          size_t count, loff_t *ppos)
+{
+       struct rdtgroup *rdtgrp = file->private_data;
+       size_t buf_size;
+       char buf[32];
+       int ret;
+       int sel;
+
+       buf_size = min(count, (sizeof(buf) - 1));
+       if (copy_from_user(buf, user_buf, buf_size))
+               return -EFAULT;
+
+       buf[buf_size] = '\0';
+       ret = kstrtoint(buf, 10, &sel);
+       if (ret == 0) {
+               if (sel != 1 && sel != 2 && sel != 3)
+                       return -EINVAL;
+               ret = debugfs_file_get(file->f_path.dentry);
+               if (ret)
+                       return ret;
+               ret = pseudo_lock_measure_cycles(rdtgrp, sel);
+               if (ret == 0)
+                       ret = count;
+               debugfs_file_put(file->f_path.dentry);
+       }
+
+       return ret;
+}
+
+static const struct file_operations pseudo_measure_fops = {
+       .write = pseudo_lock_measure_trigger,
+       .open = simple_open,
+       .llseek = default_llseek,
+};
+
+/**
+ * rdtgroup_pseudo_lock_create - Create a pseudo-locked region
+ * @rdtgrp: resource group to which pseudo-lock region belongs
+ *
+ * Called when a resource group in the pseudo-locksetup mode receives a
+ * valid schemata that should be pseudo-locked. Since the resource group is
+ * in pseudo-locksetup mode the &struct pseudo_lock_region has already been
+ * allocated and initialized with the essential information. If a failure
+ * occurs the resource group remains in the pseudo-locksetup mode with the
+ * &struct pseudo_lock_region associated with it, but cleared from all
+ * information and ready for the user to re-attempt pseudo-locking by
+ * writing the schemata again.
+ *
+ * Return: 0 if the pseudo-locked region was successfully pseudo-locked, <0
+ * on failure. Descriptive error will be written to last_cmd_status buffer.
+ */
+int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp)
+{
+       struct pseudo_lock_region *plr = rdtgrp->plr;
+       struct task_struct *thread;
+       unsigned int new_minor;
+       struct device *dev;
+       char *kn_name __free(kfree) = NULL;
+       int ret;
+
+       ret = pseudo_lock_region_alloc(plr);
+       if (ret < 0)
+               return ret;
+
+       ret = pseudo_lock_cstates_constrain(plr);
+       if (ret < 0) {
+               ret = -EINVAL;
+               goto out_region;
+       }
+       kn_name = kstrdup(rdt_kn_name(rdtgrp->kn), GFP_KERNEL);
+       if (!kn_name) {
+               ret = -ENOMEM;
+               goto out_cstates;
+       }
+
+       plr->thread_done = 0;
+
+       thread = kthread_run_on_cpu(resctrl_arch_pseudo_lock_fn, plr,
+                                   plr->cpu, "pseudo_lock/%u");
+       if (IS_ERR(thread)) {
+               ret = PTR_ERR(thread);
+               rdt_last_cmd_printf("Locking thread returned error %d\n", ret);
+               goto out_cstates;
+       }
+
+       ret = wait_event_interruptible(plr->lock_thread_wq,
+                                      plr->thread_done == 1);
+       if (ret < 0) {
+               /*
+                * If the thread does not get on the CPU for whatever
+                * reason and the process which sets up the region is
+                * interrupted then this will leave the thread in runnable
+                * state and once it gets on the CPU it will dereference
+                * the cleared, but not freed, plr struct resulting in an
+                * empty pseudo-locking loop.
+                */
+               rdt_last_cmd_puts("Locking thread interrupted\n");
+               goto out_cstates;
+       }
+
+       ret = pseudo_lock_minor_get(&new_minor);
+       if (ret < 0) {
+               rdt_last_cmd_puts("Unable to obtain a new minor number\n");
+               goto out_cstates;
+       }
+
+       /*
+        * Unlock access but do not release the reference. The
+        * pseudo-locked region will still be here on return.
+        *
+        * The mutex has to be released temporarily to avoid a potential
+        * deadlock with the mm->mmap_lock which is obtained in the
+        * device_create() and debugfs_create_dir() callpath below as well as
+        * before the mmap() callback is called.
+        */
+       mutex_unlock(&rdtgroup_mutex);
+
+       if (!IS_ERR_OR_NULL(debugfs_resctrl)) {
+               plr->debugfs_dir = debugfs_create_dir(kn_name, debugfs_resctrl);
+               if (!IS_ERR_OR_NULL(plr->debugfs_dir))
+                       debugfs_create_file("pseudo_lock_measure", 0200,
+                                           plr->debugfs_dir, rdtgrp,
+                                           &pseudo_measure_fops);
+       }
+
+       dev = device_create(&pseudo_lock_class, NULL,
+                           MKDEV(pseudo_lock_major, new_minor),
+                           rdtgrp, "%s", kn_name);
+
+       mutex_lock(&rdtgroup_mutex);
+
+       if (IS_ERR(dev)) {
+               ret = PTR_ERR(dev);
+               rdt_last_cmd_printf("Failed to create character device: %d\n",
+                                   ret);
+               goto out_debugfs;
+       }
+
+       /* We released the mutex - check if group was removed while we did so */
+       if (rdtgrp->flags & RDT_DELETED) {
+               ret = -ENODEV;
+               goto out_device;
+       }
+
+       plr->minor = new_minor;
+
+       rdtgrp->mode = RDT_MODE_PSEUDO_LOCKED;
+       closid_free(rdtgrp->closid);
+       rdtgroup_kn_mode_restore(rdtgrp, "cpus", 0444);
+       rdtgroup_kn_mode_restore(rdtgrp, "cpus_list", 0444);
+
+       ret = 0;
+       goto out;
+
+out_device:
+       device_destroy(&pseudo_lock_class, MKDEV(pseudo_lock_major, new_minor));
+out_debugfs:
+       debugfs_remove_recursive(plr->debugfs_dir);
+       pseudo_lock_minor_release(new_minor);
+out_cstates:
+       pseudo_lock_cstates_relax(plr);
+out_region:
+       pseudo_lock_region_clear(plr);
+out:
+       return ret;
+}
+
+/**
+ * rdtgroup_pseudo_lock_remove - Remove a pseudo-locked region
+ * @rdtgrp: resource group to which the pseudo-locked region belongs
+ *
+ * The removal of a pseudo-locked region can be initiated when the resource
+ * group is removed from user space via a "rmdir" from userspace or the
+ * unmount of the resctrl filesystem. On removal the resource group does
+ * not go back to pseudo-locksetup mode before it is removed, instead it is
+ * removed directly. There is thus asymmetry with the creation where the
+ * &struct pseudo_lock_region is removed here while it was not created in
+ * rdtgroup_pseudo_lock_create().
+ *
+ * Return: void
+ */
+void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp)
+{
+       struct pseudo_lock_region *plr = rdtgrp->plr;
+
+       if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
+               /*
+                * Default group cannot be a pseudo-locked region so we can
+                * free closid here.
+                */
+               closid_free(rdtgrp->closid);
+               goto free;
+       }
+
+       pseudo_lock_cstates_relax(plr);
+       debugfs_remove_recursive(rdtgrp->plr->debugfs_dir);
+       device_destroy(&pseudo_lock_class, MKDEV(pseudo_lock_major, plr->minor));
+       pseudo_lock_minor_release(plr->minor);
+
+free:
+       pseudo_lock_free(rdtgrp);
+}
+
+static int pseudo_lock_dev_open(struct inode *inode, struct file *filp)
+{
+       struct rdtgroup *rdtgrp;
+
+       mutex_lock(&rdtgroup_mutex);
+
+       rdtgrp = region_find_by_minor(iminor(inode));
+       if (!rdtgrp) {
+               mutex_unlock(&rdtgroup_mutex);
+               return -ENODEV;
+       }
+
+       filp->private_data = rdtgrp;
+       atomic_inc(&rdtgrp->waitcount);
+       /* Perform a non-seekable open - llseek is not supported */
+       filp->f_mode &= ~(FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE);
+
+       mutex_unlock(&rdtgroup_mutex);
+
+       return 0;
+}
+
+static int pseudo_lock_dev_release(struct inode *inode, struct file *filp)
+{
+       struct rdtgroup *rdtgrp;
+
+       mutex_lock(&rdtgroup_mutex);
+       rdtgrp = filp->private_data;
+       WARN_ON(!rdtgrp);
+       if (!rdtgrp) {
+               mutex_unlock(&rdtgroup_mutex);
+               return -ENODEV;
+       }
+       filp->private_data = NULL;
+       atomic_dec(&rdtgrp->waitcount);
+       mutex_unlock(&rdtgroup_mutex);
+       return 0;
+}
+
+static int pseudo_lock_dev_mremap(struct vm_area_struct *area)
+{
+       /* Not supported */
+       return -EINVAL;
+}
+
+static const struct vm_operations_struct pseudo_mmap_ops = {
+       .mremap = pseudo_lock_dev_mremap,
+};
+
+static int pseudo_lock_dev_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+       unsigned long vsize = vma->vm_end - vma->vm_start;
+       unsigned long off = vma->vm_pgoff << PAGE_SHIFT;
+       struct pseudo_lock_region *plr;
+       struct rdtgroup *rdtgrp;
+       unsigned long physical;
+       unsigned long psize;
+
+       mutex_lock(&rdtgroup_mutex);
+
+       rdtgrp = filp->private_data;
+       WARN_ON(!rdtgrp);
+       if (!rdtgrp) {
+               mutex_unlock(&rdtgroup_mutex);
+               return -ENODEV;
+       }
+
+       plr = rdtgrp->plr;
+
+       if (!plr->d) {
+               mutex_unlock(&rdtgroup_mutex);
+               return -ENODEV;
+       }
+
+       /*
+        * Task is required to run with affinity to the cpus associated
+        * with the pseudo-locked region. If this is not the case the task
+        * may be scheduled elsewhere and invalidate entries in the
+        * pseudo-locked region.
+        */
+       if (!cpumask_subset(current->cpus_ptr, &plr->d->hdr.cpu_mask)) {
+               mutex_unlock(&rdtgroup_mutex);
+               return -EINVAL;
+       }
+
+       physical = __pa(plr->kmem) >> PAGE_SHIFT;
+       psize = plr->size - off;
+
+       if (off > plr->size) {
+               mutex_unlock(&rdtgroup_mutex);
+               return -ENOSPC;
+       }
+
+       /*
+        * Ensure changes are carried directly to the memory being mapped,
+        * do not allow copy-on-write mapping.
+        */
+       if (!(vma->vm_flags & VM_SHARED)) {
+               mutex_unlock(&rdtgroup_mutex);
+               return -EINVAL;
+       }
+
+       if (vsize > psize) {
+               mutex_unlock(&rdtgroup_mutex);
+               return -ENOSPC;
+       }
+
+       memset(plr->kmem + off, 0, vsize);
+
+       if (remap_pfn_range(vma, vma->vm_start, physical + vma->vm_pgoff,
+                           vsize, vma->vm_page_prot)) {
+               mutex_unlock(&rdtgroup_mutex);
+               return -EAGAIN;
+       }
+       vma->vm_ops = &pseudo_mmap_ops;
+       mutex_unlock(&rdtgroup_mutex);
+       return 0;
+}
+
+static const struct file_operations pseudo_lock_dev_fops = {
+       .owner =        THIS_MODULE,
+       .read =         NULL,
+       .write =        NULL,
+       .open =         pseudo_lock_dev_open,
+       .release =      pseudo_lock_dev_release,
+       .mmap =         pseudo_lock_dev_mmap,
+};
+
+int rdt_pseudo_lock_init(void)
+{
+       int ret;
+
+       ret = register_chrdev(0, "pseudo_lock", &pseudo_lock_dev_fops);
+       if (ret < 0)
+               return ret;
+
+       pseudo_lock_major = ret;
+
+       ret = class_register(&pseudo_lock_class);
+       if (ret) {
+               unregister_chrdev(pseudo_lock_major, "pseudo_lock");
+               return ret;
+       }
+
+       return 0;
+}
+
+void rdt_pseudo_lock_release(void)
+{
+       class_unregister(&pseudo_lock_class);
+       unregister_chrdev(pseudo_lock_major, "pseudo_lock");
+       pseudo_lock_major = 0;
+}
diff --git a/fs/resctrl/pseudo_lock_trace.h b/fs/resctrl/pseudo_lock_trace.h

deleted file mode 100644 (file)

index e69de29..0000000
diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c

index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..cc37f58b47dd7145cb6fdb55827291b63c23c622 100644 (file)
--- a/fs/resctrl/rdtgroup.c
+++ b/fs/resctrl/rdtgroup.c
@@ -0,0 +1,4353 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * User interface for Resource Allocation in Resource Director Technology(RDT)
+ *
+ * Copyright (C) 2016 Intel Corporation
+ *
+ * Author: Fenghua Yu <fenghua.yu@intel.com>
+ *
+ * More information about RDT be found in the Intel (R) x86 Architecture
+ * Software Developer Manual.
+ */
+
+#define pr_fmt(fmt)    KBUILD_MODNAME ": " fmt
+
+#include <linux/cpu.h>
+#include <linux/debugfs.h>
+#include <linux/fs.h>
+#include <linux/fs_parser.h>
+#include <linux/sysfs.h>
+#include <linux/kernfs.h>
+#include <linux/resctrl.h>
+#include <linux/seq_buf.h>
+#include <linux/seq_file.h>
+#include <linux/sched/task.h>
+#include <linux/slab.h>
+#include <linux/user_namespace.h>
+
+#include <uapi/linux/magic.h>
+
+#include "internal.h"
+
+/* Mutex to protect rdtgroup access. */
+DEFINE_MUTEX(rdtgroup_mutex);
+
+static struct kernfs_root *rdt_root;
+
+struct rdtgroup rdtgroup_default;
+
+LIST_HEAD(rdt_all_groups);
+
+/* list of entries for the schemata file */
+LIST_HEAD(resctrl_schema_all);
+
+/*
+ * List of struct mon_data containing private data of event files for use by
+ * rdtgroup_mondata_show(). Protected by rdtgroup_mutex.
+ */
+static LIST_HEAD(mon_data_kn_priv_list);
+
+/* The filesystem can only be mounted once. */
+bool resctrl_mounted;
+
+/* Kernel fs node for "info" directory under root */
+static struct kernfs_node *kn_info;
+
+/* Kernel fs node for "mon_groups" directory under root */
+static struct kernfs_node *kn_mongrp;
+
+/* Kernel fs node for "mon_data" directory under root */
+static struct kernfs_node *kn_mondata;
+
+/*
+ * Used to store the max resource name width to display the schemata names in
+ * a tabular format.
+ */
+int max_name_width;
+
+static struct seq_buf last_cmd_status;
+
+static char last_cmd_status_buf[512];
+
+static int rdtgroup_setup_root(struct rdt_fs_context *ctx);
+
+static void rdtgroup_destroy_root(void);
+
+struct dentry *debugfs_resctrl;
+
+/*
+ * Memory bandwidth monitoring event to use for the default CTRL_MON group
+ * and each new CTRL_MON group created by the user.  Only relevant when
+ * the filesystem is mounted with the "mba_MBps" option so it does not
+ * matter that it remains uninitialized on systems that do not support
+ * the "mba_MBps" option.
+ */
+enum resctrl_event_id mba_mbps_default_event;
+
+static bool resctrl_debug;
+
+void rdt_last_cmd_clear(void)
+{
+       lockdep_assert_held(&rdtgroup_mutex);
+       seq_buf_clear(&last_cmd_status);
+}
+
+void rdt_last_cmd_puts(const char *s)
+{
+       lockdep_assert_held(&rdtgroup_mutex);
+       seq_buf_puts(&last_cmd_status, s);
+}
+
+void rdt_last_cmd_printf(const char *fmt, ...)
+{
+       va_list ap;
+
+       va_start(ap, fmt);
+       lockdep_assert_held(&rdtgroup_mutex);
+       seq_buf_vprintf(&last_cmd_status, fmt, ap);
+       va_end(ap);
+}
+
+void rdt_staged_configs_clear(void)
+{
+       struct rdt_ctrl_domain *dom;
+       struct rdt_resource *r;
+
+       lockdep_assert_held(&rdtgroup_mutex);
+
+       for_each_alloc_capable_rdt_resource(r) {
+               list_for_each_entry(dom, &r->ctrl_domains, hdr.list)
+                       memset(dom->staged_config, 0, sizeof(dom->staged_config));
+       }
+}
+
+static bool resctrl_is_mbm_enabled(void)
+{
+       return (resctrl_arch_is_mbm_total_enabled() ||
+               resctrl_arch_is_mbm_local_enabled());
+}
+
+static bool resctrl_is_mbm_event(int e)
+{
+       return (e >= QOS_L3_MBM_TOTAL_EVENT_ID &&
+               e <= QOS_L3_MBM_LOCAL_EVENT_ID);
+}
+
+/*
+ * Trivial allocator for CLOSIDs. Use BITMAP APIs to manipulate a bitmap
+ * of free CLOSIDs.
+ *
+ * Using a global CLOSID across all resources has some advantages and
+ * some drawbacks:
+ * + We can simply set current's closid to assign a task to a resource
+ *   group.
+ * + Context switch code can avoid extra memory references deciding which
+ *   CLOSID to load into the PQR_ASSOC MSR
+ * - We give up some options in configuring resource groups across multi-socket
+ *   systems.
+ * - Our choices on how to configure each resource become progressively more
+ *   limited as the number of resources grows.
+ */
+static unsigned long *closid_free_map;
+
+static int closid_free_map_len;
+
+int closids_supported(void)
+{
+       return closid_free_map_len;
+}
+
+static int closid_init(void)
+{
+       struct resctrl_schema *s;
+       u32 rdt_min_closid = ~0;
+
+       /* Monitor only platforms still call closid_init() */
+       if (list_empty(&resctrl_schema_all))
+               return 0;
+
+       /* Compute rdt_min_closid across all resources */
+       list_for_each_entry(s, &resctrl_schema_all, list)
+               rdt_min_closid = min(rdt_min_closid, s->num_closid);
+
+       closid_free_map = bitmap_alloc(rdt_min_closid, GFP_KERNEL);
+       if (!closid_free_map)
+               return -ENOMEM;
+       bitmap_fill(closid_free_map, rdt_min_closid);
+
+       /* RESCTRL_RESERVED_CLOSID is always reserved for the default group */
+       __clear_bit(RESCTRL_RESERVED_CLOSID, closid_free_map);
+       closid_free_map_len = rdt_min_closid;
+
+       return 0;
+}
+
+static void closid_exit(void)
+{
+       bitmap_free(closid_free_map);
+       closid_free_map = NULL;
+}
+
+static int closid_alloc(void)
+{
+       int cleanest_closid;
+       u32 closid;
+
+       lockdep_assert_held(&rdtgroup_mutex);
+
+       if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID) &&
+           resctrl_arch_is_llc_occupancy_enabled()) {
+               cleanest_closid = resctrl_find_cleanest_closid();
+               if (cleanest_closid < 0)
+                       return cleanest_closid;
+               closid = cleanest_closid;
+       } else {
+               closid = find_first_bit(closid_free_map, closid_free_map_len);
+               if (closid == closid_free_map_len)
+                       return -ENOSPC;
+       }
+       __clear_bit(closid, closid_free_map);
+
+       return closid;
+}
+
+void closid_free(int closid)
+{
+       lockdep_assert_held(&rdtgroup_mutex);
+
+       __set_bit(closid, closid_free_map);
+}
+
+/**
+ * closid_allocated - test if provided closid is in use
+ * @closid: closid to be tested
+ *
+ * Return: true if @closid is currently associated with a resource group,
+ * false if @closid is free
+ */
+bool closid_allocated(unsigned int closid)
+{
+       lockdep_assert_held(&rdtgroup_mutex);
+
+       return !test_bit(closid, closid_free_map);
+}
+
+/**
+ * rdtgroup_mode_by_closid - Return mode of resource group with closid
+ * @closid: closid if the resource group
+ *
+ * Each resource group is associated with a @closid. Here the mode
+ * of a resource group can be queried by searching for it using its closid.
+ *
+ * Return: mode as &enum rdtgrp_mode of resource group with closid @closid
+ */
+enum rdtgrp_mode rdtgroup_mode_by_closid(int closid)
+{
+       struct rdtgroup *rdtgrp;
+
+       list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) {
+               if (rdtgrp->closid == closid)
+                       return rdtgrp->mode;
+       }
+
+       return RDT_NUM_MODES;
+}
+
+static const char * const rdt_mode_str[] = {
+       [RDT_MODE_SHAREABLE]            = "shareable",
+       [RDT_MODE_EXCLUSIVE]            = "exclusive",
+       [RDT_MODE_PSEUDO_LOCKSETUP]     = "pseudo-locksetup",
+       [RDT_MODE_PSEUDO_LOCKED]        = "pseudo-locked",
+};
+
+/**
+ * rdtgroup_mode_str - Return the string representation of mode
+ * @mode: the resource group mode as &enum rdtgroup_mode
+ *
+ * Return: string representation of valid mode, "unknown" otherwise
+ */
+static const char *rdtgroup_mode_str(enum rdtgrp_mode mode)
+{
+       if (mode < RDT_MODE_SHAREABLE || mode >= RDT_NUM_MODES)
+               return "unknown";
+
+       return rdt_mode_str[mode];
+}
+
+/* set uid and gid of rdtgroup dirs and files to that of the creator */
+static int rdtgroup_kn_set_ugid(struct kernfs_node *kn)
+{
+       struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID,
+                               .ia_uid = current_fsuid(),
+                               .ia_gid = current_fsgid(), };
+
+       if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) &&
+           gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID))
+               return 0;
+
+       return kernfs_setattr(kn, &iattr);
+}
+
+static int rdtgroup_add_file(struct kernfs_node *parent_kn, struct rftype *rft)
+{
+       struct kernfs_node *kn;
+       int ret;
+
+       kn = __kernfs_create_file(parent_kn, rft->name, rft->mode,
+                                 GLOBAL_ROOT_UID, GLOBAL_ROOT_GID,
+                                 0, rft->kf_ops, rft, NULL, NULL);
+       if (IS_ERR(kn))
+               return PTR_ERR(kn);
+
+       ret = rdtgroup_kn_set_ugid(kn);
+       if (ret) {
+               kernfs_remove(kn);
+               return ret;
+       }
+
+       return 0;
+}
+
+static int rdtgroup_seqfile_show(struct seq_file *m, void *arg)
+{
+       struct kernfs_open_file *of = m->private;
+       struct rftype *rft = of->kn->priv;
+
+       if (rft->seq_show)
+               return rft->seq_show(of, m, arg);
+       return 0;
+}
+
+static ssize_t rdtgroup_file_write(struct kernfs_open_file *of, char *buf,
+                                  size_t nbytes, loff_t off)
+{
+       struct rftype *rft = of->kn->priv;
+
+       if (rft->write)
+               return rft->write(of, buf, nbytes, off);
+
+       return -EINVAL;
+}
+
+static const struct kernfs_ops rdtgroup_kf_single_ops = {
+       .atomic_write_len       = PAGE_SIZE,
+       .write                  = rdtgroup_file_write,
+       .seq_show               = rdtgroup_seqfile_show,
+};
+
+static const struct kernfs_ops kf_mondata_ops = {
+       .atomic_write_len       = PAGE_SIZE,
+       .seq_show               = rdtgroup_mondata_show,
+};
+
+static bool is_cpu_list(struct kernfs_open_file *of)
+{
+       struct rftype *rft = of->kn->priv;
+
+       return rft->flags & RFTYPE_FLAGS_CPUS_LIST;
+}
+
+static int rdtgroup_cpus_show(struct kernfs_open_file *of,
+                             struct seq_file *s, void *v)
+{
+       struct rdtgroup *rdtgrp;
+       struct cpumask *mask;
+       int ret = 0;
+
+       rdtgrp = rdtgroup_kn_lock_live(of->kn);
+
+       if (rdtgrp) {
+               if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) {
+                       if (!rdtgrp->plr->d) {
+                               rdt_last_cmd_clear();
+                               rdt_last_cmd_puts("Cache domain offline\n");
+                               ret = -ENODEV;
+                       } else {
+                               mask = &rdtgrp->plr->d->hdr.cpu_mask;
+                               seq_printf(s, is_cpu_list(of) ?
+                                          "%*pbl\n" : "%*pb\n",
+                                          cpumask_pr_args(mask));
+                       }
+               } else {
+                       seq_printf(s, is_cpu_list(of) ? "%*pbl\n" : "%*pb\n",
+                                  cpumask_pr_args(&rdtgrp->cpu_mask));
+               }
+       } else {
+               ret = -ENOENT;
+       }
+       rdtgroup_kn_unlock(of->kn);
+
+       return ret;
+}
+
+/*
+ * Update the PGR_ASSOC MSR on all cpus in @cpu_mask,
+ *
+ * Per task closids/rmids must have been set up before calling this function.
+ * @r may be NULL.
+ */
+static void
+update_closid_rmid(const struct cpumask *cpu_mask, struct rdtgroup *r)
+{
+       struct resctrl_cpu_defaults defaults, *p = NULL;
+
+       if (r) {
+               defaults.closid = r->closid;
+               defaults.rmid = r->mon.rmid;
+               p = &defaults;
+       }
+
+       on_each_cpu_mask(cpu_mask, resctrl_arch_sync_cpu_closid_rmid, p, 1);
+}
+
+static int cpus_mon_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask,
+                         cpumask_var_t tmpmask)
+{
+       struct rdtgroup *prgrp = rdtgrp->mon.parent, *crgrp;
+       struct list_head *head;
+
+       /* Check whether cpus belong to parent ctrl group */
+       cpumask_andnot(tmpmask, newmask, &prgrp->cpu_mask);
+       if (!cpumask_empty(tmpmask)) {
+               rdt_last_cmd_puts("Can only add CPUs to mongroup that belong to parent\n");
+               return -EINVAL;
+       }
+
+       /* Check whether cpus are dropped from this group */
+       cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask);
+       if (!cpumask_empty(tmpmask)) {
+               /* Give any dropped cpus to parent rdtgroup */
+               cpumask_or(&prgrp->cpu_mask, &prgrp->cpu_mask, tmpmask);
+               update_closid_rmid(tmpmask, prgrp);
+       }
+
+       /*
+        * If we added cpus, remove them from previous group that owned them
+        * and update per-cpu rmid
+        */
+       cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask);
+       if (!cpumask_empty(tmpmask)) {
+               head = &prgrp->mon.crdtgrp_list;
+               list_for_each_entry(crgrp, head, mon.crdtgrp_list) {
+                       if (crgrp == rdtgrp)
+                               continue;
+                       cpumask_andnot(&crgrp->cpu_mask, &crgrp->cpu_mask,
+                                      tmpmask);
+               }
+               update_closid_rmid(tmpmask, rdtgrp);
+       }
+
+       /* Done pushing/pulling - update this group with new mask */
+       cpumask_copy(&rdtgrp->cpu_mask, newmask);
+
+       return 0;
+}
+
+static void cpumask_rdtgrp_clear(struct rdtgroup *r, struct cpumask *m)
+{
+       struct rdtgroup *crgrp;
+
+       cpumask_andnot(&r->cpu_mask, &r->cpu_mask, m);
+       /* update the child mon group masks as well*/
+       list_for_each_entry(crgrp, &r->mon.crdtgrp_list, mon.crdtgrp_list)
+               cpumask_and(&crgrp->cpu_mask, &r->cpu_mask, &crgrp->cpu_mask);
+}
+
+static int cpus_ctrl_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask,
+                          cpumask_var_t tmpmask, cpumask_var_t tmpmask1)
+{
+       struct rdtgroup *r, *crgrp;
+       struct list_head *head;
+
+       /* Check whether cpus are dropped from this group */
+       cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask);
+       if (!cpumask_empty(tmpmask)) {
+               /* Can't drop from default group */
+               if (rdtgrp == &rdtgroup_default) {
+                       rdt_last_cmd_puts("Can't drop CPUs from default group\n");
+                       return -EINVAL;
+               }
+
+               /* Give any dropped cpus to rdtgroup_default */
+               cpumask_or(&rdtgroup_default.cpu_mask,
+                          &rdtgroup_default.cpu_mask, tmpmask);
+               update_closid_rmid(tmpmask, &rdtgroup_default);
+       }
+
+       /*
+        * If we added cpus, remove them from previous group and
+        * the prev group's child groups that owned them
+        * and update per-cpu closid/rmid.
+        */
+       cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask);
+       if (!cpumask_empty(tmpmask)) {
+               list_for_each_entry(r, &rdt_all_groups, rdtgroup_list) {
+                       if (r == rdtgrp)
+                               continue;
+                       cpumask_and(tmpmask1, &r->cpu_mask, tmpmask);
+                       if (!cpumask_empty(tmpmask1))
+                               cpumask_rdtgrp_clear(r, tmpmask1);
+               }
+               update_closid_rmid(tmpmask, rdtgrp);
+       }
+
+       /* Done pushing/pulling - update this group with new mask */
+       cpumask_copy(&rdtgrp->cpu_mask, newmask);
+
+       /*
+        * Clear child mon group masks since there is a new parent mask
+        * now and update the rmid for the cpus the child lost.
+        */
+       head = &rdtgrp->mon.crdtgrp_list;
+       list_for_each_entry(crgrp, head, mon.crdtgrp_list) {
+               cpumask_and(tmpmask, &rdtgrp->cpu_mask, &crgrp->cpu_mask);
+               update_closid_rmid(tmpmask, rdtgrp);
+               cpumask_clear(&crgrp->cpu_mask);
+       }
+
+       return 0;
+}
+
+static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of,
+                                  char *buf, size_t nbytes, loff_t off)
+{
+       cpumask_var_t tmpmask, newmask, tmpmask1;
+       struct rdtgroup *rdtgrp;
+       int ret;
+
+       if (!buf)
+               return -EINVAL;
+
+       if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
+               return -ENOMEM;
+       if (!zalloc_cpumask_var(&newmask, GFP_KERNEL)) {
+               free_cpumask_var(tmpmask);
+               return -ENOMEM;
+       }
+       if (!zalloc_cpumask_var(&tmpmask1, GFP_KERNEL)) {
+               free_cpumask_var(tmpmask);
+               free_cpumask_var(newmask);
+               return -ENOMEM;
+       }
+
+       rdtgrp = rdtgroup_kn_lock_live(of->kn);
+       if (!rdtgrp) {
+               ret = -ENOENT;
+               goto unlock;
+       }
+
+       if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED ||
+           rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
+               ret = -EINVAL;
+               rdt_last_cmd_puts("Pseudo-locking in progress\n");
+               goto unlock;
+       }
+
+       if (is_cpu_list(of))
+               ret = cpulist_parse(buf, newmask);
+       else
+               ret = cpumask_parse(buf, newmask);
+
+       if (ret) {
+               rdt_last_cmd_puts("Bad CPU list/mask\n");
+               goto unlock;
+       }
+
+       /* check that user didn't specify any offline cpus */
+       cpumask_andnot(tmpmask, newmask, cpu_online_mask);
+       if (!cpumask_empty(tmpmask)) {
+               ret = -EINVAL;
+               rdt_last_cmd_puts("Can only assign online CPUs\n");
+               goto unlock;
+       }
+
+       if (rdtgrp->type == RDTCTRL_GROUP)
+               ret = cpus_ctrl_write(rdtgrp, newmask, tmpmask, tmpmask1);
+       else if (rdtgrp->type == RDTMON_GROUP)
+               ret = cpus_mon_write(rdtgrp, newmask, tmpmask);
+       else
+               ret = -EINVAL;
+
+unlock:
+       rdtgroup_kn_unlock(of->kn);
+       free_cpumask_var(tmpmask);
+       free_cpumask_var(newmask);
+       free_cpumask_var(tmpmask1);
+
+       return ret ?: nbytes;
+}
+
+/**
+ * rdtgroup_remove - the helper to remove resource group safely
+ * @rdtgrp: resource group to remove
+ *
+ * On resource group creation via a mkdir, an extra kernfs_node reference is
+ * taken to ensure that the rdtgroup structure remains accessible for the
+ * rdtgroup_kn_unlock() calls where it is removed.
+ *
+ * Drop the extra reference here, then free the rdtgroup structure.
+ *
+ * Return: void
+ */
+static void rdtgroup_remove(struct rdtgroup *rdtgrp)
+{
+       kernfs_put(rdtgrp->kn);
+       kfree(rdtgrp);
+}
+
+static void _update_task_closid_rmid(void *task)
+{
+       /*
+        * If the task is still current on this CPU, update PQR_ASSOC MSR.
+        * Otherwise, the MSR is updated when the task is scheduled in.
+        */
+       if (task == current)
+               resctrl_arch_sched_in(task);
+}
+
+static void update_task_closid_rmid(struct task_struct *t)
+{
+       if (IS_ENABLED(CONFIG_SMP) && task_curr(t))
+               smp_call_function_single(task_cpu(t), _update_task_closid_rmid, t, 1);
+       else
+               _update_task_closid_rmid(t);
+}
+
+static bool task_in_rdtgroup(struct task_struct *tsk, struct rdtgroup *rdtgrp)
+{
+       u32 closid, rmid = rdtgrp->mon.rmid;
+
+       if (rdtgrp->type == RDTCTRL_GROUP)
+               closid = rdtgrp->closid;
+       else if (rdtgrp->type == RDTMON_GROUP)
+               closid = rdtgrp->mon.parent->closid;
+       else
+               return false;
+
+       return resctrl_arch_match_closid(tsk, closid) &&
+              resctrl_arch_match_rmid(tsk, closid, rmid);
+}
+
+static int __rdtgroup_move_task(struct task_struct *tsk,
+                               struct rdtgroup *rdtgrp)
+{
+       /* If the task is already in rdtgrp, no need to move the task. */
+       if (task_in_rdtgroup(tsk, rdtgrp))
+               return 0;
+
+       /*
+        * Set the task's closid/rmid before the PQR_ASSOC MSR can be
+        * updated by them.
+        *
+        * For ctrl_mon groups, move both closid and rmid.
+        * For monitor groups, can move the tasks only from
+        * their parent CTRL group.
+        */
+       if (rdtgrp->type == RDTMON_GROUP &&
+           !resctrl_arch_match_closid(tsk, rdtgrp->mon.parent->closid)) {
+               rdt_last_cmd_puts("Can't move task to different control group\n");
+               return -EINVAL;
+       }
+
+       if (rdtgrp->type == RDTMON_GROUP)
+               resctrl_arch_set_closid_rmid(tsk, rdtgrp->mon.parent->closid,
+                                            rdtgrp->mon.rmid);
+       else
+               resctrl_arch_set_closid_rmid(tsk, rdtgrp->closid,
+                                            rdtgrp->mon.rmid);
+
+       /*
+        * Ensure the task's closid and rmid are written before determining if
+        * the task is current that will decide if it will be interrupted.
+        * This pairs with the full barrier between the rq->curr update and
+        * resctrl_arch_sched_in() during context switch.
+        */
+       smp_mb();
+
+       /*
+        * By now, the task's closid and rmid are set. If the task is current
+        * on a CPU, the PQR_ASSOC MSR needs to be updated to make the resource
+        * group go into effect. If the task is not current, the MSR will be
+        * updated when the task is scheduled in.
+        */
+       update_task_closid_rmid(tsk);
+
+       return 0;
+}
+
+static bool is_closid_match(struct task_struct *t, struct rdtgroup *r)
+{
+       return (resctrl_arch_alloc_capable() && (r->type == RDTCTRL_GROUP) &&
+               resctrl_arch_match_closid(t, r->closid));
+}
+
+static bool is_rmid_match(struct task_struct *t, struct rdtgroup *r)
+{
+       return (resctrl_arch_mon_capable() && (r->type == RDTMON_GROUP) &&
+               resctrl_arch_match_rmid(t, r->mon.parent->closid,
+                                       r->mon.rmid));
+}
+
+/**
+ * rdtgroup_tasks_assigned - Test if tasks have been assigned to resource group
+ * @r: Resource group
+ *
+ * Return: 1 if tasks have been assigned to @r, 0 otherwise
+ */
+int rdtgroup_tasks_assigned(struct rdtgroup *r)
+{
+       struct task_struct *p, *t;
+       int ret = 0;
+
+       lockdep_assert_held(&rdtgroup_mutex);
+
+       rcu_read_lock();
+       for_each_process_thread(p, t) {
+               if (is_closid_match(t, r) || is_rmid_match(t, r)) {
+                       ret = 1;
+                       break;
+               }
+       }
+       rcu_read_unlock();
+
+       return ret;
+}
+
+static int rdtgroup_task_write_permission(struct task_struct *task,
+                                         struct kernfs_open_file *of)
+{
+       const struct cred *tcred = get_task_cred(task);
+       const struct cred *cred = current_cred();
+       int ret = 0;
+
+       /*
+        * Even if we're attaching all tasks in the thread group, we only
+        * need to check permissions on one of them.
+        */
+       if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
+           !uid_eq(cred->euid, tcred->uid) &&
+           !uid_eq(cred->euid, tcred->suid)) {
+               rdt_last_cmd_printf("No permission to move task %d\n", task->pid);
+               ret = -EPERM;
+       }
+
+       put_cred(tcred);
+       return ret;
+}
+
+static int rdtgroup_move_task(pid_t pid, struct rdtgroup *rdtgrp,
+                             struct kernfs_open_file *of)
+{
+       struct task_struct *tsk;
+       int ret;
+
+       rcu_read_lock();
+       if (pid) {
+               tsk = find_task_by_vpid(pid);
+               if (!tsk) {
+                       rcu_read_unlock();
+                       rdt_last_cmd_printf("No task %d\n", pid);
+                       return -ESRCH;
+               }
+       } else {
+               tsk = current;
+       }
+
+       get_task_struct(tsk);
+       rcu_read_unlock();
+
+       ret = rdtgroup_task_write_permission(tsk, of);
+       if (!ret)
+               ret = __rdtgroup_move_task(tsk, rdtgrp);
+
+       put_task_struct(tsk);
+       return ret;
+}
+
+static ssize_t rdtgroup_tasks_write(struct kernfs_open_file *of,
+                                   char *buf, size_t nbytes, loff_t off)
+{
+       struct rdtgroup *rdtgrp;
+       char *pid_str;
+       int ret = 0;
+       pid_t pid;
+
+       rdtgrp = rdtgroup_kn_lock_live(of->kn);
+       if (!rdtgrp) {
+               rdtgroup_kn_unlock(of->kn);
+               return -ENOENT;
+       }
+       rdt_last_cmd_clear();
+
+       if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED ||
+           rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
+               ret = -EINVAL;
+               rdt_last_cmd_puts("Pseudo-locking in progress\n");
+               goto unlock;
+       }
+
+       while (buf && buf[0] != '\0' && buf[0] != '\n') {
+               pid_str = strim(strsep(&buf, ","));
+
+               if (kstrtoint(pid_str, 0, &pid)) {
+                       rdt_last_cmd_printf("Task list parsing error pid %s\n", pid_str);
+                       ret = -EINVAL;
+                       break;
+               }
+
+               if (pid < 0) {
+                       rdt_last_cmd_printf("Invalid pid %d\n", pid);
+                       ret = -EINVAL;
+                       break;
+               }
+
+               ret = rdtgroup_move_task(pid, rdtgrp, of);
+               if (ret) {
+                       rdt_last_cmd_printf("Error while processing task %d\n", pid);
+                       break;
+               }
+       }
+
+unlock:
+       rdtgroup_kn_unlock(of->kn);
+
+       return ret ?: nbytes;
+}
+
+static void show_rdt_tasks(struct rdtgroup *r, struct seq_file *s)
+{
+       struct task_struct *p, *t;
+       pid_t pid;
+
+       rcu_read_lock();
+       for_each_process_thread(p, t) {
+               if (is_closid_match(t, r) || is_rmid_match(t, r)) {
+                       pid = task_pid_vnr(t);
+                       if (pid)
+                               seq_printf(s, "%d\n", pid);
+               }
+       }
+       rcu_read_unlock();
+}
+
+static int rdtgroup_tasks_show(struct kernfs_open_file *of,
+                              struct seq_file *s, void *v)
+{
+       struct rdtgroup *rdtgrp;
+       int ret = 0;
+
+       rdtgrp = rdtgroup_kn_lock_live(of->kn);
+       if (rdtgrp)
+               show_rdt_tasks(rdtgrp, s);
+       else
+               ret = -ENOENT;
+       rdtgroup_kn_unlock(of->kn);
+
+       return ret;
+}
+
+static int rdtgroup_closid_show(struct kernfs_open_file *of,
+                               struct seq_file *s, void *v)
+{
+       struct rdtgroup *rdtgrp;
+       int ret = 0;
+
+       rdtgrp = rdtgroup_kn_lock_live(of->kn);
+       if (rdtgrp)
+               seq_printf(s, "%u\n", rdtgrp->closid);
+       else
+               ret = -ENOENT;
+       rdtgroup_kn_unlock(of->kn);
+
+       return ret;
+}
+
+static int rdtgroup_rmid_show(struct kernfs_open_file *of,
+                             struct seq_file *s, void *v)
+{
+       struct rdtgroup *rdtgrp;
+       int ret = 0;
+
+       rdtgrp = rdtgroup_kn_lock_live(of->kn);
+       if (rdtgrp)
+               seq_printf(s, "%u\n", rdtgrp->mon.rmid);
+       else
+               ret = -ENOENT;
+       rdtgroup_kn_unlock(of->kn);
+
+       return ret;
+}
+
+#ifdef CONFIG_PROC_CPU_RESCTRL
+/*
+ * A task can only be part of one resctrl control group and of one monitor
+ * group which is associated to that control group.
+ *
+ * 1)   res:
+ *      mon:
+ *
+ *    resctrl is not available.
+ *
+ * 2)   res:/
+ *      mon:
+ *
+ *    Task is part of the root resctrl control group, and it is not associated
+ *    to any monitor group.
+ *
+ * 3)  res:/
+ *     mon:mon0
+ *
+ *    Task is part of the root resctrl control group and monitor group mon0.
+ *
+ * 4)  res:group0
+ *     mon:
+ *
+ *    Task is part of resctrl control group group0, and it is not associated
+ *    to any monitor group.
+ *
+ * 5) res:group0
+ *    mon:mon1
+ *
+ *    Task is part of resctrl control group group0 and monitor group mon1.
+ */
+int proc_resctrl_show(struct seq_file *s, struct pid_namespace *ns,
+                     struct pid *pid, struct task_struct *tsk)
+{
+       struct rdtgroup *rdtg;
+       int ret = 0;
+
+       mutex_lock(&rdtgroup_mutex);
+
+       /* Return empty if resctrl has not been mounted. */
+       if (!resctrl_mounted) {
+               seq_puts(s, "res:\nmon:\n");
+               goto unlock;
+       }
+
+       list_for_each_entry(rdtg, &rdt_all_groups, rdtgroup_list) {
+               struct rdtgroup *crg;
+
+               /*
+                * Task information is only relevant for shareable
+                * and exclusive groups.
+                */
+               if (rdtg->mode != RDT_MODE_SHAREABLE &&
+                   rdtg->mode != RDT_MODE_EXCLUSIVE)
+                       continue;
+
+               if (!resctrl_arch_match_closid(tsk, rdtg->closid))
+                       continue;
+
+               seq_printf(s, "res:%s%s\n", (rdtg == &rdtgroup_default) ? "/" : "",
+                          rdt_kn_name(rdtg->kn));
+               seq_puts(s, "mon:");
+               list_for_each_entry(crg, &rdtg->mon.crdtgrp_list,
+                                   mon.crdtgrp_list) {
+                       if (!resctrl_arch_match_rmid(tsk, crg->mon.parent->closid,
+                                                    crg->mon.rmid))
+                               continue;
+                       seq_printf(s, "%s", rdt_kn_name(crg->kn));
+                       break;
+               }
+               seq_putc(s, '\n');
+               goto unlock;
+       }
+       /*
+        * The above search should succeed. Otherwise return
+        * with an error.
+        */
+       ret = -ENOENT;
+unlock:
+       mutex_unlock(&rdtgroup_mutex);
+
+       return ret;
+}
+#endif
+
+static int rdt_last_cmd_status_show(struct kernfs_open_file *of,
+                                   struct seq_file *seq, void *v)
+{
+       int len;
+
+       mutex_lock(&rdtgroup_mutex);
+       len = seq_buf_used(&last_cmd_status);
+       if (len)
+               seq_printf(seq, "%.*s", len, last_cmd_status_buf);
+       else
+               seq_puts(seq, "ok\n");
+       mutex_unlock(&rdtgroup_mutex);
+       return 0;
+}
+
+static void *rdt_kn_parent_priv(struct kernfs_node *kn)
+{
+       /*
+        * The parent pointer is only valid within RCU section since it can be
+        * replaced.
+        */
+       guard(rcu)();
+       return rcu_dereference(kn->__parent)->priv;
+}
+
+static int rdt_num_closids_show(struct kernfs_open_file *of,
+                               struct seq_file *seq, void *v)
+{
+       struct resctrl_schema *s = rdt_kn_parent_priv(of->kn);
+
+       seq_printf(seq, "%u\n", s->num_closid);
+       return 0;
+}
+
+static int rdt_default_ctrl_show(struct kernfs_open_file *of,
+                                struct seq_file *seq, void *v)
+{
+       struct resctrl_schema *s = rdt_kn_parent_priv(of->kn);
+       struct rdt_resource *r = s->res;
+
+       seq_printf(seq, "%x\n", resctrl_get_default_ctrl(r));
+       return 0;
+}
+
+static int rdt_min_cbm_bits_show(struct kernfs_open_file *of,
+                                struct seq_file *seq, void *v)
+{
+       struct resctrl_schema *s = rdt_kn_parent_priv(of->kn);
+       struct rdt_resource *r = s->res;
+
+       seq_printf(seq, "%u\n", r->cache.min_cbm_bits);
+       return 0;
+}
+
+static int rdt_shareable_bits_show(struct kernfs_open_file *of,
+                                  struct seq_file *seq, void *v)
+{
+       struct resctrl_schema *s = rdt_kn_parent_priv(of->kn);
+       struct rdt_resource *r = s->res;
+
+       seq_printf(seq, "%x\n", r->cache.shareable_bits);
+       return 0;
+}
+
+/*
+ * rdt_bit_usage_show - Display current usage of resources
+ *
+ * A domain is a shared resource that can now be allocated differently. Here
+ * we display the current regions of the domain as an annotated bitmask.
+ * For each domain of this resource its allocation bitmask
+ * is annotated as below to indicate the current usage of the corresponding bit:
+ *   0 - currently unused
+ *   X - currently available for sharing and used by software and hardware
+ *   H - currently used by hardware only but available for software use
+ *   S - currently used and shareable by software only
+ *   E - currently used exclusively by one resource group
+ *   P - currently pseudo-locked by one resource group
+ */
+static int rdt_bit_usage_show(struct kernfs_open_file *of,
+                             struct seq_file *seq, void *v)
+{
+       struct resctrl_schema *s = rdt_kn_parent_priv(of->kn);
+       /*
+        * Use unsigned long even though only 32 bits are used to ensure
+        * test_bit() is used safely.
+        */
+       unsigned long sw_shareable = 0, hw_shareable = 0;
+       unsigned long exclusive = 0, pseudo_locked = 0;
+       struct rdt_resource *r = s->res;
+       struct rdt_ctrl_domain *dom;
+       int i, hwb, swb, excl, psl;
+       enum rdtgrp_mode mode;
+       bool sep = false;
+       u32 ctrl_val;
+
+       cpus_read_lock();
+       mutex_lock(&rdtgroup_mutex);
+       hw_shareable = r->cache.shareable_bits;
+       list_for_each_entry(dom, &r->ctrl_domains, hdr.list) {
+               if (sep)
+                       seq_putc(seq, ';');
+               sw_shareable = 0;
+               exclusive = 0;
+               seq_printf(seq, "%d=", dom->hdr.id);
+               for (i = 0; i < closids_supported(); i++) {
+                       if (!closid_allocated(i))
+                               continue;
+                       ctrl_val = resctrl_arch_get_config(r, dom, i,
+                                                          s->conf_type);
+                       mode = rdtgroup_mode_by_closid(i);
+                       switch (mode) {
+                       case RDT_MODE_SHAREABLE:
+                               sw_shareable |= ctrl_val;
+                               break;
+                       case RDT_MODE_EXCLUSIVE:
+                               exclusive |= ctrl_val;
+                               break;
+                       case RDT_MODE_PSEUDO_LOCKSETUP:
+                       /*
+                        * RDT_MODE_PSEUDO_LOCKSETUP is possible
+                        * here but not included since the CBM
+                        * associated with this CLOSID in this mode
+                        * is not initialized and no task or cpu can be
+                        * assigned this CLOSID.
+                        */
+                               break;
+                       case RDT_MODE_PSEUDO_LOCKED:
+                       case RDT_NUM_MODES:
+                               WARN(1,
+                                    "invalid mode for closid %d\n", i);
+                               break;
+                       }
+               }
+               for (i = r->cache.cbm_len - 1; i >= 0; i--) {
+                       pseudo_locked = dom->plr ? dom->plr->cbm : 0;
+                       hwb = test_bit(i, &hw_shareable);
+                       swb = test_bit(i, &sw_shareable);
+                       excl = test_bit(i, &exclusive);
+                       psl = test_bit(i, &pseudo_locked);
+                       if (hwb && swb)
+                               seq_putc(seq, 'X');
+                       else if (hwb && !swb)
+                               seq_putc(seq, 'H');
+                       else if (!hwb && swb)
+                               seq_putc(seq, 'S');
+                       else if (excl)
+                               seq_putc(seq, 'E');
+                       else if (psl)
+                               seq_putc(seq, 'P');
+                       else /* Unused bits remain */
+                               seq_putc(seq, '0');
+               }
+               sep = true;
+       }
+       seq_putc(seq, '\n');
+       mutex_unlock(&rdtgroup_mutex);
+       cpus_read_unlock();
+       return 0;
+}
+
+static int rdt_min_bw_show(struct kernfs_open_file *of,
+                          struct seq_file *seq, void *v)
+{
+       struct resctrl_schema *s = rdt_kn_parent_priv(of->kn);
+       struct rdt_resource *r = s->res;
+
+       seq_printf(seq, "%u\n", r->membw.min_bw);
+       return 0;
+}
+
+static int rdt_num_rmids_show(struct kernfs_open_file *of,
+                             struct seq_file *seq, void *v)
+{
+       struct rdt_resource *r = rdt_kn_parent_priv(of->kn);
+
+       seq_printf(seq, "%d\n", r->num_rmid);
+
+       return 0;
+}
+
+static int rdt_mon_features_show(struct kernfs_open_file *of,
+                                struct seq_file *seq, void *v)
+{
+       struct rdt_resource *r = rdt_kn_parent_priv(of->kn);
+       struct mon_evt *mevt;
+
+       list_for_each_entry(mevt, &r->evt_list, list) {
+               seq_printf(seq, "%s\n", mevt->name);
+               if (mevt->configurable)
+                       seq_printf(seq, "%s_config\n", mevt->name);
+       }
+
+       return 0;
+}
+
+static int rdt_bw_gran_show(struct kernfs_open_file *of,
+                           struct seq_file *seq, void *v)
+{
+       struct resctrl_schema *s = rdt_kn_parent_priv(of->kn);
+       struct rdt_resource *r = s->res;
+
+       seq_printf(seq, "%u\n", r->membw.bw_gran);
+       return 0;
+}
+
+static int rdt_delay_linear_show(struct kernfs_open_file *of,
+                                struct seq_file *seq, void *v)
+{
+       struct resctrl_schema *s = rdt_kn_parent_priv(of->kn);
+       struct rdt_resource *r = s->res;
+
+       seq_printf(seq, "%u\n", r->membw.delay_linear);
+       return 0;
+}
+
+static int max_threshold_occ_show(struct kernfs_open_file *of,
+                                 struct seq_file *seq, void *v)
+{
+       seq_printf(seq, "%u\n", resctrl_rmid_realloc_threshold);
+
+       return 0;
+}
+
+static int rdt_thread_throttle_mode_show(struct kernfs_open_file *of,
+                                        struct seq_file *seq, void *v)
+{
+       struct resctrl_schema *s = rdt_kn_parent_priv(of->kn);
+       struct rdt_resource *r = s->res;
+
+       switch (r->membw.throttle_mode) {
+       case THREAD_THROTTLE_PER_THREAD:
+               seq_puts(seq, "per-thread\n");
+               return 0;
+       case THREAD_THROTTLE_MAX:
+               seq_puts(seq, "max\n");
+               return 0;
+       case THREAD_THROTTLE_UNDEFINED:
+               seq_puts(seq, "undefined\n");
+               return 0;
+       }
+
+       WARN_ON_ONCE(1);
+
+       return 0;
+}
+
+static ssize_t max_threshold_occ_write(struct kernfs_open_file *of,
+                                      char *buf, size_t nbytes, loff_t off)
+{
+       unsigned int bytes;
+       int ret;
+
+       ret = kstrtouint(buf, 0, &bytes);
+       if (ret)
+               return ret;
+
+       if (bytes > resctrl_rmid_realloc_limit)
+               return -EINVAL;
+
+       resctrl_rmid_realloc_threshold = resctrl_arch_round_mon_val(bytes);
+
+       return nbytes;
+}
+
+/*
+ * rdtgroup_mode_show - Display mode of this resource group
+ */
+static int rdtgroup_mode_show(struct kernfs_open_file *of,
+                             struct seq_file *s, void *v)
+{
+       struct rdtgroup *rdtgrp;
+
+       rdtgrp = rdtgroup_kn_lock_live(of->kn);
+       if (!rdtgrp) {
+               rdtgroup_kn_unlock(of->kn);
+               return -ENOENT;
+       }
+
+       seq_printf(s, "%s\n", rdtgroup_mode_str(rdtgrp->mode));
+
+       rdtgroup_kn_unlock(of->kn);
+       return 0;
+}
+
+static enum resctrl_conf_type resctrl_peer_type(enum resctrl_conf_type my_type)
+{
+       switch (my_type) {
+       case CDP_CODE:
+               return CDP_DATA;
+       case CDP_DATA:
+               return CDP_CODE;
+       default:
+       case CDP_NONE:
+               return CDP_NONE;
+       }
+}
+
+static int rdt_has_sparse_bitmasks_show(struct kernfs_open_file *of,
+                                       struct seq_file *seq, void *v)
+{
+       struct resctrl_schema *s = rdt_kn_parent_priv(of->kn);
+       struct rdt_resource *r = s->res;
+
+       seq_printf(seq, "%u\n", r->cache.arch_has_sparse_bitmasks);
+
+       return 0;
+}
+
+/**
+ * __rdtgroup_cbm_overlaps - Does CBM for intended closid overlap with other
+ * @r: Resource to which domain instance @d belongs.
+ * @d: The domain instance for which @closid is being tested.
+ * @cbm: Capacity bitmask being tested.
+ * @closid: Intended closid for @cbm.
+ * @type: CDP type of @r.
+ * @exclusive: Only check if overlaps with exclusive resource groups
+ *
+ * Checks if provided @cbm intended to be used for @closid on domain
+ * @d overlaps with any other closids or other hardware usage associated
+ * with this domain. If @exclusive is true then only overlaps with
+ * resource groups in exclusive mode will be considered. If @exclusive
+ * is false then overlaps with any resource group or hardware entities
+ * will be considered.
+ *
+ * @cbm is unsigned long, even if only 32 bits are used, to make the
+ * bitmap functions work correctly.
+ *
+ * Return: false if CBM does not overlap, true if it does.
+ */
+static bool __rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_ctrl_domain *d,
+                                   unsigned long cbm, int closid,
+                                   enum resctrl_conf_type type, bool exclusive)
+{
+       enum rdtgrp_mode mode;
+       unsigned long ctrl_b;
+       int i;
+
+       /* Check for any overlap with regions used by hardware directly */
+       if (!exclusive) {
+               ctrl_b = r->cache.shareable_bits;
+               if (bitmap_intersects(&cbm, &ctrl_b, r->cache.cbm_len))
+                       return true;
+       }
+
+       /* Check for overlap with other resource groups */
+       for (i = 0; i < closids_supported(); i++) {
+               ctrl_b = resctrl_arch_get_config(r, d, i, type);
+               mode = rdtgroup_mode_by_closid(i);
+               if (closid_allocated(i) && i != closid &&
+                   mode != RDT_MODE_PSEUDO_LOCKSETUP) {
+                       if (bitmap_intersects(&cbm, &ctrl_b, r->cache.cbm_len)) {
+                               if (exclusive) {
+                                       if (mode == RDT_MODE_EXCLUSIVE)
+                                               return true;
+                                       continue;
+                               }
+                               return true;
+                       }
+               }
+       }
+
+       return false;
+}
+
+/**
+ * rdtgroup_cbm_overlaps - Does CBM overlap with other use of hardware
+ * @s: Schema for the resource to which domain instance @d belongs.
+ * @d: The domain instance for which @closid is being tested.
+ * @cbm: Capacity bitmask being tested.
+ * @closid: Intended closid for @cbm.
+ * @exclusive: Only check if overlaps with exclusive resource groups
+ *
+ * Resources that can be allocated using a CBM can use the CBM to control
+ * the overlap of these allocations. rdtgroup_cmb_overlaps() is the test
+ * for overlap. Overlap test is not limited to the specific resource for
+ * which the CBM is intended though - when dealing with CDP resources that
+ * share the underlying hardware the overlap check should be performed on
+ * the CDP resource sharing the hardware also.
+ *
+ * Refer to description of __rdtgroup_cbm_overlaps() for the details of the
+ * overlap test.
+ *
+ * Return: true if CBM overlap detected, false if there is no overlap
+ */
+bool rdtgroup_cbm_overlaps(struct resctrl_schema *s, struct rdt_ctrl_domain *d,
+                          unsigned long cbm, int closid, bool exclusive)
+{
+       enum resctrl_conf_type peer_type = resctrl_peer_type(s->conf_type);
+       struct rdt_resource *r = s->res;
+
+       if (__rdtgroup_cbm_overlaps(r, d, cbm, closid, s->conf_type,
+                                   exclusive))
+               return true;
+
+       if (!resctrl_arch_get_cdp_enabled(r->rid))
+               return false;
+       return  __rdtgroup_cbm_overlaps(r, d, cbm, closid, peer_type, exclusive);
+}
+
+/**
+ * rdtgroup_mode_test_exclusive - Test if this resource group can be exclusive
+ * @rdtgrp: Resource group identified through its closid.
+ *
+ * An exclusive resource group implies that there should be no sharing of
+ * its allocated resources. At the time this group is considered to be
+ * exclusive this test can determine if its current schemata supports this
+ * setting by testing for overlap with all other resource groups.
+ *
+ * Return: true if resource group can be exclusive, false if there is overlap
+ * with allocations of other resource groups and thus this resource group
+ * cannot be exclusive.
+ */
+static bool rdtgroup_mode_test_exclusive(struct rdtgroup *rdtgrp)
+{
+       int closid = rdtgrp->closid;
+       struct rdt_ctrl_domain *d;
+       struct resctrl_schema *s;
+       struct rdt_resource *r;
+       bool has_cache = false;
+       u32 ctrl;
+
+       /* Walking r->domains, ensure it can't race with cpuhp */
+       lockdep_assert_cpus_held();
+
+       list_for_each_entry(s, &resctrl_schema_all, list) {
+               r = s->res;
+               if (r->rid == RDT_RESOURCE_MBA || r->rid == RDT_RESOURCE_SMBA)
+                       continue;
+               has_cache = true;
+               list_for_each_entry(d, &r->ctrl_domains, hdr.list) {
+                       ctrl = resctrl_arch_get_config(r, d, closid,
+                                                      s->conf_type);
+                       if (rdtgroup_cbm_overlaps(s, d, ctrl, closid, false)) {
+                               rdt_last_cmd_puts("Schemata overlaps\n");
+                               return false;
+                       }
+               }
+       }
+
+       if (!has_cache) {
+               rdt_last_cmd_puts("Cannot be exclusive without CAT/CDP\n");
+               return false;
+       }
+
+       return true;
+}
+
+/*
+ * rdtgroup_mode_write - Modify the resource group's mode
+ */
+static ssize_t rdtgroup_mode_write(struct kernfs_open_file *of,
+                                  char *buf, size_t nbytes, loff_t off)
+{
+       struct rdtgroup *rdtgrp;
+       enum rdtgrp_mode mode;
+       int ret = 0;
+
+       /* Valid input requires a trailing newline */
+       if (nbytes == 0 || buf[nbytes - 1] != '\n')
+               return -EINVAL;
+       buf[nbytes - 1] = '\0';
+
+       rdtgrp = rdtgroup_kn_lock_live(of->kn);
+       if (!rdtgrp) {
+               rdtgroup_kn_unlock(of->kn);
+               return -ENOENT;
+       }
+
+       rdt_last_cmd_clear();
+
+       mode = rdtgrp->mode;
+
+       if ((!strcmp(buf, "shareable") && mode == RDT_MODE_SHAREABLE) ||
+           (!strcmp(buf, "exclusive") && mode == RDT_MODE_EXCLUSIVE) ||
+           (!strcmp(buf, "pseudo-locksetup") &&
+            mode == RDT_MODE_PSEUDO_LOCKSETUP) ||
+           (!strcmp(buf, "pseudo-locked") && mode == RDT_MODE_PSEUDO_LOCKED))
+               goto out;
+
+       if (mode == RDT_MODE_PSEUDO_LOCKED) {
+               rdt_last_cmd_puts("Cannot change pseudo-locked group\n");
+               ret = -EINVAL;
+               goto out;
+       }
+
+       if (!strcmp(buf, "shareable")) {
+               if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
+                       ret = rdtgroup_locksetup_exit(rdtgrp);
+                       if (ret)
+                               goto out;
+               }
+               rdtgrp->mode = RDT_MODE_SHAREABLE;
+       } else if (!strcmp(buf, "exclusive")) {
+               if (!rdtgroup_mode_test_exclusive(rdtgrp)) {
+                       ret = -EINVAL;
+                       goto out;
+               }
+               if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
+                       ret = rdtgroup_locksetup_exit(rdtgrp);
+                       if (ret)
+                               goto out;
+               }
+               rdtgrp->mode = RDT_MODE_EXCLUSIVE;
+       } else if (IS_ENABLED(CONFIG_RESCTRL_FS_PSEUDO_LOCK) &&
+                  !strcmp(buf, "pseudo-locksetup")) {
+               ret = rdtgroup_locksetup_enter(rdtgrp);
+               if (ret)
+                       goto out;
+               rdtgrp->mode = RDT_MODE_PSEUDO_LOCKSETUP;
+       } else {
+               rdt_last_cmd_puts("Unknown or unsupported mode\n");
+               ret = -EINVAL;
+       }
+
+out:
+       rdtgroup_kn_unlock(of->kn);
+       return ret ?: nbytes;
+}
+
+/**
+ * rdtgroup_cbm_to_size - Translate CBM to size in bytes
+ * @r: RDT resource to which @d belongs.
+ * @d: RDT domain instance.
+ * @cbm: bitmask for which the size should be computed.
+ *
+ * The bitmask provided associated with the RDT domain instance @d will be
+ * translated into how many bytes it represents. The size in bytes is
+ * computed by first dividing the total cache size by the CBM length to
+ * determine how many bytes each bit in the bitmask represents. The result
+ * is multiplied with the number of bits set in the bitmask.
+ *
+ * @cbm is unsigned long, even if only 32 bits are used to make the
+ * bitmap functions work correctly.
+ */
+unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r,
+                                 struct rdt_ctrl_domain *d, unsigned long cbm)
+{
+       unsigned int size = 0;
+       struct cacheinfo *ci;
+       int num_b;
+
+       if (WARN_ON_ONCE(r->ctrl_scope != RESCTRL_L2_CACHE && r->ctrl_scope != RESCTRL_L3_CACHE))
+               return size;
+
+       num_b = bitmap_weight(&cbm, r->cache.cbm_len);
+       ci = get_cpu_cacheinfo_level(cpumask_any(&d->hdr.cpu_mask), r->ctrl_scope);
+       if (ci)
+               size = ci->size / r->cache.cbm_len * num_b;
+
+       return size;
+}
+
+bool is_mba_sc(struct rdt_resource *r)
+{
+       if (!r)
+               r = resctrl_arch_get_resource(RDT_RESOURCE_MBA);
+
+       /*
+        * The software controller support is only applicable to MBA resource.
+        * Make sure to check for resource type.
+        */
+       if (r->rid != RDT_RESOURCE_MBA)
+               return false;
+
+       return r->membw.mba_sc;
+}
+
+/*
+ * rdtgroup_size_show - Display size in bytes of allocated regions
+ *
+ * The "size" file mirrors the layout of the "schemata" file, printing the
+ * size in bytes of each region instead of the capacity bitmask.
+ */
+static int rdtgroup_size_show(struct kernfs_open_file *of,
+                             struct seq_file *s, void *v)
+{
+       struct resctrl_schema *schema;
+       enum resctrl_conf_type type;
+       struct rdt_ctrl_domain *d;
+       struct rdtgroup *rdtgrp;
+       struct rdt_resource *r;
+       unsigned int size;
+       int ret = 0;
+       u32 closid;
+       bool sep;
+       u32 ctrl;
+
+       rdtgrp = rdtgroup_kn_lock_live(of->kn);
+       if (!rdtgrp) {
+               rdtgroup_kn_unlock(of->kn);
+               return -ENOENT;
+       }
+
+       if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) {
+               if (!rdtgrp->plr->d) {
+                       rdt_last_cmd_clear();
+                       rdt_last_cmd_puts("Cache domain offline\n");
+                       ret = -ENODEV;
+               } else {
+                       seq_printf(s, "%*s:", max_name_width,
+                                  rdtgrp->plr->s->name);
+                       size = rdtgroup_cbm_to_size(rdtgrp->plr->s->res,
+                                                   rdtgrp->plr->d,
+                                                   rdtgrp->plr->cbm);
+                       seq_printf(s, "%d=%u\n", rdtgrp->plr->d->hdr.id, size);
+               }
+               goto out;
+       }
+
+       closid = rdtgrp->closid;
+
+       list_for_each_entry(schema, &resctrl_schema_all, list) {
+               r = schema->res;
+               type = schema->conf_type;
+               sep = false;
+               seq_printf(s, "%*s:", max_name_width, schema->name);
+               list_for_each_entry(d, &r->ctrl_domains, hdr.list) {
+                       if (sep)
+                               seq_putc(s, ';');
+                       if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
+                               size = 0;
+                       } else {
+                               if (is_mba_sc(r))
+                                       ctrl = d->mbps_val[closid];
+                               else
+                                       ctrl = resctrl_arch_get_config(r, d,
+                                                                      closid,
+                                                                      type);
+                               if (r->rid == RDT_RESOURCE_MBA ||
+                                   r->rid == RDT_RESOURCE_SMBA)
+                                       size = ctrl;
+                               else
+                                       size = rdtgroup_cbm_to_size(r, d, ctrl);
+                       }
+                       seq_printf(s, "%d=%u", d->hdr.id, size);
+                       sep = true;
+               }
+               seq_putc(s, '\n');
+       }
+
+out:
+       rdtgroup_kn_unlock(of->kn);
+
+       return ret;
+}
+
+static void mondata_config_read(struct resctrl_mon_config_info *mon_info)
+{
+       smp_call_function_any(&mon_info->d->hdr.cpu_mask,
+                             resctrl_arch_mon_event_config_read, mon_info, 1);
+}
+
+static int mbm_config_show(struct seq_file *s, struct rdt_resource *r, u32 evtid)
+{
+       struct resctrl_mon_config_info mon_info;
+       struct rdt_mon_domain *dom;
+       bool sep = false;
+
+       cpus_read_lock();
+       mutex_lock(&rdtgroup_mutex);
+
+       list_for_each_entry(dom, &r->mon_domains, hdr.list) {
+               if (sep)
+                       seq_puts(s, ";");
+
+               memset(&mon_info, 0, sizeof(struct resctrl_mon_config_info));
+               mon_info.r = r;
+               mon_info.d = dom;
+               mon_info.evtid = evtid;
+               mondata_config_read(&mon_info);
+
+               seq_printf(s, "%d=0x%02x", dom->hdr.id, mon_info.mon_config);
+               sep = true;
+       }
+       seq_puts(s, "\n");
+
+       mutex_unlock(&rdtgroup_mutex);
+       cpus_read_unlock();
+
+       return 0;
+}
+
+static int mbm_total_bytes_config_show(struct kernfs_open_file *of,
+                                      struct seq_file *seq, void *v)
+{
+       struct rdt_resource *r = rdt_kn_parent_priv(of->kn);
+
+       mbm_config_show(seq, r, QOS_L3_MBM_TOTAL_EVENT_ID);
+
+       return 0;
+}
+
+static int mbm_local_bytes_config_show(struct kernfs_open_file *of,
+                                      struct seq_file *seq, void *v)
+{
+       struct rdt_resource *r = rdt_kn_parent_priv(of->kn);
+
+       mbm_config_show(seq, r, QOS_L3_MBM_LOCAL_EVENT_ID);
+
+       return 0;
+}
+
+static void mbm_config_write_domain(struct rdt_resource *r,
+                                   struct rdt_mon_domain *d, u32 evtid, u32 val)
+{
+       struct resctrl_mon_config_info mon_info = {0};
+
+       /*
+        * Read the current config value first. If both are the same then
+        * no need to write it again.
+        */
+       mon_info.r = r;
+       mon_info.d = d;
+       mon_info.evtid = evtid;
+       mondata_config_read(&mon_info);
+       if (mon_info.mon_config == val)
+               return;
+
+       mon_info.mon_config = val;
+
+       /*
+        * Update MSR_IA32_EVT_CFG_BASE MSR on one of the CPUs in the
+        * domain. The MSRs offset from MSR MSR_IA32_EVT_CFG_BASE
+        * are scoped at the domain level. Writing any of these MSRs
+        * on one CPU is observed by all the CPUs in the domain.
+        */
+       smp_call_function_any(&d->hdr.cpu_mask, resctrl_arch_mon_event_config_write,
+                             &mon_info, 1);
+
+       /*
+        * When an Event Configuration is changed, the bandwidth counters
+        * for all RMIDs and Events will be cleared by the hardware. The
+        * hardware also sets MSR_IA32_QM_CTR.Unavailable (bit 62) for
+        * every RMID on the next read to any event for every RMID.
+        * Subsequent reads will have MSR_IA32_QM_CTR.Unavailable (bit 62)
+        * cleared while it is tracked by the hardware. Clear the
+        * mbm_local and mbm_total counts for all the RMIDs.
+        */
+       resctrl_arch_reset_rmid_all(r, d);
+}
+
+static int mon_config_write(struct rdt_resource *r, char *tok, u32 evtid)
+{
+       char *dom_str = NULL, *id_str;
+       unsigned long dom_id, val;
+       struct rdt_mon_domain *d;
+
+       /* Walking r->domains, ensure it can't race with cpuhp */
+       lockdep_assert_cpus_held();
+
+next:
+       if (!tok || tok[0] == '\0')
+               return 0;
+
+       /* Start processing the strings for each domain */
+       dom_str = strim(strsep(&tok, ";"));
+       id_str = strsep(&dom_str, "=");
+
+       if (!id_str || kstrtoul(id_str, 10, &dom_id)) {
+               rdt_last_cmd_puts("Missing '=' or non-numeric domain id\n");
+               return -EINVAL;
+       }
+
+       if (!dom_str || kstrtoul(dom_str, 16, &val)) {
+               rdt_last_cmd_puts("Non-numeric event configuration value\n");
+               return -EINVAL;
+       }
+
+       /* Value from user cannot be more than the supported set of events */
+       if ((val & r->mbm_cfg_mask) != val) {
+               rdt_last_cmd_printf("Invalid event configuration: max valid mask is 0x%02x\n",
+                                   r->mbm_cfg_mask);
+               return -EINVAL;
+       }
+
+       list_for_each_entry(d, &r->mon_domains, hdr.list) {
+               if (d->hdr.id == dom_id) {
+                       mbm_config_write_domain(r, d, evtid, val);
+                       goto next;
+               }
+       }
+
+       return -EINVAL;
+}
+
+static ssize_t mbm_total_bytes_config_write(struct kernfs_open_file *of,
+                                           char *buf, size_t nbytes,
+                                           loff_t off)
+{
+       struct rdt_resource *r = rdt_kn_parent_priv(of->kn);
+       int ret;
+
+       /* Valid input requires a trailing newline */
+       if (nbytes == 0 || buf[nbytes - 1] != '\n')
+               return -EINVAL;
+
+       cpus_read_lock();
+       mutex_lock(&rdtgroup_mutex);
+
+       rdt_last_cmd_clear();
+
+       buf[nbytes - 1] = '\0';
+
+       ret = mon_config_write(r, buf, QOS_L3_MBM_TOTAL_EVENT_ID);
+
+       mutex_unlock(&rdtgroup_mutex);
+       cpus_read_unlock();
+
+       return ret ?: nbytes;
+}
+
+static ssize_t mbm_local_bytes_config_write(struct kernfs_open_file *of,
+                                           char *buf, size_t nbytes,
+                                           loff_t off)
+{
+       struct rdt_resource *r = rdt_kn_parent_priv(of->kn);
+       int ret;
+
+       /* Valid input requires a trailing newline */
+       if (nbytes == 0 || buf[nbytes - 1] != '\n')
+               return -EINVAL;
+
+       cpus_read_lock();
+       mutex_lock(&rdtgroup_mutex);
+
+       rdt_last_cmd_clear();
+
+       buf[nbytes - 1] = '\0';
+
+       ret = mon_config_write(r, buf, QOS_L3_MBM_LOCAL_EVENT_ID);
+
+       mutex_unlock(&rdtgroup_mutex);
+       cpus_read_unlock();
+
+       return ret ?: nbytes;
+}
+
+/* rdtgroup information files for one cache resource. */
+static struct rftype res_common_files[] = {
+       {
+               .name           = "last_cmd_status",
+               .mode           = 0444,
+               .kf_ops         = &rdtgroup_kf_single_ops,
+               .seq_show       = rdt_last_cmd_status_show,
+               .fflags         = RFTYPE_TOP_INFO,
+       },
+       {
+               .name           = "num_closids",
+               .mode           = 0444,
+               .kf_ops         = &rdtgroup_kf_single_ops,
+               .seq_show       = rdt_num_closids_show,
+               .fflags         = RFTYPE_CTRL_INFO,
+       },
+       {
+               .name           = "mon_features",
+               .mode           = 0444,
+               .kf_ops         = &rdtgroup_kf_single_ops,
+               .seq_show       = rdt_mon_features_show,
+               .fflags         = RFTYPE_MON_INFO,
+       },
+       {
+               .name           = "num_rmids",
+               .mode           = 0444,
+               .kf_ops         = &rdtgroup_kf_single_ops,
+               .seq_show       = rdt_num_rmids_show,
+               .fflags         = RFTYPE_MON_INFO,
+       },
+       {
+               .name           = "cbm_mask",
+               .mode           = 0444,
+               .kf_ops         = &rdtgroup_kf_single_ops,
+               .seq_show       = rdt_default_ctrl_show,
+               .fflags         = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE,
+       },
+       {
+               .name           = "min_cbm_bits",
+               .mode           = 0444,
+               .kf_ops         = &rdtgroup_kf_single_ops,
+               .seq_show       = rdt_min_cbm_bits_show,
+               .fflags         = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE,
+       },
+       {
+               .name           = "shareable_bits",
+               .mode           = 0444,
+               .kf_ops         = &rdtgroup_kf_single_ops,
+               .seq_show       = rdt_shareable_bits_show,
+               .fflags         = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE,
+       },
+       {
+               .name           = "bit_usage",
+               .mode           = 0444,
+               .kf_ops         = &rdtgroup_kf_single_ops,
+               .seq_show       = rdt_bit_usage_show,
+               .fflags         = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE,
+       },
+       {
+               .name           = "min_bandwidth",
+               .mode           = 0444,
+               .kf_ops         = &rdtgroup_kf_single_ops,
+               .seq_show       = rdt_min_bw_show,
+               .fflags         = RFTYPE_CTRL_INFO | RFTYPE_RES_MB,
+       },
+       {
+               .name           = "bandwidth_gran",
+               .mode           = 0444,
+               .kf_ops         = &rdtgroup_kf_single_ops,
+               .seq_show       = rdt_bw_gran_show,
+               .fflags         = RFTYPE_CTRL_INFO | RFTYPE_RES_MB,
+       },
+       {
+               .name           = "delay_linear",
+               .mode           = 0444,
+               .kf_ops         = &rdtgroup_kf_single_ops,
+               .seq_show       = rdt_delay_linear_show,
+               .fflags         = RFTYPE_CTRL_INFO | RFTYPE_RES_MB,
+       },
+       /*
+        * Platform specific which (if any) capabilities are provided by
+        * thread_throttle_mode. Defer "fflags" initialization to platform
+        * discovery.
+        */
+       {
+               .name           = "thread_throttle_mode",
+               .mode           = 0444,
+               .kf_ops         = &rdtgroup_kf_single_ops,
+               .seq_show       = rdt_thread_throttle_mode_show,
+       },
+       {
+               .name           = "max_threshold_occupancy",
+               .mode           = 0644,
+               .kf_ops         = &rdtgroup_kf_single_ops,
+               .write          = max_threshold_occ_write,
+               .seq_show       = max_threshold_occ_show,
+               .fflags         = RFTYPE_MON_INFO | RFTYPE_RES_CACHE,
+       },
+       {
+               .name           = "mbm_total_bytes_config",
+               .mode           = 0644,
+               .kf_ops         = &rdtgroup_kf_single_ops,
+               .seq_show       = mbm_total_bytes_config_show,
+               .write          = mbm_total_bytes_config_write,
+       },
+       {
+               .name           = "mbm_local_bytes_config",
+               .mode           = 0644,
+               .kf_ops         = &rdtgroup_kf_single_ops,
+               .seq_show       = mbm_local_bytes_config_show,
+               .write          = mbm_local_bytes_config_write,
+       },
+       {
+               .name           = "cpus",
+               .mode           = 0644,
+               .kf_ops         = &rdtgroup_kf_single_ops,
+               .write          = rdtgroup_cpus_write,
+               .seq_show       = rdtgroup_cpus_show,
+               .fflags         = RFTYPE_BASE,
+       },
+       {
+               .name           = "cpus_list",
+               .mode           = 0644,
+               .kf_ops         = &rdtgroup_kf_single_ops,
+               .write          = rdtgroup_cpus_write,
+               .seq_show       = rdtgroup_cpus_show,
+               .flags          = RFTYPE_FLAGS_CPUS_LIST,
+               .fflags         = RFTYPE_BASE,
+       },
+       {
+               .name           = "tasks",
+               .mode           = 0644,
+               .kf_ops         = &rdtgroup_kf_single_ops,
+               .write          = rdtgroup_tasks_write,
+               .seq_show       = rdtgroup_tasks_show,
+               .fflags         = RFTYPE_BASE,
+       },
+       {
+               .name           = "mon_hw_id",
+               .mode           = 0444,
+               .kf_ops         = &rdtgroup_kf_single_ops,
+               .seq_show       = rdtgroup_rmid_show,
+               .fflags         = RFTYPE_MON_BASE | RFTYPE_DEBUG,
+       },
+       {
+               .name           = "schemata",
+               .mode           = 0644,
+               .kf_ops         = &rdtgroup_kf_single_ops,
+               .write          = rdtgroup_schemata_write,
+               .seq_show       = rdtgroup_schemata_show,
+               .fflags         = RFTYPE_CTRL_BASE,
+       },
+       {
+               .name           = "mba_MBps_event",
+               .mode           = 0644,
+               .kf_ops         = &rdtgroup_kf_single_ops,
+               .write          = rdtgroup_mba_mbps_event_write,
+               .seq_show       = rdtgroup_mba_mbps_event_show,
+       },
+       {
+               .name           = "mode",
+               .mode           = 0644,
+               .kf_ops         = &rdtgroup_kf_single_ops,
+               .write          = rdtgroup_mode_write,
+               .seq_show       = rdtgroup_mode_show,
+               .fflags         = RFTYPE_CTRL_BASE,
+       },
+       {
+               .name           = "size",
+               .mode           = 0444,
+               .kf_ops         = &rdtgroup_kf_single_ops,
+               .seq_show       = rdtgroup_size_show,
+               .fflags         = RFTYPE_CTRL_BASE,
+       },
+       {
+               .name           = "sparse_masks",
+               .mode           = 0444,
+               .kf_ops         = &rdtgroup_kf_single_ops,
+               .seq_show       = rdt_has_sparse_bitmasks_show,
+               .fflags         = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE,
+       },
+       {
+               .name           = "ctrl_hw_id",
+               .mode           = 0444,
+               .kf_ops         = &rdtgroup_kf_single_ops,
+               .seq_show       = rdtgroup_closid_show,
+               .fflags         = RFTYPE_CTRL_BASE | RFTYPE_DEBUG,
+       },
+};
+
+static int rdtgroup_add_files(struct kernfs_node *kn, unsigned long fflags)
+{
+       struct rftype *rfts, *rft;
+       int ret, len;
+
+       rfts = res_common_files;
+       len = ARRAY_SIZE(res_common_files);
+
+       lockdep_assert_held(&rdtgroup_mutex);
+
+       if (resctrl_debug)
+               fflags |= RFTYPE_DEBUG;
+
+       for (rft = rfts; rft < rfts + len; rft++) {
+               if (rft->fflags && ((fflags & rft->fflags) == rft->fflags)) {
+                       ret = rdtgroup_add_file(kn, rft);
+                       if (ret)
+                               goto error;
+               }
+       }
+
+       return 0;
+error:
+       pr_warn("Failed to add %s, err=%d\n", rft->name, ret);
+       while (--rft >= rfts) {
+               if ((fflags & rft->fflags) == rft->fflags)
+                       kernfs_remove_by_name(kn, rft->name);
+       }
+       return ret;
+}
+
+static struct rftype *rdtgroup_get_rftype_by_name(const char *name)
+{
+       struct rftype *rfts, *rft;
+       int len;
+
+       rfts = res_common_files;
+       len = ARRAY_SIZE(res_common_files);
+
+       for (rft = rfts; rft < rfts + len; rft++) {
+               if (!strcmp(rft->name, name))
+                       return rft;
+       }
+
+       return NULL;
+}
+
+static void thread_throttle_mode_init(void)
+{
+       enum membw_throttle_mode throttle_mode = THREAD_THROTTLE_UNDEFINED;
+       struct rdt_resource *r_mba, *r_smba;
+
+       r_mba = resctrl_arch_get_resource(RDT_RESOURCE_MBA);
+       if (r_mba->alloc_capable &&
+           r_mba->membw.throttle_mode != THREAD_THROTTLE_UNDEFINED)
+               throttle_mode = r_mba->membw.throttle_mode;
+
+       r_smba = resctrl_arch_get_resource(RDT_RESOURCE_SMBA);
+       if (r_smba->alloc_capable &&
+           r_smba->membw.throttle_mode != THREAD_THROTTLE_UNDEFINED)
+               throttle_mode = r_smba->membw.throttle_mode;
+
+       if (throttle_mode == THREAD_THROTTLE_UNDEFINED)
+               return;
+
+       resctrl_file_fflags_init("thread_throttle_mode",
+                                RFTYPE_CTRL_INFO | RFTYPE_RES_MB);
+}
+
+void resctrl_file_fflags_init(const char *config, unsigned long fflags)
+{
+       struct rftype *rft;
+
+       rft = rdtgroup_get_rftype_by_name(config);
+       if (rft)
+               rft->fflags = fflags;
+}
+
+/**
+ * rdtgroup_kn_mode_restrict - Restrict user access to named resctrl file
+ * @r: The resource group with which the file is associated.
+ * @name: Name of the file
+ *
+ * The permissions of named resctrl file, directory, or link are modified
+ * to not allow read, write, or execute by any user.
+ *
+ * WARNING: This function is intended to communicate to the user that the
+ * resctrl file has been locked down - that it is not relevant to the
+ * particular state the system finds itself in. It should not be relied
+ * on to protect from user access because after the file's permissions
+ * are restricted the user can still change the permissions using chmod
+ * from the command line.
+ *
+ * Return: 0 on success, <0 on failure.
+ */
+int rdtgroup_kn_mode_restrict(struct rdtgroup *r, const char *name)
+{
+       struct iattr iattr = {.ia_valid = ATTR_MODE,};
+       struct kernfs_node *kn;
+       int ret = 0;
+
+       kn = kernfs_find_and_get_ns(r->kn, name, NULL);
+       if (!kn)
+               return -ENOENT;
+
+       switch (kernfs_type(kn)) {
+       case KERNFS_DIR:
+               iattr.ia_mode = S_IFDIR;
+               break;
+       case KERNFS_FILE:
+               iattr.ia_mode = S_IFREG;
+               break;
+       case KERNFS_LINK:
+               iattr.ia_mode = S_IFLNK;
+               break;
+       }
+
+       ret = kernfs_setattr(kn, &iattr);
+       kernfs_put(kn);
+       return ret;
+}
+
+/**
+ * rdtgroup_kn_mode_restore - Restore user access to named resctrl file
+ * @r: The resource group with which the file is associated.
+ * @name: Name of the file
+ * @mask: Mask of permissions that should be restored
+ *
+ * Restore the permissions of the named file. If @name is a directory the
+ * permissions of its parent will be used.
+ *
+ * Return: 0 on success, <0 on failure.
+ */
+int rdtgroup_kn_mode_restore(struct rdtgroup *r, const char *name,
+                            umode_t mask)
+{
+       struct iattr iattr = {.ia_valid = ATTR_MODE,};
+       struct kernfs_node *kn, *parent;
+       struct rftype *rfts, *rft;
+       int ret, len;
+
+       rfts = res_common_files;
+       len = ARRAY_SIZE(res_common_files);
+
+       for (rft = rfts; rft < rfts + len; rft++) {
+               if (!strcmp(rft->name, name))
+                       iattr.ia_mode = rft->mode & mask;
+       }
+
+       kn = kernfs_find_and_get_ns(r->kn, name, NULL);
+       if (!kn)
+               return -ENOENT;
+
+       switch (kernfs_type(kn)) {
+       case KERNFS_DIR:
+               parent = kernfs_get_parent(kn);
+               if (parent) {
+                       iattr.ia_mode |= parent->mode;
+                       kernfs_put(parent);
+               }
+               iattr.ia_mode |= S_IFDIR;
+               break;
+       case KERNFS_FILE:
+               iattr.ia_mode |= S_IFREG;
+               break;
+       case KERNFS_LINK:
+               iattr.ia_mode |= S_IFLNK;
+               break;
+       }
+
+       ret = kernfs_setattr(kn, &iattr);
+       kernfs_put(kn);
+       return ret;
+}
+
+static int rdtgroup_mkdir_info_resdir(void *priv, char *name,
+                                     unsigned long fflags)
+{
+       struct kernfs_node *kn_subdir;
+       int ret;
+
+       kn_subdir = kernfs_create_dir(kn_info, name,
+                                     kn_info->mode, priv);
+       if (IS_ERR(kn_subdir))
+               return PTR_ERR(kn_subdir);
+
+       ret = rdtgroup_kn_set_ugid(kn_subdir);
+       if (ret)
+               return ret;
+
+       ret = rdtgroup_add_files(kn_subdir, fflags);
+       if (!ret)
+               kernfs_activate(kn_subdir);
+
+       return ret;
+}
+
+static unsigned long fflags_from_resource(struct rdt_resource *r)
+{
+       switch (r->rid) {
+       case RDT_RESOURCE_L3:
+       case RDT_RESOURCE_L2:
+               return RFTYPE_RES_CACHE;
+       case RDT_RESOURCE_MBA:
+       case RDT_RESOURCE_SMBA:
+               return RFTYPE_RES_MB;
+       }
+
+       return WARN_ON_ONCE(1);
+}
+
+static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn)
+{
+       struct resctrl_schema *s;
+       struct rdt_resource *r;
+       unsigned long fflags;
+       char name[32];
+       int ret;
+
+       /* create the directory */
+       kn_info = kernfs_create_dir(parent_kn, "info", parent_kn->mode, NULL);
+       if (IS_ERR(kn_info))
+               return PTR_ERR(kn_info);
+
+       ret = rdtgroup_add_files(kn_info, RFTYPE_TOP_INFO);
+       if (ret)
+               goto out_destroy;
+
+       /* loop over enabled controls, these are all alloc_capable */
+       list_for_each_entry(s, &resctrl_schema_all, list) {
+               r = s->res;
+               fflags = fflags_from_resource(r) | RFTYPE_CTRL_INFO;
+               ret = rdtgroup_mkdir_info_resdir(s, s->name, fflags);
+               if (ret)
+                       goto out_destroy;
+       }
+
+       for_each_mon_capable_rdt_resource(r) {
+               fflags = fflags_from_resource(r) | RFTYPE_MON_INFO;
+               sprintf(name, "%s_MON", r->name);
+               ret = rdtgroup_mkdir_info_resdir(r, name, fflags);
+               if (ret)
+                       goto out_destroy;
+       }
+
+       ret = rdtgroup_kn_set_ugid(kn_info);
+       if (ret)
+               goto out_destroy;
+
+       kernfs_activate(kn_info);
+
+       return 0;
+
+out_destroy:
+       kernfs_remove(kn_info);
+       return ret;
+}
+
+static int
+mongroup_create_dir(struct kernfs_node *parent_kn, struct rdtgroup *prgrp,
+                   char *name, struct kernfs_node **dest_kn)
+{
+       struct kernfs_node *kn;
+       int ret;
+
+       /* create the directory */
+       kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp);
+       if (IS_ERR(kn))
+               return PTR_ERR(kn);
+
+       if (dest_kn)
+               *dest_kn = kn;
+
+       ret = rdtgroup_kn_set_ugid(kn);
+       if (ret)
+               goto out_destroy;
+
+       kernfs_activate(kn);
+
+       return 0;
+
+out_destroy:
+       kernfs_remove(kn);
+       return ret;
+}
+
+static inline bool is_mba_linear(void)
+{
+       return resctrl_arch_get_resource(RDT_RESOURCE_MBA)->membw.delay_linear;
+}
+
+static int mba_sc_domain_allocate(struct rdt_resource *r, struct rdt_ctrl_domain *d)
+{
+       u32 num_closid = resctrl_arch_get_num_closid(r);
+       int cpu = cpumask_any(&d->hdr.cpu_mask);
+       int i;
+
+       d->mbps_val = kcalloc_node(num_closid, sizeof(*d->mbps_val),
+                                  GFP_KERNEL, cpu_to_node(cpu));
+       if (!d->mbps_val)
+               return -ENOMEM;
+
+       for (i = 0; i < num_closid; i++)
+               d->mbps_val[i] = MBA_MAX_MBPS;
+
+       return 0;
+}
+
+static void mba_sc_domain_destroy(struct rdt_resource *r,
+                                 struct rdt_ctrl_domain *d)
+{
+       kfree(d->mbps_val);
+       d->mbps_val = NULL;
+}
+
+/*
+ * MBA software controller is supported only if
+ * MBM is supported and MBA is in linear scale,
+ * and the MBM monitor scope is the same as MBA
+ * control scope.
+ */
+static bool supports_mba_mbps(void)
+{
+       struct rdt_resource *rmbm = resctrl_arch_get_resource(RDT_RESOURCE_L3);
+       struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_MBA);
+
+       return (resctrl_is_mbm_enabled() &&
+               r->alloc_capable && is_mba_linear() &&
+               r->ctrl_scope == rmbm->mon_scope);
+}
+
+/*
+ * Enable or disable the MBA software controller
+ * which helps user specify bandwidth in MBps.
+ */
+static int set_mba_sc(bool mba_sc)
+{
+       struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_MBA);
+       u32 num_closid = resctrl_arch_get_num_closid(r);
+       struct rdt_ctrl_domain *d;
+       unsigned long fflags;
+       int i;
+
+       if (!supports_mba_mbps() || mba_sc == is_mba_sc(r))
+               return -EINVAL;
+
+       r->membw.mba_sc = mba_sc;
+
+       rdtgroup_default.mba_mbps_event = mba_mbps_default_event;
+
+       list_for_each_entry(d, &r->ctrl_domains, hdr.list) {
+               for (i = 0; i < num_closid; i++)
+                       d->mbps_val[i] = MBA_MAX_MBPS;
+       }
+
+       fflags = mba_sc ? RFTYPE_CTRL_BASE | RFTYPE_MON_BASE : 0;
+       resctrl_file_fflags_init("mba_MBps_event", fflags);
+
+       return 0;
+}
+
+/*
+ * We don't allow rdtgroup directories to be created anywhere
+ * except the root directory. Thus when looking for the rdtgroup
+ * structure for a kernfs node we are either looking at a directory,
+ * in which case the rdtgroup structure is pointed at by the "priv"
+ * field, otherwise we have a file, and need only look to the parent
+ * to find the rdtgroup.
+ */
+static struct rdtgroup *kernfs_to_rdtgroup(struct kernfs_node *kn)
+{
+       if (kernfs_type(kn) == KERNFS_DIR) {
+               /*
+                * All the resource directories use "kn->priv"
+                * to point to the "struct rdtgroup" for the
+                * resource. "info" and its subdirectories don't
+                * have rdtgroup structures, so return NULL here.
+                */
+               if (kn == kn_info ||
+                   rcu_access_pointer(kn->__parent) == kn_info)
+                       return NULL;
+               else
+                       return kn->priv;
+       } else {
+               return rdt_kn_parent_priv(kn);
+       }
+}
+
+static void rdtgroup_kn_get(struct rdtgroup *rdtgrp, struct kernfs_node *kn)
+{
+       atomic_inc(&rdtgrp->waitcount);
+       kernfs_break_active_protection(kn);
+}
+
+static void rdtgroup_kn_put(struct rdtgroup *rdtgrp, struct kernfs_node *kn)
+{
+       if (atomic_dec_and_test(&rdtgrp->waitcount) &&
+           (rdtgrp->flags & RDT_DELETED)) {
+               if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP ||
+                   rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED)
+                       rdtgroup_pseudo_lock_remove(rdtgrp);
+               kernfs_unbreak_active_protection(kn);
+               rdtgroup_remove(rdtgrp);
+       } else {
+               kernfs_unbreak_active_protection(kn);
+       }
+}
+
+struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn)
+{
+       struct rdtgroup *rdtgrp = kernfs_to_rdtgroup(kn);
+
+       if (!rdtgrp)
+               return NULL;
+
+       rdtgroup_kn_get(rdtgrp, kn);
+
+       cpus_read_lock();
+       mutex_lock(&rdtgroup_mutex);
+
+       /* Was this group deleted while we waited? */
+       if (rdtgrp->flags & RDT_DELETED)
+               return NULL;
+
+       return rdtgrp;
+}
+
+void rdtgroup_kn_unlock(struct kernfs_node *kn)
+{
+       struct rdtgroup *rdtgrp = kernfs_to_rdtgroup(kn);
+
+       if (!rdtgrp)
+               return;
+
+       mutex_unlock(&rdtgroup_mutex);
+       cpus_read_unlock();
+
+       rdtgroup_kn_put(rdtgrp, kn);
+}
+
+static int mkdir_mondata_all(struct kernfs_node *parent_kn,
+                            struct rdtgroup *prgrp,
+                            struct kernfs_node **mon_data_kn);
+
+static void rdt_disable_ctx(void)
+{
+       resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, false);
+       resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, false);
+       set_mba_sc(false);
+
+       resctrl_debug = false;
+}
+
+static int rdt_enable_ctx(struct rdt_fs_context *ctx)
+{
+       int ret = 0;
+
+       if (ctx->enable_cdpl2) {
+               ret = resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, true);
+               if (ret)
+                       goto out_done;
+       }
+
+       if (ctx->enable_cdpl3) {
+               ret = resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, true);
+               if (ret)
+                       goto out_cdpl2;
+       }
+
+       if (ctx->enable_mba_mbps) {
+               ret = set_mba_sc(true);
+               if (ret)
+                       goto out_cdpl3;
+       }
+
+       if (ctx->enable_debug)
+               resctrl_debug = true;
+
+       return 0;
+
+out_cdpl3:
+       resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, false);
+out_cdpl2:
+       resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, false);
+out_done:
+       return ret;
+}
+
+static int schemata_list_add(struct rdt_resource *r, enum resctrl_conf_type type)
+{
+       struct resctrl_schema *s;
+       const char *suffix = "";
+       int ret, cl;
+
+       s = kzalloc(sizeof(*s), GFP_KERNEL);
+       if (!s)
+               return -ENOMEM;
+
+       s->res = r;
+       s->num_closid = resctrl_arch_get_num_closid(r);
+       if (resctrl_arch_get_cdp_enabled(r->rid))
+               s->num_closid /= 2;
+
+       s->conf_type = type;
+       switch (type) {
+       case CDP_CODE:
+               suffix = "CODE";
+               break;
+       case CDP_DATA:
+               suffix = "DATA";
+               break;
+       case CDP_NONE:
+               suffix = "";
+               break;
+       }
+
+       ret = snprintf(s->name, sizeof(s->name), "%s%s", r->name, suffix);
+       if (ret >= sizeof(s->name)) {
+               kfree(s);
+               return -EINVAL;
+       }
+
+       cl = strlen(s->name);
+
+       /*
+        * If CDP is supported by this resource, but not enabled,
+        * include the suffix. This ensures the tabular format of the
+        * schemata file does not change between mounts of the filesystem.
+        */
+       if (r->cdp_capable && !resctrl_arch_get_cdp_enabled(r->rid))
+               cl += 4;
+
+       if (cl > max_name_width)
+               max_name_width = cl;
+
+       switch (r->schema_fmt) {
+       case RESCTRL_SCHEMA_BITMAP:
+               s->fmt_str = "%d=%x";
+               break;
+       case RESCTRL_SCHEMA_RANGE:
+               s->fmt_str = "%d=%u";
+               break;
+       }
+
+       if (WARN_ON_ONCE(!s->fmt_str)) {
+               kfree(s);
+               return -EINVAL;
+       }
+
+       INIT_LIST_HEAD(&s->list);
+       list_add(&s->list, &resctrl_schema_all);
+
+       return 0;
+}
+
+static int schemata_list_create(void)
+{
+       struct rdt_resource *r;
+       int ret = 0;
+
+       for_each_alloc_capable_rdt_resource(r) {
+               if (resctrl_arch_get_cdp_enabled(r->rid)) {
+                       ret = schemata_list_add(r, CDP_CODE);
+                       if (ret)
+                               break;
+
+                       ret = schemata_list_add(r, CDP_DATA);
+               } else {
+                       ret = schemata_list_add(r, CDP_NONE);
+               }
+
+               if (ret)
+                       break;
+       }
+
+       return ret;
+}
+
+static void schemata_list_destroy(void)
+{
+       struct resctrl_schema *s, *tmp;
+
+       list_for_each_entry_safe(s, tmp, &resctrl_schema_all, list) {
+               list_del(&s->list);
+               kfree(s);
+       }
+}
+
+static int rdt_get_tree(struct fs_context *fc)
+{
+       struct rdt_fs_context *ctx = rdt_fc2context(fc);
+       unsigned long flags = RFTYPE_CTRL_BASE;
+       struct rdt_mon_domain *dom;
+       struct rdt_resource *r;
+       int ret;
+
+       cpus_read_lock();
+       mutex_lock(&rdtgroup_mutex);
+       /*
+        * resctrl file system can only be mounted once.
+        */
+       if (resctrl_mounted) {
+               ret = -EBUSY;
+               goto out;
+       }
+
+       ret = rdtgroup_setup_root(ctx);
+       if (ret)
+               goto out;
+
+       ret = rdt_enable_ctx(ctx);
+       if (ret)
+               goto out_root;
+
+       ret = schemata_list_create();
+       if (ret) {
+               schemata_list_destroy();
+               goto out_ctx;
+       }
+
+       ret = closid_init();
+       if (ret)
+               goto out_schemata_free;
+
+       if (resctrl_arch_mon_capable())
+               flags |= RFTYPE_MON;
+
+       ret = rdtgroup_add_files(rdtgroup_default.kn, flags);
+       if (ret)
+               goto out_closid_exit;
+
+       kernfs_activate(rdtgroup_default.kn);
+
+       ret = rdtgroup_create_info_dir(rdtgroup_default.kn);
+       if (ret < 0)
+               goto out_closid_exit;
+
+       if (resctrl_arch_mon_capable()) {
+               ret = mongroup_create_dir(rdtgroup_default.kn,
+                                         &rdtgroup_default, "mon_groups",
+                                         &kn_mongrp);
+               if (ret < 0)
+                       goto out_info;
+
+               ret = mkdir_mondata_all(rdtgroup_default.kn,
+                                       &rdtgroup_default, &kn_mondata);
+               if (ret < 0)
+                       goto out_mongrp;
+               rdtgroup_default.mon.mon_data_kn = kn_mondata;
+       }
+
+       ret = rdt_pseudo_lock_init();
+       if (ret)
+               goto out_mondata;
+
+       ret = kernfs_get_tree(fc);
+       if (ret < 0)
+               goto out_psl;
+
+       if (resctrl_arch_alloc_capable())
+               resctrl_arch_enable_alloc();
+       if (resctrl_arch_mon_capable())
+               resctrl_arch_enable_mon();
+
+       if (resctrl_arch_alloc_capable() || resctrl_arch_mon_capable())
+               resctrl_mounted = true;
+
+       if (resctrl_is_mbm_enabled()) {
+               r = resctrl_arch_get_resource(RDT_RESOURCE_L3);
+               list_for_each_entry(dom, &r->mon_domains, hdr.list)
+                       mbm_setup_overflow_handler(dom, MBM_OVERFLOW_INTERVAL,
+                                                  RESCTRL_PICK_ANY_CPU);
+       }
+
+       goto out;
+
+out_psl:
+       rdt_pseudo_lock_release();
+out_mondata:
+       if (resctrl_arch_mon_capable())
+               kernfs_remove(kn_mondata);
+out_mongrp:
+       if (resctrl_arch_mon_capable())
+               kernfs_remove(kn_mongrp);
+out_info:
+       kernfs_remove(kn_info);
+out_closid_exit:
+       closid_exit();
+out_schemata_free:
+       schemata_list_destroy();
+out_ctx:
+       rdt_disable_ctx();
+out_root:
+       rdtgroup_destroy_root();
+out:
+       rdt_last_cmd_clear();
+       mutex_unlock(&rdtgroup_mutex);
+       cpus_read_unlock();
+       return ret;
+}
+
+enum rdt_param {
+       Opt_cdp,
+       Opt_cdpl2,
+       Opt_mba_mbps,
+       Opt_debug,
+       nr__rdt_params
+};
+
+static const struct fs_parameter_spec rdt_fs_parameters[] = {
+       fsparam_flag("cdp",             Opt_cdp),
+       fsparam_flag("cdpl2",           Opt_cdpl2),
+       fsparam_flag("mba_MBps",        Opt_mba_mbps),
+       fsparam_flag("debug",           Opt_debug),
+       {}
+};
+
+static int rdt_parse_param(struct fs_context *fc, struct fs_parameter *param)
+{
+       struct rdt_fs_context *ctx = rdt_fc2context(fc);
+       struct fs_parse_result result;
+       const char *msg;
+       int opt;
+
+       opt = fs_parse(fc, rdt_fs_parameters, param, &result);
+       if (opt < 0)
+               return opt;
+
+       switch (opt) {
+       case Opt_cdp:
+               ctx->enable_cdpl3 = true;
+               return 0;
+       case Opt_cdpl2:
+               ctx->enable_cdpl2 = true;
+               return 0;
+       case Opt_mba_mbps:
+               msg = "mba_MBps requires MBM and linear scale MBA at L3 scope";
+               if (!supports_mba_mbps())
+                       return invalfc(fc, msg);
+               ctx->enable_mba_mbps = true;
+               return 0;
+       case Opt_debug:
+               ctx->enable_debug = true;
+               return 0;
+       }
+
+       return -EINVAL;
+}
+
+static void rdt_fs_context_free(struct fs_context *fc)
+{
+       struct rdt_fs_context *ctx = rdt_fc2context(fc);
+
+       kernfs_free_fs_context(fc);
+       kfree(ctx);
+}
+
+static const struct fs_context_operations rdt_fs_context_ops = {
+       .free           = rdt_fs_context_free,
+       .parse_param    = rdt_parse_param,
+       .get_tree       = rdt_get_tree,
+};
+
+static int rdt_init_fs_context(struct fs_context *fc)
+{
+       struct rdt_fs_context *ctx;
+
+       ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+       if (!ctx)
+               return -ENOMEM;
+
+       ctx->kfc.magic = RDTGROUP_SUPER_MAGIC;
+       fc->fs_private = &ctx->kfc;
+       fc->ops = &rdt_fs_context_ops;
+       put_user_ns(fc->user_ns);
+       fc->user_ns = get_user_ns(&init_user_ns);
+       fc->global = true;
+       return 0;
+}
+
+/*
+ * Move tasks from one to the other group. If @from is NULL, then all tasks
+ * in the systems are moved unconditionally (used for teardown).
+ *
+ * If @mask is not NULL the cpus on which moved tasks are running are set
+ * in that mask so the update smp function call is restricted to affected
+ * cpus.
+ */
+static void rdt_move_group_tasks(struct rdtgroup *from, struct rdtgroup *to,
+                                struct cpumask *mask)
+{
+       struct task_struct *p, *t;
+
+       read_lock(&tasklist_lock);
+       for_each_process_thread(p, t) {
+               if (!from || is_closid_match(t, from) ||
+                   is_rmid_match(t, from)) {
+                       resctrl_arch_set_closid_rmid(t, to->closid,
+                                                    to->mon.rmid);
+
+                       /*
+                        * Order the closid/rmid stores above before the loads
+                        * in task_curr(). This pairs with the full barrier
+                        * between the rq->curr update and
+                        * resctrl_arch_sched_in() during context switch.
+                        */
+                       smp_mb();
+
+                       /*
+                        * If the task is on a CPU, set the CPU in the mask.
+                        * The detection is inaccurate as tasks might move or
+                        * schedule before the smp function call takes place.
+                        * In such a case the function call is pointless, but
+                        * there is no other side effect.
+                        */
+                       if (IS_ENABLED(CONFIG_SMP) && mask && task_curr(t))
+                               cpumask_set_cpu(task_cpu(t), mask);
+               }
+       }
+       read_unlock(&tasklist_lock);
+}
+
+static void free_all_child_rdtgrp(struct rdtgroup *rdtgrp)
+{
+       struct rdtgroup *sentry, *stmp;
+       struct list_head *head;
+
+       head = &rdtgrp->mon.crdtgrp_list;
+       list_for_each_entry_safe(sentry, stmp, head, mon.crdtgrp_list) {
+               free_rmid(sentry->closid, sentry->mon.rmid);
+               list_del(&sentry->mon.crdtgrp_list);
+
+               if (atomic_read(&sentry->waitcount) != 0)
+                       sentry->flags = RDT_DELETED;
+               else
+                       rdtgroup_remove(sentry);
+       }
+}
+
+/*
+ * Forcibly remove all of subdirectories under root.
+ */
+static void rmdir_all_sub(void)
+{
+       struct rdtgroup *rdtgrp, *tmp;
+
+       /* Move all tasks to the default resource group */
+       rdt_move_group_tasks(NULL, &rdtgroup_default, NULL);
+
+       list_for_each_entry_safe(rdtgrp, tmp, &rdt_all_groups, rdtgroup_list) {
+               /* Free any child rmids */
+               free_all_child_rdtgrp(rdtgrp);
+
+               /* Remove each rdtgroup other than root */
+               if (rdtgrp == &rdtgroup_default)
+                       continue;
+
+               if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP ||
+                   rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED)
+                       rdtgroup_pseudo_lock_remove(rdtgrp);
+
+               /*
+                * Give any CPUs back to the default group. We cannot copy
+                * cpu_online_mask because a CPU might have executed the
+                * offline callback already, but is still marked online.
+                */
+               cpumask_or(&rdtgroup_default.cpu_mask,
+                          &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask);
+
+               free_rmid(rdtgrp->closid, rdtgrp->mon.rmid);
+
+               kernfs_remove(rdtgrp->kn);
+               list_del(&rdtgrp->rdtgroup_list);
+
+               if (atomic_read(&rdtgrp->waitcount) != 0)
+                       rdtgrp->flags = RDT_DELETED;
+               else
+                       rdtgroup_remove(rdtgrp);
+       }
+       /* Notify online CPUs to update per cpu storage and PQR_ASSOC MSR */
+       update_closid_rmid(cpu_online_mask, &rdtgroup_default);
+
+       kernfs_remove(kn_info);
+       kernfs_remove(kn_mongrp);
+       kernfs_remove(kn_mondata);
+}
+
+/**
+ * mon_get_kn_priv() - Get the mon_data priv data for this event.
+ *
+ * The same values are used across the mon_data directories of all control and
+ * monitor groups for the same event in the same domain. Keep a list of
+ * allocated structures and re-use an existing one with the same values for
+ * @rid, @domid, etc.
+ *
+ * @rid:    The resource id for the event file being created.
+ * @domid:  The domain id for the event file being created.
+ * @mevt:   The type of event file being created.
+ * @do_sum: Whether SNC summing monitors are being created.
+ */
+static struct mon_data *mon_get_kn_priv(enum resctrl_res_level rid, int domid,
+                                       struct mon_evt *mevt,
+                                       bool do_sum)
+{
+       struct mon_data *priv;
+
+       lockdep_assert_held(&rdtgroup_mutex);
+
+       list_for_each_entry(priv, &mon_data_kn_priv_list, list) {
+               if (priv->rid == rid && priv->domid == domid &&
+                   priv->sum == do_sum && priv->evtid == mevt->evtid)
+                       return priv;
+       }
+
+       priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+       if (!priv)
+               return NULL;
+
+       priv->rid = rid;
+       priv->domid = domid;
+       priv->sum = do_sum;
+       priv->evtid = mevt->evtid;
+       list_add_tail(&priv->list, &mon_data_kn_priv_list);
+
+       return priv;
+}
+
+/**
+ * mon_put_kn_priv() - Free all allocated mon_data structures.
+ *
+ * Called when resctrl file system is unmounted.
+ */
+static void mon_put_kn_priv(void)
+{
+       struct mon_data *priv, *tmp;
+
+       lockdep_assert_held(&rdtgroup_mutex);
+
+       list_for_each_entry_safe(priv, tmp, &mon_data_kn_priv_list, list) {
+               list_del(&priv->list);
+               kfree(priv);
+       }
+}
+
+static void resctrl_fs_teardown(void)
+{
+       lockdep_assert_held(&rdtgroup_mutex);
+
+       /* Cleared by rdtgroup_destroy_root() */
+       if (!rdtgroup_default.kn)
+               return;
+
+       rmdir_all_sub();
+       mon_put_kn_priv();
+       rdt_pseudo_lock_release();
+       rdtgroup_default.mode = RDT_MODE_SHAREABLE;
+       closid_exit();
+       schemata_list_destroy();
+       rdtgroup_destroy_root();
+}
+
+static void rdt_kill_sb(struct super_block *sb)
+{
+       struct rdt_resource *r;
+
+       cpus_read_lock();
+       mutex_lock(&rdtgroup_mutex);
+
+       rdt_disable_ctx();
+
+       /* Put everything back to default values. */
+       for_each_alloc_capable_rdt_resource(r)
+               resctrl_arch_reset_all_ctrls(r);
+
+       resctrl_fs_teardown();
+       if (resctrl_arch_alloc_capable())
+               resctrl_arch_disable_alloc();
+       if (resctrl_arch_mon_capable())
+               resctrl_arch_disable_mon();
+       resctrl_mounted = false;
+       kernfs_kill_sb(sb);
+       mutex_unlock(&rdtgroup_mutex);
+       cpus_read_unlock();
+}
+
+static struct file_system_type rdt_fs_type = {
+       .name                   = "resctrl",
+       .init_fs_context        = rdt_init_fs_context,
+       .parameters             = rdt_fs_parameters,
+       .kill_sb                = rdt_kill_sb,
+};
+
+static int mon_addfile(struct kernfs_node *parent_kn, const char *name,
+                      void *priv)
+{
+       struct kernfs_node *kn;
+       int ret = 0;
+
+       kn = __kernfs_create_file(parent_kn, name, 0444,
+                                 GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, 0,
+                                 &kf_mondata_ops, priv, NULL, NULL);
+       if (IS_ERR(kn))
+               return PTR_ERR(kn);
+
+       ret = rdtgroup_kn_set_ugid(kn);
+       if (ret) {
+               kernfs_remove(kn);
+               return ret;
+       }
+
+       return ret;
+}
+
+static void mon_rmdir_one_subdir(struct kernfs_node *pkn, char *name, char *subname)
+{
+       struct kernfs_node *kn;
+
+       kn = kernfs_find_and_get(pkn, name);
+       if (!kn)
+               return;
+       kernfs_put(kn);
+
+       if (kn->dir.subdirs <= 1)
+               kernfs_remove(kn);
+       else
+               kernfs_remove_by_name(kn, subname);
+}
+
+/*
+ * Remove all subdirectories of mon_data of ctrl_mon groups
+ * and monitor groups for the given domain.
+ * Remove files and directories containing "sum" of domain data
+ * when last domain being summed is removed.
+ */
+static void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r,
+                                          struct rdt_mon_domain *d)
+{
+       struct rdtgroup *prgrp, *crgrp;
+       char subname[32];
+       bool snc_mode;
+       char name[32];
+
+       snc_mode = r->mon_scope == RESCTRL_L3_NODE;
+       sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci->id : d->hdr.id);
+       if (snc_mode)
+               sprintf(subname, "mon_sub_%s_%02d", r->name, d->hdr.id);
+
+       list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) {
+               mon_rmdir_one_subdir(prgrp->mon.mon_data_kn, name, subname);
+
+               list_for_each_entry(crgrp, &prgrp->mon.crdtgrp_list, mon.crdtgrp_list)
+                       mon_rmdir_one_subdir(crgrp->mon.mon_data_kn, name, subname);
+       }
+}
+
+static int mon_add_all_files(struct kernfs_node *kn, struct rdt_mon_domain *d,
+                            struct rdt_resource *r, struct rdtgroup *prgrp,
+                            bool do_sum)
+{
+       struct rmid_read rr = {0};
+       struct mon_data *priv;
+       struct mon_evt *mevt;
+       int ret, domid;
+
+       if (WARN_ON(list_empty(&r->evt_list)))
+               return -EPERM;
+
+       list_for_each_entry(mevt, &r->evt_list, list) {
+               domid = do_sum ? d->ci->id : d->hdr.id;
+               priv = mon_get_kn_priv(r->rid, domid, mevt, do_sum);
+               if (WARN_ON_ONCE(!priv))
+                       return -EINVAL;
+
+               ret = mon_addfile(kn, mevt->name, priv);
+               if (ret)
+                       return ret;
+
+               if (!do_sum && resctrl_is_mbm_event(mevt->evtid))
+                       mon_event_read(&rr, r, d, prgrp, &d->hdr.cpu_mask, mevt->evtid, true);
+       }
+
+       return 0;
+}
+
+static int mkdir_mondata_subdir(struct kernfs_node *parent_kn,
+                               struct rdt_mon_domain *d,
+                               struct rdt_resource *r, struct rdtgroup *prgrp)
+{
+       struct kernfs_node *kn, *ckn;
+       char name[32];
+       bool snc_mode;
+       int ret = 0;
+
+       lockdep_assert_held(&rdtgroup_mutex);
+
+       snc_mode = r->mon_scope == RESCTRL_L3_NODE;
+       sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci->id : d->hdr.id);
+       kn = kernfs_find_and_get(parent_kn, name);
+       if (kn) {
+               /*
+                * rdtgroup_mutex will prevent this directory from being
+                * removed. No need to keep this hold.
+                */
+               kernfs_put(kn);
+       } else {
+               kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp);
+               if (IS_ERR(kn))
+                       return PTR_ERR(kn);
+
+               ret = rdtgroup_kn_set_ugid(kn);
+               if (ret)
+                       goto out_destroy;
+               ret = mon_add_all_files(kn, d, r, prgrp, snc_mode);
+               if (ret)
+                       goto out_destroy;
+       }
+
+       if (snc_mode) {
+               sprintf(name, "mon_sub_%s_%02d", r->name, d->hdr.id);
+               ckn = kernfs_create_dir(kn, name, parent_kn->mode, prgrp);
+               if (IS_ERR(ckn)) {
+                       ret = -EINVAL;
+                       goto out_destroy;
+               }
+
+               ret = rdtgroup_kn_set_ugid(ckn);
+               if (ret)
+                       goto out_destroy;
+
+               ret = mon_add_all_files(ckn, d, r, prgrp, false);
+               if (ret)
+                       goto out_destroy;
+       }
+
+       kernfs_activate(kn);
+       return 0;
+
+out_destroy:
+       kernfs_remove(kn);
+       return ret;
+}
+
+/*
+ * Add all subdirectories of mon_data for "ctrl_mon" groups
+ * and "monitor" groups with given domain id.
+ */
+static void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r,
+                                          struct rdt_mon_domain *d)
+{
+       struct kernfs_node *parent_kn;
+       struct rdtgroup *prgrp, *crgrp;
+       struct list_head *head;
+
+       list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) {
+               parent_kn = prgrp->mon.mon_data_kn;
+               mkdir_mondata_subdir(parent_kn, d, r, prgrp);
+
+               head = &prgrp->mon.crdtgrp_list;
+               list_for_each_entry(crgrp, head, mon.crdtgrp_list) {
+                       parent_kn = crgrp->mon.mon_data_kn;
+                       mkdir_mondata_subdir(parent_kn, d, r, crgrp);
+               }
+       }
+}
+
+static int mkdir_mondata_subdir_alldom(struct kernfs_node *parent_kn,
+                                      struct rdt_resource *r,
+                                      struct rdtgroup *prgrp)
+{
+       struct rdt_mon_domain *dom;
+       int ret;
+
+       /* Walking r->domains, ensure it can't race with cpuhp */
+       lockdep_assert_cpus_held();
+
+       list_for_each_entry(dom, &r->mon_domains, hdr.list) {
+               ret = mkdir_mondata_subdir(parent_kn, dom, r, prgrp);
+               if (ret)
+                       return ret;
+       }
+
+       return 0;
+}
+
+/*
+ * This creates a directory mon_data which contains the monitored data.
+ *
+ * mon_data has one directory for each domain which are named
+ * in the format mon_<domain_name>_<domain_id>. For ex: A mon_data
+ * with L3 domain looks as below:
+ * ./mon_data:
+ * mon_L3_00
+ * mon_L3_01
+ * mon_L3_02
+ * ...
+ *
+ * Each domain directory has one file per event:
+ * ./mon_L3_00/:
+ * llc_occupancy
+ *
+ */
+static int mkdir_mondata_all(struct kernfs_node *parent_kn,
+                            struct rdtgroup *prgrp,
+                            struct kernfs_node **dest_kn)
+{
+       struct rdt_resource *r;
+       struct kernfs_node *kn;
+       int ret;
+
+       /*
+        * Create the mon_data directory first.
+        */
+       ret = mongroup_create_dir(parent_kn, prgrp, "mon_data", &kn);
+       if (ret)
+               return ret;
+
+       if (dest_kn)
+               *dest_kn = kn;
+
+       /*
+        * Create the subdirectories for each domain. Note that all events
+        * in a domain like L3 are grouped into a resource whose domain is L3
+        */
+       for_each_mon_capable_rdt_resource(r) {
+               ret = mkdir_mondata_subdir_alldom(kn, r, prgrp);
+               if (ret)
+                       goto out_destroy;
+       }
+
+       return 0;
+
+out_destroy:
+       kernfs_remove(kn);
+       return ret;
+}
+
+/**
+ * cbm_ensure_valid - Enforce validity on provided CBM
+ * @_val:      Candidate CBM
+ * @r:         RDT resource to which the CBM belongs
+ *
+ * The provided CBM represents all cache portions available for use. This
+ * may be represented by a bitmap that does not consist of contiguous ones
+ * and thus be an invalid CBM.
+ * Here the provided CBM is forced to be a valid CBM by only considering
+ * the first set of contiguous bits as valid and clearing all bits.
+ * The intention here is to provide a valid default CBM with which a new
+ * resource group is initialized. The user can follow this with a
+ * modification to the CBM if the default does not satisfy the
+ * requirements.
+ */
+static u32 cbm_ensure_valid(u32 _val, struct rdt_resource *r)
+{
+       unsigned int cbm_len = r->cache.cbm_len;
+       unsigned long first_bit, zero_bit;
+       unsigned long val = _val;
+
+       if (!val)
+               return 0;
+
+       first_bit = find_first_bit(&val, cbm_len);
+       zero_bit = find_next_zero_bit(&val, cbm_len, first_bit);
+
+       /* Clear any remaining bits to ensure contiguous region */
+       bitmap_clear(&val, zero_bit, cbm_len - zero_bit);
+       return (u32)val;
+}
+
+/*
+ * Initialize cache resources per RDT domain
+ *
+ * Set the RDT domain up to start off with all usable allocations. That is,
+ * all shareable and unused bits. All-zero CBM is invalid.
+ */
+static int __init_one_rdt_domain(struct rdt_ctrl_domain *d, struct resctrl_schema *s,
+                                u32 closid)
+{
+       enum resctrl_conf_type peer_type = resctrl_peer_type(s->conf_type);
+       enum resctrl_conf_type t = s->conf_type;
+       struct resctrl_staged_config *cfg;
+       struct rdt_resource *r = s->res;
+       u32 used_b = 0, unused_b = 0;
+       unsigned long tmp_cbm;
+       enum rdtgrp_mode mode;
+       u32 peer_ctl, ctrl_val;
+       int i;
+
+       cfg = &d->staged_config[t];
+       cfg->have_new_ctrl = false;
+       cfg->new_ctrl = r->cache.shareable_bits;
+       used_b = r->cache.shareable_bits;
+       for (i = 0; i < closids_supported(); i++) {
+               if (closid_allocated(i) && i != closid) {
+                       mode = rdtgroup_mode_by_closid(i);
+                       if (mode == RDT_MODE_PSEUDO_LOCKSETUP)
+                               /*
+                                * ctrl values for locksetup aren't relevant
+                                * until the schemata is written, and the mode
+                                * becomes RDT_MODE_PSEUDO_LOCKED.
+                                */
+                               continue;
+                       /*
+                        * If CDP is active include peer domain's
+                        * usage to ensure there is no overlap
+                        * with an exclusive group.
+                        */
+                       if (resctrl_arch_get_cdp_enabled(r->rid))
+                               peer_ctl = resctrl_arch_get_config(r, d, i,
+                                                                  peer_type);
+                       else
+                               peer_ctl = 0;
+                       ctrl_val = resctrl_arch_get_config(r, d, i,
+                                                          s->conf_type);
+                       used_b |= ctrl_val | peer_ctl;
+                       if (mode == RDT_MODE_SHAREABLE)
+                               cfg->new_ctrl |= ctrl_val | peer_ctl;
+               }
+       }
+       if (d->plr && d->plr->cbm > 0)
+               used_b |= d->plr->cbm;
+       unused_b = used_b ^ (BIT_MASK(r->cache.cbm_len) - 1);
+       unused_b &= BIT_MASK(r->cache.cbm_len) - 1;
+       cfg->new_ctrl |= unused_b;
+       /*
+        * Force the initial CBM to be valid, user can
+        * modify the CBM based on system availability.
+        */
+       cfg->new_ctrl = cbm_ensure_valid(cfg->new_ctrl, r);
+       /*
+        * Assign the u32 CBM to an unsigned long to ensure that
+        * bitmap_weight() does not access out-of-bound memory.
+        */
+       tmp_cbm = cfg->new_ctrl;
+       if (bitmap_weight(&tmp_cbm, r->cache.cbm_len) < r->cache.min_cbm_bits) {
+               rdt_last_cmd_printf("No space on %s:%d\n", s->name, d->hdr.id);
+               return -ENOSPC;
+       }
+       cfg->have_new_ctrl = true;
+
+       return 0;
+}
+
+/*
+ * Initialize cache resources with default values.
+ *
+ * A new RDT group is being created on an allocation capable (CAT)
+ * supporting system. Set this group up to start off with all usable
+ * allocations.
+ *
+ * If there are no more shareable bits available on any domain then
+ * the entire allocation will fail.
+ */
+static int rdtgroup_init_cat(struct resctrl_schema *s, u32 closid)
+{
+       struct rdt_ctrl_domain *d;
+       int ret;
+
+       list_for_each_entry(d, &s->res->ctrl_domains, hdr.list) {
+               ret = __init_one_rdt_domain(d, s, closid);
+               if (ret < 0)
+                       return ret;
+       }
+
+       return 0;
+}
+
+/* Initialize MBA resource with default values. */
+static void rdtgroup_init_mba(struct rdt_resource *r, u32 closid)
+{
+       struct resctrl_staged_config *cfg;
+       struct rdt_ctrl_domain *d;
+
+       list_for_each_entry(d, &r->ctrl_domains, hdr.list) {
+               if (is_mba_sc(r)) {
+                       d->mbps_val[closid] = MBA_MAX_MBPS;
+                       continue;
+               }
+
+               cfg = &d->staged_config[CDP_NONE];
+               cfg->new_ctrl = resctrl_get_default_ctrl(r);
+               cfg->have_new_ctrl = true;
+       }
+}
+
+/* Initialize the RDT group's allocations. */
+static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp)
+{
+       struct resctrl_schema *s;
+       struct rdt_resource *r;
+       int ret = 0;
+
+       rdt_staged_configs_clear();
+
+       list_for_each_entry(s, &resctrl_schema_all, list) {
+               r = s->res;
+               if (r->rid == RDT_RESOURCE_MBA ||
+                   r->rid == RDT_RESOURCE_SMBA) {
+                       rdtgroup_init_mba(r, rdtgrp->closid);
+                       if (is_mba_sc(r))
+                               continue;
+               } else {
+                       ret = rdtgroup_init_cat(s, rdtgrp->closid);
+                       if (ret < 0)
+                               goto out;
+               }
+
+               ret = resctrl_arch_update_domains(r, rdtgrp->closid);
+               if (ret < 0) {
+                       rdt_last_cmd_puts("Failed to initialize allocations\n");
+                       goto out;
+               }
+       }
+
+       rdtgrp->mode = RDT_MODE_SHAREABLE;
+
+out:
+       rdt_staged_configs_clear();
+       return ret;
+}
+
+static int mkdir_rdt_prepare_rmid_alloc(struct rdtgroup *rdtgrp)
+{
+       int ret;
+
+       if (!resctrl_arch_mon_capable())
+               return 0;
+
+       ret = alloc_rmid(rdtgrp->closid);
+       if (ret < 0) {
+               rdt_last_cmd_puts("Out of RMIDs\n");
+               return ret;
+       }
+       rdtgrp->mon.rmid = ret;
+
+       ret = mkdir_mondata_all(rdtgrp->kn, rdtgrp, &rdtgrp->mon.mon_data_kn);
+       if (ret) {
+               rdt_last_cmd_puts("kernfs subdir error\n");
+               free_rmid(rdtgrp->closid, rdtgrp->mon.rmid);
+               return ret;
+       }
+
+       return 0;
+}
+
+static void mkdir_rdt_prepare_rmid_free(struct rdtgroup *rgrp)
+{
+       if (resctrl_arch_mon_capable())
+               free_rmid(rgrp->closid, rgrp->mon.rmid);
+}
+
+/*
+ * We allow creating mon groups only with in a directory called "mon_groups"
+ * which is present in every ctrl_mon group. Check if this is a valid
+ * "mon_groups" directory.
+ *
+ * 1. The directory should be named "mon_groups".
+ * 2. The mon group itself should "not" be named "mon_groups".
+ *   This makes sure "mon_groups" directory always has a ctrl_mon group
+ *   as parent.
+ */
+static bool is_mon_groups(struct kernfs_node *kn, const char *name)
+{
+       return (!strcmp(rdt_kn_name(kn), "mon_groups") &&
+               strcmp(name, "mon_groups"));
+}
+
+static int mkdir_rdt_prepare(struct kernfs_node *parent_kn,
+                            const char *name, umode_t mode,
+                            enum rdt_group_type rtype, struct rdtgroup **r)
+{
+       struct rdtgroup *prdtgrp, *rdtgrp;
+       unsigned long files = 0;
+       struct kernfs_node *kn;
+       int ret;
+
+       prdtgrp = rdtgroup_kn_lock_live(parent_kn);
+       if (!prdtgrp) {
+               ret = -ENODEV;
+               goto out_unlock;
+       }
+
+       /*
+        * Check that the parent directory for a monitor group is a "mon_groups"
+        * directory.
+        */
+       if (rtype == RDTMON_GROUP && !is_mon_groups(parent_kn, name)) {
+               ret = -EPERM;
+               goto out_unlock;
+       }
+
+       if (rtype == RDTMON_GROUP &&
+           (prdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP ||
+            prdtgrp->mode == RDT_MODE_PSEUDO_LOCKED)) {
+               ret = -EINVAL;
+               rdt_last_cmd_puts("Pseudo-locking in progress\n");
+               goto out_unlock;
+       }
+
+       /* allocate the rdtgroup. */
+       rdtgrp = kzalloc(sizeof(*rdtgrp), GFP_KERNEL);
+       if (!rdtgrp) {
+               ret = -ENOSPC;
+               rdt_last_cmd_puts("Kernel out of memory\n");
+               goto out_unlock;
+       }
+       *r = rdtgrp;
+       rdtgrp->mon.parent = prdtgrp;
+       rdtgrp->type = rtype;
+       INIT_LIST_HEAD(&rdtgrp->mon.crdtgrp_list);
+
+       /* kernfs creates the directory for rdtgrp */
+       kn = kernfs_create_dir(parent_kn, name, mode, rdtgrp);
+       if (IS_ERR(kn)) {
+               ret = PTR_ERR(kn);
+               rdt_last_cmd_puts("kernfs create error\n");
+               goto out_free_rgrp;
+       }
+       rdtgrp->kn = kn;
+
+       /*
+        * kernfs_remove() will drop the reference count on "kn" which
+        * will free it. But we still need it to stick around for the
+        * rdtgroup_kn_unlock(kn) call. Take one extra reference here,
+        * which will be dropped by kernfs_put() in rdtgroup_remove().
+        */
+       kernfs_get(kn);
+
+       ret = rdtgroup_kn_set_ugid(kn);
+       if (ret) {
+               rdt_last_cmd_puts("kernfs perm error\n");
+               goto out_destroy;
+       }
+
+       if (rtype == RDTCTRL_GROUP) {
+               files = RFTYPE_BASE | RFTYPE_CTRL;
+               if (resctrl_arch_mon_capable())
+                       files |= RFTYPE_MON;
+       } else {
+               files = RFTYPE_BASE | RFTYPE_MON;
+       }
+
+       ret = rdtgroup_add_files(kn, files);
+       if (ret) {
+               rdt_last_cmd_puts("kernfs fill error\n");
+               goto out_destroy;
+       }
+
+       /*
+        * The caller unlocks the parent_kn upon success.
+        */
+       return 0;
+
+out_destroy:
+       kernfs_put(rdtgrp->kn);
+       kernfs_remove(rdtgrp->kn);
+out_free_rgrp:
+       kfree(rdtgrp);
+out_unlock:
+       rdtgroup_kn_unlock(parent_kn);
+       return ret;
+}
+
+static void mkdir_rdt_prepare_clean(struct rdtgroup *rgrp)
+{
+       kernfs_remove(rgrp->kn);
+       rdtgroup_remove(rgrp);
+}
+
+/*
+ * Create a monitor group under "mon_groups" directory of a control
+ * and monitor group(ctrl_mon). This is a resource group
+ * to monitor a subset of tasks and cpus in its parent ctrl_mon group.
+ */
+static int rdtgroup_mkdir_mon(struct kernfs_node *parent_kn,
+                             const char *name, umode_t mode)
+{
+       struct rdtgroup *rdtgrp, *prgrp;
+       int ret;
+
+       ret = mkdir_rdt_prepare(parent_kn, name, mode, RDTMON_GROUP, &rdtgrp);
+       if (ret)
+               return ret;
+
+       prgrp = rdtgrp->mon.parent;
+       rdtgrp->closid = prgrp->closid;
+
+       ret = mkdir_rdt_prepare_rmid_alloc(rdtgrp);
+       if (ret) {
+               mkdir_rdt_prepare_clean(rdtgrp);
+               goto out_unlock;
+       }
+
+       kernfs_activate(rdtgrp->kn);
+
+       /*
+        * Add the rdtgrp to the list of rdtgrps the parent
+        * ctrl_mon group has to track.
+        */
+       list_add_tail(&rdtgrp->mon.crdtgrp_list, &prgrp->mon.crdtgrp_list);
+
+out_unlock:
+       rdtgroup_kn_unlock(parent_kn);
+       return ret;
+}
+
+/*
+ * These are rdtgroups created under the root directory. Can be used
+ * to allocate and monitor resources.
+ */
+static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn,
+                                  const char *name, umode_t mode)
+{
+       struct rdtgroup *rdtgrp;
+       struct kernfs_node *kn;
+       u32 closid;
+       int ret;
+
+       ret = mkdir_rdt_prepare(parent_kn, name, mode, RDTCTRL_GROUP, &rdtgrp);
+       if (ret)
+               return ret;
+
+       kn = rdtgrp->kn;
+       ret = closid_alloc();
+       if (ret < 0) {
+               rdt_last_cmd_puts("Out of CLOSIDs\n");
+               goto out_common_fail;
+       }
+       closid = ret;
+       ret = 0;
+
+       rdtgrp->closid = closid;
+
+       ret = mkdir_rdt_prepare_rmid_alloc(rdtgrp);
+       if (ret)
+               goto out_closid_free;
+
+       kernfs_activate(rdtgrp->kn);
+
+       ret = rdtgroup_init_alloc(rdtgrp);
+       if (ret < 0)
+               goto out_rmid_free;
+
+       list_add(&rdtgrp->rdtgroup_list, &rdt_all_groups);
+
+       if (resctrl_arch_mon_capable()) {
+               /*
+                * Create an empty mon_groups directory to hold the subset
+                * of tasks and cpus to monitor.
+                */
+               ret = mongroup_create_dir(kn, rdtgrp, "mon_groups", NULL);
+               if (ret) {
+                       rdt_last_cmd_puts("kernfs subdir error\n");
+                       goto out_del_list;
+               }
+               if (is_mba_sc(NULL))
+                       rdtgrp->mba_mbps_event = mba_mbps_default_event;
+       }
+
+       goto out_unlock;
+
+out_del_list:
+       list_del(&rdtgrp->rdtgroup_list);
+out_rmid_free:
+       mkdir_rdt_prepare_rmid_free(rdtgrp);
+out_closid_free:
+       closid_free(closid);
+out_common_fail:
+       mkdir_rdt_prepare_clean(rdtgrp);
+out_unlock:
+       rdtgroup_kn_unlock(parent_kn);
+       return ret;
+}
+
+static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
+                         umode_t mode)
+{
+       /* Do not accept '\n' to avoid unparsable situation. */
+       if (strchr(name, '\n'))
+               return -EINVAL;
+
+       /*
+        * If the parent directory is the root directory and RDT
+        * allocation is supported, add a control and monitoring
+        * subdirectory
+        */
+       if (resctrl_arch_alloc_capable() && parent_kn == rdtgroup_default.kn)
+               return rdtgroup_mkdir_ctrl_mon(parent_kn, name, mode);
+
+       /* Else, attempt to add a monitoring subdirectory. */
+       if (resctrl_arch_mon_capable())
+               return rdtgroup_mkdir_mon(parent_kn, name, mode);
+
+       return -EPERM;
+}
+
+static int rdtgroup_rmdir_mon(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask)
+{
+       struct rdtgroup *prdtgrp = rdtgrp->mon.parent;
+       u32 closid, rmid;
+       int cpu;
+
+       /* Give any tasks back to the parent group */
+       rdt_move_group_tasks(rdtgrp, prdtgrp, tmpmask);
+
+       /*
+        * Update per cpu closid/rmid of the moved CPUs first.
+        * Note: the closid will not change, but the arch code still needs it.
+        */
+       closid = prdtgrp->closid;
+       rmid = prdtgrp->mon.rmid;
+       for_each_cpu(cpu, &rdtgrp->cpu_mask)
+               resctrl_arch_set_cpu_default_closid_rmid(cpu, closid, rmid);
+
+       /*
+        * Update the MSR on moved CPUs and CPUs which have moved
+        * task running on them.
+        */
+       cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask);
+       update_closid_rmid(tmpmask, NULL);
+
+       rdtgrp->flags = RDT_DELETED;
+       free_rmid(rdtgrp->closid, rdtgrp->mon.rmid);
+
+       /*
+        * Remove the rdtgrp from the parent ctrl_mon group's list
+        */
+       WARN_ON(list_empty(&prdtgrp->mon.crdtgrp_list));
+       list_del(&rdtgrp->mon.crdtgrp_list);
+
+       kernfs_remove(rdtgrp->kn);
+
+       return 0;
+}
+
+static int rdtgroup_ctrl_remove(struct rdtgroup *rdtgrp)
+{
+       rdtgrp->flags = RDT_DELETED;
+       list_del(&rdtgrp->rdtgroup_list);
+
+       kernfs_remove(rdtgrp->kn);
+       return 0;
+}
+
+static int rdtgroup_rmdir_ctrl(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask)
+{
+       u32 closid, rmid;
+       int cpu;
+
+       /* Give any tasks back to the default group */
+       rdt_move_group_tasks(rdtgrp, &rdtgroup_default, tmpmask);
+
+       /* Give any CPUs back to the default group */
+       cpumask_or(&rdtgroup_default.cpu_mask,
+                  &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask);
+
+       /* Update per cpu closid and rmid of the moved CPUs first */
+       closid = rdtgroup_default.closid;
+       rmid = rdtgroup_default.mon.rmid;
+       for_each_cpu(cpu, &rdtgrp->cpu_mask)
+               resctrl_arch_set_cpu_default_closid_rmid(cpu, closid, rmid);
+
+       /*
+        * Update the MSR on moved CPUs and CPUs which have moved
+        * task running on them.
+        */
+       cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask);
+       update_closid_rmid(tmpmask, NULL);
+
+       free_rmid(rdtgrp->closid, rdtgrp->mon.rmid);
+       closid_free(rdtgrp->closid);
+
+       rdtgroup_ctrl_remove(rdtgrp);
+
+       /*
+        * Free all the child monitor group rmids.
+        */
+       free_all_child_rdtgrp(rdtgrp);
+
+       return 0;
+}
+
+static struct kernfs_node *rdt_kn_parent(struct kernfs_node *kn)
+{
+       /*
+        * Valid within the RCU section it was obtained or while rdtgroup_mutex
+        * is held.
+        */
+       return rcu_dereference_check(kn->__parent, lockdep_is_held(&rdtgroup_mutex));
+}
+
+static int rdtgroup_rmdir(struct kernfs_node *kn)
+{
+       struct kernfs_node *parent_kn;
+       struct rdtgroup *rdtgrp;
+       cpumask_var_t tmpmask;
+       int ret = 0;
+
+       if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
+               return -ENOMEM;
+
+       rdtgrp = rdtgroup_kn_lock_live(kn);
+       if (!rdtgrp) {
+               ret = -EPERM;
+               goto out;
+       }
+       parent_kn = rdt_kn_parent(kn);
+
+       /*
+        * If the rdtgroup is a ctrl_mon group and parent directory
+        * is the root directory, remove the ctrl_mon group.
+        *
+        * If the rdtgroup is a mon group and parent directory
+        * is a valid "mon_groups" directory, remove the mon group.
+        */
+       if (rdtgrp->type == RDTCTRL_GROUP && parent_kn == rdtgroup_default.kn &&
+           rdtgrp != &rdtgroup_default) {
+               if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP ||
+                   rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) {
+                       ret = rdtgroup_ctrl_remove(rdtgrp);
+               } else {
+                       ret = rdtgroup_rmdir_ctrl(rdtgrp, tmpmask);
+               }
+       } else if (rdtgrp->type == RDTMON_GROUP &&
+                is_mon_groups(parent_kn, rdt_kn_name(kn))) {
+               ret = rdtgroup_rmdir_mon(rdtgrp, tmpmask);
+       } else {
+               ret = -EPERM;
+       }
+
+out:
+       rdtgroup_kn_unlock(kn);
+       free_cpumask_var(tmpmask);
+       return ret;
+}
+
+/**
+ * mongrp_reparent() - replace parent CTRL_MON group of a MON group
+ * @rdtgrp:            the MON group whose parent should be replaced
+ * @new_prdtgrp:       replacement parent CTRL_MON group for @rdtgrp
+ * @cpus:              cpumask provided by the caller for use during this call
+ *
+ * Replaces the parent CTRL_MON group for a MON group, resulting in all member
+ * tasks' CLOSID immediately changing to that of the new parent group.
+ * Monitoring data for the group is unaffected by this operation.
+ */
+static void mongrp_reparent(struct rdtgroup *rdtgrp,
+                           struct rdtgroup *new_prdtgrp,
+                           cpumask_var_t cpus)
+{
+       struct rdtgroup *prdtgrp = rdtgrp->mon.parent;
+
+       WARN_ON(rdtgrp->type != RDTMON_GROUP);
+       WARN_ON(new_prdtgrp->type != RDTCTRL_GROUP);
+
+       /* Nothing to do when simply renaming a MON group. */
+       if (prdtgrp == new_prdtgrp)
+               return;
+
+       WARN_ON(list_empty(&prdtgrp->mon.crdtgrp_list));
+       list_move_tail(&rdtgrp->mon.crdtgrp_list,
+                      &new_prdtgrp->mon.crdtgrp_list);
+
+       rdtgrp->mon.parent = new_prdtgrp;
+       rdtgrp->closid = new_prdtgrp->closid;
+
+       /* Propagate updated closid to all tasks in this group. */
+       rdt_move_group_tasks(rdtgrp, rdtgrp, cpus);
+
+       update_closid_rmid(cpus, NULL);
+}
+
+static int rdtgroup_rename(struct kernfs_node *kn,
+                          struct kernfs_node *new_parent, const char *new_name)
+{
+       struct kernfs_node *kn_parent;
+       struct rdtgroup *new_prdtgrp;
+       struct rdtgroup *rdtgrp;
+       cpumask_var_t tmpmask;
+       int ret;
+
+       rdtgrp = kernfs_to_rdtgroup(kn);
+       new_prdtgrp = kernfs_to_rdtgroup(new_parent);
+       if (!rdtgrp || !new_prdtgrp)
+               return -ENOENT;
+
+       /* Release both kernfs active_refs before obtaining rdtgroup mutex. */
+       rdtgroup_kn_get(rdtgrp, kn);
+       rdtgroup_kn_get(new_prdtgrp, new_parent);
+
+       mutex_lock(&rdtgroup_mutex);
+
+       rdt_last_cmd_clear();
+
+       /*
+        * Don't allow kernfs_to_rdtgroup() to return a parent rdtgroup if
+        * either kernfs_node is a file.
+        */
+       if (kernfs_type(kn) != KERNFS_DIR ||
+           kernfs_type(new_parent) != KERNFS_DIR) {
+               rdt_last_cmd_puts("Source and destination must be directories");
+               ret = -EPERM;
+               goto out;
+       }
+
+       if ((rdtgrp->flags & RDT_DELETED) || (new_prdtgrp->flags & RDT_DELETED)) {
+               ret = -ENOENT;
+               goto out;
+       }
+
+       kn_parent = rdt_kn_parent(kn);
+       if (rdtgrp->type != RDTMON_GROUP || !kn_parent ||
+           !is_mon_groups(kn_parent, rdt_kn_name(kn))) {
+               rdt_last_cmd_puts("Source must be a MON group\n");
+               ret = -EPERM;
+               goto out;
+       }
+
+       if (!is_mon_groups(new_parent, new_name)) {
+               rdt_last_cmd_puts("Destination must be a mon_groups subdirectory\n");
+               ret = -EPERM;
+               goto out;
+       }
+
+       /*
+        * If the MON group is monitoring CPUs, the CPUs must be assigned to the
+        * current parent CTRL_MON group and therefore cannot be assigned to
+        * the new parent, making the move illegal.
+        */
+       if (!cpumask_empty(&rdtgrp->cpu_mask) &&
+           rdtgrp->mon.parent != new_prdtgrp) {
+               rdt_last_cmd_puts("Cannot move a MON group that monitors CPUs\n");
+               ret = -EPERM;
+               goto out;
+       }
+
+       /*
+        * Allocate the cpumask for use in mongrp_reparent() to avoid the
+        * possibility of failing to allocate it after kernfs_rename() has
+        * succeeded.
+        */
+       if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) {
+               ret = -ENOMEM;
+               goto out;
+       }
+
+       /*
+        * Perform all input validation and allocations needed to ensure
+        * mongrp_reparent() will succeed before calling kernfs_rename(),
+        * otherwise it would be necessary to revert this call if
+        * mongrp_reparent() failed.
+        */
+       ret = kernfs_rename(kn, new_parent, new_name);
+       if (!ret)
+               mongrp_reparent(rdtgrp, new_prdtgrp, tmpmask);
+
+       free_cpumask_var(tmpmask);
+
+out:
+       mutex_unlock(&rdtgroup_mutex);
+       rdtgroup_kn_put(rdtgrp, kn);
+       rdtgroup_kn_put(new_prdtgrp, new_parent);
+       return ret;
+}
+
+static int rdtgroup_show_options(struct seq_file *seq, struct kernfs_root *kf)
+{
+       if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L3))
+               seq_puts(seq, ",cdp");
+
+       if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L2))
+               seq_puts(seq, ",cdpl2");
+
+       if (is_mba_sc(resctrl_arch_get_resource(RDT_RESOURCE_MBA)))
+               seq_puts(seq, ",mba_MBps");
+
+       if (resctrl_debug)
+               seq_puts(seq, ",debug");
+
+       return 0;
+}
+
+static struct kernfs_syscall_ops rdtgroup_kf_syscall_ops = {
+       .mkdir          = rdtgroup_mkdir,
+       .rmdir          = rdtgroup_rmdir,
+       .rename         = rdtgroup_rename,
+       .show_options   = rdtgroup_show_options,
+};
+
+static int rdtgroup_setup_root(struct rdt_fs_context *ctx)
+{
+       rdt_root = kernfs_create_root(&rdtgroup_kf_syscall_ops,
+                                     KERNFS_ROOT_CREATE_DEACTIVATED |
+                                     KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK,
+                                     &rdtgroup_default);
+       if (IS_ERR(rdt_root))
+               return PTR_ERR(rdt_root);
+
+       ctx->kfc.root = rdt_root;
+       rdtgroup_default.kn = kernfs_root_to_node(rdt_root);
+
+       return 0;
+}
+
+static void rdtgroup_destroy_root(void)
+{
+       lockdep_assert_held(&rdtgroup_mutex);
+
+       kernfs_destroy_root(rdt_root);
+       rdtgroup_default.kn = NULL;
+}
+
+static void rdtgroup_setup_default(void)
+{
+       mutex_lock(&rdtgroup_mutex);
+
+       rdtgroup_default.closid = RESCTRL_RESERVED_CLOSID;
+       rdtgroup_default.mon.rmid = RESCTRL_RESERVED_RMID;
+       rdtgroup_default.type = RDTCTRL_GROUP;
+       INIT_LIST_HEAD(&rdtgroup_default.mon.crdtgrp_list);
+
+       list_add(&rdtgroup_default.rdtgroup_list, &rdt_all_groups);
+
+       mutex_unlock(&rdtgroup_mutex);
+}
+
+static void domain_destroy_mon_state(struct rdt_mon_domain *d)
+{
+       bitmap_free(d->rmid_busy_llc);
+       kfree(d->mbm_total);
+       kfree(d->mbm_local);
+}
+
+void resctrl_offline_ctrl_domain(struct rdt_resource *r, struct rdt_ctrl_domain *d)
+{
+       mutex_lock(&rdtgroup_mutex);
+
+       if (supports_mba_mbps() && r->rid == RDT_RESOURCE_MBA)
+               mba_sc_domain_destroy(r, d);
+
+       mutex_unlock(&rdtgroup_mutex);
+}
+
+void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d)
+{
+       mutex_lock(&rdtgroup_mutex);
+
+       /*
+        * If resctrl is mounted, remove all the
+        * per domain monitor data directories.
+        */
+       if (resctrl_mounted && resctrl_arch_mon_capable())
+               rmdir_mondata_subdir_allrdtgrp(r, d);
+
+       if (resctrl_is_mbm_enabled())
+               cancel_delayed_work(&d->mbm_over);
+       if (resctrl_arch_is_llc_occupancy_enabled() && has_busy_rmid(d)) {
+               /*
+                * When a package is going down, forcefully
+                * decrement rmid->ebusy. There is no way to know
+                * that the L3 was flushed and hence may lead to
+                * incorrect counts in rare scenarios, but leaving
+                * the RMID as busy creates RMID leaks if the
+                * package never comes back.
+                */
+               __check_limbo(d, true);
+               cancel_delayed_work(&d->cqm_limbo);
+       }
+
+       domain_destroy_mon_state(d);
+
+       mutex_unlock(&rdtgroup_mutex);
+}
+
+/**
+ * domain_setup_mon_state() -  Initialise domain monitoring structures.
+ * @r: The resource for the newly online domain.
+ * @d: The newly online domain.
+ *
+ * Allocate monitor resources that belong to this domain.
+ * Called when the first CPU of a domain comes online, regardless of whether
+ * the filesystem is mounted.
+ * During boot this may be called before global allocations have been made by
+ * resctrl_mon_resource_init().
+ *
+ * Returns 0 for success, or -ENOMEM.
+ */
+static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_mon_domain *d)
+{
+       u32 idx_limit = resctrl_arch_system_num_rmid_idx();
+       size_t tsize;
+
+       if (resctrl_arch_is_llc_occupancy_enabled()) {
+               d->rmid_busy_llc = bitmap_zalloc(idx_limit, GFP_KERNEL);
+               if (!d->rmid_busy_llc)
+                       return -ENOMEM;
+       }
+       if (resctrl_arch_is_mbm_total_enabled()) {
+               tsize = sizeof(*d->mbm_total);
+               d->mbm_total = kcalloc(idx_limit, tsize, GFP_KERNEL);
+               if (!d->mbm_total) {
+                       bitmap_free(d->rmid_busy_llc);
+                       return -ENOMEM;
+               }
+       }
+       if (resctrl_arch_is_mbm_local_enabled()) {
+               tsize = sizeof(*d->mbm_local);
+               d->mbm_local = kcalloc(idx_limit, tsize, GFP_KERNEL);
+               if (!d->mbm_local) {
+                       bitmap_free(d->rmid_busy_llc);
+                       kfree(d->mbm_total);
+                       return -ENOMEM;
+               }
+       }
+
+       return 0;
+}
+
+int resctrl_online_ctrl_domain(struct rdt_resource *r, struct rdt_ctrl_domain *d)
+{
+       int err = 0;
+
+       mutex_lock(&rdtgroup_mutex);
+
+       if (supports_mba_mbps() && r->rid == RDT_RESOURCE_MBA) {
+               /* RDT_RESOURCE_MBA is never mon_capable */
+               err = mba_sc_domain_allocate(r, d);
+       }
+
+       mutex_unlock(&rdtgroup_mutex);
+
+       return err;
+}
+
+int resctrl_online_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d)
+{
+       int err;
+
+       mutex_lock(&rdtgroup_mutex);
+
+       err = domain_setup_mon_state(r, d);
+       if (err)
+               goto out_unlock;
+
+       if (resctrl_is_mbm_enabled()) {
+               INIT_DELAYED_WORK(&d->mbm_over, mbm_handle_overflow);
+               mbm_setup_overflow_handler(d, MBM_OVERFLOW_INTERVAL,
+                                          RESCTRL_PICK_ANY_CPU);
+       }
+
+       if (resctrl_arch_is_llc_occupancy_enabled())
+               INIT_DELAYED_WORK(&d->cqm_limbo, cqm_handle_limbo);
+
+       /*
+        * If the filesystem is not mounted then only the default resource group
+        * exists. Creation of its directories is deferred until mount time
+        * by rdt_get_tree() calling mkdir_mondata_all().
+        * If resctrl is mounted, add per domain monitor data directories.
+        */
+       if (resctrl_mounted && resctrl_arch_mon_capable())
+               mkdir_mondata_subdir_allrdtgrp(r, d);
+
+out_unlock:
+       mutex_unlock(&rdtgroup_mutex);
+
+       return err;
+}
+
+void resctrl_online_cpu(unsigned int cpu)
+{
+       mutex_lock(&rdtgroup_mutex);
+       /* The CPU is set in default rdtgroup after online. */
+       cpumask_set_cpu(cpu, &rdtgroup_default.cpu_mask);
+       mutex_unlock(&rdtgroup_mutex);
+}
+
+static void clear_childcpus(struct rdtgroup *r, unsigned int cpu)
+{
+       struct rdtgroup *cr;
+
+       list_for_each_entry(cr, &r->mon.crdtgrp_list, mon.crdtgrp_list) {
+               if (cpumask_test_and_clear_cpu(cpu, &cr->cpu_mask))
+                       break;
+       }
+}
+
+static struct rdt_mon_domain *get_mon_domain_from_cpu(int cpu,
+                                                     struct rdt_resource *r)
+{
+       struct rdt_mon_domain *d;
+
+       lockdep_assert_cpus_held();
+
+       list_for_each_entry(d, &r->mon_domains, hdr.list) {
+               /* Find the domain that contains this CPU */
+               if (cpumask_test_cpu(cpu, &d->hdr.cpu_mask))
+                       return d;
+       }
+
+       return NULL;
+}
+
+void resctrl_offline_cpu(unsigned int cpu)
+{
+       struct rdt_resource *l3 = resctrl_arch_get_resource(RDT_RESOURCE_L3);
+       struct rdt_mon_domain *d;
+       struct rdtgroup *rdtgrp;
+
+       mutex_lock(&rdtgroup_mutex);
+       list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) {
+               if (cpumask_test_and_clear_cpu(cpu, &rdtgrp->cpu_mask)) {
+                       clear_childcpus(rdtgrp, cpu);
+                       break;
+               }
+       }
+
+       if (!l3->mon_capable)
+               goto out_unlock;
+
+       d = get_mon_domain_from_cpu(cpu, l3);
+       if (d) {
+               if (resctrl_is_mbm_enabled() && cpu == d->mbm_work_cpu) {
+                       cancel_delayed_work(&d->mbm_over);
+                       mbm_setup_overflow_handler(d, 0, cpu);
+               }
+               if (resctrl_arch_is_llc_occupancy_enabled() &&
+                   cpu == d->cqm_work_cpu && has_busy_rmid(d)) {
+                       cancel_delayed_work(&d->cqm_limbo);
+                       cqm_setup_limbo_handler(d, 0, cpu);
+               }
+       }
+
+out_unlock:
+       mutex_unlock(&rdtgroup_mutex);
+}
+
+/*
+ * resctrl_init - resctrl filesystem initialization
+ *
+ * Setup resctrl file system including set up root, create mount point,
+ * register resctrl filesystem, and initialize files under root directory.
+ *
+ * Return: 0 on success or -errno
+ */
+int resctrl_init(void)
+{
+       int ret = 0;
+
+       seq_buf_init(&last_cmd_status, last_cmd_status_buf,
+                    sizeof(last_cmd_status_buf));
+
+       rdtgroup_setup_default();
+
+       thread_throttle_mode_init();
+
+       ret = resctrl_mon_resource_init();
+       if (ret)
+               return ret;
+
+       ret = sysfs_create_mount_point(fs_kobj, "resctrl");
+       if (ret) {
+               resctrl_mon_resource_exit();
+               return ret;
+       }
+
+       ret = register_filesystem(&rdt_fs_type);
+       if (ret)
+               goto cleanup_mountpoint;
+
+       /*
+        * Adding the resctrl debugfs directory here may not be ideal since
+        * it would let the resctrl debugfs directory appear on the debugfs
+        * filesystem before the resctrl filesystem is mounted.
+        * It may also be ok since that would enable debugging of RDT before
+        * resctrl is mounted.
+        * The reason why the debugfs directory is created here and not in
+        * rdt_get_tree() is because rdt_get_tree() takes rdtgroup_mutex and
+        * during the debugfs directory creation also &sb->s_type->i_mutex_key
+        * (the lockdep class of inode->i_rwsem). Other filesystem
+        * interactions (eg. SyS_getdents) have the lock ordering:
+        * &sb->s_type->i_mutex_key --> &mm->mmap_lock
+        * During mmap(), called with &mm->mmap_lock, the rdtgroup_mutex
+        * is taken, thus creating dependency:
+        * &mm->mmap_lock --> rdtgroup_mutex for the latter that can cause
+        * issues considering the other two lock dependencies.
+        * By creating the debugfs directory here we avoid a dependency
+        * that may cause deadlock (even though file operations cannot
+        * occur until the filesystem is mounted, but I do not know how to
+        * tell lockdep that).
+        */
+       debugfs_resctrl = debugfs_create_dir("resctrl", NULL);
+
+       return 0;
+
+cleanup_mountpoint:
+       sysfs_remove_mount_point(fs_kobj, "resctrl");
+       resctrl_mon_resource_exit();
+
+       return ret;
+}
+
+static bool resctrl_online_domains_exist(void)
+{
+       struct rdt_resource *r;
+
+       /*
+        * Only walk capable resources to allow resctrl_arch_get_resource()
+        * to return dummy 'not capable' resources.
+        */
+       for_each_alloc_capable_rdt_resource(r) {
+               if (!list_empty(&r->ctrl_domains))
+                       return true;
+       }
+
+       for_each_mon_capable_rdt_resource(r) {
+               if (!list_empty(&r->mon_domains))
+                       return true;
+       }
+
+       return false;
+}
+
+/**
+ * resctrl_exit() - Remove the resctrl filesystem and free resources.
+ *
+ * Called by the architecture code in response to a fatal error.
+ * Removes resctrl files and structures from kernfs to prevent further
+ * configuration.
+ *
+ * When called by the architecture code, all CPUs and resctrl domains must be
+ * offline. This ensures the limbo and overflow handlers are not scheduled to
+ * run, meaning the data structures they access can be freed by
+ * resctrl_mon_resource_exit().
+ *
+ * After resctrl_exit() returns, the architecture code should return an
+ * error from all resctrl_arch_ functions that can do this.
+ * resctrl_arch_get_resource() must continue to return struct rdt_resources
+ * with the correct rid field to ensure the filesystem can be unmounted.
+ */
+void resctrl_exit(void)
+{
+       cpus_read_lock();
+       WARN_ON_ONCE(resctrl_online_domains_exist());
+
+       mutex_lock(&rdtgroup_mutex);
+       resctrl_fs_teardown();
+       mutex_unlock(&rdtgroup_mutex);
+
+       cpus_read_unlock();
+
+       debugfs_remove_recursive(debugfs_resctrl);
+       debugfs_resctrl = NULL;
+       unregister_filesystem(&rdt_fs_type);
+
+       /*
+        * Do not remove the sysfs mount point added by resctrl_init() so that
+        * it can be used to umount resctrl.
+        */
+
+       resctrl_mon_resource_exit();
+}
author	James Morse <james.morse@arm.com>
	Thu, 15 May 2025 16:58:54 +0000 (16:58 +0000)
committer	Borislav Petkov (AMD) <bp@alien8.de>
	Fri, 16 May 2025 12:36:09 +0000 (14:36 +0200)
Documentation/arch/x86/index.rst		patch \| blob \| blame \| history
Documentation/filesystems/index.rst		patch \| blob \| blame \| history
Documentation/filesystems/resctrl.rst	[moved from Documentation/arch/x86/resctrl.rst with 100% similarity]	patch \| blob \| blame \| history
MAINTAINERS		patch \| blob \| blame \| history
arch/x86/kernel/cpu/resctrl/Makefile		patch \| blob \| blame \| history
arch/x86/kernel/cpu/resctrl/ctrlmondata.c		patch \| blob \| blame \| history
arch/x86/kernel/cpu/resctrl/internal.h		patch \| blob \| blame \| history
arch/x86/kernel/cpu/resctrl/monitor.c		patch \| blob \| blame \| history
arch/x86/kernel/cpu/resctrl/monitor_trace.h	[deleted file]	patch \| blob \| blame \| history
arch/x86/kernel/cpu/resctrl/pseudo_lock.c		patch \| blob \| blame \| history
arch/x86/kernel/cpu/resctrl/pseudo_lock_trace.h		patch \| blob \| blame \| history
arch/x86/kernel/cpu/resctrl/rdtgroup.c		patch \| blob \| blame \| history
fs/resctrl/Kconfig		patch \| blob \| blame \| history
fs/resctrl/ctrlmondata.c		patch \| blob \| blame \| history
fs/resctrl/internal.h		patch \| blob \| blame \| history
fs/resctrl/monitor.c		patch \| blob \| blame \| history
fs/resctrl/monitor_trace.h		patch \| blob \| blame \| history
fs/resctrl/pseudo_lock.c		patch \| blob \| blame \| history
fs/resctrl/pseudo_lock_trace.h	[deleted file]	patch \| blob \| blame \| history
fs/resctrl/rdtgroup.c		patch \| blob \| blame \| history