Merge branch 'x86-cache-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 cache allocation interface from Thomas Gleixner: "This provides support for Intel's Cache Allocation Technology, a cache partitioning mechanism. The interface is odd, but the hardware interface of that CAT stuff is odd as well. We tried hard to come up with an abstraction, but that only allows rather simple partitioning, but no way of sharing and dealing with the per package nature of this mechanism. In the end we decided to expose the allocation bitmaps directly so all combinations of the hardware can be utilized. There are two ways of associating a cache partition: - Task A task can be added to a resource group. It uses the cache partition associated to the group. - CPU All tasks which are not member of a resource group use the group to which the CPU they are running on is associated with. That allows for simple CPU based partitioning schemes. The main expected user sare: - Virtualization so a VM can only trash only the associated part of the cash w/o disturbing others - Real-Time systems to seperate RT and general workloads. - Latency sensitive enterprise workloads - In theory this also can be used to protect against cache side channel attacks" [ Intel RDT is "Resource Director Technology". The interface really is rather odd and very specific, which delayed this pull request while I was thinking about it. The pull request itself came in early during the merge window, I just delayed it until things had calmed down and I had more time. But people tell me they'll use this, and the good news is that it is _so_ specific that it's rather independent of anything else, and no user is going to depend on the interface since it's pretty rare. So if push comes to shove, we can just remove the interface and nothing will break ] * 'x86-cache-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (31 commits) x86/intel_rdt: Implement show_options() for resctrlfs x86/intel_rdt: Call intel_rdt_sched_in() with preemption disabled x86/intel_rdt: Update task closid immediately on CPU in rmdir and unmount x86/intel_rdt: Fix setting of closid when adding CPUs to a group x86/intel_rdt: Update percpu closid immeditately on CPUs affected by changee x86/intel_rdt: Reset per cpu closids on unmount x86/intel_rdt: Select KERNFS when enabling INTEL_RDT_A x86/intel_rdt: Prevent deadlock against hotplug lock x86/intel_rdt: Protect info directory from removal x86/intel_rdt: Add info files to Documentation x86/intel_rdt: Export the minimum number of set mask bits in sysfs x86/intel_rdt: Propagate error in rdt_mount() properly x86/intel_rdt: Add a missing #include MAINTAINERS: Add maintainer for Intel RDT resource allocation x86/intel_rdt: Add scheduler hook x86/intel_rdt: Add schemata file x86/intel_rdt: Add tasks files x86/intel_rdt: Add cpus file x86/intel_rdt: Add mkdir to resctrl file system x86/intel_rdt: Add "info" files to resctrl file system ...

Merge branch 'x86-cache-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 cache allocation interface from Thomas Gleixner: "This provides support for Intel's Cache Allocation Technology, a cache partitioning mechanism. The interface is odd, but the hardware interface of that CAT stuff is odd as well. We tried hard to come up with an abstraction, but that only allows rather simple partitioning, but no way of sharing and dealing with the per package nature of this mechanism. In the end we decided to expose the allocation bitmaps directly so all combinations of the hardware can be utilized. There are two ways of associating a cache partition: - Task A task can be added to a resource group. It uses the cache partition associated to the group. - CPU All tasks which are not member of a resource group use the group to which the CPU they are running on is associated with. That allows for simple CPU based partitioning schemes. The main expected user sare: - Virtualization so a VM can only trash only the associated part of the cash w/o disturbing others - Real-Time systems to seperate RT and general workloads. - Latency sensitive enterprise workloads - In theory this also can be used to protect against cache side channel attacks" [ Intel RDT is "Resource Director Technology". The interface really is rather odd and very specific, which delayed this pull request while I was thinking about it. The pull request itself came in early during the merge window, I just delayed it until things had calmed down and I had more time. But people tell me they'll use this, and the good news is that it is _so_ specific that it's rather independent of anything else, and no user is going to depend on the interface since it's pretty rare. So if push comes to shove, we can just remove the interface and nothing will break ] * 'x86-cache-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (31 commits) x86/intel_rdt: Implement show_options() for resctrlfs x86/intel_rdt: Call intel_rdt_sched_in() with preemption disabled x86/intel_rdt: Update task closid immediately on CPU in rmdir and unmount x86/intel_rdt: Fix setting of closid when adding CPUs to a group x86/intel_rdt: Update percpu closid immeditately on CPUs affected by changee x86/intel_rdt: Reset per cpu closids on unmount x86/intel_rdt: Select KERNFS when enabling INTEL_RDT_A x86/intel_rdt: Prevent deadlock against hotplug lock x86/intel_rdt: Protect info directory from removal x86/intel_rdt: Add info files to Documentation x86/intel_rdt: Export the minimum number of set mask bits in sysfs x86/intel_rdt: Propagate error in rdt_mount() properly x86/intel_rdt: Add a missing #include MAINTAINERS: Add maintainer for Intel RDT resource allocation x86/intel_rdt: Add scheduler hook x86/intel_rdt: Add schemata file x86/intel_rdt: Add tasks files x86/intel_rdt: Add cpus file x86/intel_rdt: Add mkdir to resctrl file system x86/intel_rdt: Add "info" files to resctrl file system ...
eb254f32 · Linus Torvalds · f79f7b1b · 76ae054c · eb254f32 · eb254f32
Commit eb254f32 authored Dec 22, 2016 by Linus Torvalds
20 changed files
--- a/Documentation/ABI/testing/sysfs-devices-system-cpu
+++ b/Documentation/ABI/testing/sysfs-devices-system-cpu
@@ -272,6 +272,22 @@ Description:	Parameters for the CPU cache attributes
 				     the modified cache line is written to main
 				     memory only when it is replaced
+What:		/sys/devices/system/cpu/cpu*/cache/index*/id
+Date:		September 2016
+Contact:	Linux kernel mailing list <linux-kernel@vger.kernel.org>
+Description:	Cache id
+		The id provides a unique number for a specific instance of
+		a cache of a particular type. E.g. there may be a level
+		3 unified cache on each socket in a server and we may
+		assign them ids 0, 1, 2, ...
+		Note that id value can be non-contiguous. E.g. level 1
+		caches typically exist per core, but there may not be a
+		power of two cores on a socket, so these caches may be
+		numbered 0, 1, 2, 3, 4, 5, 8, 9, 10, ...
 What:		/sys/devices/system/cpu/cpuX/cpufreq/throttle_stats
 		/sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/turbo_stat
 		/sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/sub_turbo_stat

--- a/Documentation/x86/intel_rdt_ui.txt
+++ b/Documentation/x86/intel_rdt_ui.txt
+User Interface for Resource Allocation in Intel Resource Director Technology
+Copyright (C) 2016 Intel Corporation
+Fenghua Yu <fenghua.yu@intel.com>
+Tony Luck <tony.luck@intel.com>
+This feature is enabled by the CONFIG_INTEL_RDT_A Kconfig and the
+X86 /proc/cpuinfo flag bits "rdt", "cat_l3" and "cdp_l3".
+To use the feature mount the file system:
+ # mount -t resctrl resctrl [-o cdp] /sys/fs/resctrl
+mount options are:
+"cdp": Enable code/data prioritization in L3 cache allocations.
+Info directory
+--------------
+The 'info' directory contains information about the enabled
+resources. Each resource has its own subdirectory. The subdirectory
+names reflect the resource names. Each subdirectory contains the
+following files:
+"num_closids":  The number of CLOSIDs which are valid for this
+	        resource. The kernel uses the smallest number of
+		CLOSIDs of all enabled resources as limit.
+"cbm_mask":     The bitmask which is valid for this resource. This
+		mask is equivalent to 100%.
+"min_cbm_bits": The minimum number of consecutive bits which must be
+		set when writing a mask.
+Resource groups
+---------------
+Resource groups are represented as directories in the resctrl file
+system. The default group is the root directory. Other groups may be
+created as desired by the system administrator using the "mkdir(1)"
+command, and removed using "rmdir(1)".
+There are three files associated with each group:
+"tasks": A list of tasks that belongs to this group. Tasks can be
+	added to a group by writing the task ID to the "tasks" file
+	(which will automatically remove them from the previous
+	group to which they belonged). New tasks created by fork(2)
+	and clone(2) are added to the same group as their parent.
+	If a pid is not in any sub partition, it is in root partition
+	(i.e. default partition).
+"cpus": A bitmask of logical CPUs assigned to this group. Writing
+	a new mask can add/remove CPUs from this group. Added CPUs
+	are removed from their previous group. Removed ones are
+	given to the default (root) group. You cannot remove CPUs
+	from the default group.
+"schemata": A list of all the resources available to this group.
+	Each resource has its own line and format - see below for
+	details.
+When a task is running the following rules define which resources
+are available to it:
+1) If the task is a member of a non-default group, then the schemata
+for that group is used.
+2) Else if the task belongs to the default group, but is running on a
+CPU that is assigned to some specific group, then the schemata for
+the CPU's group is used.
+3) Otherwise the schemata for the default group is used.
+Schemata files - general concepts
+---------------------------------
+Each line in the file describes one resource. The line starts with
+the name of the resource, followed by specific values to be applied
+in each of the instances of that resource on the system.
+Cache IDs
+---------
+On current generation systems there is one L3 cache per socket and L2
+caches are generally just shared by the hyperthreads on a core, but this
+isn't an architectural requirement. We could have multiple separate L3
+caches on a socket, multiple cores could share an L2 cache. So instead
+of using "socket" or "core" to define the set of logical cpus sharing
+a resource we use a "Cache ID". At a given cache level this will be a
+unique number across the whole system (but it isn't guaranteed to be a
+contiguous sequence, there may be gaps).  To find the ID for each logical
+CPU look in /sys/devices/system/cpu/cpu*/cache/index*/id
+Cache Bit Masks (CBM)
+---------------------
+For cache resources we describe the portion of the cache that is available
+for allocation using a bitmask. The maximum value of the mask is defined
+by each cpu model (and may be different for different cache levels). It
+is found using CPUID, but is also provided in the "info" directory of
+the resctrl file system in "info/{resource}/cbm_mask". X86 hardware
+requires that these masks have all the '1' bits in a contiguous block. So
+0x3, 0x6 and 0xC are legal 4-bit masks with two bits set, but 0x5, 0x9
+and 0xA are not.  On a system with a 20-bit mask each bit represents 5%
+of the capacity of the cache. You could partition the cache into four
+equal parts with masks: 0x1f, 0x3e0, 0x7c00, 0xf8000.
+L3 details (code and data prioritization disabled)
+--------------------------------------------------
+With CDP disabled the L3 schemata format is:
+	L3:<cache_id0>=<cbm>;<cache_id1>=<cbm>;...
+L3 details (CDP enabled via mount option to resctrl)
+----------------------------------------------------
+When CDP is enabled L3 control is split into two separate resources
+so you can specify independent masks for code and data like this:
+	L3data:<cache_id0>=<cbm>;<cache_id1>=<cbm>;...
+	L3code:<cache_id0>=<cbm>;<cache_id1>=<cbm>;...
+L2 details
+----------
+L2 cache does not support code and data prioritization, so the
+schemata format is always:
+	L2:<cache_id0>=<cbm>;<cache_id1>=<cbm>;...
+Example 1
+---------
+On a two socket machine (one L3 cache per socket) with just four bits
+for cache bit masks
+# mount -t resctrl resctrl /sys/fs/resctrl
+# cd /sys/fs/resctrl
+# mkdir p0 p1
+# echo "L3:0=3;1=c" > /sys/fs/resctrl/p0/schemata
+# echo "L3:0=3;1=3" > /sys/fs/resctrl/p1/schemata
+The default resource group is unmodified, so we have access to all parts
+of all caches (its schemata file reads "L3:0=f;1=f").
+Tasks that are under the control of group "p0" may only allocate from the
+"lower" 50% on cache ID 0, and the "upper" 50% of cache ID 1.
+Tasks in group "p1" use the "lower" 50% of cache on both sockets.
+Example 2
+---------
+Again two sockets, but this time with a more realistic 20-bit mask.
+Two real time tasks pid=1234 running on processor 0 and pid=5678 running on
+processor 1 on socket 0 on a 2-socket and dual core machine. To avoid noisy
+neighbors, each of the two real-time tasks exclusively occupies one quarter
+of L3 cache on socket 0.
+# mount -t resctrl resctrl /sys/fs/resctrl
+# cd /sys/fs/resctrl
+First we reset the schemata for the default group so that the "upper"
+50% of the L3 cache on socket 0 cannot be used by ordinary tasks:
+# echo "L3:0=3ff;1=fffff" > schemata
+Next we make a resource group for our first real time task and give
+it access to the "top" 25% of the cache on socket 0.
+# mkdir p0
+# echo "L3:0=f8000;1=fffff" > p0/schemata
+Finally we move our first real time task into this resource group. We
+also use taskset(1) to ensure the task always runs on a dedicated CPU
+on socket 0. Most uses of resource groups will also constrain which
+processors tasks run on.
+# echo 1234 > p0/tasks
+# taskset -cp 1 1234
+Ditto for the second real time task (with the remaining 25% of cache):
+# mkdir p1
+# echo "L3:0=7c00;1=fffff" > p1/schemata
+# echo 5678 > p1/tasks
+# taskset -cp 2 5678
+Example 3
+---------
+A single socket system which has real-time tasks running on core 4-7 and
+non real-time workload assigned to core 0-3. The real-time tasks share text
+and data, so a per task association is not required and due to interaction
+with the kernel it's desired that the kernel on these cores shares L3 with
+the tasks.
+# mount -t resctrl resctrl /sys/fs/resctrl
+# cd /sys/fs/resctrl
+First we reset the schemata for the default group so that the "upper"
+50% of the L3 cache on socket 0 cannot be used by ordinary tasks:
+# echo "L3:0=3ff" > schemata
+Next we make a resource group for our real time cores and give
+it access to the "top" 50% of the cache on socket 0.
+# mkdir p0
+# echo "L3:0=ffc00;" > p0/schemata
+Finally we move core 4-7 over to the new group and make sure that the
+kernel and the tasks running there get 50% of the cache.
+# echo C0 > p0/cpus
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -10327,6 +10327,14 @@ L:	linux-rdma@vger.kernel.org
 S:	Supported
 F:	drivers/infiniband/sw/rdmavt
+RDT - RESOURCE ALLOCATION
+M:	Fenghua Yu <fenghua.yu@intel.com>
+L:	linux-kernel@vger.kernel.org
+S:	Supported
+F:	arch/x86/kernel/cpu/intel_rdt*
+F:	arch/x86/include/asm/intel_rdt*
+F:	Documentation/x86/intel_rdt*
 READ-COPY UPDATE (RCU)
 M:	"Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
 M:	Josh Triplett <josh@joshtriplett.org>

--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -412,6 +412,19 @@ config GOLDFISH
       def_bool y
       depends on X86_GOLDFISH
+config INTEL_RDT_A
+	bool "Intel Resource Director Technology Allocation support"
+	default n
+	depends on X86 && CPU_SUP_INTEL
+	select KERNFS
+	help
+	  Select to enable resource allocation which is a sub-feature of
+	  Intel Resource Director Technology(RDT). More information about
+	  RDT can be found in the Intel x86 Architecture Software
+	  Developer Manual.
+	  Say N if unsure.
 if X86_32
 config X86_EXTENDED_PLATFORM
 	bool "Support for extended (non-PC) x86 platforms"

--- a/arch/x86/events/intel/cqm.c
+++ b/arch/x86/events/intel/cqm.c
@@ -7,9 +7,9 @@
 #include <linux/perf_event.h>
 #include <linux/slab.h>
 #include <asm/cpu_device_id.h>
+#include <asm/intel_rdt_common.h>
 #include "../perf_event.h"
-#define MSR_IA32_PQR_ASSOC	0x0c8f
 #define MSR_IA32_QM_CTR		0x0c8e
 #define MSR_IA32_QM_EVTSEL	0x0c8d
@@ -24,32 +24,13 @@ static unsigned int cqm_l3_scale; /* supposedly cacheline size */
 static bool cqm_enabled, mbm_enabled;
 unsigned int mbm_socket_max;
-/**
- * struct intel_pqr_state - State cache for the PQR MSR
- * @rmid:		The cached Resource Monitoring ID
- * @closid:		The cached Class Of Service ID
- * @rmid_usecnt:	The usage counter for rmid
- *
- * The upper 32 bits of MSR_IA32_PQR_ASSOC contain closid and the
- * lower 10 bits rmid. The update to MSR_IA32_PQR_ASSOC always
- * contains both parts, so we need to cache them.
- *
- * The cache also helps to avoid pointless updates if the value does
- * not change.
- */
-struct intel_pqr_state {
-	u32			rmid;
-	u32			closid;
-	int			rmid_usecnt;
-};
 /*
 * The cached intel_pqr_state is strictly per CPU and can never be
 * updated from a remote CPU. Both functions which modify the state
 * (intel_cqm_event_start and intel_cqm_event_stop) are called with
 * interrupts disabled, which is sufficient for the protection.
 */
-static DEFINE_PER_CPU(struct intel_pqr_state, pqr_state);
+DEFINE_PER_CPU(struct intel_pqr_state, pqr_state);
 static struct hrtimer *mbm_timers;
 /**
 * struct sample - mbm event's (local or total) data

--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -189,6 +189,9 @@
 #define X86_FEATURE_CPB		( 7*32+ 2) /* AMD Core Performance Boost */
 #define X86_FEATURE_EPB		( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */
+#define X86_FEATURE_CAT_L3	( 7*32+ 4) /* Cache Allocation Technology L3 */
+#define X86_FEATURE_CAT_L2	( 7*32+ 5) /* Cache Allocation Technology L2 */
+#define X86_FEATURE_CDP_L3	( 7*32+ 6) /* Code and Data Prioritization L3 */
 #define X86_FEATURE_HW_PSTATE	( 7*32+ 8) /* AMD HW-PState */
 #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */
@@ -222,6 +225,7 @@
 #define X86_FEATURE_RTM		( 9*32+11) /* Restricted Transactional Memory */
 #define X86_FEATURE_CQM		( 9*32+12) /* Cache QoS Monitoring */
 #define X86_FEATURE_MPX		( 9*32+14) /* Memory Protection Extension */
+#define X86_FEATURE_RDT_A	( 9*32+15) /* Resource Director Technology Allocation */
 #define X86_FEATURE_AVX512F	( 9*32+16) /* AVX-512 Foundation */
 #define X86_FEATURE_AVX512DQ	( 9*32+17) /* AVX-512 DQ (Double/Quad granular) Instructions */
 #define X86_FEATURE_RDSEED	( 9*32+18) /* The RDSEED instruction */

--- a/arch/x86/include/asm/intel_rdt.h
+++ b/arch/x86/include/asm/intel_rdt.h
+#ifndef _ASM_X86_INTEL_RDT_H
+#define _ASM_X86_INTEL_RDT_H
+#ifdef CONFIG_INTEL_RDT_A
+#include <linux/kernfs.h>
+#include <linux/jump_label.h>
+#include <asm/intel_rdt_common.h>
+#define IA32_L3_QOS_CFG		0xc81
+#define IA32_L3_CBM_BASE	0xc90
+#define IA32_L2_CBM_BASE	0xd10
+#define L3_QOS_CDP_ENABLE	0x01ULL
+/**
+ * struct rdtgroup - store rdtgroup's data in resctrl file system.
+ * @kn:				kernfs node
+ * @rdtgroup_list:		linked list for all rdtgroups
+ * @closid:			closid for this rdtgroup
+ * @cpu_mask:			CPUs assigned to this rdtgroup
+ * @flags:			status bits
+ * @waitcount:			how many cpus expect to find this
+ *				group when they acquire rdtgroup_mutex
+ */
+struct rdtgroup {
+	struct kernfs_node	*kn;
+	struct list_head	rdtgroup_list;
+	int			closid;
+	struct cpumask		cpu_mask;
+	int			flags;
+	atomic_t		waitcount;
+};
+/* rdtgroup.flags */
+#define	RDT_DELETED		1
+/* List of all resource groups */
+extern struct list_head rdt_all_groups;
+int __init rdtgroup_init(void);
+/**
+ * struct rftype - describe each file in the resctrl file system
+ * @name: file name
+ * @mode: access mode
+ * @kf_ops: operations
+ * @seq_show: show content of the file
+ * @write: write to the file
+ */
+struct rftype {
+	char			*name;
+	umode_t			mode;
+	struct kernfs_ops	*kf_ops;
+	int (*seq_show)(struct kernfs_open_file *of,
+			struct seq_file *sf, void *v);
+	/*
+	 * write() is the generic write callback which maps directly to
+	 * kernfs write operation and overrides all other operations.
+	 * Maximum write size is determined by ->max_write_len.
+	 */
+	ssize_t (*write)(struct kernfs_open_file *of,
+			 char *buf, size_t nbytes, loff_t off);
+};
+/**
+ * struct rdt_resource - attributes of an RDT resource
+ * @enabled:			Is this feature enabled on this machine
+ * @capable:			Is this feature available on this machine
+ * @name:			Name to use in "schemata" file
+ * @num_closid:			Number of CLOSIDs available
+ * @max_cbm:			Largest Cache Bit Mask allowed
+ * @min_cbm_bits:		Minimum number of consecutive bits to be set
+ *				in a cache bit mask
+ * @domains:			All domains for this resource
+ * @num_domains:		Number of domains active
+ * @msr_base:			Base MSR address for CBMs
+ * @tmp_cbms:			Scratch space when updating schemata
+ * @num_tmp_cbms:		Number of CBMs in tmp_cbms
+ * @cache_level:		Which cache level defines scope of this domain
+ * @cbm_idx_multi:		Multiplier of CBM index
+ * @cbm_idx_offset:		Offset of CBM index. CBM index is computed by:
+ *				closid * cbm_idx_multi + cbm_idx_offset
+ */
+struct rdt_resource {
+	bool			enabled;
+	bool			capable;
+	char			*name;
+	int			num_closid;
+	int			cbm_len;
+	int			min_cbm_bits;
+	u32			max_cbm;
+	struct list_head	domains;
+	int			num_domains;
+	int			msr_base;
+	u32			*tmp_cbms;
+	int			num_tmp_cbms;
+	int			cache_level;
+	int			cbm_idx_multi;
+	int			cbm_idx_offset;
+};
+/**
+ * struct rdt_domain - group of cpus sharing an RDT resource
+ * @list:	all instances of this resource
+ * @id:		unique id for this instance
+ * @cpu_mask:	which cpus share this resource
+ * @cbm:	array of cache bit masks (indexed by CLOSID)
+ */
+struct rdt_domain {
+	struct list_head	list;
+	int			id;
+	struct cpumask		cpu_mask;
+	u32			*cbm;
+};
+/**
+ * struct msr_param - set a range of MSRs from a domain
+ * @res:       The resource to use
+ * @low:       Beginning index from base MSR
+ * @high:      End index
+ */
+struct msr_param {
+	struct rdt_resource	*res;
+	int			low;
+	int			high;
+};
+extern struct mutex rdtgroup_mutex;
+extern struct rdt_resource rdt_resources_all[];
+extern struct rdtgroup rdtgroup_default;
+DECLARE_STATIC_KEY_FALSE(rdt_enable_key);
+int __init rdtgroup_init(void);
+enum {
+	RDT_RESOURCE_L3,
+	RDT_RESOURCE_L3DATA,
+	RDT_RESOURCE_L3CODE,
+	RDT_RESOURCE_L2,
+	/* Must be the last */
+	RDT_NUM_RESOURCES,
+};
+#define for_each_capable_rdt_resource(r)				      \
+	for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\
+	     r++) 							      \
+		if (r->capable)
+#define for_each_enabled_rdt_resource(r)				      \
+	for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\
+	     r++)							      \
+		if (r->enabled)
+/* CPUID.(EAX=10H, ECX=ResID=1).EAX */
+union cpuid_0x10_1_eax {
+	struct {
+		unsigned int cbm_len:5;
+	} split;
+	unsigned int full;
+};
+/* CPUID.(EAX=10H, ECX=ResID=1).EDX */
+union cpuid_0x10_1_edx {
+	struct {
+		unsigned int cos_max:16;
+	} split;
+	unsigned int full;
+};
+DECLARE_PER_CPU_READ_MOSTLY(int, cpu_closid);
+void rdt_cbm_update(void *arg);
+struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn);
+void rdtgroup_kn_unlock(struct kernfs_node *kn);
+ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
+				char *buf, size_t nbytes, loff_t off);
+int rdtgroup_schemata_show(struct kernfs_open_file *of,
+			   struct seq_file *s, void *v);
+/*
+ * intel_rdt_sched_in() - Writes the task's CLOSid to IA32_PQR_MSR
+ *
+ * Following considerations are made so that this has minimal impact
+ * on scheduler hot path:
+ * - This will stay as no-op unless we are running on an Intel SKU
+ *   which supports resource control and we enable by mounting the
+ *   resctrl file system.
+ * - Caches the per cpu CLOSid values and does the MSR write only
+ *   when a task with a different CLOSid is scheduled in.
+ *
+ * Must be called with preemption disabled.
+ */
+static inline void intel_rdt_sched_in(void)
+{
+	if (static_branch_likely(&rdt_enable_key)) {
+		struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
+		int closid;
+		/*
+		 * If this task has a closid assigned, use it.
+		 * Else use the closid assigned to this cpu.
+		 */
+		closid = current->closid;
+		if (closid == 0)
+			closid = this_cpu_read(cpu_closid);
+		if (closid != state->closid) {
+			state->closid = closid;
+			wrmsr(MSR_IA32_PQR_ASSOC, state->rmid, closid);
+		}
+	}
+}
+#else
+static inline void intel_rdt_sched_in(void) {}
+#endif /* CONFIG_INTEL_RDT_A */
+#endif /* _ASM_X86_INTEL_RDT_H */
--- a/arch/x86/include/asm/intel_rdt_common.h
+++ b/arch/x86/include/asm/intel_rdt_common.h
+#ifndef _ASM_X86_INTEL_RDT_COMMON_H
+#define _ASM_X86_INTEL_RDT_COMMON_H
+#define MSR_IA32_PQR_ASSOC	0x0c8f
+/**
+ * struct intel_pqr_state - State cache for the PQR MSR
+ * @rmid:		The cached Resource Monitoring ID
+ * @closid:		The cached Class Of Service ID
+ * @rmid_usecnt:	The usage counter for rmid
+ *
+ * The upper 32 bits of MSR_IA32_PQR_ASSOC contain closid and the
+ * lower 10 bits rmid. The update to MSR_IA32_PQR_ASSOC always
+ * contains both parts, so we need to cache them.
+ *
+ * The cache also helps to avoid pointless updates if the value does
+ * not change.
+ */
+struct intel_pqr_state {
+	u32			rmid;
+	u32			closid;
+	int			rmid_usecnt;
+};
+DECLARE_PER_CPU(struct intel_pqr_state, pqr_state);
+#endif /* _ASM_X86_INTEL_RDT_COMMON_H */
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -32,6 +32,8 @@ obj-$(CONFIG_CPU_SUP_CENTAUR)		+= centaur.o
 obj-$(CONFIG_CPU_SUP_TRANSMETA_32)	+= transmeta.o
 obj-$(CONFIG_CPU_SUP_UMC_32)		+= umc.o
+obj-$(CONFIG_INTEL_RDT_A)	+= intel_rdt.o intel_rdt_rdtgroup.o intel_rdt_schemata.o
 obj-$(CONFIG_X86_MCE)			+= mcheck/
 obj-$(CONFIG_MTRR)			+= mtrr/
 obj-$(CONFIG_MICROCODE)			+= microcode/

--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -153,6 +153,7 @@ struct _cpuid4_info_regs {
 	union _cpuid4_leaf_eax eax;
 	union _cpuid4_leaf_ebx ebx;
 	union _cpuid4_leaf_ecx ecx;
+	unsigned int id;
 	unsigned long size;
 	struct amd_northbridge *nb;
 };
@@ -894,6 +895,8 @@ static void __cache_cpumap_setup(unsigned int cpu, int index,
 static void ci_leaf_init(struct cacheinfo *this_leaf,
 			 struct _cpuid4_info_regs *base)
 {
+	this_leaf->id = base->id;
+	this_leaf->attributes = CACHE_ID;
 	this_leaf->level = base->eax.split.level;
 	this_leaf->type = cache_type_map[base->eax.split.type];
 	this_leaf->coherency_line_size =
@@ -920,6 +923,22 @@ static int __init_cache_level(unsigned int cpu)
 	return 0;
 }
+/*
+ * The max shared threads number comes from CPUID.4:EAX[25-14] with input
+ * ECX as cache index. Then right shift apicid by the number's order to get
+ * cache id for this cache node.
+ */
+static void get_cache_id(int cpu, struct _cpuid4_info_regs *id4_regs)
+{
+	struct cpuinfo_x86 *c = &cpu_data(cpu);
+	unsigned long num_threads_sharing;
+	int index_msb;
+	num_threads_sharing = 1 + id4_regs->eax.split.num_threads_sharing;
+	index_msb = get_count_order(num_threads_sharing);
+	id4_regs->id = c->apicid >> index_msb;
+}
 static int __populate_cache_leaves(unsigned int cpu)
 {
 	unsigned int idx, ret;
@@ -931,6 +950,7 @@ static int __populate_cache_leaves(unsigned int cpu)
 		ret = cpuid4_cache_lookup_regs(idx, &id4_regs);
 		if (ret)
 			return ret;
+		get_cache_id(cpu, &id4_regs);
 		ci_leaf_init(this_leaf++, &id4_regs);
 		__cache_cpumap_setup(cpu, idx, &id4_regs);
 	}

--- a/arch/x86/kernel/cpu/intel_rdt.c
+++ b/arch/x86/kernel/cpu/intel_rdt.c
--- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
+++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
--- a/arch/x86/kernel/cpu/intel_rdt_schemata.c
+++ b/arch/x86/kernel/cpu/intel_rdt_schemata.c
+/*
+ * Resource Director Technology(RDT)
+ * - Cache Allocation code.
+ *
+ * Copyright (C) 2016 Intel Corporation
+ *
+ * Authors:
+ *    Fenghua Yu <fenghua.yu@intel.com>
+ *    Tony Luck <tony.luck@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * More information about RDT be found in the Intel (R) x86 Architecture
+ * Software Developer Manual June 2016, volume 3, section 17.17.
+ */
+#define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
+#include <linux/kernfs.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <asm/intel_rdt.h>
+/*
+ * Check whether a cache bit mask is valid. The SDM says:
+ *	Please note that all (and only) contiguous '1' combinations
+ *	are allowed (e.g. FFFFH, 0FF0H, 003CH, etc.).
+ * Additionally Haswell requires at least two bits set.
+ */
+static bool cbm_validate(unsigned long var, struct rdt_resource *r)
+{
+	unsigned long first_bit, zero_bit;
+	if (var == 0 || var > r->max_cbm)
+		return false;
+	first_bit = find_first_bit(&var, r->cbm_len);
+	zero_bit = find_next_zero_bit(&var, r->cbm_len, first_bit);
+	if (find_next_bit(&var, r->cbm_len, zero_bit) < r->cbm_len)
+		return false;
+	if ((zero_bit - first_bit) < r->min_cbm_bits)
+		return false;
+	return true;
+}
+/*
+ * Read one cache bit mask (hex). Check that it is valid for the current
+ * resource type.
+ */
+static int parse_cbm(char *buf, struct rdt_resource *r)
+{
+	unsigned long data;
+	int ret;
+	ret = kstrtoul(buf, 16, &data);
+	if (ret)
+		return ret;
+	if (!cbm_validate(data, r))
+		return -EINVAL;
+	r->tmp_cbms[r->num_tmp_cbms++] = data;
+	return 0;
+}
+/*
+ * For each domain in this resource we expect to find a series of:
+ *	id=mask
+ * separated by ";". The "id" is in decimal, and must appear in the
+ * right order.
+ */
+static int parse_line(char *line, struct rdt_resource *r)
+{
+	char *dom = NULL, *id;
+	struct rdt_domain *d;
+	unsigned long dom_id;
+	list_for_each_entry(d, &r->domains, list) {
+		dom = strsep(&line, ";");
+		if (!dom)
+			return -EINVAL;
+		id = strsep(&dom, "=");
+		if (kstrtoul(id, 10, &dom_id) || dom_id != d->id)
+			return -EINVAL;
+		if (parse_cbm(dom, r))
+			return -EINVAL;
+	}
+	/* Any garbage at the end of the line? */
+	if (line && line[0])
+		return -EINVAL;
+	return 0;
+}
+static int update_domains(struct rdt_resource *r, int closid)
+{
+	struct msr_param msr_param;
+	cpumask_var_t cpu_mask;
+	struct rdt_domain *d;
+	int cpu, idx = 0;
+	if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL))
+		return -ENOMEM;
+	msr_param.low = closid;
+	msr_param.high = msr_param.low + 1;
+	msr_param.res = r;
+	list_for_each_entry(d, &r->domains, list) {
+		cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask);
+		d->cbm[msr_param.low] = r->tmp_cbms[idx++];
+	}
+	cpu = get_cpu();
+	/* Update CBM on this cpu if it's in cpu_mask. */
+	if (cpumask_test_cpu(cpu, cpu_mask))
+		rdt_cbm_update(&msr_param);
+	/* Update CBM on other cpus. */
+	smp_call_function_many(cpu_mask, rdt_cbm_update, &msr_param, 1);
+	put_cpu();
+	free_cpumask_var(cpu_mask);
+	return 0;
+}
+ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
+				char *buf, size_t nbytes, loff_t off)
+{
+	struct rdtgroup *rdtgrp;
+	struct rdt_resource *r;
+	char *tok, *resname;
+	int closid, ret = 0;
+	u32 *l3_cbms = NULL;
+	/* Valid input requires a trailing newline */
+	if (nbytes == 0 || buf[nbytes - 1] != '\n')
+		return -EINVAL;
+	buf[nbytes - 1] = '\0';
+	rdtgrp = rdtgroup_kn_lock_live(of->kn);
+	if (!rdtgrp) {
+		rdtgroup_kn_unlock(of->kn);
+		return -ENOENT;
+	}
+	closid = rdtgrp->closid;
+	/* get scratch space to save all the masks while we validate input */
+	for_each_enabled_rdt_resource(r) {
+		r->tmp_cbms = kcalloc(r->num_domains, sizeof(*l3_cbms),
+				      GFP_KERNEL);
+		if (!r->tmp_cbms) {
+			ret = -ENOMEM;
+			goto out;
+		}
+		r->num_tmp_cbms = 0;
+	}
+	while ((tok = strsep(&buf, "\n")) != NULL) {
+		resname = strsep(&tok, ":");
+		if (!tok) {
+			ret = -EINVAL;
+			goto out;
+		}
+		for_each_enabled_rdt_resource(r) {
+			if (!strcmp(resname, r->name) &&
+			    closid < r->num_closid) {
+				ret = parse_line(tok, r);
+				if (ret)
+					goto out;
+				break;
+			}
+		}
+		if (!r->name) {
+			ret = -EINVAL;
+			goto out;
+		}
+	}
+	/* Did the parser find all the masks we need? */
+	for_each_enabled_rdt_resource(r) {
+		if (r->num_tmp_cbms != r->num_domains) {
+			ret = -EINVAL;
+			goto out;
+		}
+	}
+	for_each_enabled_rdt_resource(r) {
+		ret = update_domains(r, closid);
+		if (ret)
+			goto out;
+	}
+out:
+	rdtgroup_kn_unlock(of->kn);
+	for_each_enabled_rdt_resource(r) {
+		kfree(r->tmp_cbms);
+		r->tmp_cbms = NULL;
+	}
+	return ret ?: nbytes;
+}
+static void show_doms(struct seq_file *s, struct rdt_resource *r, int closid)
+{
+	struct rdt_domain *dom;
+	bool sep = false;
+	seq_printf(s, "%s:", r->name);
+	list_for_each_entry(dom, &r->domains, list) {
+		if (sep)
+			seq_puts(s, ";");
+		seq_printf(s, "%d=%x", dom->id, dom->cbm[closid]);
+		sep = true;
+	}
+	seq_puts(s, "\n");
+}
+int rdtgroup_schemata_show(struct kernfs_open_file *of,
+			   struct seq_file *s, void *v)
+{
+	struct rdtgroup *rdtgrp;
+	struct rdt_resource *r;
+	int closid, ret = 0;
+	rdtgrp = rdtgroup_kn_lock_live(of->kn);
+	if (rdtgrp) {
+		closid = rdtgrp->closid;
+		for_each_enabled_rdt_resource(r) {
+			if (closid < r->num_closid)
+				show_doms(s, r, closid);
+		}
+	} else {
+		ret = -ENOENT;
+	}
+	rdtgroup_kn_unlock(of->kn);
+	return ret;
+}
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -20,12 +20,15 @@ struct cpuid_bit {
 /* Please keep the leaf sorted by cpuid_bit.level for faster search. */
 static const struct cpuid_bit cpuid_bits[] = {
 	{ X86_FEATURE_APERFMPERF,       CPUID_ECX,  0, 0x00000006, 0 },
-	{ X86_FEATURE_EPB,              CPUID_ECX,  3, 0x00000006, 0 },
+	{ X86_FEATURE_EPB,		CPUID_ECX,  3, 0x00000006, 0 },
-	{ X86_FEATURE_INTEL_PT,         CPUID_EBX, 25, 0x00000007, 0 },
+	{ X86_FEATURE_INTEL_PT,		CPUID_EBX, 25, 0x00000007, 0 },
 	{ X86_FEATURE_AVX512_4VNNIW,    CPUID_EDX,  2, 0x00000007, 0 },
 	{ X86_FEATURE_AVX512_4FMAPS,    CPUID_EDX,  3, 0x00000007, 0 },
-	{ X86_FEATURE_HW_PSTATE,        CPUID_EDX,  7, 0x80000007, 0 },
+	{ X86_FEATURE_CAT_L3,		CPUID_EBX,  1, 0x00000010, 0 },
-	{ X86_FEATURE_CPB,              CPUID_EDX,  9, 0x80000007, 0 },
+	{ X86_FEATURE_CAT_L2,		CPUID_EBX,  2, 0x00000010, 0 },
+	{ X86_FEATURE_CDP_L3,		CPUID_ECX,  2, 0x00000010, 1 },
+	{ X86_FEATURE_HW_PSTATE,	CPUID_EDX,  7, 0x80000007, 0 },
+	{ X86_FEATURE_CPB,		CPUID_EDX,  9, 0x80000007, 0 },
 	{ X86_FEATURE_PROC_FEEDBACK,    CPUID_EDX, 11, 0x80000007, 0 },
 	{ 0, 0, 0, 0, 0 }
 };

--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -53,6 +53,7 @@
 #include <asm/debugreg.h>
 #include <asm/switch_to.h>
 #include <asm/vm86.h>
+#include <asm/intel_rdt.h>
 void __show_regs(struct pt_regs *regs, int all)
 {
@@ -296,5 +297,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	this_cpu_write(current_task, next_p);
+	/* Load the Intel cache allocation PQR MSR. */
+	intel_rdt_sched_in();
 	return prev_p;
 }
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -49,6 +49,7 @@
 #include <asm/switch_to.h>
 #include <asm/xen/hypervisor.h>
 #include <asm/vdso.h>
+#include <asm/intel_rdt.h>
 __visible DEFINE_PER_CPU(unsigned long, rsp_scratch);
@@ -476,6 +477,9 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 			loadsegment(ss, __KERNEL_DS);
 	}
+	/* Load the Intel cache allocation PQR MSR. */
+	intel_rdt_sched_in();
 	return prev_p;
 }

--- a/drivers/base/cacheinfo.c
+++ b/drivers/base/cacheinfo.c
@@ -363,6 +363,7 @@ static ssize_t file_name##_show(struct device *dev,		\
 	return sprintf(buf, "%u\n", this_leaf->object);		\
 }
+show_one(id, id);
 show_one(level, level);
 show_one(coherency_line_size, coherency_line_size);
 show_one(number_of_sets, number_of_sets);
@@ -444,6 +445,7 @@ static ssize_t write_policy_show(struct device *dev,
 	return n;
 }
+static DEVICE_ATTR_RO(id);
 static DEVICE_ATTR_RO(level);
 static DEVICE_ATTR_RO(type);
 static DEVICE_ATTR_RO(coherency_line_size);
@@ -457,6 +459,7 @@ static DEVICE_ATTR_RO(shared_cpu_list);
 static DEVICE_ATTR_RO(physical_line_partition);
 static struct attribute *cache_default_attrs[] = {
+	&dev_attr_id.attr,
 	&dev_attr_type.attr,
 	&dev_attr_level.attr,
 	&dev_attr_shared_cpu_map.attr,
@@ -480,6 +483,8 @@ cache_default_attrs_is_visible(struct kobject *kobj,
 	const struct cpumask *mask = &this_leaf->shared_cpu_map;
 	umode_t mode = attr->mode;
+	if ((attr == &dev_attr_id.attr) && (this_leaf->attributes & CACHE_ID))
+		return mode;
 	if ((attr == &dev_attr_type.attr) && this_leaf->type)
 		return mode;
 	if ((attr == &dev_attr_level.attr) && this_leaf->level)

--- a/include/linux/cacheinfo.h
+++ b/include/linux/cacheinfo.h
@@ -18,6 +18,7 @@ enum cache_type {
 /**
 * struct cacheinfo - represent a cache leaf node
+ * @id: This cache's id. It is unique among caches with the same (type, level).
 * @type: type of the cache - data, inst or unified
 * @level: represents the hierarchy in the multi-level cache
 * @coherency_line_size: size of each cache line usually representing
@@ -44,6 +45,7 @@ enum cache_type {
 * keeping, the remaining members form the core properties of the cache
 */
 struct cacheinfo {
+	unsigned int id;
 	enum cache_type type;
 	unsigned int level;
 	unsigned int coherency_line_size;
@@ -61,6 +63,7 @@ struct cacheinfo {
 #define CACHE_WRITE_ALLOCATE	BIT(3)
 #define CACHE_ALLOCATE_POLICY_MASK	\
 	(CACHE_READ_ALLOCATE | CACHE_WRITE_ALLOCATE)
+#define CACHE_ID		BIT(4)
 	struct device_node *of_node;
 	bool disable_sysfs;

--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1821,6 +1821,9 @@ struct task_struct {
 	/* cg_list protected by css_set_lock and tsk->alloc_lock */
 	struct list_head cg_list;
 #endif
+#ifdef CONFIG_INTEL_RDT_A
+	int closid;
+#endif
 #ifdef CONFIG_FUTEX
 	struct robust_list_head __user *robust_list;
 #ifdef CONFIG_COMPAT

--- a/include/uapi/linux/magic.h
+++ b/include/uapi/linux/magic.h
@@ -57,6 +57,7 @@
 #define CGROUP_SUPER_MAGIC	0x27e0eb
 #define CGROUP2_SUPER_MAGIC	0x63677270
+#define RDTGROUP_SUPER_MAGIC	0x7655821
 #define STACK_END_MAGIC		0x57AC6E9D