Merge branch 'x86-cpu-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 cpu updates from Ingo Molnar: "The biggest changes are an extension of the Intel RDT code to extend it with Intel Memory Bandwidth Allocation CPU support: MBA allows bandwidth allocation between cores, while CBM (already upstream) allows CPU cache partitioning. There's also misc smaller fixes and updates" * 'x86-cpu-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (23 commits) x86/intel_rdt: Return error for incorrect resource names in schemata x86/intel_rdt: Trim whitespace while parsing schemata input x86/intel_rdt: Fix padding when resource is enabled via mount x86/intel_rdt: Get rid of anon union x86/cpu: Keep model defines sorted by model number x86/intel_rdt/mba: Add schemata file support for MBA x86/intel_rdt: Make schemata file parsers resource specific x86/intel_rdt/mba: Add info directory files for Memory Bandwidth Allocation x86/intel_rdt: Make information files resource specific x86/intel_rdt/mba: Add primary support for Memory Bandwidth Allocation (MBA) x86/intel_rdt/mba: Memory bandwith allocation feature detect x86/intel_rdt: Add resource specific msr update function x86/intel_rdt: Move CBM specific data into a struct x86/intel_rdt: Cleanup namespace to support multiple resource types Documentation, x86: Intel Memory bandwidth allocation x86/intel_rdt: Organize code properly x86/intel_rdt: Init padding only if a device exists x86/intel_rdt: Add cpus_list rdtgroup file x86/intel_rdt: Cleanup kernel-doc x86/intel_rdt: Update schemata read to show data in tabular format ...

Merge branch 'x86-cpu-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 cpu updates from Ingo Molnar: "The biggest changes are an extension of the Intel RDT code to extend it with Intel Memory Bandwidth Allocation CPU support: MBA allows bandwidth allocation between cores, while CBM (already upstream) allows CPU cache partitioning. There's also misc smaller fixes and updates" * 'x86-cpu-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (23 commits) x86/intel_rdt: Return error for incorrect resource names in schemata x86/intel_rdt: Trim whitespace while parsing schemata input x86/intel_rdt: Fix padding when resource is enabled via mount x86/intel_rdt: Get rid of anon union x86/cpu: Keep model defines sorted by model number x86/intel_rdt/mba: Add schemata file support for MBA x86/intel_rdt: Make schemata file parsers resource specific x86/intel_rdt/mba: Add info directory files for Memory Bandwidth Allocation x86/intel_rdt: Make information files resource specific x86/intel_rdt/mba: Add primary support for Memory Bandwidth Allocation (MBA) x86/intel_rdt/mba: Memory bandwith allocation feature detect x86/intel_rdt: Add resource specific msr update function x86/intel_rdt: Move CBM specific data into a struct x86/intel_rdt: Cleanup namespace to support multiple resource types Documentation, x86: Intel Memory bandwidth allocation x86/intel_rdt: Organize code properly x86/intel_rdt: Init padding only if a device exists x86/intel_rdt: Add cpus_list rdtgroup file x86/intel_rdt: Cleanup kernel-doc x86/intel_rdt: Update schemata read to show data in tabular format ...
a52bbaf4 · Linus Torvalds · 16b76293 · 4797b7df · a52bbaf4 · a52bbaf4
Commit a52bbaf4 authored May 01, 2017 by Linus Torvalds
13 changed files
--- a/Documentation/x86/intel_rdt_ui.txt
+++ b/Documentation/x86/intel_rdt_ui.txt
@@ -4,6 +4,7 @@ Copyright (C) 2016 Intel Corporation

 Fenghua Yu <fenghua.yu@intel.com>
 Tony Luck <tony.luck@intel.com>
+Vikas Shivappa <vikas.shivappa@intel.com>

 This feature is enabled by the CONFIG_INTEL_RDT_A Kconfig and the
 X86 /proc/cpuinfo flag bits "rdt", "cat_l3" and "cdp_l3".
@@ -22,19 +23,34 @@ Info directory

 The 'info' directory contains information about the enabled
 resources. Each resource has its own subdirectory. The subdirectory
-names reflect the resource names. Each subdirectory contains the
-following files:
+names reflect the resource names.
+Cache resource(L3/L2)  subdirectory contains the following files:

-"num_closids":  The number of CLOSIDs which are valid for this
-	        resource. The kernel uses the smallest number of
-		CLOSIDs of all enabled resources as limit.
+"num_closids":  	The number of CLOSIDs which are valid for this
+			resource. The kernel uses the smallest number of
+			CLOSIDs of all enabled resources as limit.

-"cbm_mask":     The bitmask which is valid for this resource. This
-		mask is equivalent to 100%.
+"cbm_mask":     	The bitmask which is valid for this resource.
+			This mask is equivalent to 100%.

-"min_cbm_bits": The minimum number of consecutive bits which must be
-		set when writing a mask.
+"min_cbm_bits": 	The minimum number of consecutive bits which
+			must be set when writing a mask.

+Memory bandwitdh(MB) subdirectory contains the following files:
+
+"min_bandwidth":	The minimum memory bandwidth percentage which
+			user can request.
+
+"bandwidth_gran":	The granularity in which the memory bandwidth
+			percentage is allocated. The allocated
+			b/w percentage is rounded off to the next
+			control step available on the hardware. The
+			available bandwidth control steps are:
+			min_bandwidth + N * bandwidth_gran.
+
+"delay_linear": 	Indicates if the delay scale is linear or
+			non-linear. This field is purely informational
+			only.

 Resource groups
 ---------------
@@ -59,6 +75,9 @@ There are three files associated with each group:
 	given to the default (root) group. You cannot remove CPUs
 	from the default group.

+"cpus_list": One or more CPU ranges of logical CPUs assigned to this
+	     group. Same rules apply like for the "cpus" file.
+
 "schemata": A list of all the resources available to this group.
 	Each resource has its own line and format - see below for
 	details.
@@ -107,6 +126,22 @@ and 0xA are not.  On a system with a 20-bit mask each bit represents 5%
 of the capacity of the cache. You could partition the cache into four
 equal parts with masks: 0x1f, 0x3e0, 0x7c00, 0xf8000.

+Memory bandwidth(b/w) percentage
+--------------------------------
+For Memory b/w resource, user controls the resource by indicating the
+percentage of total memory b/w.
+
+The minimum bandwidth percentage value for each cpu model is predefined
+and can be looked up through "info/MB/min_bandwidth". The bandwidth
+granularity that is allocated is also dependent on the cpu model and can
+be looked up at "info/MB/bandwidth_gran". The available bandwidth
+control steps are: min_bw + N * bw_gran. Intermediate values are rounded
+to the next control step available on the hardware.
+
+The bandwidth throttling is a core specific mechanism on some of Intel
+SKUs. Using a high bandwidth and a low bandwidth setting on two threads
+sharing a core will result in both threads being throttled to use the
+low bandwidth.

 L3 details (code and data prioritization disabled)
 --------------------------------------------------
@@ -129,16 +164,38 @@ schemata format is always:

 	L2:<cache_id0>=<cbm>;<cache_id1>=<cbm>;...

+Memory b/w Allocation details
+-----------------------------
+
+Memory b/w domain is L3 cache.
+
+	MB:<cache_id0>=bandwidth0;<cache_id1>=bandwidth1;...
+
+Reading/writing the schemata file
+---------------------------------
+Reading the schemata file will show the state of all resources
+on all domains. When writing you only need to specify those values
+which you wish to change.  E.g.
+
+# cat schemata
+L3DATA:0=fffff;1=fffff;2=fffff;3=fffff
+L3CODE:0=fffff;1=fffff;2=fffff;3=fffff
+# echo "L3DATA:2=3c0;" > schemata
+# cat schemata
+L3DATA:0=fffff;1=fffff;2=3c0;3=fffff
+L3CODE:0=fffff;1=fffff;2=fffff;3=fffff
+
 Example 1
 ---------
 On a two socket machine (one L3 cache per socket) with just four bits
-for cache bit masks
+for cache bit masks, minimum b/w of 10% with a memory bandwidth
+granularity of 10%

 # mount -t resctrl resctrl /sys/fs/resctrl
 # cd /sys/fs/resctrl
 # mkdir p0 p1
-# echo "L3:0=3;1=c" > /sys/fs/resctrl/p0/schemata
-# echo "L3:0=3;1=3" > /sys/fs/resctrl/p1/schemata
+# echo "L3:0=3;1=c\nMB:0=50;1=50" > /sys/fs/resctrl/p0/schemata
+# echo "L3:0=3;1=3\nMB:0=50;1=50" > /sys/fs/resctrl/p1/schemata

 The default resource group is unmodified, so we have access to all parts
 of all caches (its schemata file reads "L3:0=f;1=f").
@@ -147,6 +204,14 @@ Tasks that are under the control of group "p0" may only allocate from the
 "lower" 50% on cache ID 0, and the "upper" 50% of cache ID 1.
 Tasks in group "p1" use the "lower" 50% of cache on both sockets.

+Similarly, tasks that are under the control of group "p0" may use a
+maximum memory b/w of 50% on socket0 and 50% on socket 1.
+Tasks in group "p1" may also use 50% memory b/w on both sockets.
+Note that unlike cache masks, memory b/w cannot specify whether these
+allocations can overlap or not. The allocations specifies the maximum
+b/w that the group may be able to use and the system admin can configure
+the b/w accordingly.
+
 Example 2
 ---------
 Again two sockets, but this time with a more realistic 20-bit mask.
@@ -160,9 +225,10 @@ of L3 cache on socket 0.
 # cd /sys/fs/resctrl

 First we reset the schemata for the default group so that the "upper"
-50% of the L3 cache on socket 0 cannot be used by ordinary tasks:
+50% of the L3 cache on socket 0 and 50% of memory b/w cannot be used by
+ordinary tasks:

-# echo "L3:0=3ff;1=fffff" > schemata
+# echo "L3:0=3ff;1=fffff\nMB:0=50;1=100" > schemata

 Next we make a resource group for our first real time task and give
 it access to the "top" 25% of the cache on socket 0.
@@ -185,6 +251,20 @@ Ditto for the second real time task (with the remaining 25% of cache):
 # echo 5678 > p1/tasks
 # taskset -cp 2 5678

+For the same 2 socket system with memory b/w resource and CAT L3 the
+schemata would look like(Assume min_bandwidth 10 and bandwidth_gran is
+10):
+
+For our first real time task this would request 20% memory b/w on socket
+0.
+
+# echo -e "L3:0=f8000;1=fffff\nMB:0=20;1=100" > p0/schemata
+
+For our second real time task this would request an other 20% memory b/w
+on socket 0.
+
+# echo -e "L3:0=f8000;1=fffff\nMB:0=20;1=100" > p0/schemata
+
 Example 3
 ---------

@@ -198,18 +278,22 @@ the tasks.
 # cd /sys/fs/resctrl

 First we reset the schemata for the default group so that the "upper"
-50% of the L3 cache on socket 0 cannot be used by ordinary tasks:
+50% of the L3 cache on socket 0, and 50% of memory bandwidth on socket 0
+cannot be used by ordinary tasks:

-# echo "L3:0=3ff" > schemata
+# echo "L3:0=3ff\nMB:0=50" > schemata

-Next we make a resource group for our real time cores and give
-it access to the "top" 50% of the cache on socket 0.
+Next we make a resource group for our real time cores and give it access
+to the "top" 50% of the cache on socket 0 and 50% of memory bandwidth on
+socket 0.

 # mkdir p0
-# echo "L3:0=ffc00;" > p0/schemata
+# echo "L3:0=ffc00\nMB:0=50" > p0/schemata

 Finally we move core 4-7 over to the new group and make sure that the
-kernel and the tasks running there get 50% of the cache.
+kernel and the tasks running there get 50% of the cache. They should
+also get 50% of memory bandwidth assuming that the cores 4-7 are SMT
+siblings and only the real time threads are scheduled on the cores 4-7.

 # echo C0 > p0/cpus


--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -202,6 +202,8 @@
 #define X86_FEATURE_AVX512_4VNNIW (7*32+16) /* AVX-512 Neural Network Instructions */
 #define X86_FEATURE_AVX512_4FMAPS (7*32+17) /* AVX-512 Multiply Accumulation Single precision */

+#define X86_FEATURE_MBA         ( 7*32+18) /* Memory Bandwidth Allocation */
+
 /* Virtualization flags: Linux defined, word 8 */
 #define X86_FEATURE_TPR_SHADOW  ( 8*32+ 0) /* Intel TPR Shadow */
 #define X86_FEATURE_VNMI        ( 8*32+ 1) /* Intel Virtual NMI */

--- a/arch/x86/include/asm/intel-family.h
+++ b/arch/x86/include/asm/intel-family.h
@@ -12,6 +12,7 @@
 */

 #define INTEL_FAM6_CORE_YONAH		0x0E
+
 #define INTEL_FAM6_CORE2_MEROM		0x0F
 #define INTEL_FAM6_CORE2_MEROM_L	0x16
 #define INTEL_FAM6_CORE2_PENRYN		0x17
@@ -21,6 +22,7 @@
 #define INTEL_FAM6_NEHALEM_G		0x1F /* Auburndale / Havendale */
 #define INTEL_FAM6_NEHALEM_EP		0x1A
 #define INTEL_FAM6_NEHALEM_EX		0x2E
+
 #define INTEL_FAM6_WESTMERE		0x25
 #define INTEL_FAM6_WESTMERE_EP		0x2C
 #define INTEL_FAM6_WESTMERE_EX		0x2F
@@ -36,9 +38,9 @@
 #define INTEL_FAM6_HASWELL_GT3E		0x46

 #define INTEL_FAM6_BROADWELL_CORE	0x3D
-#define INTEL_FAM6_BROADWELL_XEON_D	0x56
 #define INTEL_FAM6_BROADWELL_GT3E	0x47
 #define INTEL_FAM6_BROADWELL_X		0x4F
+#define INTEL_FAM6_BROADWELL_XEON_D	0x56

 #define INTEL_FAM6_SKYLAKE_MOBILE	0x4E
 #define INTEL_FAM6_SKYLAKE_DESKTOP	0x5E
@@ -59,8 +61,8 @@
 #define INTEL_FAM6_ATOM_MERRIFIELD	0x4A /* Tangier */
 #define INTEL_FAM6_ATOM_MOOREFIELD	0x5A /* Anniedale */
 #define INTEL_FAM6_ATOM_GOLDMONT	0x5C
-#define INTEL_FAM6_ATOM_GEMINI_LAKE	0x7A
 #define INTEL_FAM6_ATOM_DENVERTON	0x5F /* Goldmont Microserver */
+#define INTEL_FAM6_ATOM_GEMINI_LAKE	0x7A

 /* Xeon Phi */


--- a/arch/x86/include/asm/intel_rdt.h
+++ b/arch/x86/include/asm/intel_rdt.h
@@ -12,6 +12,7 @@
 #define IA32_L3_QOS_CFG		0xc81
 #define IA32_L3_CBM_BASE	0xc90
 #define IA32_L2_CBM_BASE	0xd10
+#define IA32_MBA_THRTL_BASE	0xd50

 #define L3_QOS_CDP_ENABLE	0x01ULL

@@ -37,23 +38,30 @@ struct rdtgroup {
 /* rdtgroup.flags */
 #define	RDT_DELETED		1

+/* rftype.flags */
+#define RFTYPE_FLAGS_CPUS_LIST	1
+
 /* List of all resource groups */
 extern struct list_head rdt_all_groups;

+extern int max_name_width, max_data_width;
+
 int __init rdtgroup_init(void);

 /**
 * struct rftype - describe each file in the resctrl file system
- * @name: file name
- * @mode: access mode
- * @kf_ops: operations
- * @seq_show: show content of the file
- * @write: write to the file
+ * @name:	File name
+ * @mode:	Access mode
+ * @kf_ops:	File operations
+ * @flags:	File specific RFTYPE_FLAGS_* flags
+ * @seq_show:	Show content of the file
+ * @write:	Write to the file
 */
 struct rftype {
 	char			*name;
 	umode_t			mode;
 	struct kernfs_ops	*kf_ops;
+	unsigned long		flags;

 	int (*seq_show)(struct kernfs_open_file *of,
 			struct seq_file *sf, void *v);
@@ -66,55 +74,22 @@ struct rftype {
 			 char *buf, size_t nbytes, loff_t off);
 };

-/**
- * struct rdt_resource - attributes of an RDT resource
- * @enabled:			Is this feature enabled on this machine
- * @capable:			Is this feature available on this machine
- * @name:			Name to use in "schemata" file
- * @num_closid:			Number of CLOSIDs available
- * @max_cbm:			Largest Cache Bit Mask allowed
- * @min_cbm_bits:		Minimum number of consecutive bits to be set
- *				in a cache bit mask
- * @domains:			All domains for this resource
- * @num_domains:		Number of domains active
- * @msr_base:			Base MSR address for CBMs
- * @tmp_cbms:			Scratch space when updating schemata
- * @num_tmp_cbms:		Number of CBMs in tmp_cbms
- * @cache_level:		Which cache level defines scope of this domain
- * @cbm_idx_multi:		Multiplier of CBM index
- * @cbm_idx_offset:		Offset of CBM index. CBM index is computed by:
- *				closid * cbm_idx_multi + cbm_idx_offset
- */
-struct rdt_resource {
-	bool			enabled;
-	bool			capable;
-	char			*name;
-	int			num_closid;
-	int			cbm_len;
-	int			min_cbm_bits;
-	u32			max_cbm;
-	struct list_head	domains;
-	int			num_domains;
-	int			msr_base;
-	u32			*tmp_cbms;
-	int			num_tmp_cbms;
-	int			cache_level;
-	int			cbm_idx_multi;
-	int			cbm_idx_offset;
-};
-
 /**
 * struct rdt_domain - group of cpus sharing an RDT resource
 * @list:	all instances of this resource
 * @id:		unique id for this instance
 * @cpu_mask:	which cpus share this resource
- * @cbm:	array of cache bit masks (indexed by CLOSID)
+ * @ctrl_val:	array of cache or mem ctrl values (indexed by CLOSID)
+ * @new_ctrl:	new ctrl value to be loaded
+ * @have_new_ctrl: did user provide new_ctrl for this domain
 */
 struct rdt_domain {
 	struct list_head	list;
 	int			id;
 	struct cpumask		cpu_mask;
-	u32			*cbm;
+	u32			*ctrl_val;
+	u32			new_ctrl;
+	bool			have_new_ctrl;
 };

 /**
@@ -129,6 +104,83 @@ struct msr_param {
 	int			high;
 };

+/**
+ * struct rdt_cache - Cache allocation related data
+ * @cbm_len:		Length of the cache bit mask
+ * @min_cbm_bits:	Minimum number of consecutive bits to be set
+ * @cbm_idx_mult:	Multiplier of CBM index
+ * @cbm_idx_offset:	Offset of CBM index. CBM index is computed by:
+ *			closid * cbm_idx_multi + cbm_idx_offset
+ *			in a cache bit mask
+ */
+struct rdt_cache {
+	unsigned int	cbm_len;
+	unsigned int	min_cbm_bits;
+	unsigned int	cbm_idx_mult;
+	unsigned int	cbm_idx_offset;
+};
+
+/**
+ * struct rdt_membw - Memory bandwidth allocation related data
+ * @max_delay:		Max throttle delay. Delay is the hardware
+ *			representation for memory bandwidth.
+ * @min_bw:		Minimum memory bandwidth percentage user can request
+ * @bw_gran:		Granularity at which the memory bandwidth is allocated
+ * @delay_linear:	True if memory B/W delay is in linear scale
+ * @mb_map:		Mapping of memory B/W percentage to memory B/W delay
+ */
+struct rdt_membw {
+	u32		max_delay;
+	u32		min_bw;
+	u32		bw_gran;
+	u32		delay_linear;
+	u32		*mb_map;
+};
+
+/**
+ * struct rdt_resource - attributes of an RDT resource
+ * @enabled:		Is this feature enabled on this machine
+ * @capable:		Is this feature available on this machine
+ * @name:		Name to use in "schemata" file
+ * @num_closid:		Number of CLOSIDs available
+ * @cache_level:	Which cache level defines scope of this resource
+ * @default_ctrl:	Specifies default cache cbm or memory B/W percent.
+ * @msr_base:		Base MSR address for CBMs
+ * @msr_update:		Function pointer to update QOS MSRs
+ * @data_width:		Character width of data when displaying
+ * @domains:		All domains for this resource
+ * @cache:		Cache allocation related data
+ * @info_files:		resctrl info files for the resource
+ * @nr_info_files:	Number of info files
+ * @format_str:		Per resource format string to show domain value
+ * @parse_ctrlval:	Per resource function pointer to parse control values
+ */
+struct rdt_resource {
+	bool			enabled;
+	bool			capable;
+	char			*name;
+	int			num_closid;
+	int			cache_level;
+	u32			default_ctrl;
+	unsigned int		msr_base;
+	void (*msr_update)	(struct rdt_domain *d, struct msr_param *m,
+				 struct rdt_resource *r);
+	int			data_width;
+	struct list_head	domains;
+	struct rdt_cache	cache;
+	struct rdt_membw	membw;
+	struct rftype		*info_files;
+	int			nr_info_files;
+	const char		*format_str;
+	int (*parse_ctrlval)	(char *buf, struct rdt_resource *r,
+				 struct rdt_domain *d);
+};
+
+void rdt_get_cache_infofile(struct rdt_resource *r);
+void rdt_get_mba_infofile(struct rdt_resource *r);
+int parse_cbm(char *buf, struct rdt_resource *r, struct rdt_domain *d);
+int parse_bw(char *buf, struct rdt_resource *r,  struct rdt_domain *d);
+
 extern struct mutex rdtgroup_mutex;

 extern struct rdt_resource rdt_resources_all[];
@@ -142,6 +194,7 @@ enum {
 	RDT_RESOURCE_L3DATA,
 	RDT_RESOURCE_L3CODE,
 	RDT_RESOURCE_L2,
+	RDT_RESOURCE_MBA,

 	/* Must be the last */
 	RDT_NUM_RESOURCES,
@@ -149,7 +202,7 @@ enum {

 #define for_each_capable_rdt_resource(r)				      \
 	for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\
-	     r++) 							      \
+	     r++)							      \
 		if (r->capable)

 #define for_each_enabled_rdt_resource(r)				      \
@@ -165,8 +218,16 @@ union cpuid_0x10_1_eax {
 	unsigned int full;
 };

-/* CPUID.(EAX=10H, ECX=ResID=1).EDX */
-union cpuid_0x10_1_edx {
+/* CPUID.(EAX=10H, ECX=ResID=3).EAX */
+union cpuid_0x10_3_eax {
+	struct {
+		unsigned int max_delay:12;
+	} split;
+	unsigned int full;
+};
+
+/* CPUID.(EAX=10H, ECX=ResID).EDX */
+union cpuid_0x10_x_edx {
 	struct {
 		unsigned int cos_max:16;
 	} split;
@@ -175,7 +236,7 @@ union cpuid_0x10_1_edx {

 DECLARE_PER_CPU_READ_MOSTLY(int, cpu_closid);

-void rdt_cbm_update(void *arg);
+void rdt_ctrl_update(void *arg);
 struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn);
 void rdtgroup_kn_unlock(struct kernfs_node *kn);
 ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,

--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -80,7 +80,7 @@ extern u16 __read_mostly tlb_lld_1g[NR_INFO];

 /*
 *  CPU type and hardware bug flags. Kept separately for each CPU.
- *  Members of this structure are referenced in head.S, so think twice
+ *  Members of this structure are referenced in head_32.S, so think twice
 *  before touching them. [mj]
 */

@@ -89,14 +89,7 @@ struct cpuinfo_x86 {
 	__u8			x86_vendor;	/* CPU vendor */
 	__u8			x86_model;
 	__u8			x86_mask;
-#ifdef CONFIG_X86_32
-	char			wp_works_ok;	/* It doesn't on 386's */
-
-	/* Problems on some 486Dx4's and old 386's: */
-	char			rfu;
-	char			pad0;
-	char			pad1;
-#else
+#ifdef CONFIG_X86_64
 	/* Number of 4K pages in DTLB/ITLB combined(in pages): */
 	int			x86_tlbsize;
 #endif

--- a/arch/x86/kernel/cpu/intel_rdt.c
+++ b/arch/x86/kernel/cpu/intel_rdt.c
--- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
+++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
@@ -174,6 +174,13 @@ static struct kernfs_ops rdtgroup_kf_single_ops = {
 	.seq_show		= rdtgroup_seqfile_show,
 };

+static bool is_cpu_list(struct kernfs_open_file *of)
+{
+	struct rftype *rft = of->kn->priv;
+
+	return rft->flags & RFTYPE_FLAGS_CPUS_LIST;
+}
+
 static int rdtgroup_cpus_show(struct kernfs_open_file *of,
 			      struct seq_file *s, void *v)
 {
@@ -182,10 +189,12 @@ static int rdtgroup_cpus_show(struct kernfs_open_file *of,

 	rdtgrp = rdtgroup_kn_lock_live(of->kn);

-	if (rdtgrp)
-		seq_printf(s, "%*pb\n", cpumask_pr_args(&rdtgrp->cpu_mask));
-	else
+	if (rdtgrp) {
+		seq_printf(s, is_cpu_list(of) ? "%*pbl\n" : "%*pb\n",
+			   cpumask_pr_args(&rdtgrp->cpu_mask));
+	} else {
 		ret = -ENOENT;
+	}
 	rdtgroup_kn_unlock(of->kn);

 	return ret;
@@ -252,7 +261,11 @@ static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of,
 		goto unlock;
 	}

-	ret = cpumask_parse(buf, newmask);
+	if (is_cpu_list(of))
+		ret = cpulist_parse(buf, newmask);
+	else
+		ret = cpumask_parse(buf, newmask);
+
 	if (ret)
 		goto unlock;

@@ -472,6 +485,14 @@ static struct rftype rdtgroup_base_files[] = {
 		.write		= rdtgroup_cpus_write,
 		.seq_show	= rdtgroup_cpus_show,
 	},
+	{
+		.name		= "cpus_list",
+		.mode		= 0644,
+		.kf_ops		= &rdtgroup_kf_single_ops,
+		.write		= rdtgroup_cpus_write,
+		.seq_show	= rdtgroup_cpus_show,
+		.flags		= RFTYPE_FLAGS_CPUS_LIST,
+	},
 	{
 		.name		= "tasks",
 		.mode		= 0644,
@@ -494,32 +515,56 @@ static int rdt_num_closids_show(struct kernfs_open_file *of,
 	struct rdt_resource *r = of->kn->parent->priv;

 	seq_printf(seq, "%d\n", r->num_closid);
+	return 0;
+}
+
+static int rdt_default_ctrl_show(struct kernfs_open_file *of,
+			     struct seq_file *seq, void *v)
+{
+	struct rdt_resource *r = of->kn->parent->priv;

+	seq_printf(seq, "%x\n", r->default_ctrl);
 	return 0;
 }

-static int rdt_cbm_mask_show(struct kernfs_open_file *of,
+static int rdt_min_cbm_bits_show(struct kernfs_open_file *of,
 			     struct seq_file *seq, void *v)
 {
 	struct rdt_resource *r = of->kn->parent->priv;

-	seq_printf(seq, "%x\n", r->max_cbm);
+	seq_printf(seq, "%u\n", r->cache.min_cbm_bits);
+	return 0;
+}
+
+static int rdt_min_bw_show(struct kernfs_open_file *of,
+			     struct seq_file *seq, void *v)
+{
+	struct rdt_resource *r = of->kn->parent->priv;

+	seq_printf(seq, "%u\n", r->membw.min_bw);
 	return 0;
 }

-static int rdt_min_cbm_bits_show(struct kernfs_open_file *of,
+static int rdt_bw_gran_show(struct kernfs_open_file *of,
 			     struct seq_file *seq, void *v)
 {
 	struct rdt_resource *r = of->kn->parent->priv;

-	seq_printf(seq, "%d\n", r->min_cbm_bits);
+	seq_printf(seq, "%u\n", r->membw.bw_gran);
+	return 0;
+}
+
+static int rdt_delay_linear_show(struct kernfs_open_file *of,
+			     struct seq_file *seq, void *v)
+{
+	struct rdt_resource *r = of->kn->parent->priv;

+	seq_printf(seq, "%u\n", r->membw.delay_linear);
 	return 0;
 }

 /* rdtgroup information files for one cache resource. */
-static struct rftype res_info_files[] = {
+static struct rftype res_cache_info_files[] = {
 	{
 		.name		= "num_closids",
 		.mode		= 0444,
@@ -530,7 +575,7 @@ static struct rftype res_info_files[] = {
 		.name		= "cbm_mask",
 		.mode		= 0444,
 		.kf_ops		= &rdtgroup_kf_single_ops,
-		.seq_show	= rdt_cbm_mask_show,
+		.seq_show	= rdt_default_ctrl_show,
 	},
 	{
 		.name		= "min_cbm_bits",
@@ -540,11 +585,52 @@ static struct rftype res_info_files[] = {
 	},
 };

+/* rdtgroup information files for memory bandwidth. */
+static struct rftype res_mba_info_files[] = {
+	{
+		.name		= "num_closids",
+		.mode		= 0444,
+		.kf_ops		= &rdtgroup_kf_single_ops,
+		.seq_show	= rdt_num_closids_show,
+	},
+	{
+		.name		= "min_bandwidth",
+		.mode		= 0444,
+		.kf_ops		= &rdtgroup_kf_single_ops,
+		.seq_show	= rdt_min_bw_show,
+	},
+	{
+		.name		= "bandwidth_gran",
+		.mode		= 0444,
+		.kf_ops		= &rdtgroup_kf_single_ops,
+		.seq_show	= rdt_bw_gran_show,
+	},
+	{
+		.name		= "delay_linear",
+		.mode		= 0444,
+		.kf_ops		= &rdtgroup_kf_single_ops,
+		.seq_show	= rdt_delay_linear_show,
+	},
+};
+
+void rdt_get_mba_infofile(struct rdt_resource *r)
+{
+	r->info_files = res_mba_info_files;
+	r->nr_info_files = ARRAY_SIZE(res_mba_info_files);
+}
+
+void rdt_get_cache_infofile(struct rdt_resource *r)
+{
+	r->info_files = res_cache_info_files;
+	r->nr_info_files = ARRAY_SIZE(res_cache_info_files);
+}
+
 static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn)
 {
 	struct kernfs_node *kn_subdir;
+	struct rftype *res_info_files;
 	struct rdt_resource *r;
-	int ret;
+	int ret, len;

 	/* create the directory */
 	kn_info = kernfs_create_dir(parent_kn, "info", parent_kn->mode, NULL);
@@ -563,8 +649,11 @@ static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn)
 		ret = rdtgroup_kn_set_ugid(kn_subdir);
 		if (ret)
 			goto out_destroy;
-		ret = rdtgroup_add_files(kn_subdir, res_info_files,
-					 ARRAY_SIZE(res_info_files));
+
+		res_info_files = r->info_files;
+		len = r->nr_info_files;
+
+		ret = rdtgroup_add_files(kn_subdir, res_info_files, len);
 		if (ret)
 			goto out_destroy;
 		kernfs_activate(kn_subdir);
@@ -780,7 +869,7 @@ static struct dentry *rdt_mount(struct file_system_type *fs_type,
 	return dentry;
 }

-static int reset_all_cbms(struct rdt_resource *r)
+static int reset_all_ctrls(struct rdt_resource *r)
 {
 	struct msr_param msr_param;
 	cpumask_var_t cpu_mask;
@@ -803,14 +892,14 @@ static int reset_all_cbms(struct rdt_resource *r)
 		cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask);

 		for (i = 0; i < r->num_closid; i++)
-			d->cbm[i] = r->max_cbm;
+			d->ctrl_val[i] = r->default_ctrl;
 	}
 	cpu = get_cpu();
 	/* Update CBM on this cpu if it's in cpu_mask. */
 	if (cpumask_test_cpu(cpu, cpu_mask))
-		rdt_cbm_update(&msr_param);
+		rdt_ctrl_update(&msr_param);
 	/* Update CBM on all other cpus in cpu_mask. */
-	smp_call_function_many(cpu_mask, rdt_cbm_update, &msr_param, 1);
+	smp_call_function_many(cpu_mask, rdt_ctrl_update, &msr_param, 1);
 	put_cpu();

 	free_cpumask_var(cpu_mask);
@@ -896,7 +985,7 @@ static void rdt_kill_sb(struct super_block *sb)

 	/*Put everything back to default values. */
 	for_each_enabled_rdt_resource(r)
-		reset_all_cbms(r);
+		reset_all_ctrls(r);
 	cdp_disable();
 	rmdir_all_sub();
 	static_branch_disable(&rdt_enable_key);

--- a/arch/x86/kernel/cpu/intel_rdt_schemata.c
+++ b/arch/x86/kernel/cpu/intel_rdt_schemata.c
@@ -28,27 +28,78 @@
 #include <linux/slab.h>
 #include <asm/intel_rdt.h>

+/*
+ * Check whether MBA bandwidth percentage value is correct. The value is
+ * checked against the minimum and max bandwidth values specified by the
+ * hardware. The allocated bandwidth percentage is rounded to the next
+ * control step available on the hardware.
+ */
+static bool bw_validate(char *buf, unsigned long *data, struct rdt_resource *r)
+{
+	unsigned long bw;
+	int ret;
+
+	/*
+	 * Only linear delay values is supported for current Intel SKUs.
+	 */
+	if (!r->membw.delay_linear)
+		return false;
+
+	ret = kstrtoul(buf, 10, &bw);
+	if (ret)
+		return false;
+
+	if (bw < r->membw.min_bw || bw > r->default_ctrl)
+		return false;
+
+	*data = roundup(bw, (unsigned long)r->membw.bw_gran);
+	return true;
+}
+
+int parse_bw(char *buf, struct rdt_resource *r, struct rdt_domain *d)
+{
+	unsigned long data;
+
+	if (d->have_new_ctrl)
+		return -EINVAL;
+
+	if (!bw_validate(buf, &data, r))
+		return -EINVAL;
+	d->new_ctrl = data;
+	d->have_new_ctrl = true;
+
+	return 0;
+}
+
 /*
 * Check whether a cache bit mask is valid. The SDM says:
 *	Please note that all (and only) contiguous '1' combinations
 *	are allowed (e.g. FFFFH, 0FF0H, 003CH, etc.).
 * Additionally Haswell requires at least two bits set.
 */
-static bool cbm_validate(unsigned long var, struct rdt_resource *r)
+static bool cbm_validate(char *buf, unsigned long *data, struct rdt_resource *r)
 {
-	unsigned long first_bit, zero_bit;
+	unsigned long first_bit, zero_bit, val;
+	unsigned int cbm_len = r->cache.cbm_len;
+	int ret;
+
+	ret = kstrtoul(buf, 16, &val);
+	if (ret)
+		return false;

-	if (var == 0 || var > r->max_cbm)
+	if (val == 0 || val > r->default_ctrl)
 		return false;

-	first_bit = find_first_bit(&var, r->cbm_len);
-	zero_bit = find_next_zero_bit(&var, r->cbm_len, first_bit);
+	first_bit = find_first_bit(&val, cbm_len);
+	zero_bit = find_next_zero_bit(&val, cbm_len, first_bit);

-	if (find_next_bit(&var, r->cbm_len, zero_bit) < r->cbm_len)
+	if (find_next_bit(&val, cbm_len, zero_bit) < cbm_len)
 		return false;

-	if ((zero_bit - first_bit) < r->min_cbm_bits)
+	if ((zero_bit - first_bit) < r->cache.min_cbm_bits)
 		return false;
+
+	*data = val;
 	return true;
 }

@@ -56,17 +107,17 @@ static bool cbm_validate(unsigned long var, struct rdt_resource *r)
 * Read one cache bit mask (hex). Check that it is valid for the current
 * resource type.
 */
-static int parse_cbm(char *buf, struct rdt_resource *r)
+int parse_cbm(char *buf, struct rdt_resource *r, struct rdt_domain *d)
 {
 	unsigned long data;
-	int ret;

-	ret = kstrtoul(buf, 16, &data);
-	if (ret)
-		return ret;
-	if (!cbm_validate(data, r))
+	if (d->have_new_ctrl)
 		return -EINVAL;
-	r->tmp_cbms[r->num_tmp_cbms++] = data;
+
+	if(!cbm_validate(buf, &data, r))
+		return -EINVAL;
+	d->new_ctrl = data;
+	d->have_new_ctrl = true;

 	return 0;
 }
@@ -74,8 +125,8 @@ static int parse_cbm(char *buf, struct rdt_resource *r)
 /*
 * For each domain in this resource we expect to find a series of:
 *	id=mask
- * separated by ";". The "id" is in decimal, and must appear in the
- * right order.
+ * separated by ";". The "id" is in decimal, and must match one of
+ * the "id"s for this resource.
 */
 static int parse_line(char *line, struct rdt_resource *r)
 {
@@ -83,21 +134,22 @@ static int parse_line(char *line, struct rdt_resource *r)
 	struct rdt_domain *d;
 	unsigned long dom_id;

+next:
+	if (!line || line[0] == '\0')
+		return 0;
+	dom = strsep(&line, ";");
+	id = strsep(&dom, "=");
+	if (!dom || kstrtoul(id, 10, &dom_id))
+		return -EINVAL;
+	dom = strim(dom);
 	list_for_each_entry(d, &r->domains, list) {
-		dom = strsep(&line, ";");
-		if (!dom)
-			return -EINVAL;
-		id = strsep(&dom, "=");
-		if (kstrtoul(id, 10, &dom_id) || dom_id != d->id)
-			return -EINVAL;
-		if (parse_cbm(dom, r))
-			return -EINVAL;
+		if (d->id == dom_id) {
+			if (r->parse_ctrlval(dom, r, d))
+				return -EINVAL;
+			goto next;
+		}
 	}
-
-	/* Any garbage at the end of the line? */
-	if (line && line[0])
-		return -EINVAL;
-	return 0;
+	return -EINVAL;
 }

 static int update_domains(struct rdt_resource *r, int closid)
@@ -105,7 +157,7 @@ static int update_domains(struct rdt_resource *r, int closid)
 	struct msr_param msr_param;
 	cpumask_var_t cpu_mask;
 	struct rdt_domain *d;
-	int cpu, idx = 0;
+	int cpu;

 	if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL))
 		return -ENOMEM;
@@ -115,30 +167,46 @@ static int update_domains(struct rdt_resource *r, int closid)
 	msr_param.res = r;

 	list_for_each_entry(d, &r->domains, list) {
-		cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask);
-		d->cbm[msr_param.low] = r->tmp_cbms[idx++];
+		if (d->have_new_ctrl && d->new_ctrl != d->ctrl_val[closid]) {
+			cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask);
+			d->ctrl_val[closid] = d->new_ctrl;
+		}
 	}
+	if (cpumask_empty(cpu_mask))
+		goto done;
 	cpu = get_cpu();
 	/* Update CBM on this cpu if it's in cpu_mask. */
 	if (cpumask_test_cpu(cpu, cpu_mask))
-		rdt_cbm_update(&msr_param);
+		rdt_ctrl_update(&msr_param);
 	/* Update CBM on other cpus. */
-	smp_call_function_many(cpu_mask, rdt_cbm_update, &msr_param, 1);
+	smp_call_function_many(cpu_mask, rdt_ctrl_update, &msr_param, 1);
 	put_cpu();

+done:
 	free_cpumask_var(cpu_mask);

 	return 0;
 }

+static int rdtgroup_parse_resource(char *resname, char *tok, int closid)
+{
+	struct rdt_resource *r;
+
+	for_each_enabled_rdt_resource(r) {
+		if (!strcmp(resname, r->name) && closid < r->num_closid)
+			return parse_line(tok, r);
+	}
+	return -EINVAL;
+}
+
 ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
 				char *buf, size_t nbytes, loff_t off)
 {
 	struct rdtgroup *rdtgrp;
+	struct rdt_domain *dom;
 	struct rdt_resource *r;
 	char *tok, *resname;
 	int closid, ret = 0;
-	u32 *l3_cbms = NULL;

 	/* Valid input requires a trailing newline */
 	if (nbytes == 0 || buf[nbytes - 1] != '\n')
@@ -153,44 +221,20 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,

 	closid = rdtgrp->closid;

-	/* get scratch space to save all the masks while we validate input */
 	for_each_enabled_rdt_resource(r) {
-		r->tmp_cbms = kcalloc(r->num_domains, sizeof(*l3_cbms),
-				      GFP_KERNEL);
-		if (!r->tmp_cbms) {
-			ret = -ENOMEM;
-			goto out;
-		}
-		r->num_tmp_cbms = 0;
+		list_for_each_entry(dom, &r->domains, list)
+			dom->have_new_ctrl = false;
 	}

 	while ((tok = strsep(&buf, "\n")) != NULL) {
-		resname = strsep(&tok, ":");
+		resname = strim(strsep(&tok, ":"));
 		if (!tok) {
 			ret = -EINVAL;
 			goto out;
 		}
-		for_each_enabled_rdt_resource(r) {
-			if (!strcmp(resname, r->name) &&
-			    closid < r->num_closid) {
-				ret = parse_line(tok, r);
-				if (ret)
-					goto out;
-				break;
-			}
-		}
-		if (!r->name) {
-			ret = -EINVAL;
-			goto out;
-		}
-	}
-
-	/* Did the parser find all the masks we need? */
-	for_each_enabled_rdt_resource(r) {
-		if (r->num_tmp_cbms != r->num_domains) {
-			ret = -EINVAL;
+		ret = rdtgroup_parse_resource(resname, tok, closid);
+		if (ret)
 			goto out;
-		}
 	}

 	for_each_enabled_rdt_resource(r) {
@@ -200,10 +244,6 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
 	}

 out:
-	for_each_enabled_rdt_resource(r) {
-		kfree(r->tmp_cbms);
-		r->tmp_cbms = NULL;
-	}
 	rdtgroup_kn_unlock(of->kn);
 	return ret ?: nbytes;
 }
@@ -213,11 +253,12 @@ static void show_doms(struct seq_file *s, struct rdt_resource *r, int closid)
 	struct rdt_domain *dom;
 	bool sep = false;

-	seq_printf(s, "%s:", r->name);
+	seq_printf(s, "%*s:", max_name_width, r->name);
 	list_for_each_entry(dom, &r->domains, list) {
 		if (sep)
 			seq_puts(s, ";");
-		seq_printf(s, "%d=%x", dom->id, dom->cbm[closid]);
+		seq_printf(s, r->format_str, dom->id, max_data_width,
+			   dom->ctrl_val[closid]);
 		sep = true;
 	}
 	seq_puts(s, "\n");

--- a/arch/x86/kernel/cpu/proc.c
+++ b/arch/x86/kernel/cpu/proc.c
@@ -31,14 +31,13 @@ static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c)
 		   "fpu\t\t: %s\n"
 		   "fpu_exception\t: %s\n"
 		   "cpuid level\t: %d\n"
-		   "wp\t\t: %s\n",
+		   "wp\t\t: yes\n",
 		   static_cpu_has_bug(X86_BUG_FDIV) ? "yes" : "no",
 		   static_cpu_has_bug(X86_BUG_F00F) ? "yes" : "no",
 		   static_cpu_has_bug(X86_BUG_COMA) ? "yes" : "no",
 		   static_cpu_has(X86_FEATURE_FPU) ? "yes" : "no",
 		   static_cpu_has(X86_FEATURE_FPU) ? "yes" : "no",
-		   c->cpuid_level,
-		   c->wp_works_ok ? "yes" : "no");
+		   c->cpuid_level);
 }
 #else
 static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c)

--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -27,6 +27,7 @@ static const struct cpuid_bit cpuid_bits[] = {
 	{ X86_FEATURE_CAT_L3,		CPUID_EBX,  1, 0x00000010, 0 },
 	{ X86_FEATURE_CAT_L2,		CPUID_EBX,  2, 0x00000010, 0 },
 	{ X86_FEATURE_CDP_L3,		CPUID_ECX,  2, 0x00000010, 1 },
+	{ X86_FEATURE_MBA,		CPUID_EBX,  3, 0x00000010, 0 },
 	{ X86_FEATURE_HW_PSTATE,	CPUID_EDX,  7, 0x80000007, 0 },
 	{ X86_FEATURE_CPB,		CPUID_EDX,  9, 0x80000007, 0 },
 	{ X86_FEATURE_PROC_FEEDBACK,    CPUID_EDX, 11, 0x80000007, 0 },

--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -173,14 +173,11 @@ static struct resource bss_resource = {


 #ifdef CONFIG_X86_32
-/* cpu data as detected by the assembly code in head.S */
-struct cpuinfo_x86 new_cpu_data = {
-	.wp_works_ok = -1,
-};
+/* cpu data as detected by the assembly code in head_32.S */
+struct cpuinfo_x86 new_cpu_data;
+
 /* common cpu data for all cpus */
-struct cpuinfo_x86 boot_cpu_data __read_mostly = {
-	.wp_works_ok = -1,
-};
+struct cpuinfo_x86 boot_cpu_data __read_mostly;
 EXPORT_SYMBOL(boot_cpu_data);

 unsigned int def_to_bigsmp;

--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -716,15 +716,17 @@ void __init paging_init(void)
 */
 static void __init test_wp_bit(void)
 {
+	int wp_works_ok;
+
 	printk(KERN_INFO
  "Checking if this processor honours the WP bit even in supervisor mode...");

 	/* Any page-aligned address will do, the test is non-destructive */
 	__set_fixmap(FIX_WP_TEST, __pa(&swapper_pg_dir), PAGE_KERNEL_RO);
-	boot_cpu_data.wp_works_ok = do_test_wp_bit();
+	wp_works_ok = do_test_wp_bit();
 	clear_fixmap(FIX_WP_TEST);

-	if (!boot_cpu_data.wp_works_ok) {
+	if (!wp_works_ok) {
 		printk(KERN_CONT "No.\n");
 		panic("Linux doesn't support CPUs with broken WP.");
 	} else {
@@ -811,8 +813,7 @@ void __init mem_init(void)
 	BUG_ON(VMALLOC_START				>= VMALLOC_END);
 	BUG_ON((unsigned long)high_memory		> VMALLOC_START);

-	if (boot_cpu_data.wp_works_ok < 0)
-		test_wp_bit();
+	test_wp_bit();
 }

 #ifdef CONFIG_MEMORY_HOTPLUG

--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1596,7 +1596,6 @@ asmlinkage __visible void __init xen_start_kernel(void)
 	/* set up basic CPUID stuff */
 	cpu_detect(&new_cpu_data);
 	set_cpu_cap(&new_cpu_data, X86_FEATURE_FPU);
-	new_cpu_data.wp_works_ok = 1;
 	new_cpu_data.x86_capability[CPUID_1_EDX] = cpuid_edx(1);
 #endif